diff --git a/background.html b/background.html index a5f8002..24e89bf 100644 --- a/background.html +++ b/background.html @@ -1,4 +1,4 @@ -Appendix: Background - CS/STAT 184: Introduction to Reinforcement Learning

Appendix: Background

1O notation

Throughout this chapter and the rest of the book, we will describe the + c-1.4,4.9-3.6,8.6-6.8,11.1c-3.1,2.5-7.3,3.7-12.4,3.8H63c-6.7,0-10,2.9-10,8.8">Made with MyST

Appendix: Background

1O notation

Throughout this chapter and the rest of the book, we will describe the asymptotic behavior of a function using OO notation.

For two functions f(t)f(t) and g(t)g(t), we say that f(t)O(g(t))f(t) \le O(g(t)) if ff is asymptotically upper bounded by gg. Formally, this means that there exists some constant C>0C > 0 such that f(t)Cg(t)f(t) \le C \cdot g(t) for @@ -32,9 +32,9 @@ that f(t)Cg(t)logk(t)f(t) \le C \cdot g(t) \cdot \log^k(t) for some kk and all tt.

Occasionally, we will also use O(f(t))O(f(t)) (or one of the other symbols) as shorthand to manipulate function classes. For example, we might write O(f(t))+O(g(t))=O(f(t)+g(t))O(f(t)) + O(g(t)) = O(f(t) + g(t)) to mean that the sum of two -functions in O(f(t))O(f(t)) and O(g(t))O(g(t)) is in O(f(t)+g(t))O(f(t) + g(t)).

2Python

\ No newline at end of file diff --git a/background.json b/background.json index afd5fa6..bd08947 100644 --- a/background.json +++ b/background.json @@ -1 +1 @@ -{"kind":"Notebook","sha256":"6ba74e7c87ad2f3efe8bff9f065ad2fd5a5d67073bd2088d6a7447e7ca5dbd90","slug":"background","location":"/background.md","dependencies":[],"frontmatter":{"title":"Appendix: Background","kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","numbering":{"all":{"enabled":true}},"math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"exports":[{"format":"md","filename":"background.md","url":"/build/background-b9d91961500f82c612d4d450395301be.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":16,"column":1},"end":{"line":16,"column":1}},"children":[{"type":"text","value":"O notation","position":{"start":{"line":16,"column":1},"end":{"line":16,"column":1}},"key":"ZE2l18efxW"}],"identifier":"o-notation","label":"O notation","html_id":"o-notation","implicit":true,"enumerator":"1","key":"PCtNpx4VvU"},{"type":"paragraph","position":{"start":{"line":18,"column":1},"end":{"line":19,"column":1}},"children":[{"type":"text","value":"Throughout this chapter and the rest of the book, we will describe the\nasymptotic behavior of a function using ","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"qjEmqVdCZ5"},{"type":"inlineMath","value":"O","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"html":"OOO","key":"ud2jpWUYF0"},{"type":"text","value":" notation.","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"FFifi9wOcA"}],"key":"G4curk1Jqd"},{"type":"paragraph","position":{"start":{"line":21,"column":1},"end":{"line":24,"column":1}},"children":[{"type":"text","value":"For two functions ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"OWRhBxT6Wk"},{"type":"inlineMath","value":"f(t)","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"f(t)f(t)f(t)","key":"xppeJHTZmo"},{"type":"text","value":" and ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"e8KnE1lV4J"},{"type":"inlineMath","value":"g(t)","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"g(t)g(t)g(t)","key":"aInJlbeUyF"},{"type":"text","value":", we say that ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"OWv9JWCjU1"},{"type":"inlineMath","value":"f(t) \\le O(g(t))","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"f(t)O(g(t))f(t) \\le O(g(t))f(t)O(g(t))","key":"rRt8uO4SS6"},{"type":"text","value":" if\n","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"zPAELwhjLm"},{"type":"inlineMath","value":"f","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"fff","key":"q8hGrTRCS0"},{"type":"text","value":" is asymptotically upper bounded by ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"CqEUZGIC3f"},{"type":"inlineMath","value":"g","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"ggg","key":"yKTzSh6eer"},{"type":"text","value":". Formally, this means that\nthere exists some constant ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"SILHgr3U5y"},{"type":"inlineMath","value":"C > 0","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"C>0C > 0C>0","key":"PAO3cqgur6"},{"type":"text","value":" such that ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"lvBpZaGl44"},{"type":"inlineMath","value":"f(t) \\le C \\cdot g(t)","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"f(t)Cg(t)f(t) \\le C \\cdot g(t)f(t)Cg(t)","key":"H0satsSWrC"},{"type":"text","value":" for\nall ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"JomBlPG4A9"},{"type":"inlineMath","value":"t","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"ttt","key":"fAI7kLDQ5o"},{"type":"text","value":" past some point ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"rUPCnpWyJS"},{"type":"inlineMath","value":"t_0","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"t0t_0t0","key":"LIZEuh3Bhi"},{"type":"text","value":".","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"gGF035R7Am"}],"key":"P46aYU6gBT"},{"type":"paragraph","position":{"start":{"line":26,"column":1},"end":{"line":30,"column":1}},"children":[{"type":"text","value":"We say ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"El8mZAXWAb"},{"type":"inlineMath","value":"f(t) < o(g(t))","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"f(t)<o(g(t))f(t) < o(g(t))f(t)<o(g(t))","key":"k3CORUizxK"},{"type":"text","value":" if asymptotically ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"dxmtVZd6KP"},{"type":"inlineMath","value":"f","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"fff","key":"s3zgn7rYtR"},{"type":"text","value":" grows strictly slower than\n","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"tkqhFOtZbc"},{"type":"inlineMath","value":"g","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"ggg","key":"mOz2slW9qE"},{"type":"text","value":". Formally, this means that for ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"oRNPq2nhYY"},{"type":"emphasis","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"children":[{"type":"text","value":"any","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"Tg1j3JGYV6"}],"key":"PWndXiim8o"},{"type":"text","value":" scalar ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"TQat69uPPG"},{"type":"inlineMath","value":"C > 0","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"C>0C > 0C>0","key":"GBhh2pNRXH"},{"type":"text","value":", there exists\nsome ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"hR5ZqbSnKS"},{"type":"inlineMath","value":"t_0","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"t0t_0t0","key":"CZUj0DOYiT"},{"type":"text","value":" such that ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"HHAGAvOMqr"},{"type":"inlineMath","value":"f(t) \\le C \\cdot g(t)","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"f(t)Cg(t)f(t) \\le C \\cdot g(t)f(t)Cg(t)","key":"sdigEMD1G1"},{"type":"text","value":" for all ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"c7HirBZInK"},{"type":"inlineMath","value":"t > t_0","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"t>t0t > t_0t>t0","key":"cLZTgomjJy"},{"type":"text","value":".\nEquivalently, we say ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"iViBAN47JJ"},{"type":"inlineMath","value":"f(t) < o(g(t))","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"f(t)<o(g(t))f(t) < o(g(t))f(t)<o(g(t))","key":"EXn8q2U9lo"},{"type":"text","value":" if\n","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"q90XAxqG1l"},{"type":"inlineMath","value":"\\lim_{t \\to \\infty} f(t)/g(t) = 0","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"limtf(t)/g(t)=0\\lim_{t \\to \\infty} f(t)/g(t) = 0limtf(t)/g(t)=0","key":"vmvW57pXwa"},{"type":"text","value":".","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"AEkyAFexPP"}],"key":"XBaihvshMR"},{"type":"paragraph","position":{"start":{"line":32,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"inlineMath","value":"f(t) = \\Theta(g(t))","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"html":"f(t)=Θ(g(t))f(t) = \\Theta(g(t))f(t)=Θ(g(t))","key":"F0PaKPl0Yk"},{"type":"text","value":" means that ","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"key":"RZIaBnQ5mT"},{"type":"inlineMath","value":"f","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"html":"fff","key":"go0i6ISrNg"},{"type":"text","value":" and ","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"key":"OOd2jSc2IB"},{"type":"inlineMath","value":"g","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"html":"ggg","key":"xGrlJWTUfd"},{"type":"text","value":" grow at the same rate\nasymptotically. That is, ","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"key":"HJdIIk0bfz"},{"type":"inlineMath","value":"f(t) \\le O(g(t))","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"html":"f(t)O(g(t))f(t) \\le O(g(t))f(t)O(g(t))","key":"CAJ4F8xlCE"},{"type":"text","value":" and ","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"key":"TD6yXNLx6Y"},{"type":"inlineMath","value":"g(t) \\le O(f(t))","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"html":"g(t)O(f(t))g(t) \\le O(f(t))g(t)O(f(t))","key":"ihAnR5Me7M"},{"type":"text","value":".","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"key":"pN7X8v96IH"}],"key":"XmXi9CRNoS"},{"type":"paragraph","position":{"start":{"line":35,"column":1},"end":{"line":36,"column":1}},"children":[{"type":"text","value":"Finally, we use ","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"dqK3i5bAHd"},{"type":"inlineMath","value":"f(t) \\ge \\Omega(g(t))","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"html":"f(t)Ω(g(t))f(t) \\ge \\Omega(g(t))f(t)Ω(g(t))","key":"tCQU5ampsE"},{"type":"text","value":" to mean that ","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"JQ9IXk41nB"},{"type":"inlineMath","value":"g(t) \\le O(f(t))","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"html":"g(t)O(f(t))g(t) \\le O(f(t))g(t)O(f(t))","key":"IDe4J5Qfyv"},{"type":"text","value":",\nand ","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"AjcPwAsRbw"},{"type":"inlineMath","value":"f(t) > \\omega(g(t))","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"html":"f(t)>ω(g(t))f(t) > \\omega(g(t))f(t)>ω(g(t))","key":"IwAlnB5WUK"},{"type":"text","value":" to mean that ","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"cxIaUlzssY"},{"type":"inlineMath","value":"g(t) < o(f(t))","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"html":"g(t)<o(f(t))g(t) < o(f(t))g(t)<o(f(t))","key":"MMM9f84JwG"},{"type":"text","value":".","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"beW1x2JzoO"}],"key":"qCVFmqUDFa"},{"type":"paragraph","position":{"start":{"line":38,"column":1},"end":{"line":40,"column":1}},"children":[{"type":"text","value":"We also use the notation ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"z3gJdlSGTo"},{"type":"inlineMath","value":"\\tilde O(g(t))","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"html":"O~(g(t))\\tilde O(g(t))O~(g(t))","key":"TczJmvNemD"},{"type":"text","value":" to hide logarithmic factors.\nThat is, ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"fNZLsH0SDz"},{"type":"inlineMath","value":"f(t) = \\tilde O(g(t))","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"html":"f(t)=O~(g(t))f(t) = \\tilde O(g(t))f(t)=O~(g(t))","key":"EfFxFmM7SC"},{"type":"text","value":" if there exists some constant ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"JcKTBE42b1"},{"type":"inlineMath","value":"C","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"html":"CCC","key":"oLcPUTDyAC"},{"type":"text","value":" such\nthat ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"IqAUJNUI3p"},{"type":"inlineMath","value":"f(t) \\le C \\cdot g(t) \\cdot \\log^k(t)","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"html":"f(t)Cg(t)logk(t)f(t) \\le C \\cdot g(t) \\cdot \\log^k(t)f(t)Cg(t)logk(t)","key":"gLAu5rqoMf"},{"type":"text","value":" for some ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"IuK18WKeuz"},{"type":"inlineMath","value":"k","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"html":"kkk","key":"QOTgfFGXQI"},{"type":"text","value":" and all ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"MOboX7joXc"},{"type":"inlineMath","value":"t","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"html":"ttt","key":"BcU0W5ULBM"},{"type":"text","value":".","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"HCZIbKs1sy"}],"key":"cbsnkRdMWO"},{"type":"paragraph","position":{"start":{"line":42,"column":1},"end":{"line":45,"column":1}},"children":[{"type":"text","value":"Occasionally, we will also use ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"fvWy8QKLYg"},{"type":"inlineMath","value":"O(f(t))","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"html":"O(f(t))O(f(t))O(f(t))","key":"oAyQ4utOrB"},{"type":"text","value":" (or one of the other symbols)\nas shorthand to manipulate function classes. For example, we might write\n","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"uCEeYQxc11"},{"type":"inlineMath","value":"O(f(t)) + O(g(t)) = O(f(t) + g(t))","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"html":"O(f(t))+O(g(t))=O(f(t)+g(t))O(f(t)) + O(g(t)) = O(f(t) + g(t))O(f(t))+O(g(t))=O(f(t)+g(t))","key":"BNLw6ZHuBj"},{"type":"text","value":" to mean that the sum of two\nfunctions in ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"pfLnsZM2oH"},{"type":"inlineMath","value":"O(f(t))","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"html":"O(f(t))O(f(t))O(f(t))","key":"FnVvJQdvbX"},{"type":"text","value":" and ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"sReF9jyWpo"},{"type":"inlineMath","value":"O(g(t))","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"html":"O(g(t))O(g(t))O(g(t))","key":"g3XbEfNJiT"},{"type":"text","value":" is in ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"UU5reKN1oL"},{"type":"inlineMath","value":"O(f(t) + g(t))","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"html":"O(f(t)+g(t))O(f(t) + g(t))O(f(t)+g(t))","key":"pWvh8pQGa7"},{"type":"text","value":".","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"m31WocarO4"}],"key":"mcxctIB268"},{"type":"heading","depth":2,"position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"children":[{"type":"text","value":"Python","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"key":"MkKZaYrkVX"}],"identifier":"python","label":"Python","html_id":"python","implicit":true,"enumerator":"2","key":"v9R4kwnE3L"}],"key":"ofPW3M8a6a"}],"key":"GN7GmsB4AQ"},"references":{"cite":{"order":[],"data":{}}},"footer":{"navigation":{"prev":{"title":"9 Exploration in MDPs","url":"/exploration","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file +{"kind":"Notebook","sha256":"6ba74e7c87ad2f3efe8bff9f065ad2fd5a5d67073bd2088d6a7447e7ca5dbd90","slug":"background","location":"/background.md","dependencies":[],"frontmatter":{"title":"Appendix: Background","kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","numbering":{"all":{"enabled":true}},"math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"exports":[{"format":"md","filename":"background.md","url":"/build/background-b9d91961500f82c612d4d450395301be.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":16,"column":1},"end":{"line":16,"column":1}},"children":[{"type":"text","value":"O notation","position":{"start":{"line":16,"column":1},"end":{"line":16,"column":1}},"key":"K4n8XcuQji"}],"identifier":"o-notation","label":"O notation","html_id":"o-notation","implicit":true,"enumerator":"1","key":"BWfwyWRFNa"},{"type":"paragraph","position":{"start":{"line":18,"column":1},"end":{"line":19,"column":1}},"children":[{"type":"text","value":"Throughout this chapter and the rest of the book, we will describe the\nasymptotic behavior of a function using ","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"fYJZmOpLcq"},{"type":"inlineMath","value":"O","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"html":"OOO","key":"P7mIapXijk"},{"type":"text","value":" notation.","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"kaxW7N43mV"}],"key":"whZGpZRkMS"},{"type":"paragraph","position":{"start":{"line":21,"column":1},"end":{"line":24,"column":1}},"children":[{"type":"text","value":"For two functions ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"TR3EXnuPsB"},{"type":"inlineMath","value":"f(t)","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"f(t)f(t)f(t)","key":"NRLoiR5H6u"},{"type":"text","value":" and ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"oki7F5YmM5"},{"type":"inlineMath","value":"g(t)","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"g(t)g(t)g(t)","key":"fYMvQnC8HY"},{"type":"text","value":", we say that ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"MIjXsA7Evd"},{"type":"inlineMath","value":"f(t) \\le O(g(t))","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"f(t)O(g(t))f(t) \\le O(g(t))f(t)O(g(t))","key":"siJrOG77QO"},{"type":"text","value":" if\n","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"EKb4fhvmPh"},{"type":"inlineMath","value":"f","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"fff","key":"LnKGS9Ho03"},{"type":"text","value":" is asymptotically upper bounded by ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"FsDZo54kOv"},{"type":"inlineMath","value":"g","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"ggg","key":"BuuTAQEO9C"},{"type":"text","value":". Formally, this means that\nthere exists some constant ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"w9opO0JezR"},{"type":"inlineMath","value":"C > 0","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"C>0C > 0C>0","key":"SHGDIqj3cT"},{"type":"text","value":" such that ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"Ec5L4PAhYU"},{"type":"inlineMath","value":"f(t) \\le C \\cdot g(t)","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"f(t)Cg(t)f(t) \\le C \\cdot g(t)f(t)Cg(t)","key":"k9EluHJlZ2"},{"type":"text","value":" for\nall ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"MD1GYs9WrR"},{"type":"inlineMath","value":"t","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"ttt","key":"RTDb3XnJB7"},{"type":"text","value":" past some point ","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"abkpSqW2Up"},{"type":"inlineMath","value":"t_0","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"html":"t0t_0t0","key":"N12zOo1Ku7"},{"type":"text","value":".","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"LhsCofaBLV"}],"key":"L73JNAbxwV"},{"type":"paragraph","position":{"start":{"line":26,"column":1},"end":{"line":30,"column":1}},"children":[{"type":"text","value":"We say ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"vtbhR37UYT"},{"type":"inlineMath","value":"f(t) < o(g(t))","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"f(t)<o(g(t))f(t) < o(g(t))f(t)<o(g(t))","key":"wr7wMeIR1B"},{"type":"text","value":" if asymptotically ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"Pd8LLxEIJn"},{"type":"inlineMath","value":"f","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"fff","key":"WLqxrW2WGD"},{"type":"text","value":" grows strictly slower than\n","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"hMJdpZmsrI"},{"type":"inlineMath","value":"g","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"ggg","key":"wbd0RbU56h"},{"type":"text","value":". Formally, this means that for ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"SnLAQxdzPC"},{"type":"emphasis","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"children":[{"type":"text","value":"any","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"uMbaSQBmby"}],"key":"r2kQEPJ72q"},{"type":"text","value":" scalar ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"DAeg1vJqSM"},{"type":"inlineMath","value":"C > 0","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"C>0C > 0C>0","key":"MK6AkZgA1f"},{"type":"text","value":", there exists\nsome ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"r41V8OnObo"},{"type":"inlineMath","value":"t_0","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"t0t_0t0","key":"Dl8pwx2z4z"},{"type":"text","value":" such that ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"ca5RCZhM7i"},{"type":"inlineMath","value":"f(t) \\le C \\cdot g(t)","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"f(t)Cg(t)f(t) \\le C \\cdot g(t)f(t)Cg(t)","key":"BJIiPIDD4w"},{"type":"text","value":" for all ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"K4H8oKBtse"},{"type":"inlineMath","value":"t > t_0","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"t>t0t > t_0t>t0","key":"OpfvIbSjDY"},{"type":"text","value":".\nEquivalently, we say ","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"gxi0v0wrLp"},{"type":"inlineMath","value":"f(t) < o(g(t))","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"f(t)<o(g(t))f(t) < o(g(t))f(t)<o(g(t))","key":"H0h7aCQwTL"},{"type":"text","value":" if\n","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"C8y2Hyzaxz"},{"type":"inlineMath","value":"\\lim_{t \\to \\infty} f(t)/g(t) = 0","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"html":"limtf(t)/g(t)=0\\lim_{t \\to \\infty} f(t)/g(t) = 0limtf(t)/g(t)=0","key":"GYNpzxhugM"},{"type":"text","value":".","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"KZfaUw4U3e"}],"key":"v8dstYgGYP"},{"type":"paragraph","position":{"start":{"line":32,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"inlineMath","value":"f(t) = \\Theta(g(t))","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"html":"f(t)=Θ(g(t))f(t) = \\Theta(g(t))f(t)=Θ(g(t))","key":"wRuOlwZEir"},{"type":"text","value":" means that ","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"key":"JluyQVHRfx"},{"type":"inlineMath","value":"f","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"html":"fff","key":"j888U2iHWs"},{"type":"text","value":" and ","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"key":"xBQ9qgnqva"},{"type":"inlineMath","value":"g","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"html":"ggg","key":"j3MEMpMkMC"},{"type":"text","value":" grow at the same rate\nasymptotically. That is, ","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"key":"JPA1xinpTT"},{"type":"inlineMath","value":"f(t) \\le O(g(t))","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"html":"f(t)O(g(t))f(t) \\le O(g(t))f(t)O(g(t))","key":"e3vSwRrGdj"},{"type":"text","value":" and ","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"key":"xnwbYn9x4V"},{"type":"inlineMath","value":"g(t) \\le O(f(t))","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"html":"g(t)O(f(t))g(t) \\le O(f(t))g(t)O(f(t))","key":"oYvpAeqKCW"},{"type":"text","value":".","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"key":"A6FBhoBeVq"}],"key":"Llb1sjDgIb"},{"type":"paragraph","position":{"start":{"line":35,"column":1},"end":{"line":36,"column":1}},"children":[{"type":"text","value":"Finally, we use ","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"eL7bBquL4T"},{"type":"inlineMath","value":"f(t) \\ge \\Omega(g(t))","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"html":"f(t)Ω(g(t))f(t) \\ge \\Omega(g(t))f(t)Ω(g(t))","key":"JOwTllgxGo"},{"type":"text","value":" to mean that ","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"JDp19D7UGM"},{"type":"inlineMath","value":"g(t) \\le O(f(t))","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"html":"g(t)O(f(t))g(t) \\le O(f(t))g(t)O(f(t))","key":"AA5LvPccpc"},{"type":"text","value":",\nand ","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"meVgf3PYXm"},{"type":"inlineMath","value":"f(t) > \\omega(g(t))","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"html":"f(t)>ω(g(t))f(t) > \\omega(g(t))f(t)>ω(g(t))","key":"FiShu5k7Um"},{"type":"text","value":" to mean that ","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"camLrXqqey"},{"type":"inlineMath","value":"g(t) < o(f(t))","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"html":"g(t)<o(f(t))g(t) < o(f(t))g(t)<o(f(t))","key":"AlyYC8SxXA"},{"type":"text","value":".","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"s0YkP6yhBk"}],"key":"raJamVyYfs"},{"type":"paragraph","position":{"start":{"line":38,"column":1},"end":{"line":40,"column":1}},"children":[{"type":"text","value":"We also use the notation ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"tJTdXHKIzn"},{"type":"inlineMath","value":"\\tilde O(g(t))","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"html":"O~(g(t))\\tilde O(g(t))O~(g(t))","key":"aysNIzMTiY"},{"type":"text","value":" to hide logarithmic factors.\nThat is, ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"jVSbsjN9zO"},{"type":"inlineMath","value":"f(t) = \\tilde O(g(t))","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"html":"f(t)=O~(g(t))f(t) = \\tilde O(g(t))f(t)=O~(g(t))","key":"BdnxIxqpSe"},{"type":"text","value":" if there exists some constant ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"gb7W7CFfi9"},{"type":"inlineMath","value":"C","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"html":"CCC","key":"wxNFG0pgrU"},{"type":"text","value":" such\nthat ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"nWhTNwUSmH"},{"type":"inlineMath","value":"f(t) \\le C \\cdot g(t) \\cdot \\log^k(t)","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"html":"f(t)Cg(t)logk(t)f(t) \\le C \\cdot g(t) \\cdot \\log^k(t)f(t)Cg(t)logk(t)","key":"W9E5Gzy6eQ"},{"type":"text","value":" for some ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"gVXhxsPiTP"},{"type":"inlineMath","value":"k","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"html":"kkk","key":"kLoU444gQG"},{"type":"text","value":" and all ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"z1orMFervf"},{"type":"inlineMath","value":"t","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"html":"ttt","key":"kzeD8vwrc1"},{"type":"text","value":".","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"zv2N2yXoK1"}],"key":"A3XSdLTnl8"},{"type":"paragraph","position":{"start":{"line":42,"column":1},"end":{"line":45,"column":1}},"children":[{"type":"text","value":"Occasionally, we will also use ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"zA5ENmCIqb"},{"type":"inlineMath","value":"O(f(t))","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"html":"O(f(t))O(f(t))O(f(t))","key":"bvIJYfzGBP"},{"type":"text","value":" (or one of the other symbols)\nas shorthand to manipulate function classes. For example, we might write\n","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"NLsW6rfFlt"},{"type":"inlineMath","value":"O(f(t)) + O(g(t)) = O(f(t) + g(t))","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"html":"O(f(t))+O(g(t))=O(f(t)+g(t))O(f(t)) + O(g(t)) = O(f(t) + g(t))O(f(t))+O(g(t))=O(f(t)+g(t))","key":"V5hTb45jUf"},{"type":"text","value":" to mean that the sum of two\nfunctions in ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"rxR71XsUxR"},{"type":"inlineMath","value":"O(f(t))","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"html":"O(f(t))O(f(t))O(f(t))","key":"SZbPdXhdZV"},{"type":"text","value":" and ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"kna2uFFhbS"},{"type":"inlineMath","value":"O(g(t))","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"html":"O(g(t))O(g(t))O(g(t))","key":"wnQpJWUbSy"},{"type":"text","value":" is in ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"HmWTbJYq3m"},{"type":"inlineMath","value":"O(f(t) + g(t))","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"html":"O(f(t)+g(t))O(f(t) + g(t))O(f(t)+g(t))","key":"w7heOnZOTf"},{"type":"text","value":".","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"TNigERx21s"}],"key":"MqSpYjGKC0"},{"type":"heading","depth":2,"position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"children":[{"type":"text","value":"Python","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"key":"fAWyusgEvd"}],"identifier":"python","label":"Python","html_id":"python","implicit":true,"enumerator":"2","key":"sV03UlvRXi"}],"key":"z9pUPZGjwU"}],"key":"Y7FLU6EIpB"},"references":{"cite":{"order":[],"data":{}}},"footer":{"navigation":{"prev":{"title":"9 Exploration in MDPs","url":"/exploration","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file diff --git a/bandits.html b/bandits.html index ec56f3f..e3171f4 100644 --- a/bandits.html +++ b/bandits.html @@ -1,4 +1,4 @@ -3 Multi-Armed Bandits - CS/STAT 184: Introduction to Reinforcement Learning

3 Multi-Armed Bandits

3.1Introduction

The multi-armed bandits (MAB) setting is a simple setting for studying the basic challenges of sequential decision-making. -In this setting, an agent repeatedly chooses from a fixed set of actions, called arms, each of which has an associated reward distribution. The agent’s goal is to maximize the total reward it receives over some time period.

In particular, we’ll spend a lot of time discussing the Exploration-Exploitation Tradeoff: should the agent choose new actions to learn more about the environment, or should it choose actions that it already knows to be good?

In this chapter, we will introduce the multi-armed bandits setting, and discuss some of the challenges that arise when trying to solve problems in this setting. We will also introduce some of the key concepts that we will use throughout the book, such as regret and exploration-exploitation tradeoffs.

from jaxtyping import Float, Array
+          c-1.4,4.9-3.6,8.6-6.8,11.1c-3.1,2.5-7.3,3.7-12.4,3.8H63c-6.7,0-10,2.9-10,8.8">Made with MyST

3 Multi-Armed Bandits

3.1Introduction

The multi-armed bandits (MAB) setting is a simple setting for studying the basic challenges of sequential decision-making. +In this setting, an agent repeatedly chooses from a fixed set of actions, called arms, each of which has an associated reward distribution. The agent’s goal is to maximize the total reward it receives over some time period.

In particular, we’ll spend a lot of time discussing the Exploration-Exploitation Tradeoff: should the agent choose new actions to learn more about the environment, or should it choose actions that it already knows to be good?

In this chapter, we will introduce the multi-armed bandits setting, and discuss some of the challenges that arise when trying to solve problems in this setting. We will also introduce some of the key concepts that we will use throughout the book, such as regret and exploration-exploitation tradeoffs.

from jaxtyping import Float, Array
 import numpy as np
 import latexify
 from typing import Callable, Union
@@ -40,7 +40,7 @@
     identifiers={"arm": "a_t", "reward": "r", "means": "mu"},
     use_math_symbols=True,
     escape_underscores=False,
-)

Let KK denote the number of arms. We’ll label them 0,,K10, \dots, K-1 and use superscripts to indicate the arm index; since we seldom need to raise a number to a power, this won’t cause much confusion. In this chapter, we’ll consider the Bernoulli bandit setting from the examples above, where arm kk either returns reward 1 with probability μk\mu^k or 0 otherwise. The agent gets to pull an arm TT times in total. We can formalize the Bernoulli bandit in the following Python code:

class MAB:
+)

Let KK denote the number of arms. We’ll label them 0,,K10, \dots, K-1 and use superscripts to indicate the arm index; since we seldom need to raise a number to a power, this won’t cause much confusion. In this chapter, we’ll consider the Bernoulli bandit setting from the examples above, where arm kk either returns reward 1 with probability μk\mu^k or 0 otherwise. The agent gets to pull an arm TT times in total. We can formalize the Bernoulli bandit in the following Python code:

class MAB:
     """
     The Bernoulli multi-armed bandit environment.
 
@@ -58,8 +58,8 @@
     def pull(self, k: int) -> int:
         """Pull the `k`-th arm and sample from its (Bernoulli) reward distribution."""
         reward = np.random.rand() < self.means[k].item()
-        return +reward
mab = MAB(means=np.array([0.1, 0.8, 0.4]), T=100)

In pseudocode, the agent’s interaction with the MAB environment can be -described by the following process:

@latex
+        return +reward
mab = MAB(means=np.array([0.1, 0.8, 0.4]), T=100)

In pseudocode, the agent’s interaction with the MAB environment can be +described by the following process:

@latex
 def mab_loop(mab: MAB, agent: "Agent") -> int:
     for t in range(mab.T):
         arm = agent.choose_arm()  # in 0, ..., K-1
@@ -67,7 +67,7 @@
         agent.update_history(arm, reward)
 
 
-mab_loop
Loading...

The Agent class stores the pull history and uses it to decide which arm to pull next. Since we are working with Bernoulli bandits, we can summarize the pull history concisely in a NK×2\mathbb{N}^{K \times 2} array.

class Agent:
+mab_loop
Loading...

The Agent class stores the pull history and uses it to decide which arm to pull next. Since we are working with Bernoulli bandits, we can summarize the pull history concisely in a NK×2\mathbb{N}^{K \times 2} array.

class Agent:
     def __init__(self, K: int, T: int):
         """The MAB agent that decides how to choose an arm given the past history."""
         self.K = K
@@ -87,12 +87,12 @@
     def update_history(self, arm: int, reward: int):
         self.rewards.append(reward)
         self.choices.append(arm)
-        self.history[arm, reward] += 1

What’s the optimal strategy for the agent, i.e. the one that achieves + self.history[arm, reward] += 1

What’s the optimal strategy for the agent, i.e. the one that achieves the highest expected reward? Convince yourself that the agent should try -to always pull the arm with the highest expected reward:

μ:=maxk[K]μk.\mu^\star := \max_{k \in [K]} \mu^k.

The goal, then, can be rephrased as to minimize the regret, defined -below:

def regret_per_step(mab: MAB, agent: Agent):
+to always pull the arm with the highest expected reward:

μ:=maxk[K]μk.\mu^\star := \max_{k \in [K]} \mu^k.

The goal, then, can be rephrased as to minimize the regret, defined +below:

def regret_per_step(mab: MAB, agent: Agent):
     """Get the difference from the average reward of the optimal arm. The sum of these is the regret."""
-    return [mab.means[mab.best_arm] - mab.means[arm] for arm in agent.choices]

Note that this depends on the true means of the pulled arms, not the actual + return [mab.means[mab.best_arm] - mab.means[arm] for arm in agent.choices]

Note that this depends on the true means of the pulled arms, not the actual observed rewards. We typically think of this as a random variable where the randomness comes from the agent’s strategy (i.e. the sequence of @@ -100,7 +100,7 @@ algorithms in two different senses:

  1. Upper bound the expected regret, i.e. show E[RegretT]MT\E[\text{Regret}_T] \le M_T.

  2. Find a high-probability upper bound on the regret, i.e. show P(RegretTMT,δ)1δ\pr(\text{Regret}_T \le M_{T, \delta}) \ge 1-\delta.

Note that these two different approaches say very different things about the regret. The first approach says that the average regret is at most MTM_T. However, the agent might still achieve higher regret on many runs. The second approach says that, with high probability, the agent will achieve regret at most MT,δM_{T, \delta}. However, it doesn’t say anything about the regret in the remaining δ fraction of runs, which might be arbitrarily high.

We’d like to achieve sublinear regret in expectation, i.e. E[RegretT]=o(T)\E[\text{Regret}_T] = o(T). That is, as we learn more about the environment, we’d like to be able to exploit that knowledge to take the optimal arm as often as possible.

The rest of the chapter comprises a series of increasingly sophisticated -MAB algorithms.

def plot_strategy(mab: MAB, agent: Agent):
+MAB algorithms.

def plot_strategy(mab: MAB, agent: Agent):
     plt.figure(figsize=(10, 6))
 
     # plot reward and cumulative regret
@@ -117,21 +117,21 @@
     plt.xlabel("timestep")
     plt.legend()
     plt.title(f"{agent.__class__.__name__} reward and regret")
-    plt.show()

3.2Pure exploration (random guessing)

A trivial strategy is to always choose arms at random (i.e. “pure -exploration”).

class PureExploration(Agent):
+    plt.show()

3.2Pure exploration (random guessing)

A trivial strategy is to always choose arms at random (i.e. “pure +exploration”).

class PureExploration(Agent):
     def choose_arm(self):
         """Choose an arm uniformly at random."""
-        return solutions.pure_exploration_choose_arm(self)

Note that

EatUnif([K])[μat]=μˉ=1Kk=1Kμk\E_{a_t \sim \text{Unif}([K])}[\mu^{a_t}] = \bar \mu = \frac{1}{K} \sum_{k=1}^K \mu^k

so the expected regret is simply

E[RegretT]=t=0T1E[μμat]=T(μμˉ)>0.\begin{aligned} + return solutions.pure_exploration_choose_arm(self)

Note that

EatUnif([K])[μat]=μˉ=1Kk=1Kμk\E_{a_t \sim \text{Unif}([K])}[\mu^{a_t}] = \bar \mu = \frac{1}{K} \sum_{k=1}^K \mu^k

so the expected regret is simply

E[RegretT]=t=0T1E[μμat]=T(μμˉ)>0.\begin{aligned} \E[\text{Regret}_T] &= \sum_{t=0}^{T-1} \E[\mu^\star - \mu^{a_t}] \\ &= T (\mu^\star - \bar \mu) > 0. -\end{aligned}

This scales as Θ(T)\Theta(T), i.e. linear in the number of timesteps TT. There’s no learning here: the agent doesn’t use any information about the environment to improve its strategy. You can see that the distribution over its arm choices always appears “(uniformly) random”.

agent = PureExploration(mab.K, mab.T)
+\end{aligned}

This scales as Θ(T)\Theta(T), i.e. linear in the number of timesteps TT. There’s no learning here: the agent doesn’t use any information about the environment to improve its strategy. You can see that the distribution over its arm choices always appears “(uniformly) random”.

agent = PureExploration(mab.K, mab.T)
 mab_loop(mab, agent)
-plot_strategy(mab, agent)
<Figure size 1000x600 with 1 Axes>

3.3Pure greedy

How might we improve on pure exploration? Instead, we could try each arm +plot_strategy(mab, agent)

<Figure size 1000x600 with 1 Axes>

3.3Pure greedy

How might we improve on pure exploration? Instead, we could try each arm once, and then commit to the one with the highest observed reward. We’ll -call this the pure greedy strategy.

class PureGreedy(Agent):
+call this the pure greedy strategy.

class PureGreedy(Agent):
     def choose_arm(self):
         """Choose the arm with the highest observed reward on its first pull."""
-        return solutions.pure_greedy_choose_arm(self)

Note we’ve used superscripts rkr^k during the exploration phase to + return solutions.pure_greedy_choose_arm(self)

Note we’ve used superscripts rkr^k during the exploration phase to indicate that we observe exactly one reward for each arm. Then we use subscripts rtr_t during the exploitation phase to indicate that we observe a sequence of rewards from the chosen greedy arm k^\hat k.

How does the expected regret of this strategy compare to that of pure @@ -140,31 +140,31 @@ reward distributions with means μ0>μ1\mu^0 > \mu^1.

Let’s let r0r^0 be the random reward from the first arm and r1r^1 be the random reward from the second. If r0>r1r^0 > r^1, then we achieve zero regret. Otherwise, we achieve regret T(μ0μ1)T(\mu^0 - \mu^1). Thus, the -expected regret is simply:

E[RegretT]=P(r0<r1)T(μ0μ1)+c=(1μ0)μ1T(μ0μ1)+c\begin{aligned} +expected regret is simply:

E[RegretT]=P(r0<r1)T(μ0μ1)+c=(1μ0)μ1T(μ0μ1)+c\begin{aligned} \E[\text{Regret}_T] &= \pr(r^0 < r^1) \cdot T(\mu^0 - \mu^1) + c \\ &= (1 - \mu^0) \mu^1 \cdot T(\mu^0 - \mu^1) + c -\end{aligned}

Which is still Θ(T)\Theta(T), the same as pure exploration!

agent = PureGreedy(mab.K, mab.T)
+\end{aligned}

Which is still Θ(T)\Theta(T), the same as pure exploration!

agent = PureGreedy(mab.K, mab.T)
 mab_loop(mab, agent)
-plot_strategy(mab, agent)
<Figure size 1000x600 with 1 Axes>

The cumulative regret is a straight line because the regret only depends on the arms chosen and not the actual reward observed. In fact, if the greedy algorithm happens to get lucky on the first set of pulls, it may act entirely optimally for that episode! But its average regret is what measures its effectiveness.

3.4Explore-then-commit

We can improve the pure greedy algorithm as follows: let’s reduce the variance of the reward estimates by pulling each arm Nexplore>1N_{\text{explore}}> 1 times before committing. This is called the explore-then-commit strategy. Note that the “pure greedy” strategy above is just the special case where -Nexplore=1N_{\text{explore}}= 1.

class ExploreThenCommit(Agent):
+plot_strategy(mab, agent)
<Figure size 1000x600 with 1 Axes>

The cumulative regret is a straight line because the regret only depends on the arms chosen and not the actual reward observed. In fact, if the greedy algorithm happens to get lucky on the first set of pulls, it may act entirely optimally for that episode! But its average regret is what measures its effectiveness.

3.4Explore-then-commit

We can improve the pure greedy algorithm as follows: let’s reduce the variance of the reward estimates by pulling each arm Nexplore>1N_{\text{explore}}> 1 times before committing. This is called the explore-then-commit strategy. Note that the “pure greedy” strategy above is just the special case where +Nexplore=1N_{\text{explore}}= 1.

class ExploreThenCommit(Agent):
     def __init__(self, K: int, T: int, N_explore: int):
         super().__init__(K, T)
         self.N_explore = N_explore
 
     def choose_arm(self):
-        return solutions.etc_choose_arm(self)
agent = ExploreThenCommit(mab.K, mab.T, mab.T // 15)
+        return solutions.etc_choose_arm(self)
agent = ExploreThenCommit(mab.K, mab.T, mab.T // 15)
 mab_loop(mab, agent)
-plot_strategy(mab, agent)
<Figure size 1000x600 with 1 Axes>

Notice that now, the graphs are much more consistent, and the algorithm finds the true optimal arm and sticks with it much more frequently. We would expect ETC to then have a better (i.e. lower) average regret. Can we prove this?

3.4.1ETC regret analysis

Let’s analyze the expected regret of the explore-then-commit strategy by splitting it up +plot_strategy(mab, agent)

<Figure size 1000x600 with 1 Axes>

Notice that now, the graphs are much more consistent, and the algorithm finds the true optimal arm and sticks with it much more frequently. We would expect ETC to then have a better (i.e. lower) average regret. Can we prove this?

3.4.1ETC regret analysis

Let’s analyze the expected regret of the explore-then-commit strategy by splitting it up into the exploration and exploitation phases.

3.4.1.1Exploration phase.

This phase takes NexploreKN_{\text{explore}}K timesteps. Since at each step we incur at most 1 regret, the total regret is at most NexploreKN_{\text{explore}}K.

3.4.1.2Exploitation phase.

This will take a bit more effort. We’ll prove that for any total time TT, we can choose NexploreN_{\text{explore}} such that with arbitrarily high probability, the regret is sublinear.

Let k^\hat k denote the arm chosen after the exploration phase. We know the regret from the -exploitation phase is

Texploit(μμk^)whereTexploit:=TNexploreK.T_{\text{exploit}} (\mu^\star - \mu^{\hat k}) \qquad \text{where} \qquad T_{\text{exploit}} := T - N_{\text{explore}}K.

So we’d like to bound μμk^=o(1)\mu^\star - \mu^{\hat k} = o(1) (as a function +exploitation phase is

Texploit(μμk^)whereTexploit:=TNexploreK.T_{\text{exploit}} (\mu^\star - \mu^{\hat k}) \qquad \text{where} \qquad T_{\text{exploit}} := T - N_{\text{explore}}K.

So we’d like to bound μμk^=o(1)\mu^\star - \mu^{\hat k} = o(1) (as a function of TT) in order to achieve sublinear regret. How can we do this?

Let’s define Δk=μ^kμk\Delta^k = \hat \mu^k - \mu^k to denote how far the mean estimate for arm kk is from the true mean. How can we bound this quantity? We’ll use the following useful inequality for i.i.d. bounded random variables:

The proof of this inequality is beyond the scope of this book. See Vershynin (2018) Chapter 2.2.

We can apply this directly to the rewards for a given arm kk, since the rewards from that arm are i.i.d.:

P(Δk>ln(2/δ)2Nexplore)δ.\pr\left(|\Delta^k | > \sqrt{\frac{\ln(2/\delta)}{2N_{\text{explore}}}} \right) \le \delta.

The proof of this inequality is beyond the scope of this book. See Vershynin (2018) Chapter 2.2.

We can apply this directly to the rewards for a given arm kk, since the rewards from that arm are i.i.d.:

P(Δk>ln(2/δ)2Nexplore)δ.\pr\left(|\Delta^k | > \sqrt{\frac{\ln(2/\delta)}{2N_{\text{explore}}}} \right) \le \delta.1

Then to apply this bound to k^\hat k in particular, we -can apply the useful trick of “adding zero”:

μkμk^=μkμk^+(μ^kμ^k)+(μ^k^μ^k^)=Δk^Δk+(μ^kμ^k^)0 by definition of k^2ln(2K/δ)2Nexplore with probability at least 1δ\begin{aligned} +606zM1001 80h400000v40H1017.7z'/>)1

Then to apply this bound to k^\hat k in particular, we +can apply the useful trick of “adding zero”:

μkμk^=μkμk^+(μ^kμ^k)+(μ^k^μ^k^)=Δk^Δk+(μ^kμ^k^)0 by definition of k^2ln(2K/δ)2Nexplore with probability at least 1δ\begin{aligned} \mu^{k^\star} - \mu^{\hat k} &= \mu^{k^\star} - \mu^{\hat k} + (\hat \mu^{k^\star} - \hat \mu^{k^\star}) + (\hat \mu^{\hat k} - \hat \mu^{\hat k}) \\ &= \Delta^{\hat k} - \Delta^{k^*} + \underbrace{(\hat \mu^{k^\star} - \hat \mu^{\hat k})}_{\le 0 \text{ by definition of } \hat k} \\ &\le 2 \sqrt{\frac{\ln(2K/\delta')}{2N_{\text{explore}}}} \text{ with probability at least } 1-\delta' @@ -216,15 +216,15 @@ c-8,0,-12,-0.7,-12,-2c0,-1.3,-5.3,-32,-16,-92c-50.7,-293.3,-119.7,-693.3,-207,-1200 c0,-1.3,-5.3,8.7,-16,30c-10.7,21.3,-21.3,42.7,-32,64s-16,33,-16,33s-26,-26,-26,-26 s76,-153,76,-153s77,-151,77,-151c0.7,0.7,35.7,202,105,604c67.3,400.7,102,602.7,104, -606zM1001 80h400000v40H1017.7z'/> with probability at least 1δ

where we’ve set δ=Kδ\delta' = K\delta. Putting this all -together, we’ve shown that, with probability 1δ1 - \delta',

RegretTNexploreK+Texploit2ln(2K/δ)Nexplore.\text{Regret}_T \le N_{\text{explore}}K + T_{\text{exploit}} \cdot \sqrt{\frac{2\ln(2K/\delta')}{N_{\text{explore}}}}.

where we’ve set δ=Kδ\delta' = K\delta. Putting this all +together, we’ve shown that, with probability 1δ1 - \delta',

RegretTNexploreK+Texploit2ln(2K/δ)Nexplore.\text{Regret}_T \le N_{\text{explore}}K + T_{\text{exploit}} \cdot \sqrt{\frac{2\ln(2K/\delta')}{N_{\text{explore}}}}.

Note that it suffices for NexploreN_{\text{explore}} to be on the order of +606zM1001 80h400000v40H1017.7z'/>.

Note that it suffices for NexploreN_{\text{explore}} to be on the order of T\sqrt{T} to achieve sublinear regret. In particular, we can find the optimal NexploreN_{\text{explore}} by setting the derivative of the r.h.s. to -zero:

0=KTexploit122ln(2K/δ)Nexplore3Nexplore=(Texploitln(2K/δ)/2K)2/3\begin{aligned} +zero:

0=KTexploit122ln(2K/δ)Nexplore3Nexplore=(Texploitln(2K/δ)/2K)2/3\begin{aligned} 0 &= K - T_{\text{exploit}} \cdot \frac{1}{2} \sqrt{\frac{2\ln(2K/\delta')}{N_{\text{explore}}^3}} \\ N_{\text{explore}}&= \left( T_{\text{exploit}} \cdot \frac{\sqrt{\ln(2K/\delta')/2}}{K} \right)^{2/3} \end{aligned}

Plugging this into the expression for the regret, we -have (still with probability 1δ1-\delta')

RegretT3T2/3Kln(2K/δ)/23=O~(T2/3K1/3).\begin{aligned} +M1001 80h400000v40h-400000z'/>)2/3

Plugging this into the expression for the regret, we +have (still with probability 1δ1-\delta')

RegretT3T2/3Kln(2K/δ)/23=O~(T2/3K1/3).\begin{aligned} \text{Regret}_T &\le 3 T^{2/3} \sqrt[3]{K \ln(2K/\delta') / 2} \\ &= \tilde{O}(T^{2/3} K^{1/3}). \end{aligned}

The ETC algorithm is rather “abrupt” in that it switches from +M1001 80h400000v40h-400000z'/>=O~(T2/3K1/3).

The ETC algorithm is rather “abrupt” in that it switches from exploration to exploitation after a fixed number of timesteps. In practice, it’s often better to use a more gradual transition, which -brings us to the epsilon-greedy algorithm.

3.5Epsilon-greedy

Instead of doing all of the exploration and then all of the exploitation +brings us to the epsilon-greedy algorithm.

3.5Epsilon-greedy

Instead of doing all of the exploration and then all of the exploitation separately – which additionally requires knowing the time horizon beforehand – we can instead interleave exploration and exploitation by, at each timestep, choosing a random action with some probability. We -call this the epsilon-greedy algorithm.

class EpsilonGreedy(Agent):
+call this the epsilon-greedy algorithm.

class EpsilonGreedy(Agent):
     def __init__(
         self,
         K: int,
@@ -290,9 +290,9 @@
         self.ε_array = ε_array
 
     def choose_arm(self):
-        return solutions.epsilon_greedy_choose_arm(self)
agent = EpsilonGreedy(mab.K, mab.T, np.full(mab.T, 0.1))
+        return solutions.epsilon_greedy_choose_arm(self)
agent = EpsilonGreedy(mab.K, mab.T, np.full(mab.T, 0.1))
 mab_loop(mab, agent)
-plot_strategy(mab, agent)
<Figure size 1000x600 with 1 Axes>

Note that we let ε vary over time. In particular, we might want to gradually decrease ε as we learn more about the reward distributions and no longer need to spend time exploring.

It turns out that setting ϵt=Kln(t)/t3\epsilon_t = \sqrt[3]{K \ln(t)/t} also achieves a regret of O~(t2/3K1/3)\tilde O(t^{2/3} K^{1/3}) (ignoring the logarithmic factors). (We will not prove this here.) TODO ADD PROOF CITATION

In ETC, we had to set NexploreN_{\text{explore}} based on the total number of timesteps TT. But the epsilon-greedy algorithm actually handles the exploration automatically: the regret rate holds for any tt, and doesn’t depend on the final horizon TT.

But the way these algorithms explore is rather naive: we’ve been exploring uniformly across all the arms. But what if we could be smarter about it, and explore more for arms that we’re less certain about?

3.6Upper Confidence Bound (UCB)

To quantify how certain we are about the mean of each arm, we’ll +M1001 80h400000v40h-400000z'/> also achieves a regret of O~(t2/3K1/3)\tilde O(t^{2/3} K^{1/3}) (ignoring the logarithmic factors). (We will not prove this here.) TODO ADD PROOF CITATION

In ETC, we had to set NexploreN_{\text{explore}} based on the total number of timesteps TT. But the epsilon-greedy algorithm actually handles the exploration automatically: the regret rate holds for any tt, and doesn’t depend on the final horizon TT.

But the way these algorithms explore is rather naive: we’ve been exploring uniformly across all the arms. But what if we could be smarter about it, and explore more for arms that we’re less certain about?

3.6Upper Confidence Bound (UCB)

To quantify how certain we are about the mean of each arm, we’ll compute confidence intervals for our estimators, and then choose the arm with the highest upper confidence bound. This operates on the principle of the benefit of the doubt (i.e. optimism in the face of @@ -320,10 +320,10 @@ uniformly across all timesteps and arms. Let’s introduce some notation to discuss this.

Let NtkN^k_t denote the (random) number of times arm kk has been pulled within the first tt timesteps, and μ^tk\hat \mu^k_t denote the sample -average of those pulls. That is,

Ntk:=τ=0t11{aτ=k}μ^tk:=1Ntkτ=0t11{aτ=k}rτ.\begin{aligned} +average of those pulls. That is,

Ntk:=τ=0t11{aτ=k}μ^tk:=1Ntkτ=0t11{aτ=k}rτ.\begin{aligned} N^k_t &:= \sum_{\tau=0}^{t-1} \mathbf{1} \{ a_\tau = k \} \\ \hat \mu^k_t &:= \frac{1}{N^k_t} \sum_{\tau=0}^{t-1} \mathbf{1} \{ a_\tau = k \} r_\tau. -\end{aligned}

To achieve the “fixed sample size” assumption, we’ll +\end{aligned}

To achieve the “fixed sample size” assumption, we’ll need to shift our index from time to number of samples from each arm. In particular, we’ll define r~nk\tilde r^k_n to be the nnth sample from arm kk, and μ~nk\tilde \mu^k_n to be the sample average of the first @@ -333,7 +333,7 @@ Well, we know NtktN^k_t \le t (where equality would be the case if and only if we had pulled arm kk every time). So we can apply the same trick as last time, where we uniform-ize across all possible values of -NtkN^k_t:

P(nt,μ~nkμkln(2/δ)2n)1tδ.\begin{aligned} +NtkN^k_t:

P(nt,μ~nkμkln(2/δ)2n)1tδ.\begin{aligned} \pr\left( \forall n \le t, |\tilde \mu^k_n - \mu^k | \le \sqrt{\frac{\ln(2/\delta)}{2n}} \right) &\ge 1-t\delta. \end{aligned}

In particular, since NtktN^k_t \le t, and μ~Ntkk=μ^tk\tilde \mu^k_{N^k_t} = \hat \mu^k_t by definition, we have

P(μ^tkμkln(2t/δ)2Ntk)1δ where δ:=tδ.\begin{aligned} +h400000v40h-400000z'/>)1tδ.

In particular, since NtktN^k_t \le t, and μ~Ntkk=μ^tk\tilde \mu^k_{N^k_t} = \hat \mu^k_t by definition, we have

P(μ^tkμkln(2t/δ)2Ntk)1δ where δ:=tδ.\begin{aligned} \pr\left( |\hat \mu^k_t - \mu^k | \le \sqrt{\frac{\ln(2t/\delta')}{2N^k_t}} \right) &\ge 1-\delta' \text{ where } \delta' := t \delta. \end{aligned}

This bound would then suffice for applying the UCB algorithm! That is, the upper confidence bound for arm kk would be

Mtk:=μ^tk+ln(2t/δ)2Ntk,M^k_t := \hat \mu^k_t + \sqrt{\frac{\ln(2t/\delta')}{2N^k_t}},1δ where δ:=tδ.

This bound would then suffice for applying the UCB algorithm! That is, the upper confidence bound for arm kk would be

Mtk:=μ^tk+ln(2t/δ)2Ntk,M^k_t := \hat \mu^k_t + \sqrt{\frac{\ln(2t/\delta')}{2N^k_t}},

where we can choose δ\delta' depending on how tight we want the interval to be.

  • A smaller δ\delta' would give us a larger and higher-confidence interval, emphasizing the exploration term.
  • A larger δ\delta' would give a tighter and lower-confidence interval, prioritizing the current sample averages.

We can now use this to define the UCB algorithm.

class UCB(Agent):
+606zM1001 80h400000v40H1017.7z'/>,

where we can choose δ\delta' depending on how tight we want the interval to be.

  • A smaller δ\delta' would give us a larger and higher-confidence interval, emphasizing the exploration term.
  • A larger δ\delta' would give a tighter and lower-confidence interval, prioritizing the current sample averages.

We can now use this to define the UCB algorithm.

class UCB(Agent):
     def __init__(self, K: int, T: int, delta: float):
         super().__init__(K, T)
         self.delta = delta
 
     def choose_arm(self):
-        return solutions.ucb_choose_arm(self)

Intuitively, UCB prioritizes arms where:

  1. μ^tk\hat \mu^k_t is large, i.e. the arm has a high sample average, and + return solutions.ucb_choose_arm(self)

Intuitively, UCB prioritizes arms where:

  1. μ^tk\hat \mu^k_t is large, i.e. the arm has a high sample average, and we’d choose it for exploitation, and

  2. ln(2t/δ)2Ntk\sqrt{\frac{\ln(2t/\delta')}{2N^k_t}} is large, i.e. we’re still uncertain about the arm, and we’d choose it for exploration.

As desired, this explores in a smarter, adaptive way compared to the -previous algorithms. Does it achieve lower regret?

agent = UCB(mab.K, mab.T, 0.9)
+previous algorithms. Does it achieve lower regret?

agent = UCB(mab.K, mab.T, 0.9)
 mab_loop(mab, agent)
-plot_strategy(mab, agent)
<Figure size 1000x600 with 1 Axes>

3.6.1UCB regret analysis

First we’ll bound the regret incurred at each timestep. Then we’ll bound +plot_strategy(mab, agent)

<Figure size 1000x600 with 1 Axes>

3.6.1UCB regret analysis

First we’ll bound the regret incurred at each timestep. Then we’ll bound the total regret across timesteps.

For the sake of analysis, we’ll use a slightly looser bound that applies across the whole time horizon and across all arms. We’ll omit the derivation since it’s very similar to the above (walk through it -yourself for practice).

P(kK,t<T.μ^tkμkBtk)1δwhereBtk:=ln(2TK/δ)2Ntk.\begin{aligned} +yourself for practice).

P(kK,t<T.μ^tkμkBtk)1δwhereBtk:=ln(2TK/δ)2Ntk.\begin{aligned} \pr\left(\forall k \le K, t < T. |\hat \mu^k_t - \mu^k | \le B^k_t \right) &\ge 1-\delta'' \\ \text{where} \quad B^k_t &:= \sqrt{\frac{\ln(2TK/\delta'')}{2N^k_t}}. \end{aligned}

Intuitively, BtkB^k_t denotes the width of the CI for arm kk at time +606zM1001 80h400000v40H1017.7z'/>.

Intuitively, BtkB^k_t denotes the width of the CI for arm kk at time tt. Then, assuming the above uniform bound holds (which occurs with probability 1δ1-\delta''), we can bound the regret at each timestep as -follows:

μμatμ^tk+Btkμatapplying UCB to arm kμ^tat+Btatμatsince UCB chooses at=argmaxk[K]μ^tk+Btk2Btatsince μ^tatμatBtat by definition of Btat\begin{aligned} +follows:

μμatμ^tk+Btkμatapplying UCB to arm kμ^tat+Btatμatsince UCB chooses at=argmaxk[K]μ^tk+Btk2Btatsince μ^tatμatBtat by definition of Btat\begin{aligned} \mu^\star - \mu^{a_t} &\le \hat \mu^{k^*}_t + B_t^{k^*} - \mu^{a_t} && \text{applying UCB to arm } k^\star \\ &\le \hat \mu^{a_t}_t + B^{a_t}_t - \mu^{a_t} && \text{since UCB chooses } a_t = \arg \max_{k \in [K]} \hat \mu^k_t + B_t^{k} \\ &\le 2 B^{a_t}_t && \text{since } \hat \mu^{a_t}_t - \mu^{a_t} \le B^{a_t}_t \text{ by definition of } B^{a_t}_t \\ -\end{aligned}

Summing this across timesteps gives

RegretTt=0T12Btat=2ln(2TK/δ)t=0T1(Ntat)1/2t=0T1(Ntat)1/2=t=0T1k=1K1{at=k}(Ntk)1/2=k=1Kn=1NTkn1/2Kn=1Tn1/2n=1Tn1/21+1Tx1/2 dx=1+(2x)1T=2T12T\begin{aligned} +\end{aligned}

Summing this across timesteps gives

RegretTt=0T12Btat=2ln(2TK/δ)t=0T1(Ntat)1/2t=0T1(Ntat)1/2=t=0T1k=1K1{at=k}(Ntk)1/2=k=1Kn=1NTkn1/2Kn=1Tn1/2n=1Tn1/21+1Tx1/2 dx=1+(2x)1T=2T12T\begin{aligned} \text{Regret}_T &\le \sum_{t=0}^{T-1} 2 B^{a_t}_t \\ &= \sqrt{2\ln(2TK/\delta'')} \sum_{t=0}^{T-1} (N^{a_t}_t)^{-1/2} \\ \sum_{t=0}^{T-1} (N^{a_t}_t)^{-1/2} &= \sum_{t=0}^{T-1} \sum_{k=1}^K \mathbf{1}\{ a_t = k \} (N^k_t)^{-1/2} \\ @@ -455,7 +455,7 @@ H400000v40H845.2724 s-225.272,467,-225.272,467s-235,486,-235,486c-2.7,4.7,-9,7,-19,7 c-6,0,-10,-1,-12,-3s-194,-422,-194,-422s-65,47,-65,47z -M834 80h400000v40h-400000z'/>

Putting everything together gives

RegretT2K2Tln(2TK/δ)with probability 1δ=O~(KT)\begin{aligned} +M834 80h400000v40h-400000z'/>

Putting everything together gives

RegretT2K2Tln(2TK/δ)with probability 1δ=O~(KT)\begin{aligned} \text{Regret}_T &\le 2 K \sqrt{2T \ln(2TK/\delta'')} && \text{with probability } 1-\delta'' \\ &= \tilde O(K\sqrt{T}) \end{aligned}

In fact, we can do a more sophisticated analysis to trim off a factor of K\sqrt{K}with probability 1δ′′

In fact, we can do a more sophisticated analysis to trim off a factor of K\sqrt{K}.

3.6.2Lower bound on regret (intuition)

Is it possible to do better than Ω(T)\Omega(\sqrt{T}).

3.6.2Lower bound on regret (intuition)

Is it possible to do better than Ω(T)\Omega(\sqrt{T}), we won’t be able to confidently tell them apart, and will sample them about -equally. But then we’ll incur regret

Ω((T/2)(1/T))=Ω(T).\Omega((T/2) \cdot (1/\sqrt{T})) = \Omega(\sqrt{T}).

3.7Thompson sampling and Bayesian bandits

So far, we’ve treated the parameters μ0,,μK1\mu^0, \dots, \mu^{K-1} of the +M834 80h400000v40h-400000z'/>).

3.7Thompson sampling and Bayesian bandits

So far, we’ve treated the parameters μ0,,μK1\mu^0, \dots, \mu^{K-1} of the reward distributions as fixed. Instead, we can take a Bayesian approach where we treat them as random variables from some prior distribution. Then, upon pulling an arm and observing a reward, we can @@ -585,14 +585,14 @@ posterior distribution over the parameters. This fully describes the information we gain about the parameters from observing the reward.

From this Bayesian perspective, the Thompson sampling algorithm follows naturally: just sample from the distribution of the optimal arm, -given the observations!

class Distribution:
+given the observations!

class Distribution:
     def sample(self) -> Float[Array, " K"]:
         """Sample a vector of means for the K arms."""
         ...
 
     def update(self, arm: int, reward: float):
         """Condition on obtaining `reward` from the given arm."""
-        ...
class ThompsonSampling(Agent):
+        ...
class ThompsonSampling(Agent):
     def __init__(self, K: int, T: int, prior: Distribution):
         super().__init__(K, T)
         self.distribution = prior
@@ -603,18 +603,18 @@
 
     def update_history(self, arm: int, reward: int):
         super().update_history(arm, reward)
-        self.distribution.update(arm, reward)

In other words, we sample each arm proportionally to how likely we think + self.distribution.update(arm, reward)

In other words, we sample each arm proportionally to how likely we think it is to be optimal, given the observations so far. This strikes a good exploration-exploitation tradeoff: we explore more for arms that we’re less certain about, and exploit more for arms that we’re more certain about. Thompson sampling is a simple yet powerful algorithm that -achieves state-of-the-art performance in many settings.

class Beta(Distribution):
+the entire posterior distribution from scratch.

class Beta(Distribution):
     def __init__(self, K: int, alpha: int = 1, beta: int = 1):
         self.alphas = np.full(K, alpha)
         self.betas = np.full(K, beta)
@@ -632,16 +632,16 @@
 
     def update(self, arm: int, reward: int):
         self.alphas[arm] += reward
-        self.betas[arm] += 1 - reward
beta_distribution = Beta(mab.K)
+        self.betas[arm] += 1 - reward
beta_distribution = Beta(mab.K)
 agent = ThompsonSampling(mab.K, mab.T, beta_distribution)
 mab_loop(mab, agent)
-plot_strategy(mab, agent)
<Figure size 1000x600 with 1 Axes>

It turns out that asymptotically, Thompson sampling is optimal in the +plot_strategy(mab, agent)

<Figure size 1000x600 with 1 Axes>

It turns out that asymptotically, Thompson sampling is optimal in the following sense. Lai & Robbins (1985) prove an -instance-dependent lower bound that says for any bandit algorithm,

lim infTE[NTk]ln(T)1KL(μkμ)\liminf_{T \to \infty} \frac{\E[N_T^k]}{\ln(T)} \ge \frac{1}{\text{KL}(\mu^k \parallel \mu^\star)}

where

KL(μkμ)=μklnμkμ+(1μk)ln1μk1μ\text{KL}(\mu^k \parallel \mu^\star) = \mu^k \ln \frac{\mu^k}{\mu^\star} + (1 - \mu^k) \ln \frac{1 - \mu^k}{1 - \mu^\star}

measures the Kullback-Leibler divergence from the Bernoulli +instance-dependent lower bound that says for any bandit algorithm,

lim infTE[NTk]ln(T)1KL(μkμ)\liminf_{T \to \infty} \frac{\E[N_T^k]}{\ln(T)} \ge \frac{1}{\text{KL}(\mu^k \parallel \mu^\star)}

where

KL(μkμ)=μklnμkμ+(1μk)ln1μk1μ\text{KL}(\mu^k \parallel \mu^\star) = \mu^k \ln \frac{\mu^k}{\mu^\star} + (1 - \mu^k) \ln \frac{1 - \mu^k}{1 - \mu^\star}

measures the Kullback-Leibler divergence from the Bernoulli distribution with mean μk\mu^k to the Bernoulli distribution with mean μ\mu^\star. It turns out that Thompson sampling achieves this lower bound with equality! That is, not only is the error rate optimal, but -the constant factor is optimal as well.

3.8Contextual bandits

In the above MAB environment, the reward distributions of the arms +the constant factor is optimal as well.

3.8Contextual bandits

In the above MAB environment, the reward distributions of the arms remain constant. However, in many real-world settings, we might receive additional information that affects these distributions. For example, in the online advertising case where each arm corresponds to an ad we could @@ -652,7 +652,7 @@ to observe the context, and choose an action ata_t according to some context-dependent policy πt(xt)\pi_t(x_t). Then, the learner observes the reward from the chosen arm rtνat(xt)r_t \sim \nu^{a_t}(x_t). The reward -distribution also depends on the context.

Assuming our context is discrete, we can just perform the same +distribution also depends on the context.

Assuming our context is discrete, we can just perform the same algorithms, treating each context-arm pair as its own arm. This gives us an enlarged MAB of KXK |\mathcal{X}| arms.

Recall that running UCB for TT timesteps on an MAB with KK arms @@ -686,13 +686,13 @@ unrelated to each other, while in practice, often contexts are related to each other in some way: for example, we might want to advertise similar products to users with similar preferences. How can we -incorporate this structure into our solution?

3.8.1Linear contextual bandits

We want to model the mean reward of arm kk as a function of the +incorporate this structure into our solution?

3.8.1Linear contextual bandits

We want to model the mean reward of arm kk as a function of the context, i.e. μk(x)\mu^k(x). One simple model is the linear one: μk(x)=xθk\mu^k(x) = x^\top \theta^k, where xX=Rdx \in \mathcal{X} = \mathbb{R}^d and θkRd\theta^k \in \mathbb{R}^d describes a feature direction for arm kk. Recall that supervised learning gives us a way to estimate a conditional expectation from samples: We learn a least squares estimator from the -timesteps where arm kk was selected:

θ^tk=argminθRd{i[t]:ai=k}(rixiθ)2.\hat \theta_t^k = \arg\min_{\theta \in \mathbb{R}^d} \sum_{\{ i \in [t] : a_i = k \}} (r_i - x_i^\top \theta)^2.

This has the closed-form solution known as the ordinary least squares +timesteps where arm kk was selected:

θ^tk=argminθRd{i[t]:ai=k}(rixiθ)2.\hat \theta_t^k = \arg\min_{\theta \in \mathbb{R}^d} \sum_{\{ i \in [t] : a_i = k \}} (r_i - x_i^\top \theta)^2.

This has the closed-form solution known as the ordinary least squares (OLS) estimator:

θ^tk=(Atk)1{i[t]:ai=k}xiriwhereAtk={i[t]:ai=k}xixi.\begin{aligned} \hat \theta_t^k & = (A_t^k)^{-1} \sum_{\{ i \in [t] : a_i = k \}} x_i r_i \\ \text{where} \quad A_t^k & = \sum_{\{ i \in [t] : a_i = k \}} x_i x_i^\top. @@ -704,9 +704,9 @@ sample mean, our estimator, from the true mean. However, now our estimator is not a sample mean, but rather the OLS estimator above (3.30). Instead, we’ll use Chebyshev’s inequality to construct an upper confidence bound.

Since the OLS estimator is known to be unbiased (try proving this +EY=0\E Y = 0 and EY2=σ2\E Y^2 = \sigma^2,

Yβσwith probability11β2|Y| \le \beta \sigma \quad \text{with probability} \ge 1 - \frac{1}{\beta^2}

Since the OLS estimator is known to be unbiased (try proving this yourself), we can apply Chebyshev’s inequality to -xt(θ^tkθk)x_t^\top (\hat \theta_t^k - \theta^k):

xtθkxtθ^tk+βxt(Atk)1xtwith probability11β2\begin{aligned} +xt(θ^tkθk)x_t^\top (\hat \theta_t^k - \theta^k):

xtθkxtθ^tk+βxt(Atk)1xtwith probability11β2\begin{aligned} x_t^\top \theta^k \le x_t^\top \hat \theta_t^k + \beta \sqrt{x_t^\top (A_t^k)^{-1} x_t} \quad \text{with probability} \ge 1 - \frac{1}{\beta^2} \end{aligned}

The first term is exactly our predicted reward μ^tk(xt)\hat \mu^k_t(x_t). To -interpret the second term, note that

xt(Atk)1xt=1Ntkxt(Σtk)1xt,x_t^\top (A_t^k)^{-1} x_t = \frac{1}{N_t^k} x_t^\top (\Sigma_t^k)^{-1} x_t,

where

Σtk=1Ntk{i[t]:ai=k}xixi\Sigma_t^k = \frac{1}{N_t^k} \sum_{\{ i \in [t] : a_i = k \}} x_i x_i^\top

is the empirical covariance matrix of the contexts (assuming that the +interpret the second term, note that

xt(Atk)1xt=1Ntkxt(Σtk)1xt,x_t^\top (A_t^k)^{-1} x_t = \frac{1}{N_t^k} x_t^\top (\Sigma_t^k)^{-1} x_t,

where

Σtk=1Ntk{i[t]:ai=k}xixi\Sigma_t^k = \frac{1}{N_t^k} \sum_{\{ i \in [t] : a_i = k \}} x_i x_i^\top

is the empirical covariance matrix of the contexts (assuming that the context has mean zero). That is, the learner is encouraged to choose arms when xtx_t is not aligned with the data seen so far, or if arm kk has not been explored much and so NtkN_t^k is small.

We can now substitute these quantities into UCB to get the LinUCB -algorithm:

class LinUCBPseudocode(Agent):
+algorithm:

class LinUCBPseudocode(Agent):
     def __init__(
         self, K: int, T: int, D: int, lam: float, get_c: Callable[[int], float]
     ):
@@ -746,7 +746,7 @@
     def update_history(self, context: Float[Array, " D"], arm: int, reward: int):
         self.A[arm] += np.outer(context, context)
         self.targets[arm] += context * reward
-        self.w[arm] = np.linalg.solve(self.A[arm], self.targets[arm])

ctc_t is similar to the log(2t/δ)\log (2t/\delta') term of UCB: It controls the + self.w[arm] = np.linalg.solve(self.A[arm], self.targets[arm])

ctc_t is similar to the log(2t/δ)\log (2t/\delta') term of UCB: It controls the width of the confidence interval. Here, we treat it as a tunable parameter, though in a theoretical analysis, it would depend on AtkA_t^k and the probability δ with which the bound holds.

Using similar tools for UCB, we can also prove an O~(T)\tilde{O}(\sqrt{T}) regret bound. The full details of the analysis can be found in Section 3 of Agarwal et al. (2022).

3.9Summary

In this chapter, -we explored the multi-armed bandit setting for analyzing sequential decision-making in an unknown environment.

References
  1. Vershynin, R. (2018). High-Dimensional Probability: An Introduction with Applications in Data Science. Cambridge University Press.
  2. Lai, T. L., & Robbins, H. (1985). Asymptotically Efficient Adaptive Allocation Rules. Advances in Applied Mathematics, 6(1), 4–22. 10.1016/0196-8858(85)90002-8
  3. Agarwal, A., Jiang, N., Kakade, S. M., & Sun, W. (2022). Reinforcement Learning: Theory and Algorithms.
\ No newline at end of file diff --git a/bandits.json b/bandits.json index 69cbce9..69af5b0 100644 --- a/bandits.json +++ b/bandits.json @@ -1 +1 @@ -{"kind":"Notebook","sha256":"cb8437494713e13080ce9e296ca5fbb4d04ebda213c523132d19db6324b795e6","slug":"bandits","location":"/bandits.md","dependencies":[],"frontmatter":{"title":"3 Multi-Armed Bandits","numbering":{"all":{"enabled":true},"enumerator":{"template":"3.%s"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"exports":[{"format":"md","filename":"bandits.md","url":"/build/bandits-edc5c0bbc4c299ec710273a0eb78717a.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"nWSeY0x6gC"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"3.1","key":"GUljouEZrf"},{"type":"paragraph","position":{"start":{"line":20,"column":1},"end":{"line":21,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"GB2yISHlNa"},{"type":"strong","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"multi-armed bandits","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"EPmCrARmmc"}],"key":"Eoc4K7asrv"},{"type":"text","value":" (MAB) setting is a simple setting for studying the basic challenges of sequential decision-making.\nIn this setting, an agent repeatedly chooses from a fixed set of actions, called ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"ZDdZGnGukc"},{"type":"strong","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"arms","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"mHZbbwiqVp"}],"key":"wGvcOKRcV8"},{"type":"text","value":", each of which has an associated reward distribution. The agent’s goal is to maximize the total reward it receives over some time period.","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"CxqQJZjon6"}],"key":"HLoKqUVs3S"},{"type":"comment","value":" \n| States | Actions | Rewards |\n| :----: | :-----: | :---------------------------------: |\n| None | Finite | $\\mathcal{A} \\to \\triangle([0, 1])$ |\n","key":"aPZlUHuv99"},{"type":"paragraph","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"text","value":"In particular, we’ll spend a lot of time discussing the ","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"key":"WM2RGUd4Ue"},{"type":"strong","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"text","value":"Exploration-Exploitation Tradeoff","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"key":"NuKWfSpfjD"}],"key":"rjO4ypYqUm"},{"type":"text","value":": should the agent choose new actions to learn more about the environment, or should it choose actions that it already knows to be good?","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"key":"mg04WyXOQa"}],"key":"yiSo3Nfsy8"},{"type":"proof","kind":"example","label":"advertising","identifier":"advertising","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Online advertising","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"bCXq5PqUHS"}],"key":"EZSY9N9EVF"},{"type":"paragraph","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"children":[{"type":"text","value":"Let’s suppose you, the agent, are an advertising company. You have ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"aoP6RJaHOI"},{"type":"inlineMath","value":"K","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"html":"KKK","key":"VfyuHVwY6q"},{"type":"text","value":" different ads that you can show to users; For concreteness, let’s suppose there’s just a single user. You receive ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"dun5C6pUbw"},{"type":"text","value":"1","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"G8yjy3ECU5"},{"type":"text","value":" reward if the user clicks the ad, and ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"ZEPJhspxM1"},{"type":"text","value":"0","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"KHR196RuF7"},{"type":"text","value":" otherwise. Thus, the unknown ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"R3G9r0xz8c"},{"type":"emphasis","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"children":[{"type":"text","value":"reward distribution","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"TS6QIkuAzL"}],"key":"K1zZ9GDvFh"},{"type":"text","value":" associated to each ad is a Bernoulli distribution defined by the probability that the user clicks on the ad. Your goal is to maximize the total number of clicks by the user.","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"ZO8G9DxDM2"}],"key":"Ndnxc9VBtZ"}],"enumerator":"3.1","html_id":"advertising","key":"Ss5xrFJ7ZY"},{"type":"proof","kind":"example","label":"clinical_trials","identifier":"clinical_trials","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Clinical trials","position":{"start":{"line":37,"column":1},"end":{"line":37,"column":1}},"key":"gJyCZT7ymr"}],"key":"fEfTnS7Muv"},{"type":"paragraph","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"children":[{"type":"text","value":"Suppose you’re a pharmaceutical company, and you’re testing a new drug. You have ","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"axjjzNWrJa"},{"type":"inlineMath","value":"K","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"html":"KKK","key":"xzRnT8w3TZ"},{"type":"text","value":" different dosages of the drug that you can administer to patients. You receive ","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"Ib4HaHOZjI"},{"type":"text","value":"1","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"YjQxOtv8aL"},{"type":"text","value":" reward if the patient recovers, and ","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"qHV3meXXOW"},{"type":"text","value":"0","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"lFnKvx4wBa"},{"type":"text","value":" otherwise. Thus, the unknown ","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"DNPeiBheaW"},{"type":"emphasis","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"children":[{"type":"text","value":"reward distribution","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"AOZg3VWa3Q"}],"key":"mSAR5f3nkz"},{"type":"text","value":" associated to each dosage is a Bernoulli distribution defined by the probability that the patient recovers. Your goal is to maximize the total number of patients that recover.","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"wpusxwhV36"}],"key":"HrzzBQKOMG"}],"enumerator":"3.2","html_id":"clinical-trials","key":"qSFOf9nzGd"},{"type":"paragraph","position":{"start":{"line":43,"column":1},"end":{"line":43,"column":1}},"children":[{"type":"text","value":"In this chapter, we will introduce the multi-armed bandits setting, and discuss some of the challenges that arise when trying to solve problems in this setting. We will also introduce some of the key concepts that we will use throughout the book, such as regret and exploration-exploitation tradeoffs.","position":{"start":{"line":43,"column":1},"end":{"line":43,"column":1}},"key":"pqRvfB6puo"}],"key":"UtQqqz94w7"}],"key":"E4D8R3N1DD"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"from jaxtyping import Float, Array\nimport numpy as np\nimport latexify\nfrom typing import Callable, Union\nimport matplotlib.pyplot as plt\n\nimport solutions.bandits as solutions\n\nnp.random.seed(184)\n\ndef random_argmax(ary: Array) -> int:\n \"\"\"Take an argmax and randomize between ties.\"\"\"\n max_idx = np.flatnonzero(ary == ary.max())\n return np.random.choice(max_idx).item()\n\n\n# used as decorator\nlatex = latexify.algorithmic(\n prefixes={\"mab\"},\n identifiers={\"arm\": \"a_t\", \"reward\": \"r\", \"means\": \"mu\"},\n use_math_symbols=True,\n escape_underscores=False,\n)","key":"GFHLFuZ7cC"},{"type":"output","id":"unLuOsRZ9M9HFnMHyjSZb","data":[],"key":"eX2DIfu9Wz"}],"data":{},"key":"CA3mKlHPID"},{"type":"block","position":{"start":{"line":72,"column":1},"end":{"line":72,"column":1}},"children":[{"type":"proof","kind":"remark","label":"multi-armed","identifier":"multi-armed","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Namesake","position":{"start":{"line":74,"column":1},"end":{"line":74,"column":1}},"key":"TUQ0THHfEI"}],"key":"IADLnxr9DT"},{"type":"paragraph","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"children":[{"type":"text","value":"The name “multi-armed bandits” comes from slot machines in casinos, which are often called “one-armed bandits” since they have one arm (the lever) and take money from the player.","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"key":"phHGMkrwm7"}],"key":"dihXzZjlli"}],"enumerator":"3.1","html_id":"multi-armed","key":"DM1Ad47T3W"},{"type":"paragraph","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"hTFWm0J4kP"},{"type":"inlineMath","value":"K","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"html":"KKK","key":"qJDrkAAVK7"},{"type":"text","value":" denote the number of arms. We’ll label them ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"x2IDbGZ9Gy"},{"type":"inlineMath","value":"0, \\dots, K-1","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"html":"0,,K10, \\dots, K-10,,K1","key":"ufEeAl2WF3"},{"type":"text","value":" and use ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"Kajx0LhIHT"},{"type":"emphasis","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"children":[{"type":"text","value":"superscripts","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"o8VE0Y5pVZ"}],"key":"WIkMatWtnL"},{"type":"text","value":" to indicate the arm index; since we seldom need to raise a number to a power, this won’t cause much confusion. In this chapter, we’ll consider the ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"acjvZdz4ea"},{"type":"strong","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"children":[{"type":"text","value":"Bernoulli bandit","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"TZgZO9lqgC"}],"key":"e6NuKE2iNX"},{"type":"text","value":" setting from the examples above, where arm ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"GhMC6ziu7Z"},{"type":"inlineMath","value":"k","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"html":"kkk","key":"SEAEpIJgv4"},{"type":"text","value":" either returns reward ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"kRPc8GbEfA"},{"type":"text","value":"1","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"iLeyoOknLH"},{"type":"text","value":" with probability ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"aAsP3KWItX"},{"type":"inlineMath","value":"\\mu^k","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"html":"μk\\mu^kμk","key":"lso4pijEQ7"},{"type":"text","value":" or ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"iOOm5xXlf8"},{"type":"text","value":"0","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"l5Awnpgk7U"},{"type":"text","value":" otherwise. The agent gets to pull an arm ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"YNX6wZD4yJ"},{"type":"inlineMath","value":"T","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"html":"TTT","key":"K5MloImxKQ"},{"type":"text","value":" times in total. We can formalize the Bernoulli bandit in the following Python code:","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"u9NkZUeoqb"}],"key":"TTvMZXmaVG"}],"key":"ve4kzOb67A"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class MAB:\n \"\"\"\n The Bernoulli multi-armed bandit environment.\n\n :param means: the means (success probabilities) of the reward distributions for each arm\n :param T: the time horizon\n \"\"\"\n\n def __init__(self, means: Float[Array, \" K\"], T: int):\n assert all(0 <= p <= 1 for p in means)\n self.means = means\n self.T = T\n self.K = self.means.size\n self.best_arm = random_argmax(self.means)\n\n def pull(self, k: int) -> int:\n \"\"\"Pull the `k`-th arm and sample from its (Bernoulli) reward distribution.\"\"\"\n reward = np.random.rand() < self.means[k].item()\n return +reward","key":"o0SYIBcrrb"},{"type":"output","id":"PXfSw9Q5kCjIAXu2hWz8U","data":[],"key":"BoxeOqKpcq"}],"data":{},"key":"GBl3Yuqx8A"},{"type":"block","children":[],"key":"X4HsQlXVlr"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"mab = MAB(means=np.array([0.1, 0.8, 0.4]), T=100)","key":"zHSCvDg7nu"},{"type":"output","id":"FjqH08-BJrKWcPaTfGNx8","data":[],"key":"Nrose8kX9g"}],"data":{},"key":"YitZpqIgG9"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":108,"column":1},"end":{"line":109,"column":1}},"children":[{"type":"text","value":"In pseudocode, the agent’s interaction with the MAB environment can be\ndescribed by the following process:","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"key":"HY9b0uPTib"}],"key":"DgDDNfUClq"}],"key":"JSvpFCkUuU"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"@latex\ndef mab_loop(mab: MAB, agent: \"Agent\") -> int:\n for t in range(mab.T):\n arm = agent.choose_arm() # in 0, ..., K-1\n reward = mab.pull(arm)\n agent.update_history(arm, reward)\n\n\nmab_loop","key":"rDDcgp0PLI"},{"type":"output","id":"MrL1osEepjUJwt8n8Q2zi","data":[{"output_type":"execute_result","execution_count":4,"metadata":{},"data":{"text/plain":{"content":"","content_type":"text/plain"},"text/latex":{"content":"$ \\begin{array}{l} \\mathbf{function} \\ \\mathrm{mab\\_loop}(\\mathrm{mab}, \\mathrm{agent}) \\\\ \\hspace{1em} \\mathbf{for} \\ t \\in \\mathrm{range} \\mathopen{}\\left( T \\mathclose{}\\right) \\ \\mathbf{do} \\\\ \\hspace{2em} \\mathrm{a\\_t} \\gets \\mathrm{agent}.\\mathrm{choose\\_arm} \\mathopen{}\\left( \\mathclose{}\\right) \\\\ \\hspace{2em} r \\gets \\mathrm{pull} \\mathopen{}\\left( \\mathrm{a\\_t} \\mathclose{}\\right) \\\\ \\hspace{2em} \\mathrm{agent}.\\mathrm{update\\_history} \\mathopen{}\\left( \\mathrm{a\\_t}, r \\mathclose{}\\right) \\\\ \\hspace{1em} \\mathbf{end \\ for} \\\\ \\mathbf{end \\ function} \\end{array} $","content_type":"text/latex"}}}],"key":"WHSYzxuOVn"}],"data":{},"key":"ug3WYMcV6N"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"key":"dtTOOAZNUY"},{"type":"inlineCode","value":"Agent","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"key":"JdeJrxJBh6"},{"type":"text","value":" class stores the pull history and uses it to decide which arm to pull next. Since we are working with Bernoulli bandits, we can summarize the pull history concisely in a ","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"key":"p0W1LVF6Wj"},{"type":"inlineMath","value":"\\mathbb{N}^{K \\times 2}","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"html":"NK×2\\mathbb{N}^{K \\times 2}NK×2","key":"A1pya8l9Es"},{"type":"text","value":" array.","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"key":"FC5uzHudiG"}],"key":"kz2o6tPrnK"}],"key":"VqnxzViznl"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class Agent:\n def __init__(self, K: int, T: int):\n \"\"\"The MAB agent that decides how to choose an arm given the past history.\"\"\"\n self.K = K\n self.T = T\n self.rewards = [] # for plotting\n self.choices = []\n self.history = np.zeros((K, 2), dtype=int)\n\n def choose_arm(self) -> int:\n \"\"\"Choose an arm of the MAB. Algorithm-specific.\"\"\"\n ...\n\n def count(self) -> int:\n \"\"\"The number of pulls made. Also the current step index.\"\"\"\n return len(self.rewards)\n\n def update_history(self, arm: int, reward: int):\n self.rewards.append(reward)\n self.choices.append(arm)\n self.history[arm, reward] += 1","key":"ncTznhLvDg"},{"type":"output","id":"IdwxoNojJm1q5Q0NkJAAz","data":[],"key":"NF5OsFs3dq"}],"data":{},"key":"jYH6CCA8vw"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":149,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"What’s the ","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"key":"QIQLCWggzI"},{"type":"emphasis","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"children":[{"type":"text","value":"optimal","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"key":"bDqoxub2Zr"}],"key":"wkKhDkZZI2"},{"type":"text","value":" strategy for the agent, i.e. the one that achieves\nthe highest expected reward? Convince yourself that the agent should try\nto always pull the arm with the highest expected reward:","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"key":"ZTXDuZDufz"}],"key":"SQYCTWILXW"},{"type":"math","value":"\\mu^\\star := \\max_{k \\in [K]} \\mu^k.","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"μ:=maxk[K]μk.\\mu^\\star := \\max_{k \\in [K]} \\mu^k.μ:=k[K]maxμk.","enumerator":"3.1","key":"VUmZeIJgzs"},{"type":"paragraph","position":{"start":{"line":155,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"text","value":"The goal, then, can be rephrased as to minimize the ","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"QCqHYKFo3f"},{"type":"strong","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"children":[{"type":"text","value":"regret","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"wUDPqTnHJP"}],"key":"hZcXEAb6Pa"},{"type":"text","value":", defined\nbelow:","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"HV9YEwEfe1"}],"key":"UGt4tHskML"},{"type":"proof","kind":"definition","label":"regret","identifier":"regret","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Regret","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"key":"BPPMSSBy3R"}],"key":"u8tx3IlKgB"},{"type":"paragraph","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"children":[{"type":"text","value":"The agent’s ","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"key":"GogLfxwkG7"},{"type":"strong","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"children":[{"type":"text","value":"regret","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"key":"phaxTuWurx"}],"key":"S01Hl1tlTW"},{"type":"text","value":" after ","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"key":"YsfC2I4Ob1"},{"type":"inlineMath","value":"T","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"html":"TTT","key":"MByZ0RVTJA"},{"type":"text","value":" timesteps is defined as","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"key":"iP2Rstx9OK"}],"key":"T5OFimXKVf"},{"type":"math","value":"\\text{Regret}_T := \\sum_{t=0}^{T-1} \\mu^\\star - \\mu^{a_t}.","position":{"start":{"line":163,"column":1},"end":{"line":165,"column":1}},"html":"RegretT:=t=0T1μμat.\\text{Regret}_T := \\sum_{t=0}^{T-1} \\mu^\\star - \\mu^{a_t}.RegretT:=t=0T1μμat.","enumerator":"3.2","key":"itXINkXY8Y"}],"enumerator":"3.1","html_id":"regret","key":"VWqFsTMAoe"}],"key":"t3fGpB8EEU"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def regret_per_step(mab: MAB, agent: Agent):\n \"\"\"Get the difference from the average reward of the optimal arm. The sum of these is the regret.\"\"\"\n return [mab.means[mab.best_arm] - mab.means[arm] for arm in agent.choices]","key":"jvZ7j2E6eN"},{"type":"output","id":"yZAdfCijgwECGHGhITuM0","data":[],"key":"rJKniOdKkd"}],"data":{},"key":"xDih1GYShA"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":174,"column":1},"end":{"line":178,"column":1}},"children":[{"type":"text","value":"Note that this depends on the ","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"NWWmhmqxTh"},{"type":"emphasis","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"children":[{"type":"text","value":"true means","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"ZFMkHodDg4"}],"key":"gCZcJmXUXC"},{"type":"text","value":" of the pulled arms, ","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"KBWfPcknt7"},{"type":"emphasis","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"children":[{"type":"text","value":"not","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"lUQIyqrHfC"}],"key":"q4rEVxkomJ"},{"type":"text","value":" the actual\nobserved rewards.\nWe typically think of this as a random variable where\nthe randomness comes from the agent’s strategy (i.e. the sequence of\nactions ","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"J5GHEb16Fd"},{"type":"inlineMath","value":"a_0, \\dots, a_{T-1}","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"html":"a0,,aT1a_0, \\dots, a_{T-1}a0,,aT1","key":"NiygrE8NKp"},{"type":"text","value":").","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"lsathjhhSW"}],"key":"CxNab0sk46"},{"type":"paragraph","position":{"start":{"line":180,"column":1},"end":{"line":181,"column":1}},"children":[{"type":"text","value":"Throughout the chapter, we will try to upper bound the regret of various\nalgorithms in two different senses:","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"ROA7i8HF7g"}],"key":"HPTy45MZ60"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":183,"column":1},"end":{"line":188,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":183,"column":1},"end":{"line":185,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":183,"column":1},"end":{"line":184,"column":1}},"children":[{"type":"text","value":"Upper bound the ","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"m9bypTaZcu"},{"type":"emphasis","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"children":[{"type":"text","value":"expected regret,","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"aNIJ10yEYF"}],"key":"KitGlFR1zQ"},{"type":"text","value":" i.e. show\n","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"L2WQhk5Ycc"},{"type":"inlineMath","value":"\\E[\\text{Regret}_T] \\le M_T","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"html":"E[RegretT]MT\\E[\\text{Regret}_T] \\le M_TE[RegretT]MT","key":"nRIrtgavAD"},{"type":"text","value":".","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"a6a9tI9CHr"}],"key":"oxfbyClNDV"}],"key":"Dmz5OxGM2L"},{"type":"listItem","spread":true,"position":{"start":{"line":186,"column":1},"end":{"line":188,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":186,"column":1},"end":{"line":187,"column":1}},"children":[{"type":"text","value":"Find a ","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"key":"gX01am16OP"},{"type":"emphasis","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"children":[{"type":"text","value":"high-probability","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"key":"LbzlYE8lIq"}],"key":"Ooi8SQ4UyU"},{"type":"text","value":" upper bound on the regret, i.e. show\n","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"key":"AD0R1ng8RK"},{"type":"inlineMath","value":"\\pr(\\text{Regret}_T \\le M_{T, \\delta}) \\ge 1-\\delta","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"html":"P(RegretTMT,δ)1δ\\pr(\\text{Regret}_T \\le M_{T, \\delta}) \\ge 1-\\deltaP(RegretTMT,δ)1δ","key":"APh5ycwyX4"},{"type":"text","value":".","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"key":"fwUw4rtivX"}],"key":"AshKXLm1L2"}],"key":"a8D7yYMDix"}],"key":"V9b6qKFrGC"},{"type":"paragraph","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"children":[{"type":"text","value":"Note that these two different approaches say very different things about the regret. The first approach says that the ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"cocNsYnSS8"},{"type":"emphasis","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"children":[{"type":"text","value":"average","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"mAqMIiRc4f"}],"key":"Wt2KJh7bh9"},{"type":"text","value":" regret is at most ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"kHrt4J0MGk"},{"type":"inlineMath","value":"M_T","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"html":"MTM_TMT","key":"hnZZjQSfoK"},{"type":"text","value":". However, the agent might still achieve higher regret on many runs. The second approach says that, ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"dFAwtNdjCD"},{"type":"emphasis","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"children":[{"type":"text","value":"with high probability","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"KjpVxUEcEi"}],"key":"zGvG2ooJWq"},{"type":"text","value":", the agent will achieve regret at most ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"lujKcetkop"},{"type":"inlineMath","value":"M_{T, \\delta}","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"html":"MT,δM_{T, \\delta}MT,δ","key":"pDynFUe6Nb"},{"type":"text","value":". However, it doesn’t say anything about the regret in the remaining ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"opr2P9eN6P"},{"type":"text","value":"δ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"x3fhyquJlr"},{"type":"text","value":" fraction of runs, which might be arbitrarily high.","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"UAdKdp9qhc"}],"key":"nOBHywcfFA"},{"type":"paragraph","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"children":[{"type":"text","value":"We’d like to achieve ","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"key":"Qgseuiw2tR"},{"type":"strong","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"children":[{"type":"text","value":"sublinear regret","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"key":"qJJPQBc4yV"}],"key":"TQdFmKq5XL"},{"type":"text","value":" in expectation, i.e. ","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"key":"bY0QDcq6QA"},{"type":"inlineMath","value":"\\E[\\text{Regret}_T] = o(T)","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"html":"E[RegretT]=o(T)\\E[\\text{Regret}_T] = o(T)E[RegretT]=o(T)","key":"T8Kk1Uey74"},{"type":"text","value":". That is, as we learn more about the environment, we’d like to be able to exploit that knowledge to take the optimal arm as often as possible.","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"key":"L9ZuAnESqS"}],"key":"CetsxQ8MDf"},{"type":"paragraph","position":{"start":{"line":193,"column":1},"end":{"line":194,"column":1}},"children":[{"type":"text","value":"The rest of the chapter comprises a series of increasingly sophisticated\nMAB algorithms.","position":{"start":{"line":193,"column":1},"end":{"line":193,"column":1}},"key":"LnVj6HBJXd"}],"key":"VzI4wjbjf1"}],"key":"opKBiFUd1n"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def plot_strategy(mab: MAB, agent: Agent):\n plt.figure(figsize=(10, 6))\n\n # plot reward and cumulative regret\n plt.plot(np.arange(mab.T), np.cumsum(agent.rewards), label=\"reward\")\n cum_regret = np.cumsum(regret_per_step(mab, agent))\n plt.plot(np.arange(mab.T), cum_regret, label=\"cumulative regret\")\n\n # draw colored circles for arm choices\n colors = [\"red\", \"green\", \"blue\"]\n color_array = [colors[k] for k in agent.choices]\n plt.scatter(np.arange(mab.T), np.zeros(mab.T), c=color_array, label=\"arm\")\n\n # labels and title\n plt.xlabel(\"timestep\")\n plt.legend()\n plt.title(f\"{agent.__class__.__name__} reward and regret\")\n plt.show()","visibility":"hide","key":"EG846uP9n9"},{"type":"output","id":"qNqZ7SHOwlBeuud3V2_V5","data":[],"visibility":"show","key":"gynNpomIRr"}],"data":{"tags":[]},"visibility":"show","key":"J75CkM3xtM"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":219,"column":1},"end":{"line":219,"column":1}},"children":[{"type":"text","value":"Pure exploration (random guessing)","position":{"start":{"line":219,"column":1},"end":{"line":219,"column":1}},"key":"l62w0ze1Co"}],"identifier":"pure-exploration-random-guessing","label":"Pure exploration (random guessing)","html_id":"pure-exploration-random-guessing","implicit":true,"enumerator":"3.2","key":"JHZSMmguhk"},{"type":"paragraph","position":{"start":{"line":221,"column":1},"end":{"line":222,"column":1}},"children":[{"type":"text","value":"A trivial strategy is to always choose arms at random (i.e. “pure\nexploration”).","position":{"start":{"line":221,"column":1},"end":{"line":221,"column":1}},"key":"tooJwPmUvt"}],"key":"R7t4HI4jUp"}],"key":"QI3SmAHpLS"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class PureExploration(Agent):\n def choose_arm(self):\n \"\"\"Choose an arm uniformly at random.\"\"\"\n return solutions.pure_exploration_choose_arm(self)","identifier":"pure_exploration-code","enumerator":"3.1","html_id":"pure-exploration-code","key":"IddyJpGmXh"},{"type":"output","id":"1S6BJkONUjDFzNcNE__-s","data":[],"identifier":"pure_exploration-output","enumerator":"3.1","html_id":"pure-exploration-output","key":"e9OdCKoxgX"}],"data":{},"label":"pure_exploration","identifier":"pure_exploration","enumerator":"3.1","html_id":"pure-exploration","key":"XfWdkvRCwS"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"children":[{"type":"text","value":"Note that","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"key":"RrgpIXnx2u"}],"key":"DlRDVTnGkb"},{"type":"math","value":"\\E_{a_t \\sim \\text{Unif}([K])}[\\mu^{a_t}] = \\bar \\mu = \\frac{1}{K} \\sum_{k=1}^K \\mu^k","position":{"start":{"line":235,"column":1},"end":{"line":237,"column":1}},"html":"EatUnif([K])[μat]=μˉ=1Kk=1Kμk\\E_{a_t \\sim \\text{Unif}([K])}[\\mu^{a_t}] = \\bar \\mu = \\frac{1}{K} \\sum_{k=1}^K \\mu^kEatUnif([K])[μat]=μˉ=K1k=1Kμk","enumerator":"3.3","key":"yac5ujYbVe"},{"type":"paragraph","position":{"start":{"line":239,"column":1},"end":{"line":239,"column":1}},"children":[{"type":"text","value":"so the expected regret is simply","position":{"start":{"line":239,"column":1},"end":{"line":239,"column":1}},"key":"FotkWHF6kO"}],"key":"yDUVkS1Dx8"},{"type":"math","value":"\\begin{aligned}\n \\E[\\text{Regret}_T] &= \\sum_{t=0}^{T-1} \\E[\\mu^\\star - \\mu^{a_t}] \\\\\n &= T (\\mu^\\star - \\bar \\mu) > 0.\n\\end{aligned}","position":{"start":{"line":241,"column":1},"end":{"line":246,"column":1}},"html":"E[RegretT]=t=0T1E[μμat]=T(μμˉ)>0.\\begin{aligned}\n \\E[\\text{Regret}_T] &= \\sum_{t=0}^{T-1} \\E[\\mu^\\star - \\mu^{a_t}] \\\\\n &= T (\\mu^\\star - \\bar \\mu) > 0.\n\\end{aligned}E[RegretT]=t=0T1E[μμat]=T(μμˉ)>0.","enumerator":"3.4","key":"hThbtvbnV3"},{"type":"paragraph","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"children":[{"type":"text","value":"This scales as ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"fUCfFQPYS1"},{"type":"inlineMath","value":"\\Theta(T)","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"html":"Θ(T)\\Theta(T)Θ(T)","key":"o7layOA1GV"},{"type":"text","value":", i.e. ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"Sax1MHy8JO"},{"type":"emphasis","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"children":[{"type":"text","value":"linear","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"xNb6GlYNF0"}],"key":"eCDAYoFmPQ"},{"type":"text","value":" in the number of timesteps ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"W7pdeFTRyl"},{"type":"inlineMath","value":"T","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"html":"TTT","key":"dJtzvCMWcA"},{"type":"text","value":". There’s no learning here: the agent doesn’t use any information about the environment to improve its strategy. You can see that the distribution over its arm choices always appears “(uniformly) random”.","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"ffca0nvUn6"}],"key":"Ww5s32J9xi"}],"key":"vwuGV6EIQy"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"agent = PureExploration(mab.K, mab.T)\nmab_loop(mab, agent)\nplot_strategy(mab, agent)","key":"mTwFP24Ue3"},{"type":"output","id":"U6lrNi3FYZONd1LZaXEmk","data":[{"output_type":"display_data","metadata":{},"data":{"text/plain":{"content":"
","content_type":"text/plain"},"image/png":{"content_type":"image/png","hash":"e018a4b689feff2c40f2483432d7c76f","path":"/build/e018a4b689feff2c40f2483432d7c76f.png"}}}],"key":"dl2Sh3mqRw"}],"data":{},"key":"ic1R0xd61w"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"children":[{"type":"text","value":"Pure greedy","position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"key":"wjIWkz1loC"}],"identifier":"pure-greedy","label":"Pure greedy","html_id":"pure-greedy","implicit":true,"enumerator":"3.3","key":"aa8mEPTevJ"},{"type":"paragraph","position":{"start":{"line":258,"column":1},"end":{"line":260,"column":1}},"children":[{"type":"text","value":"How might we improve on pure exploration? Instead, we could try each arm\nonce, and then commit to the one with the highest observed reward. We’ll\ncall this the ","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"CXTBjo7Dm1"},{"type":"strong","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"children":[{"type":"text","value":"pure greedy","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"DbikD9Pn1m"}],"key":"ENrZbDb36n"},{"type":"text","value":" strategy.","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"AiN0qTOqZD"}],"key":"NA6r7KXWei"}],"key":"wmMXTJbWGT"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class PureGreedy(Agent):\n def choose_arm(self):\n \"\"\"Choose the arm with the highest observed reward on its first pull.\"\"\"\n return solutions.pure_greedy_choose_arm(self)","identifier":"pure_greedy-code","enumerator":"3.2","html_id":"pure-greedy-code","key":"fnmWsFQmSQ"},{"type":"output","id":"bPlu65MxQqmTMNe2SFOrY","data":[],"identifier":"pure_greedy-output","enumerator":"3.2","html_id":"pure-greedy-output","key":"PArUyguBQS"}],"data":{},"label":"pure_greedy","identifier":"pure_greedy","enumerator":"3.2","html_id":"pure-greedy","key":"Sjlon2Xl7D"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":271,"column":1},"end":{"line":274,"column":1}},"children":[{"type":"text","value":"Note we’ve used superscripts ","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"key":"P0qCa84XUt"},{"type":"inlineMath","value":"r^k","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"html":"rkr^krk","key":"nurJeuHj7V"},{"type":"text","value":" during the exploration phase to\nindicate that we observe exactly one reward for each arm. Then we use\nsubscripts ","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"key":"Lt5MwTyVsC"},{"type":"inlineMath","value":"r_t","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"html":"rtr_trt","key":"bXdsWw4IYj"},{"type":"text","value":" during the exploitation phase to indicate that we\nobserve a sequence of rewards from the chosen greedy arm ","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"key":"azCOU7H8wt"},{"type":"inlineMath","value":"\\hat k","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"html":"k^\\hat kk^","key":"T98bTQxjUq"},{"type":"text","value":".","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"key":"vRueTVRUSE"}],"key":"xskttodLJl"},{"type":"paragraph","position":{"start":{"line":276,"column":1},"end":{"line":279,"column":1}},"children":[{"type":"text","value":"How does the expected regret of this strategy compare to that of pure\nexploration? We’ll do a more general analysis in the following section.\nNow, for intuition, suppose there’s just ","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"key":"Zzm20xg2FO"},{"type":"inlineMath","value":"K=2","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"html":"K=2K=2K=2","key":"FhXTBEAcQP"},{"type":"text","value":" arms, with Bernoulli\nreward distributions with means ","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"key":"HCK6gqPuLK"},{"type":"inlineMath","value":"\\mu^0 > \\mu^1","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"html":"μ0>μ1\\mu^0 > \\mu^1μ0>μ1","key":"EpUpMJ7HP9"},{"type":"text","value":".","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"key":"QJdFEWkbMg"}],"key":"tuZcUzmdgM"},{"type":"paragraph","position":{"start":{"line":281,"column":1},"end":{"line":284,"column":1}},"children":[{"type":"text","value":"Let’s let ","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"fGrtPP4Sdk"},{"type":"inlineMath","value":"r^0","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"html":"r0r^0r0","key":"XfmrY1s9fn"},{"type":"text","value":" be the random reward from the first arm and ","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"oIUVjfPKgw"},{"type":"inlineMath","value":"r^1","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"html":"r1r^1r1","key":"Y0xU1taJWq"},{"type":"text","value":" be the\nrandom reward from the second. If ","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"V7SFXUcqUv"},{"type":"inlineMath","value":"r^0 > r^1","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"html":"r0>r1r^0 > r^1r0>r1","key":"eV5IAGNqEY"},{"type":"text","value":", then we achieve zero\nregret. Otherwise, we achieve regret ","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"fNeOTnz8rl"},{"type":"inlineMath","value":"T(\\mu^0 - \\mu^1)","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"html":"T(μ0μ1)T(\\mu^0 - \\mu^1)T(μ0μ1)","key":"A7x8fQcSTl"},{"type":"text","value":". Thus, the\nexpected regret is simply:","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"cXdjftRd8P"}],"key":"GvuSkOCbPv"},{"type":"math","value":"\\begin{aligned}\n \\E[\\text{Regret}_T] &= \\pr(r^0 < r^1) \\cdot T(\\mu^0 - \\mu^1) + c \\\\\n &= (1 - \\mu^0) \\mu^1 \\cdot T(\\mu^0 - \\mu^1) + c\n\\end{aligned}","position":{"start":{"line":286,"column":1},"end":{"line":291,"column":1}},"html":"E[RegretT]=P(r0<r1)T(μ0μ1)+c=(1μ0)μ1T(μ0μ1)+c\\begin{aligned}\n \\E[\\text{Regret}_T] &= \\pr(r^0 < r^1) \\cdot T(\\mu^0 - \\mu^1) + c \\\\\n &= (1 - \\mu^0) \\mu^1 \\cdot T(\\mu^0 - \\mu^1) + c\n\\end{aligned}E[RegretT]=P(r0<r1)T(μ0μ1)+c=(1μ0)μ1T(μ0μ1)+c","enumerator":"3.5","key":"ZgX2NbB2AZ"},{"type":"paragraph","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"children":[{"type":"text","value":"Which is still ","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"key":"Tl2WqrfScj"},{"type":"inlineMath","value":"\\Theta(T)","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"html":"Θ(T)\\Theta(T)Θ(T)","key":"vbtUhPRFhj"},{"type":"text","value":", the same as pure exploration!","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"key":"TzFAXhVKcW"}],"key":"wfDYNzK7ka"}],"key":"nIvQPxCncJ"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"agent = PureGreedy(mab.K, mab.T)\nmab_loop(mab, agent)\nplot_strategy(mab, agent)","key":"fOPybr5DjB"},{"type":"output","id":"tFQttZ4A4i6KhP5x7tiH4","data":[{"output_type":"display_data","metadata":{},"data":{"text/plain":{"content":"
","content_type":"text/plain"},"image/png":{"content_type":"image/png","hash":"60449ce2034aedba8d659c77e97c9729","path":"/build/60449ce2034aedba8d659c77e97c9729.png"}}}],"key":"ynkZMg0YlM"}],"data":{},"key":"TBNJgdr5yL"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"children":[{"type":"text","value":"The cumulative regret is a straight line because the regret only depends on the arms chosen and not the actual reward observed. In fact, if the greedy algorithm happens to get lucky on the first set of pulls, it may act entirely optimally for that episode! But its ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"Elu6Dqp54w"},{"type":"emphasis","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"children":[{"type":"text","value":"average","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"iKc3fCEGff"}],"key":"F9Qn2SQ4T4"},{"type":"text","value":" regret is what measures its effectiveness.","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"bT1GewcZoY"}],"key":"IwEjeU3Ycs"}],"key":"FpEGhhUIJQ"},{"type":"block","position":{"start":{"line":303,"column":1},"end":{"line":303,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":306,"column":1},"end":{"line":306,"column":1}},"children":[{"type":"text","value":"Explore-then-commit","position":{"start":{"line":306,"column":1},"end":{"line":306,"column":1}},"key":"yrN9U8YMdv"}],"label":"etc","identifier":"etc","html_id":"etc","enumerator":"3.4","key":"RLBWtnFZP4"},{"type":"paragraph","position":{"start":{"line":308,"column":1},"end":{"line":309,"column":1}},"children":[{"type":"text","value":"We can improve the pure greedy algorithm as follows: let’s reduce the variance of the reward estimates by pulling each arm ","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"aGjPnNY6Pr"},{"type":"inlineMath","value":"N_{\\text{explore}}> 1","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"html":"Nexplore>1N_{\\text{explore}}> 1Nexplore>1","key":"SEXIU4wwPs"},{"type":"text","value":" times before committing. This is called the ","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"E4mC336u6O"},{"type":"strong","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"children":[{"type":"text","value":"explore-then-commit","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"ww5ESic92H"}],"key":"xIxnyeTCA8"},{"type":"text","value":" strategy. Note that the “pure greedy” strategy above is just the special case where\n","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"bGh239jBTw"},{"type":"inlineMath","value":"N_{\\text{explore}}= 1","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"html":"Nexplore=1N_{\\text{explore}}= 1Nexplore=1","key":"vvPCYHaEeh"},{"type":"text","value":".","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"Y4WxZx0iyb"}],"key":"rrEkl3pGCn"}],"key":"sBzRvtB09g"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class ExploreThenCommit(Agent):\n def __init__(self, K: int, T: int, N_explore: int):\n super().__init__(K, T)\n self.N_explore = N_explore\n\n def choose_arm(self):\n return solutions.etc_choose_arm(self)","key":"Gg2cIObHOG"},{"type":"output","id":"zZ5SqXy4CJqspAAYekT4k","data":[],"key":"KXY9cc37Mp"}],"data":{},"key":"ZwqA6AokLN"},{"type":"block","children":[],"key":"qm46ncIJBy"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"agent = ExploreThenCommit(mab.K, mab.T, mab.T // 15)\nmab_loop(mab, agent)\nplot_strategy(mab, agent)","key":"cCEIgWiZKm"},{"type":"output","id":"83hjd2X7NUR4RdbV-7eZU","data":[{"output_type":"display_data","metadata":{},"data":{"text/plain":{"content":"
","content_type":"text/plain"},"image/png":{"content_type":"image/png","hash":"dde6263087532775cde0fb2de5a471cc","path":"/build/dde6263087532775cde0fb2de5a471cc.png"}}}],"key":"FWzid5dxLI"}],"data":{},"key":"KfUEkJN1Dl"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"children":[{"type":"text","value":"Notice that now, the graphs are much more consistent, and the algorithm finds the true optimal arm and sticks with it much more frequently. We would expect ETC to then have a better (i.e. lower) average regret. Can we prove this?","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"key":"H9wWOeKfkJ"}],"key":"XwxEyLAURh"}],"key":"pNk3LEmWoA"},{"type":"block","position":{"start":{"line":329,"column":1},"end":{"line":329,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":332,"column":1},"end":{"line":332,"column":1}},"children":[{"type":"text","value":"ETC regret analysis","position":{"start":{"line":332,"column":1},"end":{"line":332,"column":1}},"key":"oeEyUS39dF"}],"label":"etc-regret-analysis","identifier":"etc-regret-analysis","html_id":"etc-regret-analysis","enumerator":"3.4.1","key":"yu5jADZxxo"},{"type":"paragraph","position":{"start":{"line":334,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"Let’s analyze the expected regret of the explore-then-commit strategy by splitting it up\ninto the exploration and exploitation phases.","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"key":"dy7dMu5ab4"}],"key":"pgI94t26hT"},{"type":"heading","depth":4,"position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"Exploration phase.","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"LxEiXJLl8K"}],"identifier":"exploration-phase","label":"Exploration phase.","html_id":"exploration-phase","implicit":true,"enumerator":"3.4.1.1","key":"sZMeInJKz9"},{"type":"paragraph","position":{"start":{"line":339,"column":1},"end":{"line":341,"column":1}},"children":[{"type":"text","value":"This phase takes ","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"key":"xfxobjeHxn"},{"type":"inlineMath","value":"N_{\\text{explore}}K","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"html":"NexploreKN_{\\text{explore}}KNexploreK","key":"guz1V4Uylz"},{"type":"text","value":" timesteps. Since at each step we\nincur at most ","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"key":"YyVVAlJg5P"},{"type":"text","value":"1","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"key":"m62xoqrTO0"},{"type":"text","value":" regret, the total regret is at most\n","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"key":"YMPj9fIZWZ"},{"type":"inlineMath","value":"N_{\\text{explore}}K","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"html":"NexploreKN_{\\text{explore}}KNexploreK","key":"wT0DtzbSBj"},{"type":"text","value":".","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"key":"sz7QyJw0JZ"}],"key":"A9a0t7btJ2"},{"type":"heading","depth":4,"position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"children":[{"type":"text","value":"Exploitation phase.","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"PYw6azBUur"}],"identifier":"exploitation-phase","label":"Exploitation phase.","html_id":"exploitation-phase","implicit":true,"enumerator":"3.4.1.2","key":"mDRDmSceoh"},{"type":"paragraph","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"children":[{"type":"text","value":"This will take a bit more effort. We’ll prove that for any total time ","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"rfs2cJ8cZH"},{"type":"inlineMath","value":"T","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"html":"TTT","key":"ON2v3eSKBN"},{"type":"text","value":", we can choose ","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"S6fPJ1QtQR"},{"type":"inlineMath","value":"N_{\\text{explore}}","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"html":"NexploreN_{\\text{explore}}Nexplore","key":"Ie27CdlgVB"},{"type":"text","value":" such that with arbitrarily high probability, the regret is sublinear.","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"dxdDUKmtum"}],"key":"PQtNU9Uq6G"},{"type":"paragraph","position":{"start":{"line":347,"column":1},"end":{"line":348,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":347,"column":1},"end":{"line":347,"column":1}},"key":"x5LhPkZ5Y0"},{"type":"inlineMath","value":"\\hat k","position":{"start":{"line":347,"column":1},"end":{"line":347,"column":1}},"html":"k^\\hat kk^","key":"hoAdEEzprt"},{"type":"text","value":" denote the arm chosen after the exploration phase. We know the regret from the\nexploitation phase is","position":{"start":{"line":347,"column":1},"end":{"line":347,"column":1}},"key":"sDISPydKaT"}],"key":"wORFzZ3a82"},{"type":"math","value":"T_{\\text{exploit}} (\\mu^\\star - \\mu^{\\hat k}) \\qquad \\text{where} \\qquad T_{\\text{exploit}} := T - N_{\\text{explore}}K.","position":{"start":{"line":350,"column":1},"end":{"line":350,"column":1}},"html":"Texploit(μμk^)whereTexploit:=TNexploreK.T_{\\text{exploit}} (\\mu^\\star - \\mu^{\\hat k}) \\qquad \\text{where} \\qquad T_{\\text{exploit}} := T - N_{\\text{explore}}K.Texploit(μμk^)whereTexploit:=TNexploreK.","enumerator":"3.6","key":"R0D86ImVCo"},{"type":"paragraph","position":{"start":{"line":352,"column":1},"end":{"line":353,"column":1}},"children":[{"type":"text","value":"So we’d like to bound ","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"key":"NtqlmSeGqF"},{"type":"inlineMath","value":"\\mu^\\star - \\mu^{\\hat k} = o(1)","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"html":"μμk^=o(1)\\mu^\\star - \\mu^{\\hat k} = o(1)μμk^=o(1)","key":"NviFPkgwvz"},{"type":"text","value":" (as a function\nof ","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"key":"VAZMuju1uD"},{"type":"inlineMath","value":"T","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"html":"TTT","key":"bgHXCiPB82"},{"type":"text","value":") in order to achieve sublinear regret. How can we do this?","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"key":"ijIKU3PcnB"}],"key":"TfPxpkvcei"},{"type":"paragraph","position":{"start":{"line":355,"column":1},"end":{"line":358,"column":1}},"children":[{"type":"text","value":"Let’s define ","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"key":"VO7F6YnGUS"},{"type":"inlineMath","value":"\\Delta^k = \\hat \\mu^k - \\mu^k","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"html":"Δk=μ^kμk\\Delta^k = \\hat \\mu^k - \\mu^kΔk=μ^kμk","key":"J6sMNgIfdb"},{"type":"text","value":" to denote how far the mean\nestimate for arm ","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"key":"L45Az9Tlah"},{"type":"inlineMath","value":"k","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"html":"kkk","key":"j3d6goXk7P"},{"type":"text","value":" is from the true mean. How can we bound this\nquantity? We’ll use the following useful inequality for i.i.d. bounded\nrandom variables:","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"key":"C6py7gRI8C"}],"key":"R1bGHXehZh"},{"type":"proof","kind":"theorem","label":"hoeffding","identifier":"hoeffding","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Hoeffding’s inequality","position":{"start":{"line":360,"column":1},"end":{"line":360,"column":1}},"key":"ZoRJ91RljB"}],"key":"kmMziOzzSl"},{"type":"paragraph","position":{"start":{"line":363,"column":1},"end":{"line":365,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"key":"hxMKCSdjfm"},{"type":"inlineMath","value":"X_0, \\dots, X_{n-1}","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"html":"X0,,Xn1X_0, \\dots, X_{n-1}X0,,Xn1","key":"oaFG5HjUcm"},{"type":"text","value":" be i.i.d. random variables with\n","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"key":"a9ckb1HYoQ"},{"type":"inlineMath","value":"X_i \\in [0, 1]","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"html":"Xi[0,1]X_i \\in [0, 1]Xi[0,1]","key":"wK9Jqddl2C"},{"type":"text","value":" almost surely for each ","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"key":"h07HGtIdcc"},{"type":"inlineMath","value":"i \\in [n]","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"html":"i[n]i \\in [n]i[n]","key":"JFF2ZSR6Mv"},{"type":"text","value":". Then for any\n","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"key":"rm2udeY8nm"},{"type":"inlineMath","value":"\\delta > 0","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"html":"δ>0\\delta > 0δ>0","key":"viWFYLJVXO"},{"type":"text","value":",","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"key":"J8e3m3ACp3"}],"key":"eExMc5V1KU"},{"type":"math","value":"\\pr\\left( \\left| \\frac{1}{n} \\sum_{i=1}^n (X_i - \\E[X_i]) \\right| > \\sqrt{\\frac{\\ln(2/\\delta)}{2n}} \\right) \\le \\delta.","position":{"start":{"line":367,"column":1},"end":{"line":367,"column":1}},"html":"P(1ni=1n(XiE[Xi])>ln(2/δ)2n)δ.\\pr\\left( \\left| \\frac{1}{n} \\sum_{i=1}^n (X_i - \\E[X_i]) \\right| > \\sqrt{\\frac{\\ln(2/\\delta)}{2n}} \\right) \\le \\delta.P(n1i=1n(XiE[Xi])>2nln(2/δ))δ.","enumerator":"3.7","key":"sfj6lPFibl"}],"enumerator":"3.1","html_id":"hoeffding","key":"G88ISqfVEo"},{"type":"paragraph","position":{"start":{"line":370,"column":1},"end":{"line":370,"column":1}},"children":[{"type":"text","value":"The proof of this inequality is beyond the scope of this book. See ","position":{"start":{"line":370,"column":1},"end":{"line":370,"column":1}},"key":"Tz9uyKNopI"},{"type":"cite","kind":"narrative","label":"vershynin_high-dimensional_2018","identifier":"vershynin_high-dimensional_2018","children":[{"type":"text","value":"Vershynin (2018)","key":"kyRqw0hyVJ"}],"enumerator":"1","key":"irSPTTY6aD"},{"type":"text","value":" Chapter 2.2.","position":{"start":{"line":370,"column":1},"end":{"line":370,"column":1}},"key":"jpyFBM8ewf"}],"key":"GJOWdDFDJR"},{"type":"paragraph","position":{"start":{"line":372,"column":1},"end":{"line":372,"column":1}},"children":[{"type":"text","value":"We can apply this directly to the rewards for a given arm ","position":{"start":{"line":372,"column":1},"end":{"line":372,"column":1}},"key":"qcerJkJDYq"},{"type":"inlineMath","value":"k","position":{"start":{"line":372,"column":1},"end":{"line":372,"column":1}},"html":"kkk","key":"uM6unro1UY"},{"type":"text","value":", since the rewards from that arm are i.i.d.:","position":{"start":{"line":372,"column":1},"end":{"line":372,"column":1}},"key":"fbhcvASxvJ"}],"key":"r3meg7hpIO"},{"type":"math","value":"\\pr\\left(|\\Delta^k | > \\sqrt{\\frac{\\ln(2/\\delta)}{2N_{\\text{explore}}}} \\right) \\le \\delta.","label":"hoeffding-etc","identifier":"hoeffding-etc","html":"P(Δk>ln(2/δ)2Nexplore)δ.\\pr\\left(|\\Delta^k | > \\sqrt{\\frac{\\ln(2/\\delta)}{2N_{\\text{explore}}}} \\right) \\le \\delta.P(Δk>2Nexploreln(2/δ))δ.","enumerator":"3.8","html_id":"hoeffding-etc","key":"xbSGLmHalO"},{"type":"paragraph","position":{"start":{"line":380,"column":1},"end":{"line":384,"column":1}},"children":[{"type":"text","value":"But note that we can’t apply this to arm ","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"jcVQl8pGBQ"},{"type":"inlineMath","value":"\\hat k","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"html":"k^\\hat kk^","key":"FPKz2WFC3K"},{"type":"text","value":" directly since\n","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"RFT4uwY99C"},{"type":"inlineMath","value":"\\hat k","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"html":"k^\\hat kk^","key":"KAyAWRiUEO"},{"type":"text","value":" is itself a random variable. Instead, we need to “uniform-ize”\nthis bound across ","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"kn5wYMmAIX"},{"type":"emphasis","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"children":[{"type":"text","value":"all","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"BtswJekHi2"}],"key":"bx89JXnNeB"},{"type":"text","value":" the arms, i.e. bound the error across all the\narms simultaneously, so that the resulting bound will apply ","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"ypmZinRbw2"},{"type":"emphasis","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"children":[{"type":"text","value":"no matter\nwhat","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"dmkybx1kgm"}],"key":"uTJgvsd5RQ"},{"type":"text","value":" ","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"CbOMAIrmkw"},{"type":"inlineMath","value":"\\hat k","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"html":"k^\\hat kk^","key":"Y8FogyitIJ"},{"type":"text","value":" “crystallizes” to.","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"VYaXekCCiO"}],"key":"IVCqO1CmjR"},{"type":"paragraph","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"key":"TWStIFhm6K"},{"type":"strong","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"children":[{"type":"text","value":"union bound","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"key":"EJwDLBPIdD"}],"key":"oir8Bf2g6k"},{"type":"text","value":" provides a simple way to do this:","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"key":"tfLYXIoEPS"}],"key":"JkUnydWxl9"},{"type":"proof","kind":"theorem","label":"union_bound","identifier":"union_bound","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Union bound","position":{"start":{"line":388,"column":1},"end":{"line":388,"column":1}},"key":"a9ziuDAa7D"}],"key":"HPoIdkUaF2"},{"type":"paragraph","position":{"start":{"line":391,"column":1},"end":{"line":391,"column":1}},"children":[{"type":"text","value":"Consider a set of events ","position":{"start":{"line":391,"column":1},"end":{"line":391,"column":1}},"key":"F6nseJ2DqY"},{"type":"inlineMath","value":"A_0, \\dots, A_{n-1}","position":{"start":{"line":391,"column":1},"end":{"line":391,"column":1}},"html":"A0,,An1A_0, \\dots, A_{n-1}A0,,An1","key":"Jbo3hTAZfr"},{"type":"text","value":". Then","position":{"start":{"line":391,"column":1},"end":{"line":391,"column":1}},"key":"Kbq5xPquLp"}],"key":"hsgTL2WclF"},{"type":"math","value":"\\pr(\\exists i \\in [n]. A_i) \\le \\sum_{i=0}^{n-1} \\pr(A_i).","position":{"start":{"line":393,"column":1},"end":{"line":393,"column":1}},"html":"P(i[n].Ai)i=0n1P(Ai).\\pr(\\exists i \\in [n]. A_i) \\le \\sum_{i=0}^{n-1} \\pr(A_i).P(i[n].Ai)i=0n1P(Ai).","enumerator":"3.9","key":"dzdDA2WMGA"},{"type":"paragraph","position":{"start":{"line":395,"column":1},"end":{"line":396,"column":1}},"children":[{"type":"text","value":"In\nparticular, if ","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"a8LCgIA67k"},{"type":"inlineMath","value":"\\pr(A_i) \\ge 1 - \\delta","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"html":"P(Ai)1δ\\pr(A_i) \\ge 1 - \\deltaP(Ai)1δ","key":"eAWjVn9dF7"},{"type":"text","value":" for each ","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"BfP0dB08fL"},{"type":"inlineMath","value":"i \\in [n]","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"html":"i[n]i \\in [n]i[n]","key":"Qc8TzEXdC1"},{"type":"text","value":", we have","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"CJsgQvgXXi"}],"key":"Av6ld36v5p"},{"type":"math","value":"\\pr(\\forall i \\in [n]. A_i) \\ge 1 - n \\delta.","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"html":"P(i[n].Ai)1nδ.\\pr(\\forall i \\in [n]. A_i) \\ge 1 - n \\delta.P(i[n].Ai)1nδ.","enumerator":"3.10","key":"mDMfgoEQiH"}],"enumerator":"3.2","html_id":"union-bound","key":"REgQuxUbOG"},{"type":"paragraph","position":{"start":{"line":401,"column":1},"end":{"line":401,"column":1}},"children":[{"type":"strong","position":{"start":{"line":401,"column":1},"end":{"line":401,"column":1}},"children":[{"type":"text","value":"Exercise:","position":{"start":{"line":401,"column":1},"end":{"line":401,"column":1}},"key":"CtNmXOZqhf"}],"key":"jG3urHpctQ"},{"type":"text","value":" Prove the second statement above.","position":{"start":{"line":401,"column":1},"end":{"line":401,"column":1}},"key":"VxPq2Fbowy"}],"key":"eO3CtZw1Ya"},{"type":"paragraph","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"children":[{"type":"text","value":"Applying the union bound across the arms for the l.h.s. event of ","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"th7UFGalmm"},{"type":"crossReference","kind":"equation","identifier":"hoeffding-etc","label":"hoeffding-etc","children":[{"type":"text","value":"(","key":"c8GyvmSWNI"},{"type":"text","value":"3.8","key":"VL8nr9Zz7X"},{"type":"text","value":")","key":"rFdd7xlMeJ"}],"template":"(%s)","enumerator":"3.8","resolved":true,"html_id":"hoeffding-etc","key":"qOVYbI7zsA"},{"type":"text","value":", we have","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"mQMA9ri4eH"}],"key":"ELDVaLnzdZ"},{"type":"math","value":"\\begin{aligned}\n \\pr\\left( \\forall k \\in [K], |\\Delta^k | \\le \\sqrt{\\frac{\\ln(2/\\delta)}{2N_{\\text{explore}}}} \\right) &\\ge 1-K\\delta\n\\end{aligned}","position":{"start":{"line":405,"column":1},"end":{"line":409,"column":1}},"html":"P(k[K],Δkln(2/δ)2Nexplore)1Kδ\\begin{aligned}\n \\pr\\left( \\forall k \\in [K], |\\Delta^k | \\le \\sqrt{\\frac{\\ln(2/\\delta)}{2N_{\\text{explore}}}} \\right) &\\ge 1-K\\delta\n\\end{aligned}P(k[K],Δk2Nexploreln(2/δ))1","enumerator":"3.11","key":"FW1TYqqliE"},{"type":"paragraph","position":{"start":{"line":411,"column":1},"end":{"line":412,"column":1}},"children":[{"type":"text","value":"Then to apply this bound to ","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"key":"j4QM9rEpz3"},{"type":"inlineMath","value":"\\hat k","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"html":"k^\\hat kk^","key":"Ie7etmAvQL"},{"type":"text","value":" in particular, we\ncan apply the useful trick of “adding zero”:","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"key":"P9mVpUzhVB"}],"key":"MSEQgT67mK"},{"type":"math","value":"\\begin{aligned}\n \\mu^{k^\\star} - \\mu^{\\hat k} &= \\mu^{k^\\star} - \\mu^{\\hat k} + (\\hat \\mu^{k^\\star} - \\hat \\mu^{k^\\star}) + (\\hat \\mu^{\\hat k} - \\hat \\mu^{\\hat k}) \\\\\n &= \\Delta^{\\hat k} - \\Delta^{k^*} + \\underbrace{(\\hat \\mu^{k^\\star} - \\hat \\mu^{\\hat k})}_{\\le 0 \\text{ by definition of } \\hat k} \\\\\n &\\le 2 \\sqrt{\\frac{\\ln(2K/\\delta')}{2N_{\\text{explore}}}} \\text{ with probability at least } 1-\\delta'\n\\end{aligned}","position":{"start":{"line":414,"column":1},"end":{"line":420,"column":1}},"html":"μkμk^=μkμk^+(μ^kμ^k)+(μ^k^μ^k^)=Δk^Δk+(μ^kμ^k^)0 by definition of k^2ln(2K/δ)2Nexplore with probability at least 1δ\\begin{aligned}\n \\mu^{k^\\star} - \\mu^{\\hat k} &= \\mu^{k^\\star} - \\mu^{\\hat k} + (\\hat \\mu^{k^\\star} - \\hat \\mu^{k^\\star}) + (\\hat \\mu^{\\hat k} - \\hat \\mu^{\\hat k}) \\\\\n &= \\Delta^{\\hat k} - \\Delta^{k^*} + \\underbrace{(\\hat \\mu^{k^\\star} - \\hat \\mu^{\\hat k})}_{\\le 0 \\text{ by definition of } \\hat k} \\\\\n &\\le 2 \\sqrt{\\frac{\\ln(2K/\\delta')}{2N_{\\text{explore}}}} \\text{ with probability at least } 1-\\delta'\n\\end{aligned}μkμk^=μkμk^+(μ^kμ^k)+(μ^k^μ^k^)=Δk^Δk+0 by definition of k^(μ^kμ^k^)22Nexploreln(2K/δ) with probability at least 1δ","enumerator":"3.12","key":"P4VQwIIUwR"},{"type":"paragraph","position":{"start":{"line":422,"column":1},"end":{"line":423,"column":1}},"children":[{"type":"text","value":"where we’ve set ","position":{"start":{"line":422,"column":1},"end":{"line":422,"column":1}},"key":"NuTq3aEvSf"},{"type":"inlineMath","value":"\\delta' = K\\delta","position":{"start":{"line":422,"column":1},"end":{"line":422,"column":1}},"html":"δ=Kδ\\delta' = K\\deltaδ=","key":"eWCV3SkPIe"},{"type":"text","value":". Putting this all\ntogether, we’ve shown that, with probability ","position":{"start":{"line":422,"column":1},"end":{"line":422,"column":1}},"key":"y9YNdJTtJy"},{"type":"inlineMath","value":"1 - \\delta'","position":{"start":{"line":422,"column":1},"end":{"line":422,"column":1}},"html":"1δ1 - \\delta'1δ","key":"aSFpuTtxrT"},{"type":"text","value":",","position":{"start":{"line":422,"column":1},"end":{"line":422,"column":1}},"key":"N82zhkpTXr"}],"key":"oTf1RtCmtK"},{"type":"math","value":"\\text{Regret}_T \\le N_{\\text{explore}}K + T_{\\text{exploit}} \\cdot \\sqrt{\\frac{2\\ln(2K/\\delta')}{N_{\\text{explore}}}}.","position":{"start":{"line":425,"column":1},"end":{"line":425,"column":1}},"html":"RegretTNexploreK+Texploit2ln(2K/δ)Nexplore.\\text{Regret}_T \\le N_{\\text{explore}}K + T_{\\text{exploit}} \\cdot \\sqrt{\\frac{2\\ln(2K/\\delta')}{N_{\\text{explore}}}}.RegretTNexploreK+TexploitNexplore2ln(2K/δ).","enumerator":"3.13","key":"epYZsPhw9r"},{"type":"paragraph","position":{"start":{"line":427,"column":1},"end":{"line":430,"column":1}},"children":[{"type":"text","value":"Note that it suffices for ","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"key":"uNqBEojYIQ"},{"type":"inlineMath","value":"N_{\\text{explore}}","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"html":"NexploreN_{\\text{explore}}Nexplore","key":"OpLj7k9EvT"},{"type":"text","value":" to be on the order of\n","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"key":"YsYpf7WaTX"},{"type":"inlineMath","value":"\\sqrt{T}","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"html":"T\\sqrt{T}T","key":"bV44dDvpi2"},{"type":"text","value":" to achieve sublinear regret. In particular, we can find the\noptimal ","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"key":"SItNT6y0Pg"},{"type":"inlineMath","value":"N_{\\text{explore}}","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"html":"NexploreN_{\\text{explore}}Nexplore","key":"JsRPqamD8p"},{"type":"text","value":" by setting the derivative of the r.h.s. to\nzero:","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"key":"Tvmu2VlGuU"}],"key":"swDlBTLFRp"},{"type":"math","value":"\\begin{aligned}\n 0 &= K - T_{\\text{exploit}} \\cdot \\frac{1}{2} \\sqrt{\\frac{2\\ln(2K/\\delta')}{N_{\\text{explore}}^3}} \\\\\n N_{\\text{explore}}&= \\left( T_{\\text{exploit}} \\cdot \\frac{\\sqrt{\\ln(2K/\\delta')/2}}{K} \\right)^{2/3}\n\\end{aligned}","position":{"start":{"line":432,"column":1},"end":{"line":437,"column":1}},"html":"0=KTexploit122ln(2K/δ)Nexplore3Nexplore=(Texploitln(2K/δ)/2K)2/3\\begin{aligned}\n 0 &= K - T_{\\text{exploit}} \\cdot \\frac{1}{2} \\sqrt{\\frac{2\\ln(2K/\\delta')}{N_{\\text{explore}}^3}} \\\\\n N_{\\text{explore}}&= \\left( T_{\\text{exploit}} \\cdot \\frac{\\sqrt{\\ln(2K/\\delta')/2}}{K} \\right)^{2/3}\n\\end{aligned}0Nexplore=KTexploit21Nexplore32ln(2K/δ)=(TexploitKln(2K/δ)/2)2/3","enumerator":"3.14","key":"id5azWmM4v"},{"type":"paragraph","position":{"start":{"line":439,"column":1},"end":{"line":440,"column":1}},"children":[{"type":"text","value":"Plugging this into the expression for the regret, we\nhave (still with probability ","position":{"start":{"line":439,"column":1},"end":{"line":439,"column":1}},"key":"RWzIEXCb5J"},{"type":"inlineMath","value":"1-\\delta'","position":{"start":{"line":439,"column":1},"end":{"line":439,"column":1}},"html":"1δ1-\\delta'1δ","key":"AN2nONDihe"},{"type":"text","value":")","position":{"start":{"line":439,"column":1},"end":{"line":439,"column":1}},"key":"vFiRtfClNZ"}],"key":"B06h9sHA6F"},{"type":"math","value":"\\begin{aligned}\n \\text{Regret}_T &\\le 3 T^{2/3} \\sqrt[3]{K \\ln(2K/\\delta') / 2} \\\\\n &= \\tilde{O}(T^{2/3} K^{1/3}).\n\\end{aligned}","position":{"start":{"line":442,"column":1},"end":{"line":447,"column":1}},"html":"RegretT3T2/3Kln(2K/δ)/23=O~(T2/3K1/3).\\begin{aligned}\n \\text{Regret}_T &\\le 3 T^{2/3} \\sqrt[3]{K \\ln(2K/\\delta') / 2} \\\\\n &= \\tilde{O}(T^{2/3} K^{1/3}).\n\\end{aligned}RegretT3T2/33Kln(2K/δ)/2=O~(T2/3K1/3).","enumerator":"3.15","key":"MqxhpcwHrH"},{"type":"paragraph","position":{"start":{"line":449,"column":1},"end":{"line":452,"column":1}},"children":[{"type":"text","value":"The ETC algorithm is rather “abrupt” in that it switches from\nexploration to exploitation after a fixed number of timesteps. In\npractice, it’s often better to use a more gradual transition, which\nbrings us to the ","position":{"start":{"line":449,"column":1},"end":{"line":449,"column":1}},"key":"yonGcvWZ5A"},{"type":"emphasis","position":{"start":{"line":449,"column":1},"end":{"line":449,"column":1}},"children":[{"type":"text","value":"epsilon-greedy","position":{"start":{"line":449,"column":1},"end":{"line":449,"column":1}},"key":"BKS7SR8j2l"}],"key":"WYHcFQq2MP"},{"type":"text","value":" algorithm.","position":{"start":{"line":449,"column":1},"end":{"line":449,"column":1}},"key":"eUZUpGXJFy"}],"key":"P2vYT0I0Bj"}],"key":"cjBeXopxsP"},{"type":"block","position":{"start":{"line":454,"column":1},"end":{"line":454,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":456,"column":1},"end":{"line":456,"column":1}},"children":[{"type":"text","value":"Epsilon-greedy","position":{"start":{"line":456,"column":1},"end":{"line":456,"column":1}},"key":"ji8XNPkVJU"}],"identifier":"epsilon-greedy","label":"Epsilon-greedy","html_id":"epsilon-greedy","implicit":true,"enumerator":"3.5","key":"NLvCUx9LtI"},{"type":"paragraph","position":{"start":{"line":458,"column":1},"end":{"line":462,"column":1}},"children":[{"type":"text","value":"Instead of doing all of the exploration and then all of the exploitation\nseparately – which additionally requires knowing the time horizon\nbeforehand – we can instead interleave exploration and exploitation by,\nat each timestep, choosing a random action with some probability. We\ncall this the ","position":{"start":{"line":458,"column":1},"end":{"line":458,"column":1}},"key":"tqSmwAsEAc"},{"type":"strong","position":{"start":{"line":458,"column":1},"end":{"line":458,"column":1}},"children":[{"type":"text","value":"epsilon-greedy","position":{"start":{"line":458,"column":1},"end":{"line":458,"column":1}},"key":"wJTMvhdOq9"}],"key":"WdKUkLjhIj"},{"type":"text","value":" algorithm.","position":{"start":{"line":458,"column":1},"end":{"line":458,"column":1}},"key":"WjXgw7FJ3C"}],"key":"SuAKAjb3ZB"}],"key":"S3Ng9bz0aa"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class EpsilonGreedy(Agent):\n def __init__(\n self,\n K: int,\n T: int,\n ε_array: Float[Array, \" T\"],\n ):\n super().__init__(K, T)\n self.ε_array = ε_array\n\n def choose_arm(self):\n return solutions.epsilon_greedy_choose_arm(self)","key":"x5ULJq8IZg"},{"type":"output","id":"DDgEyKtxzNkrVJwR4bLkY","data":[],"key":"aNEiLPnumS"}],"data":{},"key":"sEaNrmuOPP"},{"type":"block","children":[],"key":"J2L3LTuMjT"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"agent = EpsilonGreedy(mab.K, mab.T, np.full(mab.T, 0.1))\nmab_loop(mab, agent)\nplot_strategy(mab, agent)","key":"Zsne9effF7"},{"type":"output","id":"ifd9Tm1uOL39NkNTliiN6","data":[{"output_type":"display_data","metadata":{},"data":{"text/plain":{"content":"
","content_type":"text/plain"},"image/png":{"content_type":"image/png","hash":"6ad1018e4c18668300eb6bbe80bdc84f","path":"/build/6ad1018e4c18668300eb6bbe80bdc84f.png"}}}],"key":"tnmncr89k2"}],"data":{},"key":"eQxevAZP4A"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"children":[{"type":"text","value":"Note that we let ","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"key":"hogdLhI4W7"},{"type":"text","value":"ε","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"key":"zhOf537OVW"},{"type":"text","value":" vary over time. In particular, we might want to gradually ","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"key":"YAQ0O39pTp"},{"type":"emphasis","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"children":[{"type":"text","value":"decrease","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"key":"NRCcrWwqys"}],"key":"InOXzFmY4I"},{"type":"text","value":" ","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"key":"n4uXxssn8N"},{"type":"text","value":"ε","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"key":"HxFa0y7X2k"},{"type":"text","value":" as we learn more about the reward distributions and no longer need to spend time exploring.","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"key":"DrBTDeA5Ig"}],"key":"BpEOOygwXG"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"ruk9aoyi6Q"}],"key":"ndxjkKiCsq"},{"type":"paragraph","position":{"start":{"line":488,"column":1},"end":{"line":488,"column":1}},"children":[{"type":"text","value":"What is the expected regret of the algorithm if we set ","position":{"start":{"line":488,"column":1},"end":{"line":488,"column":1}},"key":"iS1qgE4jzx"},{"type":"text","value":"ε","position":{"start":{"line":488,"column":1},"end":{"line":488,"column":1}},"key":"NrhlBNORpE"},{"type":"text","value":" to be a constant?","position":{"start":{"line":488,"column":1},"end":{"line":488,"column":1}},"key":"a7OogPimfw"}],"key":"MjZ5AgC2Ju"}],"key":"jcp6nacDlz"},{"type":"paragraph","position":{"start":{"line":491,"column":1},"end":{"line":491,"column":1}},"children":[{"type":"text","value":"It turns out that setting ","position":{"start":{"line":491,"column":1},"end":{"line":491,"column":1}},"key":"Cn4w0N2irW"},{"type":"inlineMath","value":"\\epsilon_t = \\sqrt[3]{K \\ln(t)/t}","position":{"start":{"line":491,"column":1},"end":{"line":491,"column":1}},"html":"ϵt=Kln(t)/t3\\epsilon_t = \\sqrt[3]{K \\ln(t)/t}ϵt=3Kln(t)/t","key":"S0baTq2yeK"},{"type":"text","value":" also achieves a regret of ","position":{"start":{"line":491,"column":1},"end":{"line":491,"column":1}},"key":"dFaTXlN8AJ"},{"type":"inlineMath","value":"\\tilde O(t^{2/3} K^{1/3})","position":{"start":{"line":491,"column":1},"end":{"line":491,"column":1}},"html":"O~(t2/3K1/3)\\tilde O(t^{2/3} K^{1/3})O~(t2/3K1/3)","key":"g4Ur928bg7"},{"type":"text","value":" (ignoring the logarithmic factors). (We will not prove this here.) TODO ADD PROOF CITATION","position":{"start":{"line":491,"column":1},"end":{"line":491,"column":1}},"key":"eUVlBfTcSf"}],"key":"YIqL4M6Jo8"},{"type":"paragraph","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"children":[{"type":"text","value":"In ETC, we had to set ","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"lPlaIuNwsP"},{"type":"inlineMath","value":"N_{\\text{explore}}","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"html":"NexploreN_{\\text{explore}}Nexplore","key":"iYPPxnpxjE"},{"type":"text","value":" based on the total number of timesteps ","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"lzOs9b3DLl"},{"type":"inlineMath","value":"T","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"html":"TTT","key":"BmJBjJDvOR"},{"type":"text","value":". But the epsilon-greedy algorithm actually handles the exploration ","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"LohbUKVxzr"},{"type":"emphasis","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"children":[{"type":"text","value":"automatically","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"PnFKzgc9YC"}],"key":"xXwvBZ300j"},{"type":"text","value":": the regret rate holds for ","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"ytw6yr8Z4P"},{"type":"emphasis","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"children":[{"type":"text","value":"any","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"uvpO4DdwPp"}],"key":"argQq60ENl"},{"type":"text","value":" ","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"YkYR7OMr2D"},{"type":"inlineMath","value":"t","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"html":"ttt","key":"xNBBZrR1mG"},{"type":"text","value":", and doesn’t depend on the final horizon ","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"ZwvCfLWM4f"},{"type":"inlineMath","value":"T","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"html":"TTT","key":"UK0pNL5826"},{"type":"text","value":".","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"BlnQ8B0M95"}],"key":"PndiD3ygdz"},{"type":"paragraph","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"children":[{"type":"text","value":"But the way these algorithms explore is rather naive: we’ve been exploring ","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"key":"aUjskVDCmK"},{"type":"emphasis","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"children":[{"type":"text","value":"uniformly","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"key":"JlbuK0IpZB"}],"key":"YCcPkCcLLn"},{"type":"text","value":" across all the arms. But what if we could be smarter about it, and explore ","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"key":"QkfNWXBRi3"},{"type":"emphasis","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"children":[{"type":"text","value":"more","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"key":"j1HlEu0hOZ"}],"key":"glEi9Iqeps"},{"type":"text","value":" for arms that we’re less certain about?","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"key":"XM73HMF8Fp"}],"key":"pRsdDeMzuX"}],"key":"ZIml5tYJiI"},{"type":"block","position":{"start":{"line":497,"column":1},"end":{"line":497,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"children":[{"type":"text","value":"Upper Confidence Bound (UCB)","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"key":"r0DdkjfUzw"}],"label":"ucb","identifier":"ucb","html_id":"ucb","enumerator":"3.6","key":"xX0xm17eu8"},{"type":"paragraph","position":{"start":{"line":502,"column":1},"end":{"line":506,"column":1}},"children":[{"type":"text","value":"To quantify how ","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"AhSGwQ18iC"},{"type":"emphasis","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"children":[{"type":"text","value":"certain","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"HjcYr7fNJ8"}],"key":"EDetClK3Ui"},{"type":"text","value":" we are about the mean of each arm, we’ll\ncompute ","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"OEPqktHlYI"},{"type":"emphasis","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"children":[{"type":"text","value":"confidence intervals","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"Ji5F0WczcD"}],"key":"LiRtyHQbDd"},{"type":"text","value":" for our estimators, and then choose the\narm with the highest ","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"CatXuG8nzI"},{"type":"emphasis","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"children":[{"type":"text","value":"upper confidence bound","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"UzPVy9GBRw"}],"key":"LPNZce77OL"},{"type":"text","value":". This operates on the\nprinciple of ","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"t5MRFSz92i"},{"type":"strong","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"children":[{"type":"text","value":"the benefit of the doubt (i.e. optimism in the face of\nuncertainty)","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"jaFOmKCl2U"}],"key":"SNiwWSfsbs"},{"type":"text","value":": we’ll choose the arm that we’re most optimistic about.","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"U4aON9bDtZ"}],"key":"K8J9jVIPae"},{"type":"paragraph","position":{"start":{"line":508,"column":1},"end":{"line":511,"column":1}},"children":[{"type":"text","value":"In particular, for each arm ","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"key":"lNdVzWCuES"},{"type":"inlineMath","value":"k","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"html":"kkk","key":"WYfaE8DpSv"},{"type":"text","value":" at time ","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"key":"phnNjHBhDA"},{"type":"inlineMath","value":"t","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"html":"ttt","key":"chr0KhiPaW"},{"type":"text","value":", we’d like to compute some\nupper confidence bound ","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"key":"UMaoDrUotO"},{"type":"inlineMath","value":"M^k_t","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"html":"MtkM^k_tMtk","key":"Q9AKptXrvF"},{"type":"text","value":" such that ","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"key":"AFdEGxT0OV"},{"type":"inlineMath","value":"\\hat \\mu^k_t \\le M^k_t","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"html":"μ^tkMtk\\hat \\mu^k_t \\le M^k_tμ^tkMtk","key":"ts5k7Hc8pp"},{"type":"text","value":" with\nhigh probability, and then choose ","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"key":"NlhYTK5lEr"},{"type":"inlineMath","value":"a_t := \\arg \\max_{k \\in [K]} M^k_t","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"html":"at:=argmaxk[K]Mtka_t := \\arg \\max_{k \\in [K]} M^k_tat:=argmaxk[K]Mtk","key":"ctnV6SJMpS"},{"type":"text","value":".\nBut how should we compute ","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"key":"Up4aa7O8Uh"},{"type":"inlineMath","value":"M^k_t","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"html":"MtkM^k_tMtk","key":"O4GMlOEd5p"},{"type":"text","value":"?","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"key":"PMR7MDY91v"}],"key":"DJzTAGiC9i"},{"type":"paragraph","position":{"start":{"line":513,"column":1},"end":{"line":519,"column":1}},"children":[{"type":"text","value":"In ","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"wFQMfTCu48"},{"type":"crossReference","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"children":[{"type":"text","value":"Section ","key":"jsAVbowuqP"},{"type":"text","value":"3.4.1","key":"TF5KNouHJd"}],"identifier":"etc-regret-analysis","label":"etc-regret-analysis","kind":"heading","template":"Section %s","enumerator":"3.4.1","resolved":true,"html_id":"etc-regret-analysis","key":"CbdKZnf06t"},{"type":"text","value":", we were able to compute this bound\nusing Hoeffding’s inequality, which assumes that the number of samples\nis ","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"w6nbaKNFLJ"},{"type":"emphasis","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"children":[{"type":"text","value":"fixed","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"XdvFmJtEXs"}],"key":"j3ZSYyz26c"},{"type":"text","value":". This was the case in ETC (where we pull each arm\n","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"uA19jJe8JT"},{"type":"inlineMath","value":"N_{\\text{explore}}","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"html":"NexploreN_{\\text{explore}}Nexplore","key":"aKMA68hoQn"},{"type":"text","value":" times), but in UCB, the number of times we pull\neach arm depends on the agent’s actions, which in turn depend on the\nrandom rewards and are therefore stochastic. So we ","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"WWo3SpiTFx"},{"type":"emphasis","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"children":[{"type":"text","value":"can’t","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"X3rxeonKB3"}],"key":"l4iCImMmnj"},{"type":"text","value":" use\nHoeffding’s inequality directly.","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"TD2aPjz168"}],"key":"MlOQy339GN"},{"type":"paragraph","position":{"start":{"line":521,"column":1},"end":{"line":524,"column":1}},"children":[{"type":"text","value":"Instead, we’ll apply the same trick we used in the ETC analysis: we’ll\nuse the ","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"ov1xPRhphh"},{"type":"strong","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"children":[{"type":"text","value":"union bound","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"xoWGLvbJ9P"}],"key":"ymTB12J70F"},{"type":"text","value":" to compute a ","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"iR3dkEDFrk"},{"type":"emphasis","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"children":[{"type":"text","value":"looser","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"BCJmpQkiEj"}],"key":"axaxABl4HY"},{"type":"text","value":" bound that holds\n","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"hcZMIbR3pE"},{"type":"emphasis","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"children":[{"type":"text","value":"uniformly","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"iDLisPFrn8"}],"key":"UsEKe8vsKh"},{"type":"text","value":" across all timesteps and arms. Let’s introduce some notation\nto discuss this.","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"DVtUVKgdE7"}],"key":"ibAlrWItpu"},{"type":"paragraph","position":{"start":{"line":526,"column":1},"end":{"line":528,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"IoehmXWf4r"},{"type":"inlineMath","value":"N^k_t","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"html":"NtkN^k_tNtk","key":"ZJQloC55ea"},{"type":"text","value":" denote the (random) number of times arm ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"Iz629Dr7dX"},{"type":"inlineMath","value":"k","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"html":"kkk","key":"QWXmRssCJJ"},{"type":"text","value":" has been pulled\nwithin the first ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"L7GZ5Lj5lh"},{"type":"inlineMath","value":"t","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"html":"ttt","key":"mimNbsOyHf"},{"type":"text","value":" timesteps, and ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"r1c8eGiVkO"},{"type":"inlineMath","value":"\\hat \\mu^k_t","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"html":"μ^tk\\hat \\mu^k_tμ^tk","key":"A6DpWgkrtB"},{"type":"text","value":" denote the sample\naverage of those pulls. That is,","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"aTI6AX3nen"}],"key":"yBDMG739Lp"},{"type":"math","value":"\\begin{aligned}\n N^k_t &:= \\sum_{\\tau=0}^{t-1} \\mathbf{1} \\{ a_\\tau = k \\} \\\\\n \\hat \\mu^k_t &:= \\frac{1}{N^k_t} \\sum_{\\tau=0}^{t-1} \\mathbf{1} \\{ a_\\tau = k \\} r_\\tau.\n\\end{aligned}","position":{"start":{"line":530,"column":1},"end":{"line":535,"column":1}},"html":"Ntk:=τ=0t11{aτ=k}μ^tk:=1Ntkτ=0t11{aτ=k}rτ.\\begin{aligned}\n N^k_t &:= \\sum_{\\tau=0}^{t-1} \\mathbf{1} \\{ a_\\tau = k \\} \\\\\n \\hat \\mu^k_t &:= \\frac{1}{N^k_t} \\sum_{\\tau=0}^{t-1} \\mathbf{1} \\{ a_\\tau = k \\} r_\\tau.\n\\end{aligned}Ntkμ^tk:=τ=0t11{aτ=k}:=Ntk1τ=0t11{aτ=k}rτ.","enumerator":"3.16","key":"aXBnJAEni9"},{"type":"paragraph","position":{"start":{"line":537,"column":1},"end":{"line":543,"column":1}},"children":[{"type":"text","value":"To achieve the “fixed sample size” assumption, we’ll\nneed to shift our index from ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"bUMM8MQdiP"},{"type":"emphasis","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"children":[{"type":"text","value":"time","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"EHpH6DsdRn"}],"key":"QTjTnzagE2"},{"type":"text","value":" to ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"NLWopHvGOq"},{"type":"emphasis","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"children":[{"type":"text","value":"number of samples from each\narm","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"w2FRaaW1NJ"}],"key":"mLbmcxgNqx"},{"type":"text","value":". In particular, we’ll define ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"DvpDP4Sg8T"},{"type":"inlineMath","value":"\\tilde r^k_n","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"r~nk\\tilde r^k_nr~nk","key":"gr3eUkRsGi"},{"type":"text","value":" to be the ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"adncT6JZib"},{"type":"inlineMath","value":"n","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"nnn","key":"jRIhgqkMsr"},{"type":"text","value":"th sample\nfrom arm ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"jfeEx2qOon"},{"type":"inlineMath","value":"k","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"kkk","key":"xW0RrkqLxO"},{"type":"text","value":", and ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"jiSzFunFKj"},{"type":"inlineMath","value":"\\tilde \\mu^k_n","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"μ~nk\\tilde \\mu^k_nμ~nk","key":"hPIewl35y8"},{"type":"text","value":" to be the sample average of the first\n","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"Uot9QmVQE0"},{"type":"inlineMath","value":"n","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"nnn","key":"mxH95JHyOJ"},{"type":"text","value":" samples from arm ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"FWLZTFpe6V"},{"type":"inlineMath","value":"k","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"kkk","key":"VaEFxgdYP9"},{"type":"text","value":". Then, for a fixed ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"Bjz2aXp7Mb"},{"type":"inlineMath","value":"n","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"nnn","key":"oRNZi4Ja4m"},{"type":"text","value":", this satisfies the\n“fixed sample size” assumption, and we can apply Hoeffding’s inequality\nto get a bound on ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"jnAelZeGx8"},{"type":"inlineMath","value":"\\tilde \\mu^k_n","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"μ~nk\\tilde \\mu^k_nμ~nk","key":"wwWjy5CWRn"},{"type":"text","value":".","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"Gn1WJc9Wad"}],"key":"j1Mhd6Sc9r"},{"type":"paragraph","position":{"start":{"line":545,"column":1},"end":{"line":549,"column":1}},"children":[{"type":"text","value":"So how can we extend our bound on ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"zlkTNGOwjP"},{"type":"inlineMath","value":"\\tilde\\mu^k_n","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"html":"μ~nk\\tilde\\mu^k_nμ~nk","key":"L50XHD3HhB"},{"type":"text","value":" to ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"HbzEerdYQU"},{"type":"inlineMath","value":"\\hat \\mu^k_t","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"html":"μ^tk\\hat \\mu^k_tμ^tk","key":"kMuYfWOH3B"},{"type":"text","value":"?\nWell, we know ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"Ek0rRSERmc"},{"type":"inlineMath","value":"N^k_t \\le t","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"html":"NtktN^k_t \\le tNtkt","key":"ud2OBePCva"},{"type":"text","value":" (where equality would be the case if and\nonly if we had pulled arm ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"j14HVASvZk"},{"type":"inlineMath","value":"k","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"html":"kkk","key":"zQIMhtzDpo"},{"type":"text","value":" every time). So we can apply the same\ntrick as last time, where we uniform-ize across all possible values of\n","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"HKHbKA0mOn"},{"type":"inlineMath","value":"N^k_t","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"html":"NtkN^k_tNtk","key":"mL18EMihmj"},{"type":"text","value":":","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"C3NOw41Nr9"}],"key":"FWAqNSR8zE"},{"type":"math","value":"\\begin{aligned}\n \\pr\\left( \\forall n \\le t, |\\tilde \\mu^k_n - \\mu^k | \\le \\sqrt{\\frac{\\ln(2/\\delta)}{2n}} \\right) &\\ge 1-t\\delta.\n\\end{aligned}","position":{"start":{"line":551,"column":1},"end":{"line":555,"column":1}},"html":"P(nt,μ~nkμkln(2/δ)2n)1tδ.\\begin{aligned}\n \\pr\\left( \\forall n \\le t, |\\tilde \\mu^k_n - \\mu^k | \\le \\sqrt{\\frac{\\ln(2/\\delta)}{2n}} \\right) &\\ge 1-t\\delta.\n\\end{aligned}P(nt,μ~nkμk2nln(2/δ))1tδ.","enumerator":"3.17","key":"JRTFvHbIli"},{"type":"paragraph","position":{"start":{"line":557,"column":1},"end":{"line":557,"column":1}},"children":[{"type":"text","value":"In particular, since ","position":{"start":{"line":557,"column":1},"end":{"line":557,"column":1}},"key":"aXJAnjZ1bo"},{"type":"inlineMath","value":"N^k_t \\le t","position":{"start":{"line":557,"column":1},"end":{"line":557,"column":1}},"html":"NtktN^k_t \\le tNtkt","key":"MLNWFcRAGL"},{"type":"text","value":", and ","position":{"start":{"line":557,"column":1},"end":{"line":557,"column":1}},"key":"FbwE3WNdzb"},{"type":"inlineMath","value":"\\tilde \\mu^k_{N^k_t} = \\hat \\mu^k_t","position":{"start":{"line":557,"column":1},"end":{"line":557,"column":1}},"html":"μ~Ntkk=μ^tk\\tilde \\mu^k_{N^k_t} = \\hat \\mu^k_tμ~Ntkk=μ^tk","key":"g5g9IyYfqY"},{"type":"text","value":" by definition, we have","position":{"start":{"line":557,"column":1},"end":{"line":557,"column":1}},"key":"J4vvttexPQ"}],"key":"Um40Bett6m"},{"type":"math","value":"\\begin{aligned}\n \\pr\\left( |\\hat \\mu^k_t - \\mu^k | \\le \\sqrt{\\frac{\\ln(2t/\\delta')}{2N^k_t}} \\right) &\\ge 1-\\delta' \\text{ where } \\delta' := t \\delta.\n\\end{aligned}","position":{"start":{"line":559,"column":1},"end":{"line":563,"column":1}},"html":"P(μ^tkμkln(2t/δ)2Ntk)1δ where δ:=tδ.\\begin{aligned}\n \\pr\\left( |\\hat \\mu^k_t - \\mu^k | \\le \\sqrt{\\frac{\\ln(2t/\\delta')}{2N^k_t}} \\right) &\\ge 1-\\delta' \\text{ where } \\delta' := t \\delta.\n\\end{aligned}P(μ^tkμk2Ntkln(2t/δ))1δ where δ:=tδ.","enumerator":"3.18","key":"yIu9pbdgtU"},{"type":"paragraph","position":{"start":{"line":565,"column":1},"end":{"line":565,"column":1}},"children":[{"type":"text","value":"This bound would then suffice for applying the UCB algorithm! That is, the upper confidence bound for arm ","position":{"start":{"line":565,"column":1},"end":{"line":565,"column":1}},"key":"arh9bTpp9T"},{"type":"inlineMath","value":"k","position":{"start":{"line":565,"column":1},"end":{"line":565,"column":1}},"html":"kkk","key":"dpTWoMsoWT"},{"type":"text","value":" would be","position":{"start":{"line":565,"column":1},"end":{"line":565,"column":1}},"key":"g2W5q4hAWR"}],"key":"yUqOmBIaDE"},{"type":"math","value":"M^k_t := \\hat \\mu^k_t + \\sqrt{\\frac{\\ln(2t/\\delta')}{2N^k_t}},","position":{"start":{"line":567,"column":1},"end":{"line":567,"column":1}},"html":"Mtk:=μ^tk+ln(2t/δ)2Ntk,M^k_t := \\hat \\mu^k_t + \\sqrt{\\frac{\\ln(2t/\\delta')}{2N^k_t}},Mtk:=μ^tk+2Ntkln(2t/δ),","enumerator":"3.19","key":"R8q46q6x42"},{"type":"paragraph","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"children":[{"type":"text","value":"where we can choose ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"eNlRNNat8O"},{"type":"inlineMath","value":"\\delta'","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"html":"δ\\delta'δ","key":"gFYyFeSQzI"},{"type":"text","value":" depending on how tight we want the interval to be.","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"b13McSYybJ"}],"key":"BvOfd9S9FP"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":571,"column":1},"end":{"line":573,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":571,"column":1},"end":{"line":571,"column":1}},"children":[{"type":"text","value":"A smaller ","position":{"start":{"line":571,"column":1},"end":{"line":571,"column":1}},"key":"OsSvOIVovV"},{"type":"inlineMath","value":"\\delta'","position":{"start":{"line":571,"column":1},"end":{"line":571,"column":1}},"html":"δ\\delta'δ","key":"a4JBSNkrXM"},{"type":"text","value":" would give us a larger and higher-confidence interval, emphasizing the exploration term.","position":{"start":{"line":571,"column":1},"end":{"line":571,"column":1}},"key":"aKZLX30ElS"}],"key":"uxKA1FgsDq"},{"type":"listItem","spread":true,"position":{"start":{"line":572,"column":1},"end":{"line":573,"column":1}},"children":[{"type":"text","value":"A larger ","position":{"start":{"line":572,"column":1},"end":{"line":572,"column":1}},"key":"LFLTF5MPB1"},{"type":"inlineMath","value":"\\delta'","position":{"start":{"line":572,"column":1},"end":{"line":572,"column":1}},"html":"δ\\delta'δ","key":"xdq8MtMFyw"},{"type":"text","value":" would give a tighter and lower-confidence interval, prioritizing the current sample averages.","position":{"start":{"line":572,"column":1},"end":{"line":572,"column":1}},"key":"PUA2whP84u"}],"key":"CnBGxM08cc"}],"key":"YI88sdkL7Y"},{"type":"paragraph","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"children":[{"type":"text","value":"We can now use this to define the UCB algorithm.","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"key":"prb8NjBWcP"}],"key":"pzUsu7vyAW"}],"key":"sR1wGw0OHQ"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class UCB(Agent):\n def __init__(self, K: int, T: int, delta: float):\n super().__init__(K, T)\n self.delta = delta\n\n def choose_arm(self):\n return solutions.ucb_choose_arm(self)","key":"rZr12wdggi"},{"type":"output","id":"Vv9Dm7q7gUYsUp42k28tc","data":[],"key":"wz9KouEF0A"}],"data":{},"key":"kkRoZrSQix"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":586,"column":1},"end":{"line":586,"column":1}},"children":[{"type":"text","value":"Intuitively, UCB prioritizes arms where:","position":{"start":{"line":586,"column":1},"end":{"line":586,"column":1}},"key":"NKK07S6UmB"}],"key":"OV0hmR05HW"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":588,"column":1},"end":{"line":593,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":588,"column":1},"end":{"line":590,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":588,"column":1},"end":{"line":589,"column":1}},"children":[{"type":"inlineMath","value":"\\hat \\mu^k_t","position":{"start":{"line":588,"column":1},"end":{"line":588,"column":1}},"html":"μ^tk\\hat \\mu^k_tμ^tk","key":"wJmMOjc35f"},{"type":"text","value":" is large, i.e. the arm has a high sample average, and\nwe’d choose it for ","position":{"start":{"line":588,"column":1},"end":{"line":588,"column":1}},"key":"EJYM0oAaCj"},{"type":"emphasis","position":{"start":{"line":588,"column":1},"end":{"line":588,"column":1}},"children":[{"type":"text","value":"exploitation","position":{"start":{"line":588,"column":1},"end":{"line":588,"column":1}},"key":"y3t0MGv1TZ"}],"key":"BlYrrNNMds"},{"type":"text","value":", and","position":{"start":{"line":588,"column":1},"end":{"line":588,"column":1}},"key":"DgFW8yYXja"}],"key":"g6LXeqM8nw"}],"key":"MaECBgVNbP"},{"type":"listItem","spread":true,"position":{"start":{"line":591,"column":1},"end":{"line":593,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":591,"column":1},"end":{"line":592,"column":1}},"children":[{"type":"inlineMath","value":"\\sqrt{\\frac{\\ln(2t/\\delta')}{2N^k_t}}","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"html":"ln(2t/δ)2Ntk\\sqrt{\\frac{\\ln(2t/\\delta')}{2N^k_t}}2Ntkln(2t/δ)","key":"jKujB7IOjb"},{"type":"text","value":" is large, i.e. we’re still\nuncertain about the arm, and we’d choose it for ","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"nEJPV3z6jh"},{"type":"emphasis","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"children":[{"type":"text","value":"exploration","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"hfnmiewzcW"}],"key":"JKYNpHV4II"},{"type":"text","value":".","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"ru0NxFuJpp"}],"key":"V62Qd93gN2"}],"key":"YlT1QMAtPq"}],"key":"rXaKitMDgR"},{"type":"paragraph","position":{"start":{"line":594,"column":1},"end":{"line":595,"column":1}},"children":[{"type":"text","value":"As desired, this explores in a smarter, ","position":{"start":{"line":594,"column":1},"end":{"line":594,"column":1}},"key":"BABPPN6h96"},{"type":"emphasis","position":{"start":{"line":594,"column":1},"end":{"line":594,"column":1}},"children":[{"type":"text","value":"adaptive","position":{"start":{"line":594,"column":1},"end":{"line":594,"column":1}},"key":"RlfTbM8oyR"}],"key":"fsy0iAiCwM"},{"type":"text","value":" way compared to the\nprevious algorithms. Does it achieve lower regret?","position":{"start":{"line":594,"column":1},"end":{"line":594,"column":1}},"key":"bQZmQkukgE"}],"key":"VsnmPxL8Ht"}],"key":"GXdaavM7j9"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"agent = UCB(mab.K, mab.T, 0.9)\nmab_loop(mab, agent)\nplot_strategy(mab, agent)","key":"Ay7ffZM6Bz"},{"type":"output","id":"WW8pnPLr2L2aLvDsSyh4V","data":[{"output_type":"display_data","metadata":{},"data":{"text/plain":{"content":"
","content_type":"text/plain"},"image/png":{"content_type":"image/png","hash":"f3eb002ad30c5ba869f3a828d502f4d2","path":"/build/f3eb002ad30c5ba869f3a828d502f4d2.png"}}}],"key":"Or8RQmjxQ0"}],"data":{},"key":"kHesWaFtJB"},{"type":"block","children":[{"type":"heading","depth":3,"position":{"start":{"line":603,"column":1},"end":{"line":603,"column":1}},"children":[{"type":"text","value":"UCB regret analysis","position":{"start":{"line":603,"column":1},"end":{"line":603,"column":1}},"key":"epCw3J1LyG"}],"identifier":"ucb-regret-analysis","label":"UCB regret analysis","html_id":"ucb-regret-analysis","implicit":true,"enumerator":"3.6.1","key":"u3y8qdaZbj"},{"type":"paragraph","position":{"start":{"line":605,"column":1},"end":{"line":606,"column":1}},"children":[{"type":"text","value":"First we’ll bound the regret incurred at each timestep. Then we’ll bound\nthe ","position":{"start":{"line":605,"column":1},"end":{"line":605,"column":1}},"key":"RbjSO83f2x"},{"type":"emphasis","position":{"start":{"line":605,"column":1},"end":{"line":605,"column":1}},"children":[{"type":"text","value":"total","position":{"start":{"line":605,"column":1},"end":{"line":605,"column":1}},"key":"LxrW557NjS"}],"key":"jqCtzAikE9"},{"type":"text","value":" regret across timesteps.","position":{"start":{"line":605,"column":1},"end":{"line":605,"column":1}},"key":"gL0mMMCgdU"}],"key":"bIBUIghxNN"},{"type":"paragraph","position":{"start":{"line":608,"column":1},"end":{"line":611,"column":1}},"children":[{"type":"text","value":"For the sake of analysis, we’ll use a slightly looser bound that applies\nacross the whole time horizon and across all arms. We’ll omit the\nderivation since it’s very similar to the above (walk through it\nyourself for practice).","position":{"start":{"line":608,"column":1},"end":{"line":608,"column":1}},"key":"Ggxn0veYys"}],"key":"LSnhXptF0z"},{"type":"math","value":"\\begin{aligned}\n \\pr\\left(\\forall k \\le K, t < T. |\\hat \\mu^k_t - \\mu^k | \\le B^k_t \\right) &\\ge 1-\\delta'' \\\\\n \\text{where} \\quad B^k_t &:= \\sqrt{\\frac{\\ln(2TK/\\delta'')}{2N^k_t}}.\n\\end{aligned}","position":{"start":{"line":613,"column":1},"end":{"line":618,"column":1}},"html":"P(kK,t<T.μ^tkμkBtk)1δwhereBtk:=ln(2TK/δ)2Ntk.\\begin{aligned}\n \\pr\\left(\\forall k \\le K, t < T. |\\hat \\mu^k_t - \\mu^k | \\le B^k_t \\right) &\\ge 1-\\delta'' \\\\\n \\text{where} \\quad B^k_t &:= \\sqrt{\\frac{\\ln(2TK/\\delta'')}{2N^k_t}}.\n\\end{aligned}P(kK,t<T.∣μ^tkμkBtk)whereBtk1δ′′:=2Ntkln(2TK/δ′′).","enumerator":"3.20","key":"cCQnPxI53W"},{"type":"paragraph","position":{"start":{"line":620,"column":1},"end":{"line":623,"column":1}},"children":[{"type":"text","value":"Intuitively, ","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"key":"xBQ45KXocd"},{"type":"inlineMath","value":"B^k_t","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"html":"BtkB^k_tBtk","key":"U6gmKNouGu"},{"type":"text","value":" denotes the ","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"key":"P3wogBoKiB"},{"type":"emphasis","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"children":[{"type":"text","value":"width","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"key":"YtjpW62Hc2"}],"key":"QaIRvsNJiY"},{"type":"text","value":" of the CI for arm ","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"key":"jHuBitN69l"},{"type":"inlineMath","value":"k","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"html":"kkk","key":"CZOGh3s1u8"},{"type":"text","value":" at time\n","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"key":"TRFYBb0dQl"},{"type":"inlineMath","value":"t","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"html":"ttt","key":"RABQu9ZT98"},{"type":"text","value":". Then, assuming the above uniform bound holds (which occurs with\nprobability ","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"key":"WXI8AO09tc"},{"type":"inlineMath","value":"1-\\delta''","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"html":"1δ1-\\delta''1δ′′","key":"mFBXTDiOEI"},{"type":"text","value":"), we can bound the regret at each timestep as\nfollows:","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"key":"skrGutYSV7"}],"key":"Hz2FIZgMk6"},{"type":"math","value":"\\begin{aligned}\n \\mu^\\star - \\mu^{a_t} &\\le \\hat \\mu^{k^*}_t + B_t^{k^*} - \\mu^{a_t} && \\text{applying UCB to arm } k^\\star \\\\\n &\\le \\hat \\mu^{a_t}_t + B^{a_t}_t - \\mu^{a_t} && \\text{since UCB chooses } a_t = \\arg \\max_{k \\in [K]} \\hat \\mu^k_t + B_t^{k} \\\\\n &\\le 2 B^{a_t}_t && \\text{since } \\hat \\mu^{a_t}_t - \\mu^{a_t} \\le B^{a_t}_t \\text{ by definition of } B^{a_t}_t \\\\\n\\end{aligned}","position":{"start":{"line":625,"column":1},"end":{"line":631,"column":1}},"html":"μμatμ^tk+Btkμatapplying UCB to arm kμ^tat+Btatμatsince UCB chooses at=argmaxk[K]μ^tk+Btk2Btatsince μ^tatμatBtat by definition of Btat\\begin{aligned}\n \\mu^\\star - \\mu^{a_t} &\\le \\hat \\mu^{k^*}_t + B_t^{k^*} - \\mu^{a_t} && \\text{applying UCB to arm } k^\\star \\\\\n &\\le \\hat \\mu^{a_t}_t + B^{a_t}_t - \\mu^{a_t} && \\text{since UCB chooses } a_t = \\arg \\max_{k \\in [K]} \\hat \\mu^k_t + B_t^{k} \\\\\n &\\le 2 B^{a_t}_t && \\text{since } \\hat \\mu^{a_t}_t - \\mu^{a_t} \\le B^{a_t}_t \\text{ by definition of } B^{a_t}_t \\\\\n\\end{aligned}μμatμ^tk+Btkμatμ^tat+Btatμat2Btatapplying UCB to arm ksince UCB chooses at=argk[K]maxμ^tk+Btksince μ^tatμatBtat by definition of Btat","enumerator":"3.21","key":"necYujI7SF"},{"type":"paragraph","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"children":[{"type":"text","value":"Summing this across timesteps gives","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"key":"muAiS7h7e0"}],"key":"HH6Owtt9vw"},{"type":"math","value":"\\begin{aligned}\n \\text{Regret}_T &\\le \\sum_{t=0}^{T-1} 2 B^{a_t}_t \\\\\n &= \\sqrt{2\\ln(2TK/\\delta'')} \\sum_{t=0}^{T-1} (N^{a_t}_t)^{-1/2} \\\\\n \\sum_{t=0}^{T-1} (N^{a_t}_t)^{-1/2} &= \\sum_{t=0}^{T-1} \\sum_{k=1}^K \\mathbf{1}\\{ a_t = k \\} (N^k_t)^{-1/2} \\\\\n &= \\sum_{k=1}^K \\sum_{n=1}^{N_T^k} n^{-1/2} \\\\\n &\\le K \\sum_{n=1}^T n^{-1/2} \\\\\n \\sum_{n=1}^T n^{-1/2} &\\le 1 + \\int_1^T x^{-1/2} \\ \\mathrm{d}x \\\\\n &= 1 + (2 \\sqrt{x})_1^T \\\\\n &= 2 \\sqrt{T} - 1 \\\\\n &\\le 2 \\sqrt{T} \\\\\n\\end{aligned}","position":{"start":{"line":635,"column":1},"end":{"line":647,"column":1}},"html":"RegretTt=0T12Btat=2ln(2TK/δ)t=0T1(Ntat)1/2t=0T1(Ntat)1/2=t=0T1k=1K1{at=k}(Ntk)1/2=k=1Kn=1NTkn1/2Kn=1Tn1/2n=1Tn1/21+1Tx1/2 dx=1+(2x)1T=2T12T\\begin{aligned}\n \\text{Regret}_T &\\le \\sum_{t=0}^{T-1} 2 B^{a_t}_t \\\\\n &= \\sqrt{2\\ln(2TK/\\delta'')} \\sum_{t=0}^{T-1} (N^{a_t}_t)^{-1/2} \\\\\n \\sum_{t=0}^{T-1} (N^{a_t}_t)^{-1/2} &= \\sum_{t=0}^{T-1} \\sum_{k=1}^K \\mathbf{1}\\{ a_t = k \\} (N^k_t)^{-1/2} \\\\\n &= \\sum_{k=1}^K \\sum_{n=1}^{N_T^k} n^{-1/2} \\\\\n &\\le K \\sum_{n=1}^T n^{-1/2} \\\\\n \\sum_{n=1}^T n^{-1/2} &\\le 1 + \\int_1^T x^{-1/2} \\ \\mathrm{d}x \\\\\n &= 1 + (2 \\sqrt{x})_1^T \\\\\n &= 2 \\sqrt{T} - 1 \\\\\n &\\le 2 \\sqrt{T} \\\\\n\\end{aligned}RegretTt=0T1(Ntat)1/2n=1Tn1/2t=0T12Btat=2ln(2TK/δ′′)t=0T1(Ntat)1/2=t=0T1k=1K1{at=k}(Ntk)1/2=k=1Kn=1NTkn1/2Kn=1Tn1/21+1Tx1/2 dx=1+(2x)1T=2T12T","enumerator":"3.22","key":"A7oHL4Ev6N"},{"type":"paragraph","position":{"start":{"line":649,"column":1},"end":{"line":649,"column":1}},"children":[{"type":"text","value":"Putting everything together gives","position":{"start":{"line":649,"column":1},"end":{"line":649,"column":1}},"key":"wNXyRXjqWj"}],"key":"Enza5yroBV"},{"type":"math","value":"\\begin{aligned}\n \\text{Regret}_T &\\le 2 K \\sqrt{2T \\ln(2TK/\\delta'')} && \\text{with probability } 1-\\delta'' \\\\\n &= \\tilde O(K\\sqrt{T})\n\\end{aligned}","position":{"start":{"line":651,"column":1},"end":{"line":656,"column":1}},"html":"RegretT2K2Tln(2TK/δ)with probability 1δ=O~(KT)\\begin{aligned}\n \\text{Regret}_T &\\le 2 K \\sqrt{2T \\ln(2TK/\\delta'')} && \\text{with probability } 1-\\delta'' \\\\\n &= \\tilde O(K\\sqrt{T})\n\\end{aligned}RegretT2K2Tln(2TK/δ′′)=O~(KT)with probability 1δ′′","enumerator":"3.23","key":"fsRyNmGJ2B"},{"type":"paragraph","position":{"start":{"line":658,"column":1},"end":{"line":659,"column":1}},"children":[{"type":"text","value":"In fact, we can do a more sophisticated analysis to trim off a factor of ","position":{"start":{"line":658,"column":1},"end":{"line":658,"column":1}},"key":"qgjm1B0TTi"},{"type":"inlineMath","value":"\\sqrt{K}","position":{"start":{"line":658,"column":1},"end":{"line":658,"column":1}},"html":"K\\sqrt{K}K","key":"gUQP0aoMA7"},{"type":"text","value":"\nand show ","position":{"start":{"line":658,"column":1},"end":{"line":658,"column":1}},"key":"Yy59mnOzoP"},{"type":"inlineMath","value":"\\text{Regret}_T = \\tilde O(\\sqrt{TK})","position":{"start":{"line":658,"column":1},"end":{"line":658,"column":1}},"html":"RegretT=O~(TK)\\text{Regret}_T = \\tilde O(\\sqrt{TK})RegretT=O~(TK)","key":"uDJteSLsxY"},{"type":"text","value":".","position":{"start":{"line":658,"column":1},"end":{"line":658,"column":1}},"key":"nN2zuqL5jb"}],"key":"tc4P66xaLC"}],"key":"ShmQkiqzZA"},{"type":"block","position":{"start":{"line":661,"column":1},"end":{"line":661,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":663,"column":1},"end":{"line":663,"column":1}},"children":[{"type":"text","value":"Lower bound on regret (intuition)","position":{"start":{"line":663,"column":1},"end":{"line":663,"column":1}},"key":"Tj5cU1oU7F"}],"identifier":"lower-bound-on-regret-intuition","label":"Lower bound on regret (intuition)","html_id":"lower-bound-on-regret-intuition","implicit":true,"enumerator":"3.6.2","key":"vkLxpqMUhI"},{"type":"paragraph","position":{"start":{"line":665,"column":1},"end":{"line":668,"column":1}},"children":[{"type":"text","value":"Is it possible to do better than ","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"key":"BVf2MrsXLO"},{"type":"inlineMath","value":"\\Omega(\\sqrt{T})","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"html":"Ω(T)\\Omega(\\sqrt{T})Ω(T)","key":"MlJqW1U4b5"},{"type":"text","value":" in general? In fact,\nno! We can show that any algorithm must incur ","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"key":"yJWMTiIIn0"},{"type":"inlineMath","value":"\\Omega(\\sqrt{T})","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"html":"Ω(T)\\Omega(\\sqrt{T})Ω(T)","key":"PKLVe4B1zG"},{"type":"text","value":" regret\nin the worst case. We won’t rigorously prove this here, but the\nintuition is as follows.","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"key":"pjrzcQv69A"}],"key":"KQlAFaHZm2"},{"type":"paragraph","position":{"start":{"line":670,"column":1},"end":{"line":674,"column":1}},"children":[{"type":"text","value":"The Central Limit Theorem tells us that with ","position":{"start":{"line":670,"column":1},"end":{"line":670,"column":1}},"key":"Ogiog7LZsN"},{"type":"inlineMath","value":"T","position":{"start":{"line":670,"column":1},"end":{"line":670,"column":1}},"html":"TTT","key":"MuKrXAfSjC"},{"type":"text","value":" i.i.d. samples from\nsome distribution, we can only learn the mean of the distribution to\nwithin ","position":{"start":{"line":670,"column":1},"end":{"line":670,"column":1}},"key":"TcGGQcQAK0"},{"type":"inlineMath","value":"\\Omega(1/\\sqrt{T})","position":{"start":{"line":670,"column":1},"end":{"line":670,"column":1}},"html":"Ω(1/T)\\Omega(1/\\sqrt{T})Ω(1/T)","key":"z3vbB7aCOI"},{"type":"text","value":" (the standard deviation). Then, since we get\n","position":{"start":{"line":670,"column":1},"end":{"line":670,"column":1}},"key":"MYVzliz7II"},{"type":"inlineMath","value":"T","position":{"start":{"line":670,"column":1},"end":{"line":670,"column":1}},"html":"TTT","key":"B731zi4yNE"},{"type":"text","value":" samples spread out across the arms, we can only learn each arm’s\nmean to an even looser degree.","position":{"start":{"line":670,"column":1},"end":{"line":670,"column":1}},"key":"AyQM7YSrCF"}],"key":"Otjga3IBkc"},{"type":"paragraph","position":{"start":{"line":676,"column":1},"end":{"line":679,"column":1}},"children":[{"type":"text","value":"That is, if two arms have means that are within about ","position":{"start":{"line":676,"column":1},"end":{"line":676,"column":1}},"key":"kt8ZHMVGqU"},{"type":"inlineMath","value":"1/\\sqrt{T}","position":{"start":{"line":676,"column":1},"end":{"line":676,"column":1}},"html":"1/T1/\\sqrt{T}1/T","key":"haocAAZuVp"},{"type":"text","value":", we\nwon’t be able to confidently tell them apart, and will sample them about\nequally. But then we’ll incur regret","position":{"start":{"line":676,"column":1},"end":{"line":676,"column":1}},"key":"KY3CZwSb4j"}],"key":"OupBkwFtl3"},{"type":"math","value":"\\Omega((T/2) \\cdot (1/\\sqrt{T})) = \\Omega(\\sqrt{T}).","position":{"start":{"line":676,"column":1},"end":{"line":676,"column":1}},"tight":"before","html":"Ω((T/2)(1/T))=Ω(T).\\Omega((T/2) \\cdot (1/\\sqrt{T})) = \\Omega(\\sqrt{T}).Ω((T/2)(1/T))=Ω(T).","enumerator":"3.24","key":"J8capJKmNr"}],"key":"P5nZnmqQpu"},{"type":"block","position":{"start":{"line":681,"column":1},"end":{"line":681,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":684,"column":1},"end":{"line":684,"column":1}},"children":[{"type":"text","value":"Thompson sampling and Bayesian bandits","position":{"start":{"line":684,"column":1},"end":{"line":684,"column":1}},"key":"cG8cLhUGfG"}],"label":"thompson_sampling","identifier":"thompson_sampling","html_id":"thompson-sampling","enumerator":"3.7","key":"oWvpy5k3vl"},{"type":"paragraph","position":{"start":{"line":686,"column":1},"end":{"line":692,"column":1}},"children":[{"type":"text","value":"So far, we’ve treated the parameters ","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"W1TfdNLIMR"},{"type":"inlineMath","value":"\\mu^0, \\dots, \\mu^{K-1}","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"html":"μ0,,μK1\\mu^0, \\dots, \\mu^{K-1}μ0,,μK1","key":"cAcLO945uG"},{"type":"text","value":" of the\nreward distributions as ","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"g7hGOmPG26"},{"type":"emphasis","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"children":[{"type":"text","value":"fixed","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"UXz8iDCv24"}],"key":"oXWLTRvkwP"},{"type":"text","value":". Instead, we can take a ","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"Y8x8AarEQG"},{"type":"strong","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"children":[{"type":"text","value":"Bayesian","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"aE4n6zyFTF"}],"key":"WHkrTwMsPr"},{"type":"text","value":"\napproach where we treat them as random variables from some ","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"IOoZl7gvjm"},{"type":"strong","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"children":[{"type":"text","value":"prior\ndistribution","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"SW1P1243Bl"}],"key":"HCFgysQscw"},{"type":"text","value":". Then, upon pulling an arm and observing a reward, we can\nsimply ","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"Va8JosprND"},{"type":"emphasis","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"children":[{"type":"text","value":"condition","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"ZdzH9TSBy7"}],"key":"UEOWhFguOu"},{"type":"text","value":" on this observation to exactly describe the\n","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"GEbnDudFf9"},{"type":"strong","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"children":[{"type":"text","value":"posterior distribution","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"uuf8o8XTfa"}],"key":"DHltcaMttb"},{"type":"text","value":" over the parameters. This fully describes the\ninformation we gain about the parameters from observing the reward.","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"xm441X3DxN"}],"key":"cUTpYZYuQX"},{"type":"paragraph","position":{"start":{"line":694,"column":1},"end":{"line":696,"column":1}},"children":[{"type":"text","value":"From this Bayesian perspective, the ","position":{"start":{"line":694,"column":1},"end":{"line":694,"column":1}},"key":"I3lp0xrBME"},{"type":"strong","position":{"start":{"line":694,"column":1},"end":{"line":694,"column":1}},"children":[{"type":"text","value":"Thompson sampling","position":{"start":{"line":694,"column":1},"end":{"line":694,"column":1}},"key":"iWqwJODQPd"}],"key":"aAZYPofDoi"},{"type":"text","value":" algorithm\nfollows naturally: just sample from the distribution of the optimal arm,\ngiven the observations!","position":{"start":{"line":694,"column":1},"end":{"line":694,"column":1}},"key":"nooPK9qepY"}],"key":"JpkIcwtmbp"}],"key":"Uo9MB9Mh2C"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class Distribution:\n def sample(self) -> Float[Array, \" K\"]:\n \"\"\"Sample a vector of means for the K arms.\"\"\"\n ...\n\n def update(self, arm: int, reward: float):\n \"\"\"Condition on obtaining `reward` from the given arm.\"\"\"\n ...","key":"qCO9iYA4xg"},{"type":"output","id":"wqvS5akPKxoiBV7KCTnco","data":[],"key":"bf46Q29zBH"}],"data":{},"key":"Ptd0Et9M2o"},{"type":"block","children":[],"key":"Tc653fEZxF"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class ThompsonSampling(Agent):\n def __init__(self, K: int, T: int, prior: Distribution):\n super().__init__(K, T)\n self.distribution = prior\n\n def choose_arm(self):\n means = self.distribution.sample()\n return random_argmax(means)\n\n def update_history(self, arm: int, reward: int):\n super().update_history(arm, reward)\n self.distribution.update(arm, reward)","key":"PDgeBlFrSJ"},{"type":"output","id":"l2mRR_hDd2d0aVkD6L5WV","data":[],"key":"Bc3gaoruJV"}],"data":{},"key":"njaVLDJXqK"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":724,"column":1},"end":{"line":729,"column":1}},"children":[{"type":"text","value":"In other words, we sample each arm proportionally to how likely we think\nit is to be optimal, given the observations so far. This strikes a good\nexploration-exploitation tradeoff: we explore more for arms that we’re\nless certain about, and exploit more for arms that we’re more certain\nabout. Thompson sampling is a simple yet powerful algorithm that\nachieves state-of-the-art performance in many settings.","position":{"start":{"line":724,"column":1},"end":{"line":724,"column":1}},"key":"R9ocISkxjc"}],"key":"vqYbsIPRC1"},{"type":"proof","kind":"example","label":"bayesian_bernoulli","identifier":"bayesian_bernoulli","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Bayesian Bernoulli bandit","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"key":"YkZ01aGjfU"}],"key":"hYdO6C1qfQ"},{"type":"paragraph","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"children":[{"type":"text","value":"We’ve been working in the Bernoulli bandit setting, where arm ","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"key":"oQoxKn66ce"},{"type":"inlineMath","value":"k","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"html":"kkk","key":"Uo6PdacyRc"},{"type":"text","value":" yields a reward of ","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"key":"Jxf0xNYJik"},{"type":"text","value":"1","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"key":"p1xx9Ac440"},{"type":"text","value":" with probability ","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"key":"i67MItBmrW"},{"type":"inlineMath","value":"\\mu^k","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"html":"μk\\mu^kμk","key":"fV8kKMyYuV"},{"type":"text","value":" and no reward otherwise. The vector of success probabilities ","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"key":"SVBiMsWDRu"},{"type":"inlineMath","value":"\\boldsymbol{\\mu} = (\\mu^1, \\dots, \\mu^K)","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"html":"μ=(μ1,,μK)\\boldsymbol{\\mu} = (\\mu^1, \\dots, \\mu^K)μ=(μ1,,μK)","key":"gd8S1Nvb5T"},{"type":"text","value":" thus describes the entire MAB.","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"key":"GuO5R6t7kt"}],"key":"l8zM6gGH3D"},{"type":"paragraph","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"children":[{"type":"text","value":"Under the Bayesian perspective, we think of ","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"qlQmg0EaSu"},{"type":"inlineMath","value":"\\boldsymbol{\\mu}","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"html":"μ\\boldsymbol{\\mu}μ","key":"JkOzNmOBEV"},{"type":"text","value":" as a ","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"CDreYJlOqf"},{"type":"emphasis","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"children":[{"type":"text","value":"random","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"zuZhP7DMlp"}],"key":"SY1R5ZDEkf"},{"type":"text","value":" vector drawn from some prior distribution ","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"jLbSV7lL9T"},{"type":"inlineMath","value":"\\pi(\\boldsymbol{\\mu})","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"html":"π(μ)\\pi(\\boldsymbol{\\mu})π(μ)","key":"xP3ALPESC3"},{"type":"text","value":". For example, we might have ","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"tlnF8x6Ez7"},{"type":"text","value":"π","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"umtEwInooj"},{"type":"text","value":" be the Uniform distribution over the unit hypercube ","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"HqVjPjz2dP"},{"type":"inlineMath","value":"[0, 1]^K","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"html":"[0,1]K[0, 1]^K[0,1]K","key":"IZxgnn3bBe"},{"type":"text","value":", that is,","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"ySdGfjtOXS"}],"key":"CHR7SUmqsS"},{"type":"math","value":"\\pi(\\boldsymbol{\\mu}) = \\begin{cases}\n 1 & \\text{if } \\boldsymbol{\\mu}\\in [0, 1]^K \\\\\n 0 & \\text{otherwise}\n\\end{cases}","position":{"start":{"line":738,"column":1},"end":{"line":741,"column":1}},"html":"π(μ)={1if μ[0,1]K0otherwise\\pi(\\boldsymbol{\\mu}) = \\begin{cases}\n 1 & \\text{if } \\boldsymbol{\\mu}\\in [0, 1]^K \\\\\n 0 & \\text{otherwise}\n\\end{cases}π(μ)={10if μ[0,1]Kotherwise","enumerator":"3.25","key":"BJa7scEClH"},{"type":"paragraph","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"children":[{"type":"text","value":"In this case, upon viewing some reward, we can exactly calculate the ","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"key":"ZvCEqPN7pH"},{"type":"strong","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"children":[{"type":"text","value":"posterior","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"key":"NniOBJjZK5"}],"key":"euxbJenAie"},{"type":"text","value":" distribution of ","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"key":"bPfzpJFlSh"},{"type":"inlineMath","value":"\\boldsymbol{\\mu}","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"html":"μ\\boldsymbol{\\mu}μ","key":"nzMWjRFKvn"},{"type":"text","value":" using Bayes’s rule (i.e. the definition of conditional probability):","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"key":"vvQVO7mDbg"}],"key":"ZHBAQi3Sqj"},{"type":"math","value":"\\begin{aligned}\n \\pr(\\boldsymbol{\\mu} \\mid a_0, r_0) &\\propto \\pr(r_0 \\mid a_0, \\boldsymbol{\\mu}) \\pr(a_0 \\mid \\boldsymbol{\\mu}) \\pr(\\boldsymbol{\\mu}) \\\\\n &\\propto (\\mu^{a_0})^{r_0} (1 - \\mu^{a_0})^{1-r_0}.\n\\end{aligned}","position":{"start":{"line":745,"column":1},"end":{"line":750,"column":1}},"html":"P(μa0,r0)P(r0a0,μ)P(a0μ)P(μ)(μa0)r0(1μa0)1r0.\\begin{aligned}\n \\pr(\\boldsymbol{\\mu} \\mid a_0, r_0) &\\propto \\pr(r_0 \\mid a_0, \\boldsymbol{\\mu}) \\pr(a_0 \\mid \\boldsymbol{\\mu}) \\pr(\\boldsymbol{\\mu}) \\\\\n &\\propto (\\mu^{a_0})^{r_0} (1 - \\mu^{a_0})^{1-r_0}.\n\\end{aligned}P(μa0,r0)P(r0a0,μ)P(a0μ)P(μ)(μa0)r0(1μa0)1r0.","enumerator":"3.26","key":"hwliDAaImW"},{"type":"paragraph","position":{"start":{"line":752,"column":1},"end":{"line":760,"column":1}},"children":[{"type":"text","value":"This is the PDF of the\n","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"WDef7zH5VV"},{"type":"inlineMath","value":"\\text{Beta}(1 + r_0, 1 + (1 - r_0))","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"html":"Beta(1+r0,1+(1r0))\\text{Beta}(1 + r_0, 1 + (1 - r_0))Beta(1+r0,1+(1r0))","key":"umMSzzoIyo"},{"type":"text","value":" distribution, which is a conjugate\nprior for the Bernoulli distribution. That is, if we start with a Beta\nprior on ","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"EacZotcZvr"},{"type":"inlineMath","value":"\\mu^k","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"html":"μk\\mu^kμk","key":"OScIFR9MCQ"},{"type":"text","value":" (note that ","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"b3PfTTGmpG"},{"type":"inlineMath","value":"\\text{Unif}([0, 1]) = \\text{Beta}(1, 1)","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"html":"Unif([0,1])=Beta(1,1)\\text{Unif}([0, 1]) = \\text{Beta}(1, 1)Unif([0,1])=Beta(1,1)","key":"uWPQsLBVvM"},{"type":"text","value":"),\nthen the posterior, after conditioning on samples from\n","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"Aq2USTDWxt"},{"type":"inlineMath","value":"\\text{Bern}(\\mu^k)","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"html":"Bern(μk)\\text{Bern}(\\mu^k)Bern(μk)","key":"llOp1ai016"},{"type":"text","value":", will also be Beta. This is a very convenient\nproperty, since it means we can simply update the parameters of the Beta\ndistribution upon observing a reward, rather than having to recompute\nthe entire posterior distribution from scratch.","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"q74kcWbI98"}],"key":"nWNw7DFCO1"}],"enumerator":"3.3","html_id":"bayesian-bernoulli","key":"Vd16fzQmKR"}],"key":"dkjrEK3lVO"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class Beta(Distribution):\n def __init__(self, K: int, alpha: int = 1, beta: int = 1):\n self.alphas = np.full(K, alpha)\n self.betas = np.full(K, beta)\n\n def sample(self):\n return np.random.beta(self.alphas, self.betas)\n\n def update(self, arm: int, reward: int):\n self.alphas[arm] += reward\n self.betas[arm] += 1 - reward","key":"s63oI4x0ui"},{"type":"output","id":"1PmMmTR6hQDnJio5aw7ut","data":[],"key":"bTTKJObJls"}],"data":{},"key":"qVAuo8qsUI"},{"type":"block","children":[],"key":"hd1NBd7wgk"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"beta_distribution = Beta(mab.K)\nagent = ThompsonSampling(mab.K, mab.T, beta_distribution)\nmab_loop(mab, agent)\nplot_strategy(mab, agent)","key":"xq98l3hVuk"},{"type":"output","id":"HGfdm3plOpBnSO530LoPb","data":[{"output_type":"display_data","metadata":{},"data":{"text/plain":{"content":"
","content_type":"text/plain"},"image/png":{"content_type":"image/png","hash":"991419959ab213822fb1c34db8883adb","path":"/build/991419959ab213822fb1c34db8883adb.png"}}}],"key":"a29HhM9n8K"}],"data":{},"key":"AKhyoq64A7"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":784,"column":1},"end":{"line":786,"column":1}},"children":[{"type":"text","value":"It turns out that asymptotically, Thompson sampling is optimal in the\nfollowing sense. ","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"foUHGuvXsJ"},{"type":"cite","kind":"narrative","label":"lai_asymptotically_1985","identifier":"lai_asymptotically_1985","children":[{"type":"text","value":"Lai & Robbins (1985)","key":"uInCl56ItK"}],"enumerator":"2","key":"UYtgLULFqW"},{"type":"text","value":" prove an\n","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"MIO2MAjKPM"},{"type":"emphasis","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"children":[{"type":"text","value":"instance-dependent","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"WNfPaXusDp"}],"key":"KLAxTId0pj"},{"type":"text","value":" lower bound that says for ","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"QGvjqhfeho"},{"type":"emphasis","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"children":[{"type":"text","value":"any","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"B5L9alCxhz"}],"key":"Dc22SkIrcS"},{"type":"text","value":" bandit algorithm,","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"pZJfMIobZn"}],"key":"lcpFRXf1nO"},{"type":"math","value":"\\liminf_{T \\to \\infty} \\frac{\\E[N_T^k]}{\\ln(T)} \\ge \\frac{1}{\\text{KL}(\\mu^k \\parallel \\mu^\\star)}","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"html":"lim infTE[NTk]ln(T)1KL(μkμ)\\liminf_{T \\to \\infty} \\frac{\\E[N_T^k]}{\\ln(T)} \\ge \\frac{1}{\\text{KL}(\\mu^k \\parallel \\mu^\\star)}Tliminfln(T)E[NTk]KL(μkμ)1","enumerator":"3.27","key":"xMUVppssr5"},{"type":"paragraph","position":{"start":{"line":790,"column":1},"end":{"line":790,"column":1}},"children":[{"type":"text","value":"where","position":{"start":{"line":790,"column":1},"end":{"line":790,"column":1}},"key":"sOBlHwpS7q"}],"key":"bpg9yswn47"},{"type":"math","value":"\\text{KL}(\\mu^k \\parallel \\mu^\\star) = \\mu^k \\ln \\frac{\\mu^k}{\\mu^\\star} + (1 - \\mu^k) \\ln \\frac{1 - \\mu^k}{1 - \\mu^\\star}","position":{"start":{"line":792,"column":1},"end":{"line":792,"column":1}},"html":"KL(μkμ)=μklnμkμ+(1μk)ln1μk1μ\\text{KL}(\\mu^k \\parallel \\mu^\\star) = \\mu^k \\ln \\frac{\\mu^k}{\\mu^\\star} + (1 - \\mu^k) \\ln \\frac{1 - \\mu^k}{1 - \\mu^\\star}KL(μkμ)=μklnμμk+(1μk)ln1μ1μk","enumerator":"3.28","key":"DhrS8PFGYD"},{"type":"paragraph","position":{"start":{"line":794,"column":1},"end":{"line":798,"column":1}},"children":[{"type":"text","value":"measures the ","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"zSKRzUc6Ho"},{"type":"strong","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"children":[{"type":"text","value":"Kullback-Leibler divergence","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"Cc8ePqw2Sy"}],"key":"ac95ZhcwyS"},{"type":"text","value":" from the Bernoulli\ndistribution with mean ","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"LEhFKkiEUV"},{"type":"inlineMath","value":"\\mu^k","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"html":"μk\\mu^kμk","key":"jnXP9Vx2OD"},{"type":"text","value":" to the Bernoulli distribution with mean\n","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"NWwPMEJfrD"},{"type":"inlineMath","value":"\\mu^\\star","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"html":"μ\\mu^\\starμ","key":"Gp9rDDHdA7"},{"type":"text","value":". It turns out that Thompson sampling achieves this lower\nbound with equality! That is, not only is the error ","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"Gp4CqBuDgu"},{"type":"emphasis","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"children":[{"type":"text","value":"rate","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"reZjZNIp13"}],"key":"OcQiTjRXbS"},{"type":"text","value":" optimal, but\nthe ","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"JEdXCM5AUV"},{"type":"emphasis","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"children":[{"type":"text","value":"constant factor","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"T0OwhBNmaT"}],"key":"RlzPJoFoJi"},{"type":"text","value":" is optimal as well.","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"VnTmBmWDL2"}],"key":"SFgAnv7YC4"}],"key":"FGUzP6T2EO"},{"type":"block","position":{"start":{"line":800,"column":1},"end":{"line":800,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"children":[{"type":"text","value":"Contextual bandits","position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"key":"lJzt0mXggJ"}],"identifier":"contextual-bandits","label":"Contextual bandits","html_id":"contextual-bandits","implicit":true,"enumerator":"3.8","key":"e9LV0cjy5O"},{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Note","key":"Xthvt7g9jC"}],"key":"MhiVZRDEpx"},{"type":"paragraph","position":{"start":{"line":805,"column":1},"end":{"line":805,"column":1}},"children":[{"type":"text","value":"This content is advanced material taught at the end of the course.","position":{"start":{"line":805,"column":1},"end":{"line":805,"column":1}},"key":"sYT68wQzZe"}],"key":"lA0uV3nhcA"}],"key":"WkfA2QFEPT"},{"type":"paragraph","position":{"start":{"line":808,"column":1},"end":{"line":814,"column":1}},"children":[{"type":"text","value":"In the above MAB environment, the reward distributions of the arms\nremain constant. However, in many real-world settings, we might receive\nadditional information that affects these distributions. For example, in\nthe online advertising case where each arm corresponds to an ad we could\nshow the user, we might receive information about the user’s preferences\nthat changes how likely they are to click on a given ad. We can model\nsuch environments using ","position":{"start":{"line":808,"column":1},"end":{"line":808,"column":1}},"key":"kXsMuYKuM2"},{"type":"strong","position":{"start":{"line":808,"column":1},"end":{"line":808,"column":1}},"children":[{"type":"text","value":"contextual bandits","position":{"start":{"line":808,"column":1},"end":{"line":808,"column":1}},"key":"BAzL5z5tJ0"}],"key":"xlMDDcE6CB"},{"type":"text","value":".","position":{"start":{"line":808,"column":1},"end":{"line":808,"column":1}},"key":"M2T8cZEKKY"}],"key":"JgRKJX3gvL"},{"type":"proof","kind":"definition","label":"contextual_bandit","identifier":"contextual_bandit","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Contextual bandit","position":{"start":{"line":816,"column":1},"end":{"line":816,"column":1}},"key":"LwptsEeM7C"}],"key":"tM6PW1kmXU"},{"type":"paragraph","position":{"start":{"line":819,"column":1},"end":{"line":824,"column":1}},"children":[{"type":"text","value":"At each timestep ","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"bPafTm7FTY"},{"type":"inlineMath","value":"t","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"html":"ttt","key":"uaSCMZH63Q"},{"type":"text","value":", a new ","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"jUIRnWdWEv"},{"type":"emphasis","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"children":[{"type":"text","value":"context","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"UM6olXeLGe"}],"key":"eJ69BBWYkS"},{"type":"text","value":"\n","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"uRutDObvDf"},{"type":"inlineMath","value":"x_t","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"html":"xtx_txt","key":"whrzxKYp1s"},{"type":"text","value":" is drawn from some distribution ","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"RXyVII9dk6"},{"type":"inlineMath","value":"\\nu_{\\text{x}}","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"html":"νx\\nu_{\\text{x}}νx","key":"NnLaqdmygz"},{"type":"text","value":". The learner gets\nto observe the context, and choose an action ","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"FYs8IdqzBt"},{"type":"inlineMath","value":"a_t","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"html":"ata_tat","key":"tFjQZgaDTe"},{"type":"text","value":" according to some\ncontext-dependent policy ","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"b9Z49FmpZS"},{"type":"inlineMath","value":"\\pi_t(x_t)","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"html":"πt(xt)\\pi_t(x_t)πt(xt)","key":"sP8UdVElnn"},{"type":"text","value":". Then, the learner observes the\nreward from the chosen arm ","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"zSl8oLICro"},{"type":"inlineMath","value":"r_t \\sim \\nu^{a_t}(x_t)","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"html":"rtνat(xt)r_t \\sim \\nu^{a_t}(x_t)rtνat(xt)","key":"uPyxfseIGY"},{"type":"text","value":". The reward\ndistribution also depends on the context.","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"hLwXuhM51i"}],"key":"lpw802SR6J"}],"enumerator":"3.2","html_id":"contextual-bandit","key":"AE1nKi4eWZ"}],"key":"h2YJFxwif9"},{"type":"block","position":{"start":{"line":827,"column":1},"end":{"line":827,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":829,"column":1},"end":{"line":831,"column":1}},"children":[{"type":"text","value":"Assuming our context is ","position":{"start":{"line":829,"column":1},"end":{"line":829,"column":1}},"key":"M4PpaTPVKv"},{"type":"emphasis","position":{"start":{"line":829,"column":1},"end":{"line":829,"column":1}},"children":[{"type":"text","value":"discrete","position":{"start":{"line":829,"column":1},"end":{"line":829,"column":1}},"key":"ikOXlLHsdT"}],"key":"ssZZP7WlTA"},{"type":"text","value":", we can just perform the same\nalgorithms, treating each context-arm pair as its own arm. This gives us\nan enlarged MAB of ","position":{"start":{"line":829,"column":1},"end":{"line":829,"column":1}},"key":"EF1zSkMvkP"},{"type":"inlineMath","value":"K |\\mathcal{X}|","position":{"start":{"line":829,"column":1},"end":{"line":829,"column":1}},"html":"KXK |\\mathcal{X}|KX","key":"jtvW5Uj4cm"},{"type":"text","value":" arms.","position":{"start":{"line":829,"column":1},"end":{"line":829,"column":1}},"key":"DST73qnaPW"}],"key":"tAXRp9Hnlx"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"Vn95InZN3f"}],"key":"DQYKTsnyrQ"},{"type":"paragraph","position":{"start":{"line":834,"column":1},"end":{"line":835,"column":1}},"children":[{"type":"text","value":"Write down the UCB algorithm for this enlarged MAB. That is, write an\nexpression for ","position":{"start":{"line":834,"column":1},"end":{"line":834,"column":1}},"key":"bUPHh9hb2H"},{"type":"inlineMath","value":"\\pi_t(x_t) = \\arg\\max_a \\dots","position":{"start":{"line":834,"column":1},"end":{"line":834,"column":1}},"html":"πt(xt)=argmaxa\\pi_t(x_t) = \\arg\\max_a \\dotsπt(xt)=argmaxa","key":"d6jrDfnNt2"},{"type":"text","value":".","position":{"start":{"line":834,"column":1},"end":{"line":834,"column":1}},"key":"tcKHz6ZprY"}],"key":"hpzv4ThQZV"}],"key":"yBLTOGTMSn"},{"type":"paragraph","position":{"start":{"line":838,"column":1},"end":{"line":844,"column":1}},"children":[{"type":"text","value":"Recall that running UCB for ","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"key":"RlnaTOLlWU"},{"type":"inlineMath","value":"T","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"html":"TTT","key":"z1kxMlLDO0"},{"type":"text","value":" timesteps on an MAB with ","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"key":"WEkKgufg1u"},{"type":"inlineMath","value":"K","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"html":"KKK","key":"mOmi9vtVLW"},{"type":"text","value":" arms\nachieves a regret bound of ","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"key":"CmOyqWInBJ"},{"type":"inlineMath","value":"\\tilde{O}(\\sqrt{TK})","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"html":"O~(TK)\\tilde{O}(\\sqrt{TK})O~(TK)","key":"kF0GSK3Iv4"},{"type":"text","value":". So in this problem,\nwe would achieve regret ","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"key":"okswnaAuBn"},{"type":"inlineMath","value":"\\tilde{O}(\\sqrt{TK|\\mathcal{X}|})","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"html":"O~(TKX)\\tilde{O}(\\sqrt{TK|\\mathcal{X}|})O~(TKX)","key":"TNSFrADcer"},{"type":"text","value":" in the\ncontextual MAB, which has a polynomial dependence on ","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"key":"KvX52lZTRy"},{"type":"inlineMath","value":"|\\mathcal{X}|","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"html":"X|\\mathcal{X}|X","key":"CmvURs45Ni"},{"type":"text","value":".\nBut in a situation where we have large, or even infinitely many\ncontexts, e.g. in the case where our context is a continuous value, this\nbecomes intractable.","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"key":"Kn2Id4WkJG"}],"key":"H4uCi4dzXE"},{"type":"paragraph","position":{"start":{"line":846,"column":1},"end":{"line":850,"column":1}},"children":[{"type":"text","value":"Note that this “enlarged MAB” treats the different contexts as entirely\nunrelated to each other, while in practice, often contexts are ","position":{"start":{"line":846,"column":1},"end":{"line":846,"column":1}},"key":"n2FkcjLKnj"},{"type":"emphasis","position":{"start":{"line":846,"column":1},"end":{"line":846,"column":1}},"children":[{"type":"text","value":"related","position":{"start":{"line":846,"column":1},"end":{"line":846,"column":1}},"key":"w5u7diShor"}],"key":"wiCZjmXhKi"},{"type":"text","value":"\nto each other in some way: for example, we might want to advertise\nsimilar products to users with similar preferences. How can we\nincorporate this structure into our solution?","position":{"start":{"line":846,"column":1},"end":{"line":846,"column":1}},"key":"c088eLTrXK"}],"key":"zHKdQP8ORL"}],"key":"nppRzYxkG8"},{"type":"block","position":{"start":{"line":852,"column":1},"end":{"line":852,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":855,"column":1},"end":{"line":855,"column":1}},"children":[{"type":"text","value":"Linear contextual bandits","position":{"start":{"line":855,"column":1},"end":{"line":855,"column":1}},"key":"hb09FpEFde"}],"label":"lin_ucb","identifier":"lin_ucb","html_id":"lin-ucb","enumerator":"3.8.1","key":"cRfex1pJWm"},{"type":"paragraph","position":{"start":{"line":857,"column":1},"end":{"line":866,"column":1}},"children":[{"type":"text","value":"We want to model the ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"ve24fROls5"},{"type":"emphasis","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"children":[{"type":"text","value":"mean reward","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"zjLjzAy8tm"}],"key":"tZWVCVcnAo"},{"type":"text","value":" of arm ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"jDJmb1S5wT"},{"type":"inlineMath","value":"k","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"kkk","key":"An3fFNMxw3"},{"type":"text","value":" as a function of the\ncontext, i.e. ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"qvSJaPhxiG"},{"type":"inlineMath","value":"\\mu^k(x)","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"μk(x)\\mu^k(x)μk(x)","key":"miqkBjvqZa"},{"type":"text","value":". One simple model is the ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"PRMRGv5sZz"},{"type":"emphasis","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"children":[{"type":"text","value":"linear","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"mNqZvI0OSA"}],"key":"itRyoAjrAa"},{"type":"text","value":" one:\n","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"njN9ODxIqh"},{"type":"inlineMath","value":"\\mu^k(x) = x^\\top \\theta^k","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"μk(x)=xθk\\mu^k(x) = x^\\top \\theta^kμk(x)=xθk","key":"UdkTDDarOn"},{"type":"text","value":", where ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"r35xLOkWMs"},{"type":"inlineMath","value":"x \\in \\mathcal{X} = \\mathbb{R}^d","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"xX=Rdx \\in \\mathcal{X} = \\mathbb{R}^dxX=Rd","key":"xjLz5YOmxg"},{"type":"text","value":" and\n","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"ir0Mzed20c"},{"type":"inlineMath","value":"\\theta^k \\in \\mathbb{R}^d","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"θkRd\\theta^k \\in \\mathbb{R}^dθkRd","key":"eOXnRwoCvK"},{"type":"text","value":" describes a ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"lMorcBP94o"},{"type":"emphasis","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"children":[{"type":"text","value":"feature direction","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"aABITnPRTv"}],"key":"gewx4ChjPB"},{"type":"text","value":" for arm ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"wBJ06JbEEj"},{"type":"inlineMath","value":"k","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"kkk","key":"Sb97viNR2o"},{"type":"text","value":". Recall\nthat ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"F7INHUuF9t"},{"type":"strong","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"children":[{"type":"text","value":"supervised learning","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"xFkfhnqQnD"}],"key":"nbOHImGU4T"},{"type":"text","value":" gives us a way to estimate a conditional\nexpectation from samples: We learn a ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"xkn1eVUZ0G"},{"type":"emphasis","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"children":[{"type":"text","value":"least squares","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"Qmxxumm4Pc"}],"key":"Li71WVnmSa"},{"type":"text","value":" estimator from the\ntimesteps where arm ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"tDcWbUgEvw"},{"type":"inlineMath","value":"k","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"kkk","key":"mfKiFEvrkM"},{"type":"text","value":" was selected:","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"b4LKyKHMne"}],"key":"oFk60JAAia"},{"type":"math","value":"\\hat \\theta_t^k = \\arg\\min_{\\theta \\in \\mathbb{R}^d} \\sum_{\\{ i \\in [t] : a_i = k \\}} (r_i - x_i^\\top \\theta)^2.","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"tight":true,"html":"θ^tk=argminθRd{i[t]:ai=k}(rixiθ)2.\\hat \\theta_t^k = \\arg\\min_{\\theta \\in \\mathbb{R}^d} \\sum_{\\{ i \\in [t] : a_i = k \\}} (r_i - x_i^\\top \\theta)^2.θ^tk=argθRdmin{i[t]:ai=k}(rixiθ)2.","enumerator":"3.29","key":"xnV667nxQ7"},{"type":"paragraph","position":{"start":{"line":857,"column":1},"end":{"line":866,"column":1}},"children":[{"type":"text","value":"This has the closed-form solution known as the ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"ZDCZ3C9TKj"},{"type":"emphasis","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"children":[{"type":"text","value":"ordinary least squares","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"blZrYwum2C"}],"key":"RLFgfbNre7"},{"type":"text","value":"\n(OLS) estimator:","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"jG1Et8MCch"}],"key":"la6FzH4bg6"},{"type":"math","value":"\\begin{aligned}\n \\hat \\theta_t^k & = (A_t^k)^{-1} \\sum_{\\{ i \\in [t] : a_i = k \\}} x_i r_i \\\\\n \\text{where} \\quad A_t^k & = \\sum_{\\{ i \\in [t] : a_i = k \\}} x_i x_i^\\top.\n\\end{aligned}","label":"ols_bandit","identifier":"ols_bandit","html":"θ^tk=(Atk)1{i[t]:ai=k}xiriwhereAtk={i[t]:ai=k}xixi.\\begin{aligned}\n \\hat \\theta_t^k & = (A_t^k)^{-1} \\sum_{\\{ i \\in [t] : a_i = k \\}} x_i r_i \\\\\n \\text{where} \\quad A_t^k & = \\sum_{\\{ i \\in [t] : a_i = k \\}} x_i x_i^\\top.\n\\end{aligned}θ^tkwhereAtk=(Atk)1{i[t]:ai=k}xiri={i[t]:ai=k}xixi.","enumerator":"3.30","html_id":"ols-bandit","key":"Ma5P8EDgFw"},{"type":"paragraph","position":{"start":{"line":877,"column":1},"end":{"line":884,"column":1}},"children":[{"type":"text","value":"We can now apply the UCB algorithm in this environment in order to\nbalance ","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"jZIJOEZsWk"},{"type":"emphasis","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"children":[{"type":"text","value":"exploration","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"aGWBVXnpi5"}],"key":"Wy6ThxSGs5"},{"type":"text","value":" of new arms and ","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"eFjAORzs5F"},{"type":"emphasis","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"children":[{"type":"text","value":"exploitation","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"vOReg9XrVb"}],"key":"wWe3CWvljY"},{"type":"text","value":" of arms that we\nbelieve to have high reward. But how should we construct the upper\nconfidence bound? Previously, we treated the pulls of an arm as i.i.d.\nsamples and used Hoeffding’s inequality to bound the distance of the\nsample mean, our estimator, from the true mean. However, now our\nestimator is not a sample mean, but rather the OLS estimator above ","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"Dh7F9Jc32a"},{"type":"crossReference","kind":"equation","identifier":"ols_bandit","label":"ols_bandit","children":[{"type":"text","value":"(","key":"LCOtBmyjUF"},{"type":"text","value":"3.30","key":"S4lS9iAAtg"},{"type":"text","value":")","key":"dbDh0is9MZ"}],"template":"(%s)","enumerator":"3.30","resolved":true,"html_id":"ols-bandit","key":"ANkIn4K9rR"},{"type":"text","value":". Instead, we’ll use ","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"iAuonfjPje"},{"type":"strong","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"children":[{"type":"text","value":"Chebyshev’s\ninequality","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"n0XgWHGKRR"}],"key":"jv4iRFVLdT"},{"type":"text","value":" to construct an upper confidence bound.","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"u1VqdumhE8"}],"key":"wRL74C3LJ6"},{"type":"proof","kind":"theorem","label":"chebyshev","identifier":"chebyshev","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Chebyshev’s inequality","position":{"start":{"line":886,"column":1},"end":{"line":886,"column":1}},"key":"VBNwDKJLbS"}],"key":"UTv9ihzauh"},{"type":"paragraph","position":{"start":{"line":889,"column":1},"end":{"line":891,"column":1}},"children":[{"type":"text","value":"For a random variable ","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"key":"Qpazz15kr8"},{"type":"inlineMath","value":"Y","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"html":"YYY","key":"vumxns3IK8"},{"type":"text","value":" such that\n","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"key":"ySVv7PD6uM"},{"type":"inlineMath","value":"\\E Y = 0","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"html":"EY=0\\E Y = 0EY=0","key":"WOrOK9ZgMU"},{"type":"text","value":" and ","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"key":"YyHLKOTH1i"},{"type":"inlineMath","value":"\\E Y^2 = \\sigma^2","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"html":"EY2=σ2\\E Y^2 = \\sigma^2EY2=σ2","key":"btXYLoKaDG"},{"type":"text","value":",","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"key":"TOtZ1MOilc"}],"key":"EDoy2kp3tJ"},{"type":"math","value":"|Y| \\le \\beta \\sigma \\quad \\text{with probability} \\ge 1 - \\frac{1}{\\beta^2}","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"tight":"before","html":"Yβσwith probability11β2|Y| \\le \\beta \\sigma \\quad \\text{with probability} \\ge 1 - \\frac{1}{\\beta^2}Yβσwith probability1β21","enumerator":"3.31","key":"Uo4yJJFNVo"}],"enumerator":"3.3","html_id":"chebyshev","key":"ptlq6L9ySn"},{"type":"paragraph","position":{"start":{"line":894,"column":1},"end":{"line":896,"column":1}},"children":[{"type":"text","value":"Since the OLS estimator is known to be unbiased (try proving this\nyourself), we can apply Chebyshev’s inequality to\n","position":{"start":{"line":894,"column":1},"end":{"line":894,"column":1}},"key":"kapI4qpWxT"},{"type":"inlineMath","value":"x_t^\\top (\\hat \\theta_t^k - \\theta^k)","position":{"start":{"line":894,"column":1},"end":{"line":894,"column":1}},"html":"xt(θ^tkθk)x_t^\\top (\\hat \\theta_t^k - \\theta^k)xt(θ^tkθk)","key":"T8T5LHXUJ8"},{"type":"text","value":":","position":{"start":{"line":894,"column":1},"end":{"line":894,"column":1}},"key":"wGZtG4CYz1"}],"key":"xkvq6eRzf9"},{"type":"math","value":"\\begin{aligned}\n x_t^\\top \\theta^k \\le x_t^\\top \\hat \\theta_t^k + \\beta \\sqrt{x_t^\\top (A_t^k)^{-1} x_t} \\quad \\text{with probability} \\ge 1 - \\frac{1}{\\beta^2}\n\\end{aligned}","position":{"start":{"line":898,"column":1},"end":{"line":900,"column":1}},"html":"xtθkxtθ^tk+βxt(Atk)1xtwith probability11β2\\begin{aligned}\n x_t^\\top \\theta^k \\le x_t^\\top \\hat \\theta_t^k + \\beta \\sqrt{x_t^\\top (A_t^k)^{-1} x_t} \\quad \\text{with probability} \\ge 1 - \\frac{1}{\\beta^2}\n\\end{aligned}xtθkxtθ^tk+βxt(Atk)1xtwith probability1β21","enumerator":"3.32","key":"NHKzsY2KCm"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"srTkGGrq7z"}],"key":"ntrnkwjaKX"},{"type":"paragraph","position":{"start":{"line":903,"column":1},"end":{"line":905,"column":1}},"children":[{"type":"text","value":"We haven’t explained why ","position":{"start":{"line":903,"column":1},"end":{"line":903,"column":1}},"key":"p92h3nQ0eU"},{"type":"inlineMath","value":"x_t^\\top (A_t^k)^{-1} x_t","position":{"start":{"line":903,"column":1},"end":{"line":903,"column":1}},"html":"xt(Atk)1xtx_t^\\top (A_t^k)^{-1} x_txt(Atk)1xt","key":"TCLC0WUOyJ"},{"type":"text","value":" is the correct\nexpression for the variance of ","position":{"start":{"line":903,"column":1},"end":{"line":903,"column":1}},"key":"OhhxElnrzs"},{"type":"inlineMath","value":"x_t^\\top \\hat \\theta_t^k","position":{"start":{"line":903,"column":1},"end":{"line":903,"column":1}},"html":"xtθ^tkx_t^\\top \\hat \\theta_t^kxtθ^tk","key":"bjyxBbMfT3"},{"type":"text","value":". This result\nfollows from some algebra on the definition of the OLS estimator ","position":{"start":{"line":903,"column":1},"end":{"line":903,"column":1}},"key":"vTeNkaEFp6"},{"type":"crossReference","kind":"equation","identifier":"ols_bandit","label":"ols_bandit","children":[{"type":"text","value":"(","key":"YhLK6i0l24"},{"type":"text","value":"3.30","key":"UsFN0lJgr0"},{"type":"text","value":")","key":"iiWBD4mJBO"}],"template":"(%s)","enumerator":"3.30","resolved":true,"html_id":"ols-bandit","key":"MC8Ne1BpgG"},{"type":"text","value":".","position":{"start":{"line":903,"column":1},"end":{"line":903,"column":1}},"key":"Zs2vxGECZ7"}],"key":"cdFP8etNXC"}],"key":"lOCFbMwec8"},{"type":"paragraph","position":{"start":{"line":908,"column":1},"end":{"line":916,"column":1}},"children":[{"type":"text","value":"The first term is exactly our predicted reward ","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"JRXtbpOktn"},{"type":"inlineMath","value":"\\hat \\mu^k_t(x_t)","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"html":"μ^tk(xt)\\hat \\mu^k_t(x_t)μ^tk(xt)","key":"DD3yskHvU3"},{"type":"text","value":". To\ninterpret the second term, note that","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"Gxo7770hbF"}],"key":"R0fL6bj7JZ"},{"type":"math","value":"x_t^\\top (A_t^k)^{-1} x_t = \\frac{1}{N_t^k} x_t^\\top (\\Sigma_t^k)^{-1} x_t,","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"tight":true,"html":"xt(Atk)1xt=1Ntkxt(Σtk)1xt,x_t^\\top (A_t^k)^{-1} x_t = \\frac{1}{N_t^k} x_t^\\top (\\Sigma_t^k)^{-1} x_t,xt(Atk)1xt=Ntk1xt(Σtk)1xt,","enumerator":"3.33","key":"EboiwS09Ua"},{"type":"paragraph","position":{"start":{"line":908,"column":1},"end":{"line":916,"column":1}},"children":[{"type":"text","value":"where","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"p46G01VkwQ"}],"key":"ee68TE6hXm"},{"type":"math","value":"\\Sigma_t^k = \\frac{1}{N_t^k} \\sum_{\\{ i \\in [t] : a_i = k \\}} x_i x_i^\\top","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"tight":true,"html":"Σtk=1Ntk{i[t]:ai=k}xixi\\Sigma_t^k = \\frac{1}{N_t^k} \\sum_{\\{ i \\in [t] : a_i = k \\}} x_i x_i^\\topΣtk=Ntk1{i[t]:ai=k}xixi","enumerator":"3.34","key":"n9xEpDed9b"},{"type":"paragraph","position":{"start":{"line":908,"column":1},"end":{"line":916,"column":1}},"children":[{"type":"text","value":"is the empirical covariance matrix of the contexts (assuming that the\ncontext has mean zero). That is, the learner is encouraged to choose\narms when ","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"abqsGDhEi8"},{"type":"inlineMath","value":"x_t","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"html":"xtx_txt","key":"kz4vcex6pW"},{"type":"text","value":" is ","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"I6UC5CU9TH"},{"type":"emphasis","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"children":[{"type":"text","value":"not aligned","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"Qf6Z3bQJN1"}],"key":"A9STEmWXLd"},{"type":"text","value":" with the data seen so far, or if arm\n","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"mZ45YKycvP"},{"type":"inlineMath","value":"k","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"html":"kkk","key":"Wbh7WXvo4Z"},{"type":"text","value":" has not been explored much and so ","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"u3B7SmSA6z"},{"type":"inlineMath","value":"N_t^k","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"html":"NtkN_t^kNtk","key":"APkWQ0rePX"},{"type":"text","value":" is small.","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"cdasE8hSzk"}],"key":"Uw7uAvteKH"},{"type":"paragraph","position":{"start":{"line":918,"column":1},"end":{"line":919,"column":1}},"children":[{"type":"text","value":"We can now substitute these quantities into UCB to get the ","position":{"start":{"line":918,"column":1},"end":{"line":918,"column":1}},"key":"BJZiKd17mM"},{"type":"strong","position":{"start":{"line":918,"column":1},"end":{"line":918,"column":1}},"children":[{"type":"text","value":"LinUCB","position":{"start":{"line":918,"column":1},"end":{"line":918,"column":1}},"key":"kWKlVH7wBE"}],"key":"rzZr9b5lAT"},{"type":"text","value":"\nalgorithm:","position":{"start":{"line":918,"column":1},"end":{"line":918,"column":1}},"key":"EvorEYXAJc"}],"key":"MAsGvsE4sQ"}],"key":"NDTARNZui2"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class LinUCBPseudocode(Agent):\n def __init__(\n self, K: int, T: int, D: int, lam: float, get_c: Callable[[int], float]\n ):\n super().__init__(K, T)\n self.lam = lam\n self.get_c = get_c\n self.contexts = [None for _ in range(K)]\n self.A = np.repeat(lam * np.eye(D)[...], K)\n self.targets = np.zeros(K, D)\n self.w = np.zeros(K, D)\n\n def choose_arm(self, context: Float[Array, \" D\"]):\n c = self.get_c(self.count)\n scores = self.w @ context + c * np.sqrt(\n context.T @ np.linalg.solve(self.A, context)\n )\n return random_argmax(scores)\n\n def update_history(self, context: Float[Array, \" D\"], arm: int, reward: int):\n self.A[arm] += np.outer(context, context)\n self.targets[arm] += context * reward\n self.w[arm] = np.linalg.solve(self.A[arm], self.targets[arm])","key":"SGSkwlt0AN"},{"type":"output","id":"3MX4RSRKv0TYiDnlhuby4","data":[],"key":"Et4Hra0reA"}],"data":{},"key":"Ywe0yOM2Ii"},{"type":"block","children":[{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"mpSPm2eNzz"}],"key":"ppCNsjHqju"},{"type":"paragraph","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"children":[{"type":"text","value":"Note that the matrix ","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"key":"bXjsA7i185"},{"type":"inlineMath","value":"A_t^k","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"html":"AtkA_t^kAtk","key":"E4vtYMvYUZ"},{"type":"text","value":" above might not be invertible. When does this occur? One way to address this is to include a ","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"key":"QAJ6RaMswc"},{"type":"inlineMath","value":"\\lambda I","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"html":"λI\\lambda IλI","key":"J2cizETQ0J"},{"type":"text","value":" regularization term to ensure that ","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"key":"okSZgvBPQX"},{"type":"inlineMath","value":"A_t^k","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"html":"AtkA_t^kAtk","key":"mDLA0gHBXo"},{"type":"text","value":" is invertible. This is equivalent to solving a ","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"key":"PDeLcaI375"},{"type":"emphasis","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"children":[{"type":"text","value":"ridge regression","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"key":"o7My5osfmn"}],"key":"YXb3QqJK4J"},{"type":"text","value":" problem instead of the unregularized least squares problem. Implement this solution. TODO SOLUTION CURRENTLY SHOWN","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"key":"yz3rgrCCWw"}],"key":"E8FrPUh7gl"}],"key":"zjPW5CLmND"}],"key":"Zb4Ga5IDJJ"},{"type":"block","position":{"start":{"line":951,"column":1},"end":{"line":951,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":953,"column":1},"end":{"line":956,"column":1}},"children":[{"type":"inlineMath","value":"c_t","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"html":"ctc_tct","key":"WquyfIPyJc"},{"type":"text","value":" is similar to the ","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"key":"AkJeWgn6Uv"},{"type":"inlineMath","value":"\\log (2t/\\delta')","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"html":"log(2t/δ)\\log (2t/\\delta')log(2t/δ)","key":"wuohpaqCul"},{"type":"text","value":" term of UCB: It controls the\nwidth of the confidence interval. Here, we treat it as a tunable\nparameter, though in a theoretical analysis, it would depend on ","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"key":"AxGhoLBwTb"},{"type":"inlineMath","value":"A_t^k","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"html":"AtkA_t^kAtk","key":"C9t27xhFN6"},{"type":"text","value":"\nand the probability ","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"key":"IWfCuGYcta"},{"type":"text","value":"δ","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"key":"QeSHtoLiBx"},{"type":"text","value":" with which the bound holds.","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"key":"zrb25ANtD9"}],"key":"wnj7ViXpSm"},{"type":"paragraph","position":{"start":{"line":958,"column":1},"end":{"line":959,"column":1}},"children":[{"type":"text","value":"Using similar tools for UCB, we can also prove an ","position":{"start":{"line":958,"column":1},"end":{"line":958,"column":1}},"key":"cl6NB8zuCT"},{"type":"inlineMath","value":"\\tilde{O}(\\sqrt{T})","position":{"start":{"line":958,"column":1},"end":{"line":958,"column":1}},"html":"O~(T)\\tilde{O}(\\sqrt{T})O~(T)","key":"vWekOLjtiP"},{"type":"text","value":"\nregret bound. The full details of the analysis can be found in Section 3 of ","position":{"start":{"line":958,"column":1},"end":{"line":958,"column":1}},"key":"HqB4I5mNzj"},{"type":"cite","kind":"narrative","label":"agarwal_reinforcement_2022","identifier":"agarwal_reinforcement_2022","children":[{"type":"text","value":"Agarwal ","key":"ZX1hNQCndA"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"TwAs0Ox69Y"}],"key":"TE1KamsOjP"},{"type":"text","value":" (2022)","key":"lx0vbXC6lm"}],"enumerator":"3","key":"zbQ4Ck40zp"},{"type":"text","value":".","position":{"start":{"line":958,"column":1},"end":{"line":958,"column":1}},"key":"jBo4gHK5yN"}],"key":"kNMI29Lkd9"},{"type":"heading","depth":2,"position":{"start":{"line":961,"column":1},"end":{"line":961,"column":1}},"children":[{"type":"text","value":"Summary","position":{"start":{"line":961,"column":1},"end":{"line":961,"column":1}},"key":"XsmHUm3rEW"}],"identifier":"summary","label":"Summary","html_id":"summary","implicit":true,"enumerator":"3.9","key":"nGSZbnfj42"},{"type":"paragraph","position":{"start":{"line":963,"column":1},"end":{"line":964,"column":1}},"children":[{"type":"text","value":"In this chapter,\nwe explored the ","position":{"start":{"line":963,"column":1},"end":{"line":963,"column":1}},"key":"SEwmDvuTlE"},{"type":"strong","position":{"start":{"line":963,"column":1},"end":{"line":963,"column":1}},"children":[{"type":"text","value":"multi-armed bandit","position":{"start":{"line":963,"column":1},"end":{"line":963,"column":1}},"key":"J5DViuH2YD"}],"key":"OkeTSZ3q4b"},{"type":"text","value":" setting for analyzing sequential decision-making in an unknown environment.","position":{"start":{"line":963,"column":1},"end":{"line":963,"column":1}},"key":"qOwc71TLeN"}],"key":"OVSSmOHPRx"}],"key":"DGLY7twmMf"}],"key":"fqhCm5iabt"},"references":{"cite":{"order":["vershynin_high-dimensional_2018","lai_asymptotically_1985","agarwal_reinforcement_2022"],"data":{"vershynin_high-dimensional_2018":{"label":"vershynin_high-dimensional_2018","enumerator":"1","html":"Vershynin, R. (2018). High-Dimensional Probability: An Introduction with Applications in Data Science. Cambridge University Press."},"lai_asymptotically_1985":{"label":"lai_asymptotically_1985","enumerator":"2","doi":"10.1016/0196-8858(85)90002-8","html":"Lai, T. L., & Robbins, H. (1985). Asymptotically Efficient Adaptive Allocation Rules. Advances in Applied Mathematics, 6(1), 4–22. 10.1016/0196-8858(85)90002-8","url":"https://doi.org/10.1016/0196-8858(85)90002-8"},"agarwal_reinforcement_2022":{"label":"agarwal_reinforcement_2022","enumerator":"3","html":"Agarwal, A., Jiang, N., Kakade, S. M., & Sun, W. (2022). Reinforcement Learning: Theory and Algorithms."}}}},"footer":{"navigation":{"prev":{"title":"2 Linear Quadratic Regulators","url":"/control","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"4 Supervised learning","url":"/supervised-learning","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file +{"kind":"Notebook","sha256":"cb8437494713e13080ce9e296ca5fbb4d04ebda213c523132d19db6324b795e6","slug":"bandits","location":"/bandits.md","dependencies":[],"frontmatter":{"title":"3 Multi-Armed Bandits","numbering":{"all":{"enabled":true},"enumerator":{"template":"3.%s"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"exports":[{"format":"md","filename":"bandits.md","url":"/build/bandits-edc5c0bbc4c299ec710273a0eb78717a.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"e5Qf5lahRq"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"3.1","key":"DCUSLJ99at"},{"type":"paragraph","position":{"start":{"line":20,"column":1},"end":{"line":21,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"JtMfMdwb60"},{"type":"strong","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"multi-armed bandits","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"GtR5BDmXyz"}],"key":"tzG7pvxhWl"},{"type":"text","value":" (MAB) setting is a simple setting for studying the basic challenges of sequential decision-making.\nIn this setting, an agent repeatedly chooses from a fixed set of actions, called ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"yaMhSZ10N2"},{"type":"strong","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"arms","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"H7nkiJPXqR"}],"key":"SW7Xka4i78"},{"type":"text","value":", each of which has an associated reward distribution. The agent’s goal is to maximize the total reward it receives over some time period.","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"FNZHHVQkPP"}],"key":"UjR9JeCykR"},{"type":"comment","value":" \n| States | Actions | Rewards |\n| :----: | :-----: | :---------------------------------: |\n| None | Finite | $\\mathcal{A} \\to \\triangle([0, 1])$ |\n","key":"daALMH4b1M"},{"type":"paragraph","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"text","value":"In particular, we’ll spend a lot of time discussing the ","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"key":"pdNBe09fIf"},{"type":"strong","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"text","value":"Exploration-Exploitation Tradeoff","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"key":"Qh5LQgyQtk"}],"key":"Q4uEoAjGbM"},{"type":"text","value":": should the agent choose new actions to learn more about the environment, or should it choose actions that it already knows to be good?","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"key":"gqDeCseIu7"}],"key":"YhFl9OwBcb"},{"type":"proof","kind":"example","label":"advertising","identifier":"advertising","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Online advertising","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"uii9N6q1jp"}],"key":"exxjhgqNZL"},{"type":"paragraph","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"children":[{"type":"text","value":"Let’s suppose you, the agent, are an advertising company. You have ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"pfoiGRP8bw"},{"type":"inlineMath","value":"K","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"html":"KKK","key":"w3EMNDY23I"},{"type":"text","value":" different ads that you can show to users; For concreteness, let’s suppose there’s just a single user. You receive ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"D6mkMH5WKL"},{"type":"text","value":"1","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"UK0HENksXR"},{"type":"text","value":" reward if the user clicks the ad, and ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"DgRQXmBuBF"},{"type":"text","value":"0","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"t77XR7xrnb"},{"type":"text","value":" otherwise. Thus, the unknown ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"pOsKOWtVh7"},{"type":"emphasis","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"children":[{"type":"text","value":"reward distribution","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"I6CDp2EPOt"}],"key":"s7UMGmWaYZ"},{"type":"text","value":" associated to each ad is a Bernoulli distribution defined by the probability that the user clicks on the ad. Your goal is to maximize the total number of clicks by the user.","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"iD6RSL9fUU"}],"key":"zGqcZtWwL3"}],"enumerator":"3.1","html_id":"advertising","key":"upXxEXpK4c"},{"type":"proof","kind":"example","label":"clinical_trials","identifier":"clinical_trials","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Clinical trials","position":{"start":{"line":37,"column":1},"end":{"line":37,"column":1}},"key":"p0nAA80Puc"}],"key":"zMi4mr56KN"},{"type":"paragraph","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"children":[{"type":"text","value":"Suppose you’re a pharmaceutical company, and you’re testing a new drug. You have ","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"R7PGktbNdO"},{"type":"inlineMath","value":"K","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"html":"KKK","key":"ECKSRChtYr"},{"type":"text","value":" different dosages of the drug that you can administer to patients. You receive ","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"xCp6Gvkjea"},{"type":"text","value":"1","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"jhRebCAEM7"},{"type":"text","value":" reward if the patient recovers, and ","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"jHERp2I6sP"},{"type":"text","value":"0","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"MQE4PpdVoT"},{"type":"text","value":" otherwise. Thus, the unknown ","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"ia88lBzAXx"},{"type":"emphasis","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"children":[{"type":"text","value":"reward distribution","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"VBKI0PNVqe"}],"key":"dJRMIS9RcC"},{"type":"text","value":" associated to each dosage is a Bernoulli distribution defined by the probability that the patient recovers. Your goal is to maximize the total number of patients that recover.","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"PZ9I7b8Twi"}],"key":"EeMpRIqlnn"}],"enumerator":"3.2","html_id":"clinical-trials","key":"tSP5ChMfjT"},{"type":"paragraph","position":{"start":{"line":43,"column":1},"end":{"line":43,"column":1}},"children":[{"type":"text","value":"In this chapter, we will introduce the multi-armed bandits setting, and discuss some of the challenges that arise when trying to solve problems in this setting. We will also introduce some of the key concepts that we will use throughout the book, such as regret and exploration-exploitation tradeoffs.","position":{"start":{"line":43,"column":1},"end":{"line":43,"column":1}},"key":"ZAAgCNd93Y"}],"key":"ujsEa21LHa"}],"key":"q7NUnXVRAW"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"from jaxtyping import Float, Array\nimport numpy as np\nimport latexify\nfrom typing import Callable, Union\nimport matplotlib.pyplot as plt\n\nimport solutions.bandits as solutions\n\nnp.random.seed(184)\n\ndef random_argmax(ary: Array) -> int:\n \"\"\"Take an argmax and randomize between ties.\"\"\"\n max_idx = np.flatnonzero(ary == ary.max())\n return np.random.choice(max_idx).item()\n\n\n# used as decorator\nlatex = latexify.algorithmic(\n prefixes={\"mab\"},\n identifiers={\"arm\": \"a_t\", \"reward\": \"r\", \"means\": \"mu\"},\n use_math_symbols=True,\n escape_underscores=False,\n)","key":"EiG9LSCA2a"},{"type":"output","id":"DXdkOjdZu84h6vZJyWFP7","data":[],"key":"dSCrYWab1z"}],"data":{},"key":"EmV6nORQSj"},{"type":"block","position":{"start":{"line":72,"column":1},"end":{"line":72,"column":1}},"children":[{"type":"proof","kind":"remark","label":"multi-armed","identifier":"multi-armed","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Namesake","position":{"start":{"line":74,"column":1},"end":{"line":74,"column":1}},"key":"c0HGe3u0ru"}],"key":"QtbQBjXCsP"},{"type":"paragraph","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"children":[{"type":"text","value":"The name “multi-armed bandits” comes from slot machines in casinos, which are often called “one-armed bandits” since they have one arm (the lever) and take money from the player.","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"key":"wS24YHBTG1"}],"key":"z6bL8yVx6Q"}],"enumerator":"3.1","html_id":"multi-armed","key":"hY1cJHepDr"},{"type":"paragraph","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"OTVFuLKd1m"},{"type":"inlineMath","value":"K","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"html":"KKK","key":"qkE3F2pAJz"},{"type":"text","value":" denote the number of arms. We’ll label them ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"kL9p1Kb8An"},{"type":"inlineMath","value":"0, \\dots, K-1","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"html":"0,,K10, \\dots, K-10,,K1","key":"ELoucNMLSi"},{"type":"text","value":" and use ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"NaBA9XSnpi"},{"type":"emphasis","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"children":[{"type":"text","value":"superscripts","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"iAhvj7usqW"}],"key":"O4gyx2QtV6"},{"type":"text","value":" to indicate the arm index; since we seldom need to raise a number to a power, this won’t cause much confusion. In this chapter, we’ll consider the ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"j8Cx76KMh9"},{"type":"strong","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"children":[{"type":"text","value":"Bernoulli bandit","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"BXozqSAeMJ"}],"key":"jsGE0igqJV"},{"type":"text","value":" setting from the examples above, where arm ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"FkImyQ2GpN"},{"type":"inlineMath","value":"k","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"html":"kkk","key":"DL2z6IW0Il"},{"type":"text","value":" either returns reward ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"BYIv71MR5p"},{"type":"text","value":"1","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"dPTW2DKayy"},{"type":"text","value":" with probability ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"UywukGUE2t"},{"type":"inlineMath","value":"\\mu^k","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"html":"μk\\mu^kμk","key":"tQk4ToZozR"},{"type":"text","value":" or ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"kaIKJVrpfx"},{"type":"text","value":"0","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"kqHb6HjQ4b"},{"type":"text","value":" otherwise. The agent gets to pull an arm ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"TR74tgyK9I"},{"type":"inlineMath","value":"T","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"html":"TTT","key":"mIP549YOQm"},{"type":"text","value":" times in total. We can formalize the Bernoulli bandit in the following Python code:","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"d3gindTBop"}],"key":"bIFSyxcyTo"}],"key":"LnCxfoYYoi"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class MAB:\n \"\"\"\n The Bernoulli multi-armed bandit environment.\n\n :param means: the means (success probabilities) of the reward distributions for each arm\n :param T: the time horizon\n \"\"\"\n\n def __init__(self, means: Float[Array, \" K\"], T: int):\n assert all(0 <= p <= 1 for p in means)\n self.means = means\n self.T = T\n self.K = self.means.size\n self.best_arm = random_argmax(self.means)\n\n def pull(self, k: int) -> int:\n \"\"\"Pull the `k`-th arm and sample from its (Bernoulli) reward distribution.\"\"\"\n reward = np.random.rand() < self.means[k].item()\n return +reward","key":"qzSHzLTiTL"},{"type":"output","id":"CFgQOeOyO1gh3MsiAbpZy","data":[],"key":"TTIFk1YWER"}],"data":{},"key":"E1WSlBr3X1"},{"type":"block","children":[],"key":"reEJZUF9Au"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"mab = MAB(means=np.array([0.1, 0.8, 0.4]), T=100)","key":"Xt8D5KpSy0"},{"type":"output","id":"-e_89j3QRLjCbMV33QiRu","data":[],"key":"ifA9UBXiwK"}],"data":{},"key":"Xdva9SNrfY"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":108,"column":1},"end":{"line":109,"column":1}},"children":[{"type":"text","value":"In pseudocode, the agent’s interaction with the MAB environment can be\ndescribed by the following process:","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"key":"nHfugz5o1y"}],"key":"QgqoC2gRed"}],"key":"hx1oZpq79w"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"@latex\ndef mab_loop(mab: MAB, agent: \"Agent\") -> int:\n for t in range(mab.T):\n arm = agent.choose_arm() # in 0, ..., K-1\n reward = mab.pull(arm)\n agent.update_history(arm, reward)\n\n\nmab_loop","key":"ecisB55QN8"},{"type":"output","id":"iW4HYM7Il4EGnAXsWTAJb","data":[{"output_type":"execute_result","execution_count":4,"metadata":{},"data":{"text/plain":{"content":"","content_type":"text/plain"},"text/latex":{"content":"$ \\begin{array}{l} \\mathbf{function} \\ \\mathrm{mab\\_loop}(\\mathrm{mab}, \\mathrm{agent}) \\\\ \\hspace{1em} \\mathbf{for} \\ t \\in \\mathrm{range} \\mathopen{}\\left( T \\mathclose{}\\right) \\ \\mathbf{do} \\\\ \\hspace{2em} \\mathrm{a\\_t} \\gets \\mathrm{agent}.\\mathrm{choose\\_arm} \\mathopen{}\\left( \\mathclose{}\\right) \\\\ \\hspace{2em} r \\gets \\mathrm{pull} \\mathopen{}\\left( \\mathrm{a\\_t} \\mathclose{}\\right) \\\\ \\hspace{2em} \\mathrm{agent}.\\mathrm{update\\_history} \\mathopen{}\\left( \\mathrm{a\\_t}, r \\mathclose{}\\right) \\\\ \\hspace{1em} \\mathbf{end \\ for} \\\\ \\mathbf{end \\ function} \\end{array} $","content_type":"text/latex"}}}],"key":"hu6owtpWWB"}],"data":{},"key":"hd3dTt47Yn"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"key":"UZgiDJju0t"},{"type":"inlineCode","value":"Agent","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"key":"EoKs1AMTsJ"},{"type":"text","value":" class stores the pull history and uses it to decide which arm to pull next. Since we are working with Bernoulli bandits, we can summarize the pull history concisely in a ","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"key":"H0VML6CrNd"},{"type":"inlineMath","value":"\\mathbb{N}^{K \\times 2}","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"html":"NK×2\\mathbb{N}^{K \\times 2}NK×2","key":"Z1zJZEmhCJ"},{"type":"text","value":" array.","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"key":"ursxZw6P8H"}],"key":"MX4QpEN2tf"}],"key":"bx2INuER7n"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class Agent:\n def __init__(self, K: int, T: int):\n \"\"\"The MAB agent that decides how to choose an arm given the past history.\"\"\"\n self.K = K\n self.T = T\n self.rewards = [] # for plotting\n self.choices = []\n self.history = np.zeros((K, 2), dtype=int)\n\n def choose_arm(self) -> int:\n \"\"\"Choose an arm of the MAB. Algorithm-specific.\"\"\"\n ...\n\n def count(self) -> int:\n \"\"\"The number of pulls made. Also the current step index.\"\"\"\n return len(self.rewards)\n\n def update_history(self, arm: int, reward: int):\n self.rewards.append(reward)\n self.choices.append(arm)\n self.history[arm, reward] += 1","key":"Urmw8Aomaq"},{"type":"output","id":"tsVsGK6D-2CHGEnm22-fC","data":[],"key":"OLcnlho8R8"}],"data":{},"key":"b1lfnDSfkW"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":149,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"What’s the ","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"key":"eryIM7Sl0o"},{"type":"emphasis","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"children":[{"type":"text","value":"optimal","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"key":"OTXjzUFBTe"}],"key":"AVuyjd96Mv"},{"type":"text","value":" strategy for the agent, i.e. the one that achieves\nthe highest expected reward? Convince yourself that the agent should try\nto always pull the arm with the highest expected reward:","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"key":"rorFssr7DM"}],"key":"rNYby33JH1"},{"type":"math","value":"\\mu^\\star := \\max_{k \\in [K]} \\mu^k.","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"μ:=maxk[K]μk.\\mu^\\star := \\max_{k \\in [K]} \\mu^k.μ:=k[K]maxμk.","enumerator":"3.1","key":"i9pviZ8QbX"},{"type":"paragraph","position":{"start":{"line":155,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"text","value":"The goal, then, can be rephrased as to minimize the ","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"ufnF0jO7nM"},{"type":"strong","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"children":[{"type":"text","value":"regret","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"pVuSiUe45u"}],"key":"JyXmUli3mH"},{"type":"text","value":", defined\nbelow:","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"Ef8p9IIL05"}],"key":"C59iJWNfAJ"},{"type":"proof","kind":"definition","label":"regret","identifier":"regret","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Regret","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"key":"YLx0XN2xd0"}],"key":"kGOVGo8hqX"},{"type":"paragraph","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"children":[{"type":"text","value":"The agent’s ","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"key":"kVTaz7AWVB"},{"type":"strong","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"children":[{"type":"text","value":"regret","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"key":"ffV1v2a6Cx"}],"key":"h1ICVWP77w"},{"type":"text","value":" after ","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"key":"zbOI61uYgN"},{"type":"inlineMath","value":"T","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"html":"TTT","key":"qhcAbnf3wt"},{"type":"text","value":" timesteps is defined as","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"key":"ZmSn7F2dVf"}],"key":"leyp9g5uP1"},{"type":"math","value":"\\text{Regret}_T := \\sum_{t=0}^{T-1} \\mu^\\star - \\mu^{a_t}.","position":{"start":{"line":163,"column":1},"end":{"line":165,"column":1}},"html":"RegretT:=t=0T1μμat.\\text{Regret}_T := \\sum_{t=0}^{T-1} \\mu^\\star - \\mu^{a_t}.RegretT:=t=0T1μμat.","enumerator":"3.2","key":"QSIqC84n0Y"}],"enumerator":"3.1","html_id":"regret","key":"GtpuDPYRTz"}],"key":"XX7qNlrmpK"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def regret_per_step(mab: MAB, agent: Agent):\n \"\"\"Get the difference from the average reward of the optimal arm. The sum of these is the regret.\"\"\"\n return [mab.means[mab.best_arm] - mab.means[arm] for arm in agent.choices]","key":"oC95HlLlZ4"},{"type":"output","id":"yhHe-YFkNO9lJlU2l8CoZ","data":[],"key":"R8DfPfwS1H"}],"data":{},"key":"wukx9KMHpn"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":174,"column":1},"end":{"line":178,"column":1}},"children":[{"type":"text","value":"Note that this depends on the ","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"auec50Nls9"},{"type":"emphasis","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"children":[{"type":"text","value":"true means","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"YP5rfcNTcJ"}],"key":"wkXHO9g790"},{"type":"text","value":" of the pulled arms, ","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"THDhxKuMNN"},{"type":"emphasis","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"children":[{"type":"text","value":"not","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"uf3CWyCDto"}],"key":"ZEI2hdAAoc"},{"type":"text","value":" the actual\nobserved rewards.\nWe typically think of this as a random variable where\nthe randomness comes from the agent’s strategy (i.e. the sequence of\nactions ","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"jiDRQtZ80A"},{"type":"inlineMath","value":"a_0, \\dots, a_{T-1}","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"html":"a0,,aT1a_0, \\dots, a_{T-1}a0,,aT1","key":"nBqGL8eoso"},{"type":"text","value":").","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"k7XUpkhxqC"}],"key":"lHKu8j30Y7"},{"type":"paragraph","position":{"start":{"line":180,"column":1},"end":{"line":181,"column":1}},"children":[{"type":"text","value":"Throughout the chapter, we will try to upper bound the regret of various\nalgorithms in two different senses:","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"hBrx4FcAzr"}],"key":"A9bxNbRDyf"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":183,"column":1},"end":{"line":188,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":183,"column":1},"end":{"line":185,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":183,"column":1},"end":{"line":184,"column":1}},"children":[{"type":"text","value":"Upper bound the ","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"kQVByqyraS"},{"type":"emphasis","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"children":[{"type":"text","value":"expected regret,","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"lV2RxL5Rev"}],"key":"LFWmJ4eLlG"},{"type":"text","value":" i.e. show\n","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"FvLtE3Ljyu"},{"type":"inlineMath","value":"\\E[\\text{Regret}_T] \\le M_T","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"html":"E[RegretT]MT\\E[\\text{Regret}_T] \\le M_TE[RegretT]MT","key":"ueqDVxYapL"},{"type":"text","value":".","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"VrYCOeHuYo"}],"key":"KpPkVar55A"}],"key":"m62sn83vRc"},{"type":"listItem","spread":true,"position":{"start":{"line":186,"column":1},"end":{"line":188,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":186,"column":1},"end":{"line":187,"column":1}},"children":[{"type":"text","value":"Find a ","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"key":"gIm6Da81AT"},{"type":"emphasis","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"children":[{"type":"text","value":"high-probability","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"key":"Kdr1lY6UMB"}],"key":"I9A4NBqqTH"},{"type":"text","value":" upper bound on the regret, i.e. show\n","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"key":"H6PKyZ5nCk"},{"type":"inlineMath","value":"\\pr(\\text{Regret}_T \\le M_{T, \\delta}) \\ge 1-\\delta","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"html":"P(RegretTMT,δ)1δ\\pr(\\text{Regret}_T \\le M_{T, \\delta}) \\ge 1-\\deltaP(RegretTMT,δ)1δ","key":"ZGuwMF0xBq"},{"type":"text","value":".","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"key":"OaLJhHOPQY"}],"key":"WOTZiB5dUc"}],"key":"cX5fbUhvY3"}],"key":"prDRXPecpx"},{"type":"paragraph","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"children":[{"type":"text","value":"Note that these two different approaches say very different things about the regret. The first approach says that the ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"JhFywTs9sD"},{"type":"emphasis","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"children":[{"type":"text","value":"average","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"yXQS80PnPo"}],"key":"dLuDetzR6J"},{"type":"text","value":" regret is at most ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"dbSVVctRRx"},{"type":"inlineMath","value":"M_T","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"html":"MTM_TMT","key":"esDzW4csUO"},{"type":"text","value":". However, the agent might still achieve higher regret on many runs. The second approach says that, ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"th3Q8sAlcp"},{"type":"emphasis","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"children":[{"type":"text","value":"with high probability","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"slaIe9mxPQ"}],"key":"MA69dLTPxh"},{"type":"text","value":", the agent will achieve regret at most ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"AwtPYyzmS2"},{"type":"inlineMath","value":"M_{T, \\delta}","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"html":"MT,δM_{T, \\delta}MT,δ","key":"anYXxEs2sB"},{"type":"text","value":". However, it doesn’t say anything about the regret in the remaining ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"XpcB3Tg7hW"},{"type":"text","value":"δ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"soh2sPZK5C"},{"type":"text","value":" fraction of runs, which might be arbitrarily high.","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"dncXfSS6cm"}],"key":"w8dJLTkVtr"},{"type":"paragraph","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"children":[{"type":"text","value":"We’d like to achieve ","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"key":"fTKmVY5iBb"},{"type":"strong","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"children":[{"type":"text","value":"sublinear regret","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"key":"bEOnOSAMwU"}],"key":"AvBBkSE9mz"},{"type":"text","value":" in expectation, i.e. ","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"key":"YcmnZPLPuF"},{"type":"inlineMath","value":"\\E[\\text{Regret}_T] = o(T)","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"html":"E[RegretT]=o(T)\\E[\\text{Regret}_T] = o(T)E[RegretT]=o(T)","key":"IxrPJmScoI"},{"type":"text","value":". That is, as we learn more about the environment, we’d like to be able to exploit that knowledge to take the optimal arm as often as possible.","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"key":"VsyOCQ5PbD"}],"key":"oQc4jmvM3U"},{"type":"paragraph","position":{"start":{"line":193,"column":1},"end":{"line":194,"column":1}},"children":[{"type":"text","value":"The rest of the chapter comprises a series of increasingly sophisticated\nMAB algorithms.","position":{"start":{"line":193,"column":1},"end":{"line":193,"column":1}},"key":"xnh50GuHRj"}],"key":"OvD1atuvzN"}],"key":"rwNCCKdpyj"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def plot_strategy(mab: MAB, agent: Agent):\n plt.figure(figsize=(10, 6))\n\n # plot reward and cumulative regret\n plt.plot(np.arange(mab.T), np.cumsum(agent.rewards), label=\"reward\")\n cum_regret = np.cumsum(regret_per_step(mab, agent))\n plt.plot(np.arange(mab.T), cum_regret, label=\"cumulative regret\")\n\n # draw colored circles for arm choices\n colors = [\"red\", \"green\", \"blue\"]\n color_array = [colors[k] for k in agent.choices]\n plt.scatter(np.arange(mab.T), np.zeros(mab.T), c=color_array, label=\"arm\")\n\n # labels and title\n plt.xlabel(\"timestep\")\n plt.legend()\n plt.title(f\"{agent.__class__.__name__} reward and regret\")\n plt.show()","visibility":"hide","key":"PnkYiiAPmE"},{"type":"output","id":"Os1cKFskOziNeR44xl2Y2","data":[],"visibility":"show","key":"GVm5jO8PWY"}],"data":{"tags":[]},"visibility":"show","key":"zNVlfLP0Jx"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":219,"column":1},"end":{"line":219,"column":1}},"children":[{"type":"text","value":"Pure exploration (random guessing)","position":{"start":{"line":219,"column":1},"end":{"line":219,"column":1}},"key":"z4olLO3bwq"}],"identifier":"pure-exploration-random-guessing","label":"Pure exploration (random guessing)","html_id":"pure-exploration-random-guessing","implicit":true,"enumerator":"3.2","key":"fSL7VE5SQ2"},{"type":"paragraph","position":{"start":{"line":221,"column":1},"end":{"line":222,"column":1}},"children":[{"type":"text","value":"A trivial strategy is to always choose arms at random (i.e. “pure\nexploration”).","position":{"start":{"line":221,"column":1},"end":{"line":221,"column":1}},"key":"gdMCDid2kH"}],"key":"g0Se1ZWG10"}],"key":"fiYitiFEyp"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class PureExploration(Agent):\n def choose_arm(self):\n \"\"\"Choose an arm uniformly at random.\"\"\"\n return solutions.pure_exploration_choose_arm(self)","identifier":"pure_exploration-code","enumerator":"3.1","html_id":"pure-exploration-code","key":"nxNQBDGeUB"},{"type":"output","id":"CACMwKczZGCjYYfXJ0zyZ","data":[],"identifier":"pure_exploration-output","enumerator":"3.1","html_id":"pure-exploration-output","key":"XunaqXTuAG"}],"data":{},"label":"pure_exploration","identifier":"pure_exploration","enumerator":"3.1","html_id":"pure-exploration","key":"BvFYZAJbtV"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"children":[{"type":"text","value":"Note that","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"key":"t9QFi1n5cY"}],"key":"Gk0Sm3vqjr"},{"type":"math","value":"\\E_{a_t \\sim \\text{Unif}([K])}[\\mu^{a_t}] = \\bar \\mu = \\frac{1}{K} \\sum_{k=1}^K \\mu^k","position":{"start":{"line":235,"column":1},"end":{"line":237,"column":1}},"html":"EatUnif([K])[μat]=μˉ=1Kk=1Kμk\\E_{a_t \\sim \\text{Unif}([K])}[\\mu^{a_t}] = \\bar \\mu = \\frac{1}{K} \\sum_{k=1}^K \\mu^kEatUnif([K])[μat]=μˉ=K1k=1Kμk","enumerator":"3.3","key":"zfUMml5P2u"},{"type":"paragraph","position":{"start":{"line":239,"column":1},"end":{"line":239,"column":1}},"children":[{"type":"text","value":"so the expected regret is simply","position":{"start":{"line":239,"column":1},"end":{"line":239,"column":1}},"key":"Ew2BOV6OzF"}],"key":"dbwf0XAcvE"},{"type":"math","value":"\\begin{aligned}\n \\E[\\text{Regret}_T] &= \\sum_{t=0}^{T-1} \\E[\\mu^\\star - \\mu^{a_t}] \\\\\n &= T (\\mu^\\star - \\bar \\mu) > 0.\n\\end{aligned}","position":{"start":{"line":241,"column":1},"end":{"line":246,"column":1}},"html":"E[RegretT]=t=0T1E[μμat]=T(μμˉ)>0.\\begin{aligned}\n \\E[\\text{Regret}_T] &= \\sum_{t=0}^{T-1} \\E[\\mu^\\star - \\mu^{a_t}] \\\\\n &= T (\\mu^\\star - \\bar \\mu) > 0.\n\\end{aligned}E[RegretT]=t=0T1E[μμat]=T(μμˉ)>0.","enumerator":"3.4","key":"ORWKQwGAFN"},{"type":"paragraph","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"children":[{"type":"text","value":"This scales as ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"pSsbGUGTO0"},{"type":"inlineMath","value":"\\Theta(T)","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"html":"Θ(T)\\Theta(T)Θ(T)","key":"HOBf62taLa"},{"type":"text","value":", i.e. ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"FiD48KlWYv"},{"type":"emphasis","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"children":[{"type":"text","value":"linear","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"mI1OAreaU8"}],"key":"erjkj8NAu0"},{"type":"text","value":" in the number of timesteps ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"A1eNEUaSz7"},{"type":"inlineMath","value":"T","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"html":"TTT","key":"Ac1LX8Bh96"},{"type":"text","value":". There’s no learning here: the agent doesn’t use any information about the environment to improve its strategy. You can see that the distribution over its arm choices always appears “(uniformly) random”.","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"YnZtn6RF46"}],"key":"XLgdfqrUQl"}],"key":"WQiu3X129b"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"agent = PureExploration(mab.K, mab.T)\nmab_loop(mab, agent)\nplot_strategy(mab, agent)","key":"meXYC58sCS"},{"type":"output","id":"YtfHMP0AeN16YXcUIjdv5","data":[{"output_type":"display_data","metadata":{},"data":{"text/plain":{"content":"
","content_type":"text/plain"},"image/png":{"content_type":"image/png","hash":"e018a4b689feff2c40f2483432d7c76f","path":"/build/e018a4b689feff2c40f2483432d7c76f.png"}}}],"key":"kLcjprnMdo"}],"data":{},"key":"Ngky92gvbf"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"children":[{"type":"text","value":"Pure greedy","position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"key":"I0xdkQ2HRV"}],"identifier":"pure-greedy","label":"Pure greedy","html_id":"pure-greedy","implicit":true,"enumerator":"3.3","key":"dcsSawEXsu"},{"type":"paragraph","position":{"start":{"line":258,"column":1},"end":{"line":260,"column":1}},"children":[{"type":"text","value":"How might we improve on pure exploration? Instead, we could try each arm\nonce, and then commit to the one with the highest observed reward. We’ll\ncall this the ","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"mGbwoVulJ7"},{"type":"strong","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"children":[{"type":"text","value":"pure greedy","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"x3YgdUeZgH"}],"key":"CzFn3hNvTJ"},{"type":"text","value":" strategy.","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"idS2rbxV6D"}],"key":"DCj011vLB0"}],"key":"Gv00o0GSSK"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class PureGreedy(Agent):\n def choose_arm(self):\n \"\"\"Choose the arm with the highest observed reward on its first pull.\"\"\"\n return solutions.pure_greedy_choose_arm(self)","identifier":"pure_greedy-code","enumerator":"3.2","html_id":"pure-greedy-code","key":"ExVWCQoq3F"},{"type":"output","id":"1mBnnjMwTJvhgr_2W15_i","data":[],"identifier":"pure_greedy-output","enumerator":"3.2","html_id":"pure-greedy-output","key":"S1PW0o70sb"}],"data":{},"label":"pure_greedy","identifier":"pure_greedy","enumerator":"3.2","html_id":"pure-greedy","key":"Yg8VH47BzH"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":271,"column":1},"end":{"line":274,"column":1}},"children":[{"type":"text","value":"Note we’ve used superscripts ","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"key":"dfZaJZ2WvD"},{"type":"inlineMath","value":"r^k","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"html":"rkr^krk","key":"wfAsf5XD53"},{"type":"text","value":" during the exploration phase to\nindicate that we observe exactly one reward for each arm. Then we use\nsubscripts ","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"key":"JIk4Zf7lyS"},{"type":"inlineMath","value":"r_t","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"html":"rtr_trt","key":"SBuwkF1ohr"},{"type":"text","value":" during the exploitation phase to indicate that we\nobserve a sequence of rewards from the chosen greedy arm ","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"key":"HQ9jB5DKbL"},{"type":"inlineMath","value":"\\hat k","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"html":"k^\\hat kk^","key":"N4eFHSxaSg"},{"type":"text","value":".","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"key":"cNUsxcRpX2"}],"key":"wnuJBoh3wJ"},{"type":"paragraph","position":{"start":{"line":276,"column":1},"end":{"line":279,"column":1}},"children":[{"type":"text","value":"How does the expected regret of this strategy compare to that of pure\nexploration? We’ll do a more general analysis in the following section.\nNow, for intuition, suppose there’s just ","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"key":"A7F50swRZP"},{"type":"inlineMath","value":"K=2","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"html":"K=2K=2K=2","key":"tN3bvCZT19"},{"type":"text","value":" arms, with Bernoulli\nreward distributions with means ","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"key":"iMV9dsa4OR"},{"type":"inlineMath","value":"\\mu^0 > \\mu^1","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"html":"μ0>μ1\\mu^0 > \\mu^1μ0>μ1","key":"j1Rtb4BCwO"},{"type":"text","value":".","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"key":"Dgx3kGE2Er"}],"key":"jSWMD74gzS"},{"type":"paragraph","position":{"start":{"line":281,"column":1},"end":{"line":284,"column":1}},"children":[{"type":"text","value":"Let’s let ","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"qLHXMKNGvN"},{"type":"inlineMath","value":"r^0","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"html":"r0r^0r0","key":"OxpmJ3hNJD"},{"type":"text","value":" be the random reward from the first arm and ","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"ej1CU1wNf9"},{"type":"inlineMath","value":"r^1","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"html":"r1r^1r1","key":"kMczA5JeVB"},{"type":"text","value":" be the\nrandom reward from the second. If ","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"jh0hwcqroS"},{"type":"inlineMath","value":"r^0 > r^1","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"html":"r0>r1r^0 > r^1r0>r1","key":"CKXe1Cc431"},{"type":"text","value":", then we achieve zero\nregret. Otherwise, we achieve regret ","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"AkG9hS8q9F"},{"type":"inlineMath","value":"T(\\mu^0 - \\mu^1)","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"html":"T(μ0μ1)T(\\mu^0 - \\mu^1)T(μ0μ1)","key":"vkTCK19THl"},{"type":"text","value":". Thus, the\nexpected regret is simply:","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"kmEylHFBjj"}],"key":"DcUnt2npmf"},{"type":"math","value":"\\begin{aligned}\n \\E[\\text{Regret}_T] &= \\pr(r^0 < r^1) \\cdot T(\\mu^0 - \\mu^1) + c \\\\\n &= (1 - \\mu^0) \\mu^1 \\cdot T(\\mu^0 - \\mu^1) + c\n\\end{aligned}","position":{"start":{"line":286,"column":1},"end":{"line":291,"column":1}},"html":"E[RegretT]=P(r0<r1)T(μ0μ1)+c=(1μ0)μ1T(μ0μ1)+c\\begin{aligned}\n \\E[\\text{Regret}_T] &= \\pr(r^0 < r^1) \\cdot T(\\mu^0 - \\mu^1) + c \\\\\n &= (1 - \\mu^0) \\mu^1 \\cdot T(\\mu^0 - \\mu^1) + c\n\\end{aligned}E[RegretT]=P(r0<r1)T(μ0μ1)+c=(1μ0)μ1T(μ0μ1)+c","enumerator":"3.5","key":"mq3cO7Bzc3"},{"type":"paragraph","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"children":[{"type":"text","value":"Which is still ","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"key":"Fy46Ddsjti"},{"type":"inlineMath","value":"\\Theta(T)","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"html":"Θ(T)\\Theta(T)Θ(T)","key":"NFuLHPnBPn"},{"type":"text","value":", the same as pure exploration!","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"key":"rvCqgKunwR"}],"key":"QljI4jbEvL"}],"key":"qt6Od3HaNA"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"agent = PureGreedy(mab.K, mab.T)\nmab_loop(mab, agent)\nplot_strategy(mab, agent)","key":"aJ205G4ieG"},{"type":"output","id":"etTMPkjFrFLaPxg2Gy1UL","data":[{"output_type":"display_data","metadata":{},"data":{"text/plain":{"content":"
","content_type":"text/plain"},"image/png":{"content_type":"image/png","hash":"60449ce2034aedba8d659c77e97c9729","path":"/build/60449ce2034aedba8d659c77e97c9729.png"}}}],"key":"JsWmum6yJt"}],"data":{},"key":"wHrV6MIsTv"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"children":[{"type":"text","value":"The cumulative regret is a straight line because the regret only depends on the arms chosen and not the actual reward observed. In fact, if the greedy algorithm happens to get lucky on the first set of pulls, it may act entirely optimally for that episode! But its ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"eDSIKIlqx5"},{"type":"emphasis","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"children":[{"type":"text","value":"average","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"k8kZUtXTbO"}],"key":"EkLt4RolRe"},{"type":"text","value":" regret is what measures its effectiveness.","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"YSpA12AbGZ"}],"key":"uKSxOuaq8v"}],"key":"BtcWWQHOW3"},{"type":"block","position":{"start":{"line":303,"column":1},"end":{"line":303,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":306,"column":1},"end":{"line":306,"column":1}},"children":[{"type":"text","value":"Explore-then-commit","position":{"start":{"line":306,"column":1},"end":{"line":306,"column":1}},"key":"SRbyJZeia3"}],"label":"etc","identifier":"etc","html_id":"etc","enumerator":"3.4","key":"YSSlmtfUQZ"},{"type":"paragraph","position":{"start":{"line":308,"column":1},"end":{"line":309,"column":1}},"children":[{"type":"text","value":"We can improve the pure greedy algorithm as follows: let’s reduce the variance of the reward estimates by pulling each arm ","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"juinOta9Qh"},{"type":"inlineMath","value":"N_{\\text{explore}}> 1","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"html":"Nexplore>1N_{\\text{explore}}> 1Nexplore>1","key":"Tt4AzUA67t"},{"type":"text","value":" times before committing. This is called the ","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"ESJHoRRFqx"},{"type":"strong","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"children":[{"type":"text","value":"explore-then-commit","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"ofE0oMQHvQ"}],"key":"Tk7JM1fWxo"},{"type":"text","value":" strategy. Note that the “pure greedy” strategy above is just the special case where\n","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"FxubxYx7x8"},{"type":"inlineMath","value":"N_{\\text{explore}}= 1","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"html":"Nexplore=1N_{\\text{explore}}= 1Nexplore=1","key":"JwbBXBVxR6"},{"type":"text","value":".","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"Mxo4ulHFlD"}],"key":"N9lsppmoRI"}],"key":"Kqx3arRxvM"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class ExploreThenCommit(Agent):\n def __init__(self, K: int, T: int, N_explore: int):\n super().__init__(K, T)\n self.N_explore = N_explore\n\n def choose_arm(self):\n return solutions.etc_choose_arm(self)","key":"xIXJpRAFiw"},{"type":"output","id":"zRHGWDKSFXi5ARk8ZqhUX","data":[],"key":"j6YwoEshBu"}],"data":{},"key":"X3YQW3GQoR"},{"type":"block","children":[],"key":"CbjzPjuEP4"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"agent = ExploreThenCommit(mab.K, mab.T, mab.T // 15)\nmab_loop(mab, agent)\nplot_strategy(mab, agent)","key":"kvKsrB5K6l"},{"type":"output","id":"wG8M5KHgjQEifBrc_Y5SS","data":[{"output_type":"display_data","metadata":{},"data":{"text/plain":{"content":"
","content_type":"text/plain"},"image/png":{"content_type":"image/png","hash":"dde6263087532775cde0fb2de5a471cc","path":"/build/dde6263087532775cde0fb2de5a471cc.png"}}}],"key":"lXoAVA1dIJ"}],"data":{},"key":"StreBKdWRu"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"children":[{"type":"text","value":"Notice that now, the graphs are much more consistent, and the algorithm finds the true optimal arm and sticks with it much more frequently. We would expect ETC to then have a better (i.e. lower) average regret. Can we prove this?","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"key":"DEC1ovmrrW"}],"key":"VrvUXZ5LiJ"}],"key":"fl9OhDqECa"},{"type":"block","position":{"start":{"line":329,"column":1},"end":{"line":329,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":332,"column":1},"end":{"line":332,"column":1}},"children":[{"type":"text","value":"ETC regret analysis","position":{"start":{"line":332,"column":1},"end":{"line":332,"column":1}},"key":"DABUwWR63d"}],"label":"etc-regret-analysis","identifier":"etc-regret-analysis","html_id":"etc-regret-analysis","enumerator":"3.4.1","key":"ZkuGj9VXDg"},{"type":"paragraph","position":{"start":{"line":334,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"Let’s analyze the expected regret of the explore-then-commit strategy by splitting it up\ninto the exploration and exploitation phases.","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"key":"P05gfDsoMF"}],"key":"SdrNX0f7Ak"},{"type":"heading","depth":4,"position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"Exploration phase.","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"dy1JHSC2xJ"}],"identifier":"exploration-phase","label":"Exploration phase.","html_id":"exploration-phase","implicit":true,"enumerator":"3.4.1.1","key":"nlXSKZ3Nio"},{"type":"paragraph","position":{"start":{"line":339,"column":1},"end":{"line":341,"column":1}},"children":[{"type":"text","value":"This phase takes ","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"key":"Q2SA3hTKPr"},{"type":"inlineMath","value":"N_{\\text{explore}}K","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"html":"NexploreKN_{\\text{explore}}KNexploreK","key":"sFVlN93VsU"},{"type":"text","value":" timesteps. Since at each step we\nincur at most ","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"key":"Wvo94NbHCi"},{"type":"text","value":"1","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"key":"gKcHOVO6Dq"},{"type":"text","value":" regret, the total regret is at most\n","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"key":"Yh2c8fARMh"},{"type":"inlineMath","value":"N_{\\text{explore}}K","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"html":"NexploreKN_{\\text{explore}}KNexploreK","key":"jlnvBxEFHL"},{"type":"text","value":".","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"key":"Xpi3ZufilD"}],"key":"m4Igbbjbuq"},{"type":"heading","depth":4,"position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"children":[{"type":"text","value":"Exploitation phase.","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"SpCo5EB03v"}],"identifier":"exploitation-phase","label":"Exploitation phase.","html_id":"exploitation-phase","implicit":true,"enumerator":"3.4.1.2","key":"fKTEBmalEF"},{"type":"paragraph","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"children":[{"type":"text","value":"This will take a bit more effort. We’ll prove that for any total time ","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"KCOp7AQab6"},{"type":"inlineMath","value":"T","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"html":"TTT","key":"C6H35B2cJz"},{"type":"text","value":", we can choose ","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"h8C8iFO7V4"},{"type":"inlineMath","value":"N_{\\text{explore}}","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"html":"NexploreN_{\\text{explore}}Nexplore","key":"fZk3RWwChC"},{"type":"text","value":" such that with arbitrarily high probability, the regret is sublinear.","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"wYRoBsKr2j"}],"key":"FMRxBYzReE"},{"type":"paragraph","position":{"start":{"line":347,"column":1},"end":{"line":348,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":347,"column":1},"end":{"line":347,"column":1}},"key":"PvN9KNrsuX"},{"type":"inlineMath","value":"\\hat k","position":{"start":{"line":347,"column":1},"end":{"line":347,"column":1}},"html":"k^\\hat kk^","key":"dJrEn2C4iU"},{"type":"text","value":" denote the arm chosen after the exploration phase. We know the regret from the\nexploitation phase is","position":{"start":{"line":347,"column":1},"end":{"line":347,"column":1}},"key":"tyw6UtVOVr"}],"key":"ntIM5HToZi"},{"type":"math","value":"T_{\\text{exploit}} (\\mu^\\star - \\mu^{\\hat k}) \\qquad \\text{where} \\qquad T_{\\text{exploit}} := T - N_{\\text{explore}}K.","position":{"start":{"line":350,"column":1},"end":{"line":350,"column":1}},"html":"Texploit(μμk^)whereTexploit:=TNexploreK.T_{\\text{exploit}} (\\mu^\\star - \\mu^{\\hat k}) \\qquad \\text{where} \\qquad T_{\\text{exploit}} := T - N_{\\text{explore}}K.Texploit(μμk^)whereTexploit:=TNexploreK.","enumerator":"3.6","key":"amhPAdDqm8"},{"type":"paragraph","position":{"start":{"line":352,"column":1},"end":{"line":353,"column":1}},"children":[{"type":"text","value":"So we’d like to bound ","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"key":"cgp6fIzjys"},{"type":"inlineMath","value":"\\mu^\\star - \\mu^{\\hat k} = o(1)","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"html":"μμk^=o(1)\\mu^\\star - \\mu^{\\hat k} = o(1)μμk^=o(1)","key":"DDr4knOoEj"},{"type":"text","value":" (as a function\nof ","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"key":"IJcUu44F24"},{"type":"inlineMath","value":"T","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"html":"TTT","key":"xnP4E0jNNn"},{"type":"text","value":") in order to achieve sublinear regret. How can we do this?","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"key":"iqkma6LWWI"}],"key":"G3MbUQsloP"},{"type":"paragraph","position":{"start":{"line":355,"column":1},"end":{"line":358,"column":1}},"children":[{"type":"text","value":"Let’s define ","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"key":"E1feLee41s"},{"type":"inlineMath","value":"\\Delta^k = \\hat \\mu^k - \\mu^k","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"html":"Δk=μ^kμk\\Delta^k = \\hat \\mu^k - \\mu^kΔk=μ^kμk","key":"oM40LRqjvp"},{"type":"text","value":" to denote how far the mean\nestimate for arm ","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"key":"JrwMilAh0g"},{"type":"inlineMath","value":"k","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"html":"kkk","key":"o1zzD9wcAO"},{"type":"text","value":" is from the true mean. How can we bound this\nquantity? We’ll use the following useful inequality for i.i.d. bounded\nrandom variables:","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"key":"kbb1Me0DJO"}],"key":"qazLKi4J5N"},{"type":"proof","kind":"theorem","label":"hoeffding","identifier":"hoeffding","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Hoeffding’s inequality","position":{"start":{"line":360,"column":1},"end":{"line":360,"column":1}},"key":"yit43DTlfI"}],"key":"ZzKE00PoyQ"},{"type":"paragraph","position":{"start":{"line":363,"column":1},"end":{"line":365,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"key":"vfOYa7T0dC"},{"type":"inlineMath","value":"X_0, \\dots, X_{n-1}","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"html":"X0,,Xn1X_0, \\dots, X_{n-1}X0,,Xn1","key":"MYcK2ET01Q"},{"type":"text","value":" be i.i.d. random variables with\n","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"key":"DMdu1YxarX"},{"type":"inlineMath","value":"X_i \\in [0, 1]","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"html":"Xi[0,1]X_i \\in [0, 1]Xi[0,1]","key":"yN6fk1jkzi"},{"type":"text","value":" almost surely for each ","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"key":"JLYpYDTW5A"},{"type":"inlineMath","value":"i \\in [n]","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"html":"i[n]i \\in [n]i[n]","key":"ujVweC19Ak"},{"type":"text","value":". Then for any\n","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"key":"lvryaBbxIX"},{"type":"inlineMath","value":"\\delta > 0","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"html":"δ>0\\delta > 0δ>0","key":"u2ydmyyAVq"},{"type":"text","value":",","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"key":"fLzL2TdkXc"}],"key":"smgWuuDSTX"},{"type":"math","value":"\\pr\\left( \\left| \\frac{1}{n} \\sum_{i=1}^n (X_i - \\E[X_i]) \\right| > \\sqrt{\\frac{\\ln(2/\\delta)}{2n}} \\right) \\le \\delta.","position":{"start":{"line":367,"column":1},"end":{"line":367,"column":1}},"html":"P(1ni=1n(XiE[Xi])>ln(2/δ)2n)δ.\\pr\\left( \\left| \\frac{1}{n} \\sum_{i=1}^n (X_i - \\E[X_i]) \\right| > \\sqrt{\\frac{\\ln(2/\\delta)}{2n}} \\right) \\le \\delta.P(n1i=1n(XiE[Xi])>2nln(2/δ))δ.","enumerator":"3.7","key":"FCqRE9ZbYY"}],"enumerator":"3.1","html_id":"hoeffding","key":"p3IjXeRwV4"},{"type":"paragraph","position":{"start":{"line":370,"column":1},"end":{"line":370,"column":1}},"children":[{"type":"text","value":"The proof of this inequality is beyond the scope of this book. See ","position":{"start":{"line":370,"column":1},"end":{"line":370,"column":1}},"key":"QuTvl3BswK"},{"type":"cite","kind":"narrative","label":"vershynin_high-dimensional_2018","identifier":"vershynin_high-dimensional_2018","children":[{"type":"text","value":"Vershynin (2018)","key":"vdj7U5hhI1"}],"enumerator":"1","key":"dkgxW0bFzQ"},{"type":"text","value":" Chapter 2.2.","position":{"start":{"line":370,"column":1},"end":{"line":370,"column":1}},"key":"eJATaLXBlq"}],"key":"z4AdNZrHgP"},{"type":"paragraph","position":{"start":{"line":372,"column":1},"end":{"line":372,"column":1}},"children":[{"type":"text","value":"We can apply this directly to the rewards for a given arm ","position":{"start":{"line":372,"column":1},"end":{"line":372,"column":1}},"key":"joYQ8HIuDs"},{"type":"inlineMath","value":"k","position":{"start":{"line":372,"column":1},"end":{"line":372,"column":1}},"html":"kkk","key":"wWhtwR1cCD"},{"type":"text","value":", since the rewards from that arm are i.i.d.:","position":{"start":{"line":372,"column":1},"end":{"line":372,"column":1}},"key":"h60JzMBuPU"}],"key":"sLqkQ2dFIk"},{"type":"math","value":"\\pr\\left(|\\Delta^k | > \\sqrt{\\frac{\\ln(2/\\delta)}{2N_{\\text{explore}}}} \\right) \\le \\delta.","label":"hoeffding-etc","identifier":"hoeffding-etc","html":"P(Δk>ln(2/δ)2Nexplore)δ.\\pr\\left(|\\Delta^k | > \\sqrt{\\frac{\\ln(2/\\delta)}{2N_{\\text{explore}}}} \\right) \\le \\delta.P(Δk>2Nexploreln(2/δ))δ.","enumerator":"3.8","html_id":"hoeffding-etc","key":"dMNHo17gyM"},{"type":"paragraph","position":{"start":{"line":380,"column":1},"end":{"line":384,"column":1}},"children":[{"type":"text","value":"But note that we can’t apply this to arm ","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"SKgjrhu0a8"},{"type":"inlineMath","value":"\\hat k","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"html":"k^\\hat kk^","key":"wtShpMfwlw"},{"type":"text","value":" directly since\n","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"zIIvpBV3ok"},{"type":"inlineMath","value":"\\hat k","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"html":"k^\\hat kk^","key":"e3NuWmXdqi"},{"type":"text","value":" is itself a random variable. Instead, we need to “uniform-ize”\nthis bound across ","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"sp4ngnSVCQ"},{"type":"emphasis","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"children":[{"type":"text","value":"all","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"FC8TBdlPZ7"}],"key":"Vj5V3wTfnJ"},{"type":"text","value":" the arms, i.e. bound the error across all the\narms simultaneously, so that the resulting bound will apply ","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"dgG87TNVRG"},{"type":"emphasis","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"children":[{"type":"text","value":"no matter\nwhat","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"vjqWFfilXd"}],"key":"hZ6ELbJY71"},{"type":"text","value":" ","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"sPjngZatvb"},{"type":"inlineMath","value":"\\hat k","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"html":"k^\\hat kk^","key":"N1fHHCvMv1"},{"type":"text","value":" “crystallizes” to.","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"SkDgQ5lLKn"}],"key":"sYH8Upukf1"},{"type":"paragraph","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"key":"aJhYWspVWS"},{"type":"strong","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"children":[{"type":"text","value":"union bound","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"key":"chuDHXj3FL"}],"key":"zqaCooqJgQ"},{"type":"text","value":" provides a simple way to do this:","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"key":"jzl0rp1iT7"}],"key":"ePKdZvnyUx"},{"type":"proof","kind":"theorem","label":"union_bound","identifier":"union_bound","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Union bound","position":{"start":{"line":388,"column":1},"end":{"line":388,"column":1}},"key":"SwUl6kZfyU"}],"key":"ag4UedDhLo"},{"type":"paragraph","position":{"start":{"line":391,"column":1},"end":{"line":391,"column":1}},"children":[{"type":"text","value":"Consider a set of events ","position":{"start":{"line":391,"column":1},"end":{"line":391,"column":1}},"key":"hQ9q98LFp8"},{"type":"inlineMath","value":"A_0, \\dots, A_{n-1}","position":{"start":{"line":391,"column":1},"end":{"line":391,"column":1}},"html":"A0,,An1A_0, \\dots, A_{n-1}A0,,An1","key":"ErEOMHYoha"},{"type":"text","value":". Then","position":{"start":{"line":391,"column":1},"end":{"line":391,"column":1}},"key":"a4L5lNyhyy"}],"key":"k0WgHSjn4l"},{"type":"math","value":"\\pr(\\exists i \\in [n]. A_i) \\le \\sum_{i=0}^{n-1} \\pr(A_i).","position":{"start":{"line":393,"column":1},"end":{"line":393,"column":1}},"html":"P(i[n].Ai)i=0n1P(Ai).\\pr(\\exists i \\in [n]. A_i) \\le \\sum_{i=0}^{n-1} \\pr(A_i).P(i[n].Ai)i=0n1P(Ai).","enumerator":"3.9","key":"F0UJlNlgxM"},{"type":"paragraph","position":{"start":{"line":395,"column":1},"end":{"line":396,"column":1}},"children":[{"type":"text","value":"In\nparticular, if ","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"MNd1vcf3gP"},{"type":"inlineMath","value":"\\pr(A_i) \\ge 1 - \\delta","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"html":"P(Ai)1δ\\pr(A_i) \\ge 1 - \\deltaP(Ai)1δ","key":"qiaBNeSl0h"},{"type":"text","value":" for each ","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"I7oJ1809IQ"},{"type":"inlineMath","value":"i \\in [n]","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"html":"i[n]i \\in [n]i[n]","key":"IexE3NIO1J"},{"type":"text","value":", we have","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"cerYAxAcrP"}],"key":"JuBHdF8FFX"},{"type":"math","value":"\\pr(\\forall i \\in [n]. A_i) \\ge 1 - n \\delta.","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"html":"P(i[n].Ai)1nδ.\\pr(\\forall i \\in [n]. A_i) \\ge 1 - n \\delta.P(i[n].Ai)1nδ.","enumerator":"3.10","key":"qErxEZinsH"}],"enumerator":"3.2","html_id":"union-bound","key":"DUWt6QQl6q"},{"type":"paragraph","position":{"start":{"line":401,"column":1},"end":{"line":401,"column":1}},"children":[{"type":"strong","position":{"start":{"line":401,"column":1},"end":{"line":401,"column":1}},"children":[{"type":"text","value":"Exercise:","position":{"start":{"line":401,"column":1},"end":{"line":401,"column":1}},"key":"nIGYZndMn7"}],"key":"nCxynkgOQV"},{"type":"text","value":" Prove the second statement above.","position":{"start":{"line":401,"column":1},"end":{"line":401,"column":1}},"key":"XOtIG6FTk9"}],"key":"BQBJatw9vo"},{"type":"paragraph","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"children":[{"type":"text","value":"Applying the union bound across the arms for the l.h.s. event of ","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"bfBUHbx3G8"},{"type":"crossReference","kind":"equation","identifier":"hoeffding-etc","label":"hoeffding-etc","children":[{"type":"text","value":"(","key":"pP1fQUBSks"},{"type":"text","value":"3.8","key":"ygWUjwp4wB"},{"type":"text","value":")","key":"bIOfeEphbr"}],"template":"(%s)","enumerator":"3.8","resolved":true,"html_id":"hoeffding-etc","key":"yUUtvIGoPZ"},{"type":"text","value":", we have","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"adta7OTWGe"}],"key":"M1oNMVXYRz"},{"type":"math","value":"\\begin{aligned}\n \\pr\\left( \\forall k \\in [K], |\\Delta^k | \\le \\sqrt{\\frac{\\ln(2/\\delta)}{2N_{\\text{explore}}}} \\right) &\\ge 1-K\\delta\n\\end{aligned}","position":{"start":{"line":405,"column":1},"end":{"line":409,"column":1}},"html":"P(k[K],Δkln(2/δ)2Nexplore)1Kδ\\begin{aligned}\n \\pr\\left( \\forall k \\in [K], |\\Delta^k | \\le \\sqrt{\\frac{\\ln(2/\\delta)}{2N_{\\text{explore}}}} \\right) &\\ge 1-K\\delta\n\\end{aligned}P(k[K],Δk2Nexploreln(2/δ))1","enumerator":"3.11","key":"Lq6FKehwcj"},{"type":"paragraph","position":{"start":{"line":411,"column":1},"end":{"line":412,"column":1}},"children":[{"type":"text","value":"Then to apply this bound to ","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"key":"ZT4tyuyjbD"},{"type":"inlineMath","value":"\\hat k","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"html":"k^\\hat kk^","key":"sViXGjSZtc"},{"type":"text","value":" in particular, we\ncan apply the useful trick of “adding zero”:","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"key":"pRfY8moVZn"}],"key":"g2JOXbvxhW"},{"type":"math","value":"\\begin{aligned}\n \\mu^{k^\\star} - \\mu^{\\hat k} &= \\mu^{k^\\star} - \\mu^{\\hat k} + (\\hat \\mu^{k^\\star} - \\hat \\mu^{k^\\star}) + (\\hat \\mu^{\\hat k} - \\hat \\mu^{\\hat k}) \\\\\n &= \\Delta^{\\hat k} - \\Delta^{k^*} + \\underbrace{(\\hat \\mu^{k^\\star} - \\hat \\mu^{\\hat k})}_{\\le 0 \\text{ by definition of } \\hat k} \\\\\n &\\le 2 \\sqrt{\\frac{\\ln(2K/\\delta')}{2N_{\\text{explore}}}} \\text{ with probability at least } 1-\\delta'\n\\end{aligned}","position":{"start":{"line":414,"column":1},"end":{"line":420,"column":1}},"html":"μkμk^=μkμk^+(μ^kμ^k)+(μ^k^μ^k^)=Δk^Δk+(μ^kμ^k^)0 by definition of k^2ln(2K/δ)2Nexplore with probability at least 1δ\\begin{aligned}\n \\mu^{k^\\star} - \\mu^{\\hat k} &= \\mu^{k^\\star} - \\mu^{\\hat k} + (\\hat \\mu^{k^\\star} - \\hat \\mu^{k^\\star}) + (\\hat \\mu^{\\hat k} - \\hat \\mu^{\\hat k}) \\\\\n &= \\Delta^{\\hat k} - \\Delta^{k^*} + \\underbrace{(\\hat \\mu^{k^\\star} - \\hat \\mu^{\\hat k})}_{\\le 0 \\text{ by definition of } \\hat k} \\\\\n &\\le 2 \\sqrt{\\frac{\\ln(2K/\\delta')}{2N_{\\text{explore}}}} \\text{ with probability at least } 1-\\delta'\n\\end{aligned}μkμk^=μkμk^+(μ^kμ^k)+(μ^k^μ^k^)=Δk^Δk+0 by definition of k^(μ^kμ^k^)22Nexploreln(2K/δ) with probability at least 1δ","enumerator":"3.12","key":"iyylJcQTf4"},{"type":"paragraph","position":{"start":{"line":422,"column":1},"end":{"line":423,"column":1}},"children":[{"type":"text","value":"where we’ve set ","position":{"start":{"line":422,"column":1},"end":{"line":422,"column":1}},"key":"CvD5gyQatj"},{"type":"inlineMath","value":"\\delta' = K\\delta","position":{"start":{"line":422,"column":1},"end":{"line":422,"column":1}},"html":"δ=Kδ\\delta' = K\\deltaδ=","key":"UWaS0r443W"},{"type":"text","value":". Putting this all\ntogether, we’ve shown that, with probability ","position":{"start":{"line":422,"column":1},"end":{"line":422,"column":1}},"key":"CUjjRk6O0U"},{"type":"inlineMath","value":"1 - \\delta'","position":{"start":{"line":422,"column":1},"end":{"line":422,"column":1}},"html":"1δ1 - \\delta'1δ","key":"Z1UzziH3wh"},{"type":"text","value":",","position":{"start":{"line":422,"column":1},"end":{"line":422,"column":1}},"key":"v8GIDKtUqX"}],"key":"Stpc7cH4PD"},{"type":"math","value":"\\text{Regret}_T \\le N_{\\text{explore}}K + T_{\\text{exploit}} \\cdot \\sqrt{\\frac{2\\ln(2K/\\delta')}{N_{\\text{explore}}}}.","position":{"start":{"line":425,"column":1},"end":{"line":425,"column":1}},"html":"RegretTNexploreK+Texploit2ln(2K/δ)Nexplore.\\text{Regret}_T \\le N_{\\text{explore}}K + T_{\\text{exploit}} \\cdot \\sqrt{\\frac{2\\ln(2K/\\delta')}{N_{\\text{explore}}}}.RegretTNexploreK+TexploitNexplore2ln(2K/δ).","enumerator":"3.13","key":"n6LlsmOOM9"},{"type":"paragraph","position":{"start":{"line":427,"column":1},"end":{"line":430,"column":1}},"children":[{"type":"text","value":"Note that it suffices for ","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"key":"FsqJTML7Ey"},{"type":"inlineMath","value":"N_{\\text{explore}}","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"html":"NexploreN_{\\text{explore}}Nexplore","key":"q2Cyt1JYWK"},{"type":"text","value":" to be on the order of\n","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"key":"EwI0AZRx14"},{"type":"inlineMath","value":"\\sqrt{T}","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"html":"T\\sqrt{T}T","key":"ZKmzO5ZqNX"},{"type":"text","value":" to achieve sublinear regret. In particular, we can find the\noptimal ","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"key":"zqGIblxBDq"},{"type":"inlineMath","value":"N_{\\text{explore}}","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"html":"NexploreN_{\\text{explore}}Nexplore","key":"uvWls541kl"},{"type":"text","value":" by setting the derivative of the r.h.s. to\nzero:","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"key":"fwzmvTf0rv"}],"key":"dia1lVjuvI"},{"type":"math","value":"\\begin{aligned}\n 0 &= K - T_{\\text{exploit}} \\cdot \\frac{1}{2} \\sqrt{\\frac{2\\ln(2K/\\delta')}{N_{\\text{explore}}^3}} \\\\\n N_{\\text{explore}}&= \\left( T_{\\text{exploit}} \\cdot \\frac{\\sqrt{\\ln(2K/\\delta')/2}}{K} \\right)^{2/3}\n\\end{aligned}","position":{"start":{"line":432,"column":1},"end":{"line":437,"column":1}},"html":"0=KTexploit122ln(2K/δ)Nexplore3Nexplore=(Texploitln(2K/δ)/2K)2/3\\begin{aligned}\n 0 &= K - T_{\\text{exploit}} \\cdot \\frac{1}{2} \\sqrt{\\frac{2\\ln(2K/\\delta')}{N_{\\text{explore}}^3}} \\\\\n N_{\\text{explore}}&= \\left( T_{\\text{exploit}} \\cdot \\frac{\\sqrt{\\ln(2K/\\delta')/2}}{K} \\right)^{2/3}\n\\end{aligned}0Nexplore=KTexploit21Nexplore32ln(2K/δ)=(TexploitKln(2K/δ)/2)2/3","enumerator":"3.14","key":"Y0RMUgwqd3"},{"type":"paragraph","position":{"start":{"line":439,"column":1},"end":{"line":440,"column":1}},"children":[{"type":"text","value":"Plugging this into the expression for the regret, we\nhave (still with probability ","position":{"start":{"line":439,"column":1},"end":{"line":439,"column":1}},"key":"dw1fw8aqGw"},{"type":"inlineMath","value":"1-\\delta'","position":{"start":{"line":439,"column":1},"end":{"line":439,"column":1}},"html":"1δ1-\\delta'1δ","key":"sreEfEVCGs"},{"type":"text","value":")","position":{"start":{"line":439,"column":1},"end":{"line":439,"column":1}},"key":"sP1P9xQ3Fv"}],"key":"E0kJVDtMLh"},{"type":"math","value":"\\begin{aligned}\n \\text{Regret}_T &\\le 3 T^{2/3} \\sqrt[3]{K \\ln(2K/\\delta') / 2} \\\\\n &= \\tilde{O}(T^{2/3} K^{1/3}).\n\\end{aligned}","position":{"start":{"line":442,"column":1},"end":{"line":447,"column":1}},"html":"RegretT3T2/3Kln(2K/δ)/23=O~(T2/3K1/3).\\begin{aligned}\n \\text{Regret}_T &\\le 3 T^{2/3} \\sqrt[3]{K \\ln(2K/\\delta') / 2} \\\\\n &= \\tilde{O}(T^{2/3} K^{1/3}).\n\\end{aligned}RegretT3T2/33Kln(2K/δ)/2=O~(T2/3K1/3).","enumerator":"3.15","key":"hbAFOSaF6H"},{"type":"paragraph","position":{"start":{"line":449,"column":1},"end":{"line":452,"column":1}},"children":[{"type":"text","value":"The ETC algorithm is rather “abrupt” in that it switches from\nexploration to exploitation after a fixed number of timesteps. In\npractice, it’s often better to use a more gradual transition, which\nbrings us to the ","position":{"start":{"line":449,"column":1},"end":{"line":449,"column":1}},"key":"St02qtICwC"},{"type":"emphasis","position":{"start":{"line":449,"column":1},"end":{"line":449,"column":1}},"children":[{"type":"text","value":"epsilon-greedy","position":{"start":{"line":449,"column":1},"end":{"line":449,"column":1}},"key":"RsZZddAa3p"}],"key":"SxloZdBqcr"},{"type":"text","value":" algorithm.","position":{"start":{"line":449,"column":1},"end":{"line":449,"column":1}},"key":"nT4iChfD6T"}],"key":"ssMr9hDEsS"}],"key":"eSPQ5wMKcz"},{"type":"block","position":{"start":{"line":454,"column":1},"end":{"line":454,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":456,"column":1},"end":{"line":456,"column":1}},"children":[{"type":"text","value":"Epsilon-greedy","position":{"start":{"line":456,"column":1},"end":{"line":456,"column":1}},"key":"XnZXQNQUyn"}],"identifier":"epsilon-greedy","label":"Epsilon-greedy","html_id":"epsilon-greedy","implicit":true,"enumerator":"3.5","key":"OlSbQiKQqW"},{"type":"paragraph","position":{"start":{"line":458,"column":1},"end":{"line":462,"column":1}},"children":[{"type":"text","value":"Instead of doing all of the exploration and then all of the exploitation\nseparately – which additionally requires knowing the time horizon\nbeforehand – we can instead interleave exploration and exploitation by,\nat each timestep, choosing a random action with some probability. We\ncall this the ","position":{"start":{"line":458,"column":1},"end":{"line":458,"column":1}},"key":"FMiO1mPCDJ"},{"type":"strong","position":{"start":{"line":458,"column":1},"end":{"line":458,"column":1}},"children":[{"type":"text","value":"epsilon-greedy","position":{"start":{"line":458,"column":1},"end":{"line":458,"column":1}},"key":"YVETlzKw7w"}],"key":"bqn40oU8Ji"},{"type":"text","value":" algorithm.","position":{"start":{"line":458,"column":1},"end":{"line":458,"column":1}},"key":"EyWJPZsqZF"}],"key":"gUNjKzpTxn"}],"key":"easw0BsViB"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class EpsilonGreedy(Agent):\n def __init__(\n self,\n K: int,\n T: int,\n ε_array: Float[Array, \" T\"],\n ):\n super().__init__(K, T)\n self.ε_array = ε_array\n\n def choose_arm(self):\n return solutions.epsilon_greedy_choose_arm(self)","key":"KWqcDU9aee"},{"type":"output","id":"NJ5HeygIS4OMF_MHlYIHr","data":[],"key":"JPoydqoZ92"}],"data":{},"key":"ajbgNlaqGh"},{"type":"block","children":[],"key":"zhERbOM8Qd"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"agent = EpsilonGreedy(mab.K, mab.T, np.full(mab.T, 0.1))\nmab_loop(mab, agent)\nplot_strategy(mab, agent)","key":"Xzkq1LlFCJ"},{"type":"output","id":"KoEPi-cpIsC0ABBkxxayu","data":[{"output_type":"display_data","metadata":{},"data":{"text/plain":{"content":"
","content_type":"text/plain"},"image/png":{"content_type":"image/png","hash":"6ad1018e4c18668300eb6bbe80bdc84f","path":"/build/6ad1018e4c18668300eb6bbe80bdc84f.png"}}}],"key":"krQ3fFTNQz"}],"data":{},"key":"e8g0zE0UkF"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"children":[{"type":"text","value":"Note that we let ","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"key":"OPXQ3IFK08"},{"type":"text","value":"ε","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"key":"TbqXOyVlBV"},{"type":"text","value":" vary over time. In particular, we might want to gradually ","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"key":"z19bG0mSjI"},{"type":"emphasis","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"children":[{"type":"text","value":"decrease","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"key":"WhWImvVHmF"}],"key":"NiuFFMaE08"},{"type":"text","value":" ","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"key":"xLPhKhPnZ1"},{"type":"text","value":"ε","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"key":"TH3oz44m4a"},{"type":"text","value":" as we learn more about the reward distributions and no longer need to spend time exploring.","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"key":"zbqSDKKaDq"}],"key":"yGgrbBZAlv"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"RT5IIIW7DS"}],"key":"jXhxU3KSxO"},{"type":"paragraph","position":{"start":{"line":488,"column":1},"end":{"line":488,"column":1}},"children":[{"type":"text","value":"What is the expected regret of the algorithm if we set ","position":{"start":{"line":488,"column":1},"end":{"line":488,"column":1}},"key":"wkFIlirqPG"},{"type":"text","value":"ε","position":{"start":{"line":488,"column":1},"end":{"line":488,"column":1}},"key":"Gh82DBHNB3"},{"type":"text","value":" to be a constant?","position":{"start":{"line":488,"column":1},"end":{"line":488,"column":1}},"key":"m7f2HuEz0V"}],"key":"qWzsMBemFN"}],"key":"HmZKWnwoim"},{"type":"paragraph","position":{"start":{"line":491,"column":1},"end":{"line":491,"column":1}},"children":[{"type":"text","value":"It turns out that setting ","position":{"start":{"line":491,"column":1},"end":{"line":491,"column":1}},"key":"dgY9a27Ic5"},{"type":"inlineMath","value":"\\epsilon_t = \\sqrt[3]{K \\ln(t)/t}","position":{"start":{"line":491,"column":1},"end":{"line":491,"column":1}},"html":"ϵt=Kln(t)/t3\\epsilon_t = \\sqrt[3]{K \\ln(t)/t}ϵt=3Kln(t)/t","key":"dkTegtYmOy"},{"type":"text","value":" also achieves a regret of ","position":{"start":{"line":491,"column":1},"end":{"line":491,"column":1}},"key":"tzpwOt9arA"},{"type":"inlineMath","value":"\\tilde O(t^{2/3} K^{1/3})","position":{"start":{"line":491,"column":1},"end":{"line":491,"column":1}},"html":"O~(t2/3K1/3)\\tilde O(t^{2/3} K^{1/3})O~(t2/3K1/3)","key":"Jo5qVnDFM2"},{"type":"text","value":" (ignoring the logarithmic factors). (We will not prove this here.) TODO ADD PROOF CITATION","position":{"start":{"line":491,"column":1},"end":{"line":491,"column":1}},"key":"zi93tbFHka"}],"key":"V7vEsKL00g"},{"type":"paragraph","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"children":[{"type":"text","value":"In ETC, we had to set ","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"WoUGGpGfW4"},{"type":"inlineMath","value":"N_{\\text{explore}}","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"html":"NexploreN_{\\text{explore}}Nexplore","key":"CWKEncEKfk"},{"type":"text","value":" based on the total number of timesteps ","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"MSu1k7Vgog"},{"type":"inlineMath","value":"T","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"html":"TTT","key":"QjlBLGWzQh"},{"type":"text","value":". But the epsilon-greedy algorithm actually handles the exploration ","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"rMKmZZkh7z"},{"type":"emphasis","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"children":[{"type":"text","value":"automatically","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"I2BN9uErGS"}],"key":"M6djl0uJiF"},{"type":"text","value":": the regret rate holds for ","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"m3u3tcA0Ws"},{"type":"emphasis","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"children":[{"type":"text","value":"any","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"wlKoFM9IG6"}],"key":"TiSh7aSmMN"},{"type":"text","value":" ","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"BBmKAMypbA"},{"type":"inlineMath","value":"t","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"html":"ttt","key":"FVpXFdFTml"},{"type":"text","value":", and doesn’t depend on the final horizon ","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"kIlXfhH7Ll"},{"type":"inlineMath","value":"T","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"html":"TTT","key":"j3j2bvYCKA"},{"type":"text","value":".","position":{"start":{"line":493,"column":1},"end":{"line":493,"column":1}},"key":"lwwPqDupCM"}],"key":"PVurjCwIuu"},{"type":"paragraph","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"children":[{"type":"text","value":"But the way these algorithms explore is rather naive: we’ve been exploring ","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"key":"qhf05VOsYI"},{"type":"emphasis","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"children":[{"type":"text","value":"uniformly","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"key":"zksUQsmNe6"}],"key":"zbpn9h3THl"},{"type":"text","value":" across all the arms. But what if we could be smarter about it, and explore ","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"key":"KDeJX6j5pa"},{"type":"emphasis","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"children":[{"type":"text","value":"more","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"key":"c02Xg4HM4S"}],"key":"hGIMjWobzy"},{"type":"text","value":" for arms that we’re less certain about?","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"key":"wTiyzqOZYN"}],"key":"a5DHFOlaGc"}],"key":"fm4b4VIcRl"},{"type":"block","position":{"start":{"line":497,"column":1},"end":{"line":497,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"children":[{"type":"text","value":"Upper Confidence Bound (UCB)","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"key":"ipPpojY2oM"}],"label":"ucb","identifier":"ucb","html_id":"ucb","enumerator":"3.6","key":"a5fq6f4jKa"},{"type":"paragraph","position":{"start":{"line":502,"column":1},"end":{"line":506,"column":1}},"children":[{"type":"text","value":"To quantify how ","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"vRgowWO2fS"},{"type":"emphasis","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"children":[{"type":"text","value":"certain","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"xaZ46agOkn"}],"key":"ZUopOqbS9l"},{"type":"text","value":" we are about the mean of each arm, we’ll\ncompute ","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"lsFh6Vw3si"},{"type":"emphasis","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"children":[{"type":"text","value":"confidence intervals","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"nDq7700A2T"}],"key":"oFpgxwIAA7"},{"type":"text","value":" for our estimators, and then choose the\narm with the highest ","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"HfSc4HEdDl"},{"type":"emphasis","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"children":[{"type":"text","value":"upper confidence bound","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"Kqy2j6pmnY"}],"key":"c8gaHAk4uO"},{"type":"text","value":". This operates on the\nprinciple of ","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"tHxoxf9lzd"},{"type":"strong","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"children":[{"type":"text","value":"the benefit of the doubt (i.e. optimism in the face of\nuncertainty)","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"Nzc4sVCgpP"}],"key":"IDf8GD8yrF"},{"type":"text","value":": we’ll choose the arm that we’re most optimistic about.","position":{"start":{"line":502,"column":1},"end":{"line":502,"column":1}},"key":"IEnlk2IotA"}],"key":"Hnxo0BzuWx"},{"type":"paragraph","position":{"start":{"line":508,"column":1},"end":{"line":511,"column":1}},"children":[{"type":"text","value":"In particular, for each arm ","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"key":"y0I3xiRdUY"},{"type":"inlineMath","value":"k","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"html":"kkk","key":"JP7uLbYnNN"},{"type":"text","value":" at time ","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"key":"QOPQgi9gk1"},{"type":"inlineMath","value":"t","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"html":"ttt","key":"C1Zk7UiypF"},{"type":"text","value":", we’d like to compute some\nupper confidence bound ","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"key":"AWmi52ZTtE"},{"type":"inlineMath","value":"M^k_t","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"html":"MtkM^k_tMtk","key":"UM180FQEMM"},{"type":"text","value":" such that ","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"key":"Q9BBVsX1dH"},{"type":"inlineMath","value":"\\hat \\mu^k_t \\le M^k_t","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"html":"μ^tkMtk\\hat \\mu^k_t \\le M^k_tμ^tkMtk","key":"E6hbmNRaVA"},{"type":"text","value":" with\nhigh probability, and then choose ","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"key":"fFLziyd1fM"},{"type":"inlineMath","value":"a_t := \\arg \\max_{k \\in [K]} M^k_t","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"html":"at:=argmaxk[K]Mtka_t := \\arg \\max_{k \\in [K]} M^k_tat:=argmaxk[K]Mtk","key":"vfny0r3djh"},{"type":"text","value":".\nBut how should we compute ","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"key":"zm4AX63kgB"},{"type":"inlineMath","value":"M^k_t","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"html":"MtkM^k_tMtk","key":"hjOjZcLMcT"},{"type":"text","value":"?","position":{"start":{"line":508,"column":1},"end":{"line":508,"column":1}},"key":"MDYRSTWyWL"}],"key":"ZkMPmuNULz"},{"type":"paragraph","position":{"start":{"line":513,"column":1},"end":{"line":519,"column":1}},"children":[{"type":"text","value":"In ","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"mZD1vkCZno"},{"type":"crossReference","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"children":[{"type":"text","value":"Section ","key":"bMZq44czEm"},{"type":"text","value":"3.4.1","key":"Z7w8kQyMNa"}],"identifier":"etc-regret-analysis","label":"etc-regret-analysis","kind":"heading","template":"Section %s","enumerator":"3.4.1","resolved":true,"html_id":"etc-regret-analysis","key":"DNnmiiGS6L"},{"type":"text","value":", we were able to compute this bound\nusing Hoeffding’s inequality, which assumes that the number of samples\nis ","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"Pk7cSIZcqk"},{"type":"emphasis","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"children":[{"type":"text","value":"fixed","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"s2d0Mlp9UA"}],"key":"NgMErvdb8t"},{"type":"text","value":". This was the case in ETC (where we pull each arm\n","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"t0a0ewfdgM"},{"type":"inlineMath","value":"N_{\\text{explore}}","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"html":"NexploreN_{\\text{explore}}Nexplore","key":"DDE9Au3aaB"},{"type":"text","value":" times), but in UCB, the number of times we pull\neach arm depends on the agent’s actions, which in turn depend on the\nrandom rewards and are therefore stochastic. So we ","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"TlsjNefzwo"},{"type":"emphasis","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"children":[{"type":"text","value":"can’t","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"yDbQ79byOf"}],"key":"zFQwQ6Gh3X"},{"type":"text","value":" use\nHoeffding’s inequality directly.","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"RowLM8uaH3"}],"key":"ZXeQCfb8YG"},{"type":"paragraph","position":{"start":{"line":521,"column":1},"end":{"line":524,"column":1}},"children":[{"type":"text","value":"Instead, we’ll apply the same trick we used in the ETC analysis: we’ll\nuse the ","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"N4XdZZS9Ex"},{"type":"strong","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"children":[{"type":"text","value":"union bound","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"zVl8YOvdT7"}],"key":"tpKw7lv9W9"},{"type":"text","value":" to compute a ","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"khQHpqrYEp"},{"type":"emphasis","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"children":[{"type":"text","value":"looser","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"pnBC1y9zJM"}],"key":"QYYSQZWqT8"},{"type":"text","value":" bound that holds\n","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"eowSgYg0du"},{"type":"emphasis","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"children":[{"type":"text","value":"uniformly","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"Nr2sGEGixL"}],"key":"MsWnHwdHJh"},{"type":"text","value":" across all timesteps and arms. Let’s introduce some notation\nto discuss this.","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"jHJtTTKmvp"}],"key":"OwijPWIwBy"},{"type":"paragraph","position":{"start":{"line":526,"column":1},"end":{"line":528,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"yfQJ0bkFmQ"},{"type":"inlineMath","value":"N^k_t","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"html":"NtkN^k_tNtk","key":"Q1VnssuY0V"},{"type":"text","value":" denote the (random) number of times arm ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"JK0yguUann"},{"type":"inlineMath","value":"k","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"html":"kkk","key":"BrEdKjIwgw"},{"type":"text","value":" has been pulled\nwithin the first ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"fV78AhRDj7"},{"type":"inlineMath","value":"t","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"html":"ttt","key":"nhpGsrFCwh"},{"type":"text","value":" timesteps, and ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"TeTnY3CRC8"},{"type":"inlineMath","value":"\\hat \\mu^k_t","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"html":"μ^tk\\hat \\mu^k_tμ^tk","key":"Nvl3Z0k0oY"},{"type":"text","value":" denote the sample\naverage of those pulls. That is,","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"y5SsMnMtT4"}],"key":"Lbrpvvhoto"},{"type":"math","value":"\\begin{aligned}\n N^k_t &:= \\sum_{\\tau=0}^{t-1} \\mathbf{1} \\{ a_\\tau = k \\} \\\\\n \\hat \\mu^k_t &:= \\frac{1}{N^k_t} \\sum_{\\tau=0}^{t-1} \\mathbf{1} \\{ a_\\tau = k \\} r_\\tau.\n\\end{aligned}","position":{"start":{"line":530,"column":1},"end":{"line":535,"column":1}},"html":"Ntk:=τ=0t11{aτ=k}μ^tk:=1Ntkτ=0t11{aτ=k}rτ.\\begin{aligned}\n N^k_t &:= \\sum_{\\tau=0}^{t-1} \\mathbf{1} \\{ a_\\tau = k \\} \\\\\n \\hat \\mu^k_t &:= \\frac{1}{N^k_t} \\sum_{\\tau=0}^{t-1} \\mathbf{1} \\{ a_\\tau = k \\} r_\\tau.\n\\end{aligned}Ntkμ^tk:=τ=0t11{aτ=k}:=Ntk1τ=0t11{aτ=k}rτ.","enumerator":"3.16","key":"gCIn7R8Amn"},{"type":"paragraph","position":{"start":{"line":537,"column":1},"end":{"line":543,"column":1}},"children":[{"type":"text","value":"To achieve the “fixed sample size” assumption, we’ll\nneed to shift our index from ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"bfSjczCwsJ"},{"type":"emphasis","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"children":[{"type":"text","value":"time","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"oxHRXW0A4Y"}],"key":"s5VC0cAmQO"},{"type":"text","value":" to ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"DOdrYcbHXF"},{"type":"emphasis","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"children":[{"type":"text","value":"number of samples from each\narm","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"Sxt4KPBgIL"}],"key":"JrNQNsIcvn"},{"type":"text","value":". In particular, we’ll define ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"joFbTCZhgt"},{"type":"inlineMath","value":"\\tilde r^k_n","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"r~nk\\tilde r^k_nr~nk","key":"fWFHDLvw9C"},{"type":"text","value":" to be the ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"TpsYUo0UBQ"},{"type":"inlineMath","value":"n","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"nnn","key":"MjinYTZkvO"},{"type":"text","value":"th sample\nfrom arm ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"rhioP7eZ4i"},{"type":"inlineMath","value":"k","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"kkk","key":"gYB6WOZYPn"},{"type":"text","value":", and ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"Ba7X3v1X3J"},{"type":"inlineMath","value":"\\tilde \\mu^k_n","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"μ~nk\\tilde \\mu^k_nμ~nk","key":"Kxsm6chU3i"},{"type":"text","value":" to be the sample average of the first\n","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"YdUqIy8Xuz"},{"type":"inlineMath","value":"n","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"nnn","key":"H9kzWSqqSg"},{"type":"text","value":" samples from arm ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"hz0ADoESuo"},{"type":"inlineMath","value":"k","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"kkk","key":"U6qrbC3Ysh"},{"type":"text","value":". Then, for a fixed ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"o7p8N0FAwQ"},{"type":"inlineMath","value":"n","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"nnn","key":"lt8tbof0go"},{"type":"text","value":", this satisfies the\n“fixed sample size” assumption, and we can apply Hoeffding’s inequality\nto get a bound on ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"fJJ7sOGrV2"},{"type":"inlineMath","value":"\\tilde \\mu^k_n","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"μ~nk\\tilde \\mu^k_nμ~nk","key":"TjyDmdftsS"},{"type":"text","value":".","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"NIpIHfjeI9"}],"key":"m9nvtLbBQE"},{"type":"paragraph","position":{"start":{"line":545,"column":1},"end":{"line":549,"column":1}},"children":[{"type":"text","value":"So how can we extend our bound on ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"UGJPX3kPSu"},{"type":"inlineMath","value":"\\tilde\\mu^k_n","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"html":"μ~nk\\tilde\\mu^k_nμ~nk","key":"BlvMgAlVTt"},{"type":"text","value":" to ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"Iosriuj7WT"},{"type":"inlineMath","value":"\\hat \\mu^k_t","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"html":"μ^tk\\hat \\mu^k_tμ^tk","key":"wfyH9dGPgw"},{"type":"text","value":"?\nWell, we know ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"EOCy50MbX7"},{"type":"inlineMath","value":"N^k_t \\le t","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"html":"NtktN^k_t \\le tNtkt","key":"sZGK5bQs54"},{"type":"text","value":" (where equality would be the case if and\nonly if we had pulled arm ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"jaJJXj6jBo"},{"type":"inlineMath","value":"k","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"html":"kkk","key":"vyzUjTD9SA"},{"type":"text","value":" every time). So we can apply the same\ntrick as last time, where we uniform-ize across all possible values of\n","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"C8D7pG5BUD"},{"type":"inlineMath","value":"N^k_t","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"html":"NtkN^k_tNtk","key":"V6hP4Gle2H"},{"type":"text","value":":","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"C8Dx8yuNa6"}],"key":"kwRRELRXYB"},{"type":"math","value":"\\begin{aligned}\n \\pr\\left( \\forall n \\le t, |\\tilde \\mu^k_n - \\mu^k | \\le \\sqrt{\\frac{\\ln(2/\\delta)}{2n}} \\right) &\\ge 1-t\\delta.\n\\end{aligned}","position":{"start":{"line":551,"column":1},"end":{"line":555,"column":1}},"html":"P(nt,μ~nkμkln(2/δ)2n)1tδ.\\begin{aligned}\n \\pr\\left( \\forall n \\le t, |\\tilde \\mu^k_n - \\mu^k | \\le \\sqrt{\\frac{\\ln(2/\\delta)}{2n}} \\right) &\\ge 1-t\\delta.\n\\end{aligned}P(nt,μ~nkμk2nln(2/δ))1tδ.","enumerator":"3.17","key":"u1Y80qP4sP"},{"type":"paragraph","position":{"start":{"line":557,"column":1},"end":{"line":557,"column":1}},"children":[{"type":"text","value":"In particular, since ","position":{"start":{"line":557,"column":1},"end":{"line":557,"column":1}},"key":"wpM8s8kgtH"},{"type":"inlineMath","value":"N^k_t \\le t","position":{"start":{"line":557,"column":1},"end":{"line":557,"column":1}},"html":"NtktN^k_t \\le tNtkt","key":"Sdb8uDcPER"},{"type":"text","value":", and ","position":{"start":{"line":557,"column":1},"end":{"line":557,"column":1}},"key":"n78ywO19p9"},{"type":"inlineMath","value":"\\tilde \\mu^k_{N^k_t} = \\hat \\mu^k_t","position":{"start":{"line":557,"column":1},"end":{"line":557,"column":1}},"html":"μ~Ntkk=μ^tk\\tilde \\mu^k_{N^k_t} = \\hat \\mu^k_tμ~Ntkk=μ^tk","key":"pAeVIzOrki"},{"type":"text","value":" by definition, we have","position":{"start":{"line":557,"column":1},"end":{"line":557,"column":1}},"key":"OaeK4BHbUV"}],"key":"C1Wg38VqEA"},{"type":"math","value":"\\begin{aligned}\n \\pr\\left( |\\hat \\mu^k_t - \\mu^k | \\le \\sqrt{\\frac{\\ln(2t/\\delta')}{2N^k_t}} \\right) &\\ge 1-\\delta' \\text{ where } \\delta' := t \\delta.\n\\end{aligned}","position":{"start":{"line":559,"column":1},"end":{"line":563,"column":1}},"html":"P(μ^tkμkln(2t/δ)2Ntk)1δ where δ:=tδ.\\begin{aligned}\n \\pr\\left( |\\hat \\mu^k_t - \\mu^k | \\le \\sqrt{\\frac{\\ln(2t/\\delta')}{2N^k_t}} \\right) &\\ge 1-\\delta' \\text{ where } \\delta' := t \\delta.\n\\end{aligned}P(μ^tkμk2Ntkln(2t/δ))1δ where δ:=tδ.","enumerator":"3.18","key":"GC6Gx5OyZf"},{"type":"paragraph","position":{"start":{"line":565,"column":1},"end":{"line":565,"column":1}},"children":[{"type":"text","value":"This bound would then suffice for applying the UCB algorithm! That is, the upper confidence bound for arm ","position":{"start":{"line":565,"column":1},"end":{"line":565,"column":1}},"key":"H6HX16LjkX"},{"type":"inlineMath","value":"k","position":{"start":{"line":565,"column":1},"end":{"line":565,"column":1}},"html":"kkk","key":"Ld62I9x51t"},{"type":"text","value":" would be","position":{"start":{"line":565,"column":1},"end":{"line":565,"column":1}},"key":"fAQtWiYaiu"}],"key":"d7t1si1IK3"},{"type":"math","value":"M^k_t := \\hat \\mu^k_t + \\sqrt{\\frac{\\ln(2t/\\delta')}{2N^k_t}},","position":{"start":{"line":567,"column":1},"end":{"line":567,"column":1}},"html":"Mtk:=μ^tk+ln(2t/δ)2Ntk,M^k_t := \\hat \\mu^k_t + \\sqrt{\\frac{\\ln(2t/\\delta')}{2N^k_t}},Mtk:=μ^tk+2Ntkln(2t/δ),","enumerator":"3.19","key":"n32n0Hr7ME"},{"type":"paragraph","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"children":[{"type":"text","value":"where we can choose ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"l2i8AIkFmn"},{"type":"inlineMath","value":"\\delta'","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"html":"δ\\delta'δ","key":"kNT9mu01HE"},{"type":"text","value":" depending on how tight we want the interval to be.","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"NW5yz5HoEm"}],"key":"bqHUWmYzAA"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":571,"column":1},"end":{"line":573,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":571,"column":1},"end":{"line":571,"column":1}},"children":[{"type":"text","value":"A smaller ","position":{"start":{"line":571,"column":1},"end":{"line":571,"column":1}},"key":"lo2c23HDWu"},{"type":"inlineMath","value":"\\delta'","position":{"start":{"line":571,"column":1},"end":{"line":571,"column":1}},"html":"δ\\delta'δ","key":"RKX5Yk6x1f"},{"type":"text","value":" would give us a larger and higher-confidence interval, emphasizing the exploration term.","position":{"start":{"line":571,"column":1},"end":{"line":571,"column":1}},"key":"X39RG0RDwj"}],"key":"PMBVhplv7v"},{"type":"listItem","spread":true,"position":{"start":{"line":572,"column":1},"end":{"line":573,"column":1}},"children":[{"type":"text","value":"A larger ","position":{"start":{"line":572,"column":1},"end":{"line":572,"column":1}},"key":"CQnLKUKPo8"},{"type":"inlineMath","value":"\\delta'","position":{"start":{"line":572,"column":1},"end":{"line":572,"column":1}},"html":"δ\\delta'δ","key":"ltRkMxe7go"},{"type":"text","value":" would give a tighter and lower-confidence interval, prioritizing the current sample averages.","position":{"start":{"line":572,"column":1},"end":{"line":572,"column":1}},"key":"PLUFz35CK5"}],"key":"cRqTGhIKfS"}],"key":"HuX4SX9ExY"},{"type":"paragraph","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"children":[{"type":"text","value":"We can now use this to define the UCB algorithm.","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"key":"Qi3aga5Lou"}],"key":"Kx1eJcN4xh"}],"key":"uSAErPaOjN"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class UCB(Agent):\n def __init__(self, K: int, T: int, delta: float):\n super().__init__(K, T)\n self.delta = delta\n\n def choose_arm(self):\n return solutions.ucb_choose_arm(self)","key":"pc2aL95sa5"},{"type":"output","id":"YXSACmXGhYHjGEWAMpoxo","data":[],"key":"l9iSLNOCfz"}],"data":{},"key":"hQ4wkr2ttA"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":586,"column":1},"end":{"line":586,"column":1}},"children":[{"type":"text","value":"Intuitively, UCB prioritizes arms where:","position":{"start":{"line":586,"column":1},"end":{"line":586,"column":1}},"key":"GW6XmylzMz"}],"key":"Li1CcgnIKu"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":588,"column":1},"end":{"line":593,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":588,"column":1},"end":{"line":590,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":588,"column":1},"end":{"line":589,"column":1}},"children":[{"type":"inlineMath","value":"\\hat \\mu^k_t","position":{"start":{"line":588,"column":1},"end":{"line":588,"column":1}},"html":"μ^tk\\hat \\mu^k_tμ^tk","key":"D4iyEaj9Vz"},{"type":"text","value":" is large, i.e. the arm has a high sample average, and\nwe’d choose it for ","position":{"start":{"line":588,"column":1},"end":{"line":588,"column":1}},"key":"vdC4gw6Bf5"},{"type":"emphasis","position":{"start":{"line":588,"column":1},"end":{"line":588,"column":1}},"children":[{"type":"text","value":"exploitation","position":{"start":{"line":588,"column":1},"end":{"line":588,"column":1}},"key":"oaoxZlWY4H"}],"key":"LB3mBFZjH2"},{"type":"text","value":", and","position":{"start":{"line":588,"column":1},"end":{"line":588,"column":1}},"key":"rie1r9mbOi"}],"key":"khI84o0VAR"}],"key":"ydhiFJn0LF"},{"type":"listItem","spread":true,"position":{"start":{"line":591,"column":1},"end":{"line":593,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":591,"column":1},"end":{"line":592,"column":1}},"children":[{"type":"inlineMath","value":"\\sqrt{\\frac{\\ln(2t/\\delta')}{2N^k_t}}","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"html":"ln(2t/δ)2Ntk\\sqrt{\\frac{\\ln(2t/\\delta')}{2N^k_t}}2Ntkln(2t/δ)","key":"IIiWJkGv22"},{"type":"text","value":" is large, i.e. we’re still\nuncertain about the arm, and we’d choose it for ","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"SaXEzkdv5t"},{"type":"emphasis","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"children":[{"type":"text","value":"exploration","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"As9ExkJLzp"}],"key":"AwHVETAnMs"},{"type":"text","value":".","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"pq9RsFyVUt"}],"key":"X6wL69mVkR"}],"key":"gq9HWHE1qo"}],"key":"vuHJoCx6yC"},{"type":"paragraph","position":{"start":{"line":594,"column":1},"end":{"line":595,"column":1}},"children":[{"type":"text","value":"As desired, this explores in a smarter, ","position":{"start":{"line":594,"column":1},"end":{"line":594,"column":1}},"key":"pMiAxqAEor"},{"type":"emphasis","position":{"start":{"line":594,"column":1},"end":{"line":594,"column":1}},"children":[{"type":"text","value":"adaptive","position":{"start":{"line":594,"column":1},"end":{"line":594,"column":1}},"key":"utk9rOLmn0"}],"key":"QnWdKBc7fd"},{"type":"text","value":" way compared to the\nprevious algorithms. Does it achieve lower regret?","position":{"start":{"line":594,"column":1},"end":{"line":594,"column":1}},"key":"h2wh0L78Qq"}],"key":"d9vy2DGgDI"}],"key":"CBpiJXDcuR"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"agent = UCB(mab.K, mab.T, 0.9)\nmab_loop(mab, agent)\nplot_strategy(mab, agent)","key":"CuPt5CsIBX"},{"type":"output","id":"DdOBVHc_2tY8JrpfKGFNz","data":[{"output_type":"display_data","metadata":{},"data":{"text/plain":{"content":"
","content_type":"text/plain"},"image/png":{"content_type":"image/png","hash":"f3eb002ad30c5ba869f3a828d502f4d2","path":"/build/f3eb002ad30c5ba869f3a828d502f4d2.png"}}}],"key":"t4bjAwlPxY"}],"data":{},"key":"YGdVfOpIEI"},{"type":"block","children":[{"type":"heading","depth":3,"position":{"start":{"line":603,"column":1},"end":{"line":603,"column":1}},"children":[{"type":"text","value":"UCB regret analysis","position":{"start":{"line":603,"column":1},"end":{"line":603,"column":1}},"key":"vmCBdYlS95"}],"identifier":"ucb-regret-analysis","label":"UCB regret analysis","html_id":"ucb-regret-analysis","implicit":true,"enumerator":"3.6.1","key":"yFqglZV73I"},{"type":"paragraph","position":{"start":{"line":605,"column":1},"end":{"line":606,"column":1}},"children":[{"type":"text","value":"First we’ll bound the regret incurred at each timestep. Then we’ll bound\nthe ","position":{"start":{"line":605,"column":1},"end":{"line":605,"column":1}},"key":"XWPdgjtJxC"},{"type":"emphasis","position":{"start":{"line":605,"column":1},"end":{"line":605,"column":1}},"children":[{"type":"text","value":"total","position":{"start":{"line":605,"column":1},"end":{"line":605,"column":1}},"key":"VXRFkEGQ8T"}],"key":"RkE2F2jJYF"},{"type":"text","value":" regret across timesteps.","position":{"start":{"line":605,"column":1},"end":{"line":605,"column":1}},"key":"sABNdbrRKr"}],"key":"U0reCMCnmr"},{"type":"paragraph","position":{"start":{"line":608,"column":1},"end":{"line":611,"column":1}},"children":[{"type":"text","value":"For the sake of analysis, we’ll use a slightly looser bound that applies\nacross the whole time horizon and across all arms. We’ll omit the\nderivation since it’s very similar to the above (walk through it\nyourself for practice).","position":{"start":{"line":608,"column":1},"end":{"line":608,"column":1}},"key":"SYWPVsFwCU"}],"key":"IeWuM7yV9S"},{"type":"math","value":"\\begin{aligned}\n \\pr\\left(\\forall k \\le K, t < T. |\\hat \\mu^k_t - \\mu^k | \\le B^k_t \\right) &\\ge 1-\\delta'' \\\\\n \\text{where} \\quad B^k_t &:= \\sqrt{\\frac{\\ln(2TK/\\delta'')}{2N^k_t}}.\n\\end{aligned}","position":{"start":{"line":613,"column":1},"end":{"line":618,"column":1}},"html":"P(kK,t<T.μ^tkμkBtk)1δwhereBtk:=ln(2TK/δ)2Ntk.\\begin{aligned}\n \\pr\\left(\\forall k \\le K, t < T. |\\hat \\mu^k_t - \\mu^k | \\le B^k_t \\right) &\\ge 1-\\delta'' \\\\\n \\text{where} \\quad B^k_t &:= \\sqrt{\\frac{\\ln(2TK/\\delta'')}{2N^k_t}}.\n\\end{aligned}P(kK,t<T.∣μ^tkμkBtk)whereBtk1δ′′:=2Ntkln(2TK/δ′′).","enumerator":"3.20","key":"pSfImwiLlx"},{"type":"paragraph","position":{"start":{"line":620,"column":1},"end":{"line":623,"column":1}},"children":[{"type":"text","value":"Intuitively, ","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"key":"gl1IF8azHH"},{"type":"inlineMath","value":"B^k_t","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"html":"BtkB^k_tBtk","key":"MxI7OLhX8R"},{"type":"text","value":" denotes the ","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"key":"G26XQglBB0"},{"type":"emphasis","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"children":[{"type":"text","value":"width","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"key":"GMyiRAnX5n"}],"key":"cAIIpR1wvZ"},{"type":"text","value":" of the CI for arm ","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"key":"KklGX5rn5f"},{"type":"inlineMath","value":"k","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"html":"kkk","key":"jFclrhTOyt"},{"type":"text","value":" at time\n","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"key":"WpIGmMxfLD"},{"type":"inlineMath","value":"t","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"html":"ttt","key":"fy0q1jQHWr"},{"type":"text","value":". Then, assuming the above uniform bound holds (which occurs with\nprobability ","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"key":"zT1qHvnBdC"},{"type":"inlineMath","value":"1-\\delta''","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"html":"1δ1-\\delta''1δ′′","key":"IJHSBTvvY6"},{"type":"text","value":"), we can bound the regret at each timestep as\nfollows:","position":{"start":{"line":620,"column":1},"end":{"line":620,"column":1}},"key":"aqOgPrPDnx"}],"key":"NNUWzcNs7E"},{"type":"math","value":"\\begin{aligned}\n \\mu^\\star - \\mu^{a_t} &\\le \\hat \\mu^{k^*}_t + B_t^{k^*} - \\mu^{a_t} && \\text{applying UCB to arm } k^\\star \\\\\n &\\le \\hat \\mu^{a_t}_t + B^{a_t}_t - \\mu^{a_t} && \\text{since UCB chooses } a_t = \\arg \\max_{k \\in [K]} \\hat \\mu^k_t + B_t^{k} \\\\\n &\\le 2 B^{a_t}_t && \\text{since } \\hat \\mu^{a_t}_t - \\mu^{a_t} \\le B^{a_t}_t \\text{ by definition of } B^{a_t}_t \\\\\n\\end{aligned}","position":{"start":{"line":625,"column":1},"end":{"line":631,"column":1}},"html":"μμatμ^tk+Btkμatapplying UCB to arm kμ^tat+Btatμatsince UCB chooses at=argmaxk[K]μ^tk+Btk2Btatsince μ^tatμatBtat by definition of Btat\\begin{aligned}\n \\mu^\\star - \\mu^{a_t} &\\le \\hat \\mu^{k^*}_t + B_t^{k^*} - \\mu^{a_t} && \\text{applying UCB to arm } k^\\star \\\\\n &\\le \\hat \\mu^{a_t}_t + B^{a_t}_t - \\mu^{a_t} && \\text{since UCB chooses } a_t = \\arg \\max_{k \\in [K]} \\hat \\mu^k_t + B_t^{k} \\\\\n &\\le 2 B^{a_t}_t && \\text{since } \\hat \\mu^{a_t}_t - \\mu^{a_t} \\le B^{a_t}_t \\text{ by definition of } B^{a_t}_t \\\\\n\\end{aligned}μμatμ^tk+Btkμatμ^tat+Btatμat2Btatapplying UCB to arm ksince UCB chooses at=argk[K]maxμ^tk+Btksince μ^tatμatBtat by definition of Btat","enumerator":"3.21","key":"IXNb88BgdF"},{"type":"paragraph","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"children":[{"type":"text","value":"Summing this across timesteps gives","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"key":"BZo1Oh9zgg"}],"key":"Zg3Tt8vIyN"},{"type":"math","value":"\\begin{aligned}\n \\text{Regret}_T &\\le \\sum_{t=0}^{T-1} 2 B^{a_t}_t \\\\\n &= \\sqrt{2\\ln(2TK/\\delta'')} \\sum_{t=0}^{T-1} (N^{a_t}_t)^{-1/2} \\\\\n \\sum_{t=0}^{T-1} (N^{a_t}_t)^{-1/2} &= \\sum_{t=0}^{T-1} \\sum_{k=1}^K \\mathbf{1}\\{ a_t = k \\} (N^k_t)^{-1/2} \\\\\n &= \\sum_{k=1}^K \\sum_{n=1}^{N_T^k} n^{-1/2} \\\\\n &\\le K \\sum_{n=1}^T n^{-1/2} \\\\\n \\sum_{n=1}^T n^{-1/2} &\\le 1 + \\int_1^T x^{-1/2} \\ \\mathrm{d}x \\\\\n &= 1 + (2 \\sqrt{x})_1^T \\\\\n &= 2 \\sqrt{T} - 1 \\\\\n &\\le 2 \\sqrt{T} \\\\\n\\end{aligned}","position":{"start":{"line":635,"column":1},"end":{"line":647,"column":1}},"html":"RegretTt=0T12Btat=2ln(2TK/δ)t=0T1(Ntat)1/2t=0T1(Ntat)1/2=t=0T1k=1K1{at=k}(Ntk)1/2=k=1Kn=1NTkn1/2Kn=1Tn1/2n=1Tn1/21+1Tx1/2 dx=1+(2x)1T=2T12T\\begin{aligned}\n \\text{Regret}_T &\\le \\sum_{t=0}^{T-1} 2 B^{a_t}_t \\\\\n &= \\sqrt{2\\ln(2TK/\\delta'')} \\sum_{t=0}^{T-1} (N^{a_t}_t)^{-1/2} \\\\\n \\sum_{t=0}^{T-1} (N^{a_t}_t)^{-1/2} &= \\sum_{t=0}^{T-1} \\sum_{k=1}^K \\mathbf{1}\\{ a_t = k \\} (N^k_t)^{-1/2} \\\\\n &= \\sum_{k=1}^K \\sum_{n=1}^{N_T^k} n^{-1/2} \\\\\n &\\le K \\sum_{n=1}^T n^{-1/2} \\\\\n \\sum_{n=1}^T n^{-1/2} &\\le 1 + \\int_1^T x^{-1/2} \\ \\mathrm{d}x \\\\\n &= 1 + (2 \\sqrt{x})_1^T \\\\\n &= 2 \\sqrt{T} - 1 \\\\\n &\\le 2 \\sqrt{T} \\\\\n\\end{aligned}RegretTt=0T1(Ntat)1/2n=1Tn1/2t=0T12Btat=2ln(2TK/δ′′)t=0T1(Ntat)1/2=t=0T1k=1K1{at=k}(Ntk)1/2=k=1Kn=1NTkn1/2Kn=1Tn1/21+1Tx1/2 dx=1+(2x)1T=2T12T","enumerator":"3.22","key":"VNBb4WXqGN"},{"type":"paragraph","position":{"start":{"line":649,"column":1},"end":{"line":649,"column":1}},"children":[{"type":"text","value":"Putting everything together gives","position":{"start":{"line":649,"column":1},"end":{"line":649,"column":1}},"key":"YzXEOuRKsf"}],"key":"BKmYjND5Si"},{"type":"math","value":"\\begin{aligned}\n \\text{Regret}_T &\\le 2 K \\sqrt{2T \\ln(2TK/\\delta'')} && \\text{with probability } 1-\\delta'' \\\\\n &= \\tilde O(K\\sqrt{T})\n\\end{aligned}","position":{"start":{"line":651,"column":1},"end":{"line":656,"column":1}},"html":"RegretT2K2Tln(2TK/δ)with probability 1δ=O~(KT)\\begin{aligned}\n \\text{Regret}_T &\\le 2 K \\sqrt{2T \\ln(2TK/\\delta'')} && \\text{with probability } 1-\\delta'' \\\\\n &= \\tilde O(K\\sqrt{T})\n\\end{aligned}RegretT2K2Tln(2TK/δ′′)=O~(KT)with probability 1δ′′","enumerator":"3.23","key":"VGCf7YvS8H"},{"type":"paragraph","position":{"start":{"line":658,"column":1},"end":{"line":659,"column":1}},"children":[{"type":"text","value":"In fact, we can do a more sophisticated analysis to trim off a factor of ","position":{"start":{"line":658,"column":1},"end":{"line":658,"column":1}},"key":"pTUpwAjfZM"},{"type":"inlineMath","value":"\\sqrt{K}","position":{"start":{"line":658,"column":1},"end":{"line":658,"column":1}},"html":"K\\sqrt{K}K","key":"d7SjGFucEj"},{"type":"text","value":"\nand show ","position":{"start":{"line":658,"column":1},"end":{"line":658,"column":1}},"key":"pBZaMluKwQ"},{"type":"inlineMath","value":"\\text{Regret}_T = \\tilde O(\\sqrt{TK})","position":{"start":{"line":658,"column":1},"end":{"line":658,"column":1}},"html":"RegretT=O~(TK)\\text{Regret}_T = \\tilde O(\\sqrt{TK})RegretT=O~(TK)","key":"sx9LObyMhZ"},{"type":"text","value":".","position":{"start":{"line":658,"column":1},"end":{"line":658,"column":1}},"key":"aA5hDFYexL"}],"key":"MtA8qvoXBb"}],"key":"gBralSs8R4"},{"type":"block","position":{"start":{"line":661,"column":1},"end":{"line":661,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":663,"column":1},"end":{"line":663,"column":1}},"children":[{"type":"text","value":"Lower bound on regret (intuition)","position":{"start":{"line":663,"column":1},"end":{"line":663,"column":1}},"key":"AK0udciBUd"}],"identifier":"lower-bound-on-regret-intuition","label":"Lower bound on regret (intuition)","html_id":"lower-bound-on-regret-intuition","implicit":true,"enumerator":"3.6.2","key":"x5Ai3dGYhN"},{"type":"paragraph","position":{"start":{"line":665,"column":1},"end":{"line":668,"column":1}},"children":[{"type":"text","value":"Is it possible to do better than ","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"key":"spb7ONCYW3"},{"type":"inlineMath","value":"\\Omega(\\sqrt{T})","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"html":"Ω(T)\\Omega(\\sqrt{T})Ω(T)","key":"QMVhkzdWR6"},{"type":"text","value":" in general? In fact,\nno! We can show that any algorithm must incur ","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"key":"tZHOxCxrJC"},{"type":"inlineMath","value":"\\Omega(\\sqrt{T})","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"html":"Ω(T)\\Omega(\\sqrt{T})Ω(T)","key":"o1NmMey6la"},{"type":"text","value":" regret\nin the worst case. We won’t rigorously prove this here, but the\nintuition is as follows.","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"key":"emoXWbL177"}],"key":"COvJ5pFGP1"},{"type":"paragraph","position":{"start":{"line":670,"column":1},"end":{"line":674,"column":1}},"children":[{"type":"text","value":"The Central Limit Theorem tells us that with ","position":{"start":{"line":670,"column":1},"end":{"line":670,"column":1}},"key":"yHuOpAgbEu"},{"type":"inlineMath","value":"T","position":{"start":{"line":670,"column":1},"end":{"line":670,"column":1}},"html":"TTT","key":"T7bJkHxo1o"},{"type":"text","value":" i.i.d. samples from\nsome distribution, we can only learn the mean of the distribution to\nwithin ","position":{"start":{"line":670,"column":1},"end":{"line":670,"column":1}},"key":"HP6vbLUPRv"},{"type":"inlineMath","value":"\\Omega(1/\\sqrt{T})","position":{"start":{"line":670,"column":1},"end":{"line":670,"column":1}},"html":"Ω(1/T)\\Omega(1/\\sqrt{T})Ω(1/T)","key":"AdxVSJDCwh"},{"type":"text","value":" (the standard deviation). Then, since we get\n","position":{"start":{"line":670,"column":1},"end":{"line":670,"column":1}},"key":"dCFTn6t9IO"},{"type":"inlineMath","value":"T","position":{"start":{"line":670,"column":1},"end":{"line":670,"column":1}},"html":"TTT","key":"oqbj0oK0Un"},{"type":"text","value":" samples spread out across the arms, we can only learn each arm’s\nmean to an even looser degree.","position":{"start":{"line":670,"column":1},"end":{"line":670,"column":1}},"key":"HGRae3Nv8c"}],"key":"ZpAYVJXWqA"},{"type":"paragraph","position":{"start":{"line":676,"column":1},"end":{"line":679,"column":1}},"children":[{"type":"text","value":"That is, if two arms have means that are within about ","position":{"start":{"line":676,"column":1},"end":{"line":676,"column":1}},"key":"hQHF0kxNSZ"},{"type":"inlineMath","value":"1/\\sqrt{T}","position":{"start":{"line":676,"column":1},"end":{"line":676,"column":1}},"html":"1/T1/\\sqrt{T}1/T","key":"JjpWi1nmPl"},{"type":"text","value":", we\nwon’t be able to confidently tell them apart, and will sample them about\nequally. But then we’ll incur regret","position":{"start":{"line":676,"column":1},"end":{"line":676,"column":1}},"key":"xvm6YARZvK"}],"key":"g9uwQOIxqw"},{"type":"math","value":"\\Omega((T/2) \\cdot (1/\\sqrt{T})) = \\Omega(\\sqrt{T}).","position":{"start":{"line":676,"column":1},"end":{"line":676,"column":1}},"tight":"before","html":"Ω((T/2)(1/T))=Ω(T).\\Omega((T/2) \\cdot (1/\\sqrt{T})) = \\Omega(\\sqrt{T}).Ω((T/2)(1/T))=Ω(T).","enumerator":"3.24","key":"ZHij3Mrfg4"}],"key":"nerHty4eSd"},{"type":"block","position":{"start":{"line":681,"column":1},"end":{"line":681,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":684,"column":1},"end":{"line":684,"column":1}},"children":[{"type":"text","value":"Thompson sampling and Bayesian bandits","position":{"start":{"line":684,"column":1},"end":{"line":684,"column":1}},"key":"XQBAZix6ko"}],"label":"thompson_sampling","identifier":"thompson_sampling","html_id":"thompson-sampling","enumerator":"3.7","key":"vlzolZwV3l"},{"type":"paragraph","position":{"start":{"line":686,"column":1},"end":{"line":692,"column":1}},"children":[{"type":"text","value":"So far, we’ve treated the parameters ","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"r64oLnYtxs"},{"type":"inlineMath","value":"\\mu^0, \\dots, \\mu^{K-1}","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"html":"μ0,,μK1\\mu^0, \\dots, \\mu^{K-1}μ0,,μK1","key":"DdGd9DY25n"},{"type":"text","value":" of the\nreward distributions as ","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"LpzseLqh8R"},{"type":"emphasis","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"children":[{"type":"text","value":"fixed","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"CfOgqWQUbB"}],"key":"NJFV72YdQs"},{"type":"text","value":". Instead, we can take a ","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"HHuDMeX9J3"},{"type":"strong","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"children":[{"type":"text","value":"Bayesian","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"lAO0kzgfAj"}],"key":"T9XHDxYNzx"},{"type":"text","value":"\napproach where we treat them as random variables from some ","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"HgatgRo6hm"},{"type":"strong","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"children":[{"type":"text","value":"prior\ndistribution","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"cTG9eXq6Vr"}],"key":"mb1BdfeSDv"},{"type":"text","value":". Then, upon pulling an arm and observing a reward, we can\nsimply ","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"WMNjfl2G47"},{"type":"emphasis","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"children":[{"type":"text","value":"condition","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"r6MTZNljed"}],"key":"MMPtTLFkn8"},{"type":"text","value":" on this observation to exactly describe the\n","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"Re1pfjpPof"},{"type":"strong","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"children":[{"type":"text","value":"posterior distribution","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"C3aWw7Bugc"}],"key":"lj4yrfCuZa"},{"type":"text","value":" over the parameters. This fully describes the\ninformation we gain about the parameters from observing the reward.","position":{"start":{"line":686,"column":1},"end":{"line":686,"column":1}},"key":"iEz22CGBfk"}],"key":"yiW8VezJrR"},{"type":"paragraph","position":{"start":{"line":694,"column":1},"end":{"line":696,"column":1}},"children":[{"type":"text","value":"From this Bayesian perspective, the ","position":{"start":{"line":694,"column":1},"end":{"line":694,"column":1}},"key":"SsBn5XmBbW"},{"type":"strong","position":{"start":{"line":694,"column":1},"end":{"line":694,"column":1}},"children":[{"type":"text","value":"Thompson sampling","position":{"start":{"line":694,"column":1},"end":{"line":694,"column":1}},"key":"Klf1IpXe8t"}],"key":"OUnGkNwKam"},{"type":"text","value":" algorithm\nfollows naturally: just sample from the distribution of the optimal arm,\ngiven the observations!","position":{"start":{"line":694,"column":1},"end":{"line":694,"column":1}},"key":"kwHfn3Btan"}],"key":"asJclNhNoi"}],"key":"PTZJEmb2Zh"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class Distribution:\n def sample(self) -> Float[Array, \" K\"]:\n \"\"\"Sample a vector of means for the K arms.\"\"\"\n ...\n\n def update(self, arm: int, reward: float):\n \"\"\"Condition on obtaining `reward` from the given arm.\"\"\"\n ...","key":"PJ2A9IGLtr"},{"type":"output","id":"hhkhanOTJXbZl6nXqPRtt","data":[],"key":"d29H4gWCkt"}],"data":{},"key":"eOVmhlItPA"},{"type":"block","children":[],"key":"OEx20osbt1"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class ThompsonSampling(Agent):\n def __init__(self, K: int, T: int, prior: Distribution):\n super().__init__(K, T)\n self.distribution = prior\n\n def choose_arm(self):\n means = self.distribution.sample()\n return random_argmax(means)\n\n def update_history(self, arm: int, reward: int):\n super().update_history(arm, reward)\n self.distribution.update(arm, reward)","key":"PwqZlFESTn"},{"type":"output","id":"yXJmMxc98EUf3WOYXXZ89","data":[],"key":"YY3nV7n4wU"}],"data":{},"key":"fZLo9pxdVn"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":724,"column":1},"end":{"line":729,"column":1}},"children":[{"type":"text","value":"In other words, we sample each arm proportionally to how likely we think\nit is to be optimal, given the observations so far. This strikes a good\nexploration-exploitation tradeoff: we explore more for arms that we’re\nless certain about, and exploit more for arms that we’re more certain\nabout. Thompson sampling is a simple yet powerful algorithm that\nachieves state-of-the-art performance in many settings.","position":{"start":{"line":724,"column":1},"end":{"line":724,"column":1}},"key":"J8Nhtf1DVI"}],"key":"uc1Z4x6uHa"},{"type":"proof","kind":"example","label":"bayesian_bernoulli","identifier":"bayesian_bernoulli","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Bayesian Bernoulli bandit","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"key":"q1kUPJITQD"}],"key":"fYkk4m7yPZ"},{"type":"paragraph","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"children":[{"type":"text","value":"We’ve been working in the Bernoulli bandit setting, where arm ","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"key":"ynStS3Ub7Z"},{"type":"inlineMath","value":"k","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"html":"kkk","key":"l4tOMhEcKa"},{"type":"text","value":" yields a reward of ","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"key":"ntLJjJwN2N"},{"type":"text","value":"1","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"key":"DXCbHxT3I4"},{"type":"text","value":" with probability ","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"key":"FxBsVp0YpA"},{"type":"inlineMath","value":"\\mu^k","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"html":"μk\\mu^kμk","key":"HBzyOgS5oY"},{"type":"text","value":" and no reward otherwise. The vector of success probabilities ","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"key":"cvrtxuU8sl"},{"type":"inlineMath","value":"\\boldsymbol{\\mu} = (\\mu^1, \\dots, \\mu^K)","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"html":"μ=(μ1,,μK)\\boldsymbol{\\mu} = (\\mu^1, \\dots, \\mu^K)μ=(μ1,,μK)","key":"zOdG9WxEcy"},{"type":"text","value":" thus describes the entire MAB.","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"key":"AgctimHk8C"}],"key":"ipvXuP47hK"},{"type":"paragraph","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"children":[{"type":"text","value":"Under the Bayesian perspective, we think of ","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"OFFGVKEike"},{"type":"inlineMath","value":"\\boldsymbol{\\mu}","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"html":"μ\\boldsymbol{\\mu}μ","key":"RL2qygGmfj"},{"type":"text","value":" as a ","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"f3jm3m38YK"},{"type":"emphasis","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"children":[{"type":"text","value":"random","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"amOKWOMkWA"}],"key":"C53VedLOCU"},{"type":"text","value":" vector drawn from some prior distribution ","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"byqG1lFikL"},{"type":"inlineMath","value":"\\pi(\\boldsymbol{\\mu})","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"html":"π(μ)\\pi(\\boldsymbol{\\mu})π(μ)","key":"OfydnGmxJr"},{"type":"text","value":". For example, we might have ","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"Wxh71fSfPQ"},{"type":"text","value":"π","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"n6cE56BRTc"},{"type":"text","value":" be the Uniform distribution over the unit hypercube ","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"YvelzghnXU"},{"type":"inlineMath","value":"[0, 1]^K","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"html":"[0,1]K[0, 1]^K[0,1]K","key":"psUjBzQubK"},{"type":"text","value":", that is,","position":{"start":{"line":736,"column":1},"end":{"line":736,"column":1}},"key":"pGwN5ukXBW"}],"key":"Bj6jxxhtIE"},{"type":"math","value":"\\pi(\\boldsymbol{\\mu}) = \\begin{cases}\n 1 & \\text{if } \\boldsymbol{\\mu}\\in [0, 1]^K \\\\\n 0 & \\text{otherwise}\n\\end{cases}","position":{"start":{"line":738,"column":1},"end":{"line":741,"column":1}},"html":"π(μ)={1if μ[0,1]K0otherwise\\pi(\\boldsymbol{\\mu}) = \\begin{cases}\n 1 & \\text{if } \\boldsymbol{\\mu}\\in [0, 1]^K \\\\\n 0 & \\text{otherwise}\n\\end{cases}π(μ)={10if μ[0,1]Kotherwise","enumerator":"3.25","key":"dqav7nB110"},{"type":"paragraph","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"children":[{"type":"text","value":"In this case, upon viewing some reward, we can exactly calculate the ","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"key":"p7sQ6DU53u"},{"type":"strong","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"children":[{"type":"text","value":"posterior","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"key":"oIRGxsLpj4"}],"key":"wnoxpyJMaQ"},{"type":"text","value":" distribution of ","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"key":"ODlpfKWdmF"},{"type":"inlineMath","value":"\\boldsymbol{\\mu}","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"html":"μ\\boldsymbol{\\mu}μ","key":"qwqjtIJqAt"},{"type":"text","value":" using Bayes’s rule (i.e. the definition of conditional probability):","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"key":"fbTkhxt6lJ"}],"key":"Q1FEjQpOWS"},{"type":"math","value":"\\begin{aligned}\n \\pr(\\boldsymbol{\\mu} \\mid a_0, r_0) &\\propto \\pr(r_0 \\mid a_0, \\boldsymbol{\\mu}) \\pr(a_0 \\mid \\boldsymbol{\\mu}) \\pr(\\boldsymbol{\\mu}) \\\\\n &\\propto (\\mu^{a_0})^{r_0} (1 - \\mu^{a_0})^{1-r_0}.\n\\end{aligned}","position":{"start":{"line":745,"column":1},"end":{"line":750,"column":1}},"html":"P(μa0,r0)P(r0a0,μ)P(a0μ)P(μ)(μa0)r0(1μa0)1r0.\\begin{aligned}\n \\pr(\\boldsymbol{\\mu} \\mid a_0, r_0) &\\propto \\pr(r_0 \\mid a_0, \\boldsymbol{\\mu}) \\pr(a_0 \\mid \\boldsymbol{\\mu}) \\pr(\\boldsymbol{\\mu}) \\\\\n &\\propto (\\mu^{a_0})^{r_0} (1 - \\mu^{a_0})^{1-r_0}.\n\\end{aligned}P(μa0,r0)P(r0a0,μ)P(a0μ)P(μ)(μa0)r0(1μa0)1r0.","enumerator":"3.26","key":"ySJNNenxt1"},{"type":"paragraph","position":{"start":{"line":752,"column":1},"end":{"line":760,"column":1}},"children":[{"type":"text","value":"This is the PDF of the\n","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"tWR1QccV8u"},{"type":"inlineMath","value":"\\text{Beta}(1 + r_0, 1 + (1 - r_0))","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"html":"Beta(1+r0,1+(1r0))\\text{Beta}(1 + r_0, 1 + (1 - r_0))Beta(1+r0,1+(1r0))","key":"wfW2NFuQ28"},{"type":"text","value":" distribution, which is a conjugate\nprior for the Bernoulli distribution. That is, if we start with a Beta\nprior on ","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"bsRGwu17hx"},{"type":"inlineMath","value":"\\mu^k","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"html":"μk\\mu^kμk","key":"lHc0btSlbT"},{"type":"text","value":" (note that ","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"ngNTUJY0zD"},{"type":"inlineMath","value":"\\text{Unif}([0, 1]) = \\text{Beta}(1, 1)","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"html":"Unif([0,1])=Beta(1,1)\\text{Unif}([0, 1]) = \\text{Beta}(1, 1)Unif([0,1])=Beta(1,1)","key":"OCzbZq4z6e"},{"type":"text","value":"),\nthen the posterior, after conditioning on samples from\n","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"CltmocS2ao"},{"type":"inlineMath","value":"\\text{Bern}(\\mu^k)","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"html":"Bern(μk)\\text{Bern}(\\mu^k)Bern(μk)","key":"EBezjEB04X"},{"type":"text","value":", will also be Beta. This is a very convenient\nproperty, since it means we can simply update the parameters of the Beta\ndistribution upon observing a reward, rather than having to recompute\nthe entire posterior distribution from scratch.","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"Os2JZA3KjC"}],"key":"kL4ruj7nwW"}],"enumerator":"3.3","html_id":"bayesian-bernoulli","key":"ixAo82IFvW"}],"key":"IqeqAOSZfb"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class Beta(Distribution):\n def __init__(self, K: int, alpha: int = 1, beta: int = 1):\n self.alphas = np.full(K, alpha)\n self.betas = np.full(K, beta)\n\n def sample(self):\n return np.random.beta(self.alphas, self.betas)\n\n def update(self, arm: int, reward: int):\n self.alphas[arm] += reward\n self.betas[arm] += 1 - reward","key":"tYYp1lSBre"},{"type":"output","id":"tEk8vCuG-9SbYC9-PXeco","data":[],"key":"kNbeaE8nO8"}],"data":{},"key":"JtLwgHAlRz"},{"type":"block","children":[],"key":"PPRIQ3gCWs"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"beta_distribution = Beta(mab.K)\nagent = ThompsonSampling(mab.K, mab.T, beta_distribution)\nmab_loop(mab, agent)\nplot_strategy(mab, agent)","key":"M1YMk5G4oZ"},{"type":"output","id":"kHu145heoMcccBuHUe0FG","data":[{"output_type":"display_data","metadata":{},"data":{"text/plain":{"content":"
","content_type":"text/plain"},"image/png":{"content_type":"image/png","hash":"991419959ab213822fb1c34db8883adb","path":"/build/991419959ab213822fb1c34db8883adb.png"}}}],"key":"NhLcXT3bzG"}],"data":{},"key":"ZoIXDwSAZz"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":784,"column":1},"end":{"line":786,"column":1}},"children":[{"type":"text","value":"It turns out that asymptotically, Thompson sampling is optimal in the\nfollowing sense. ","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"GiG0y5xwn9"},{"type":"cite","kind":"narrative","label":"lai_asymptotically_1985","identifier":"lai_asymptotically_1985","children":[{"type":"text","value":"Lai & Robbins (1985)","key":"MG0Iaebj15"}],"enumerator":"2","key":"vAJe0ZLcLE"},{"type":"text","value":" prove an\n","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"mF5eiaqVDp"},{"type":"emphasis","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"children":[{"type":"text","value":"instance-dependent","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"gTJ8rKcMkd"}],"key":"sgnBf0v7hg"},{"type":"text","value":" lower bound that says for ","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"PhRIsJtLHY"},{"type":"emphasis","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"children":[{"type":"text","value":"any","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"K2iirW2Uc4"}],"key":"FCKfMBe5aL"},{"type":"text","value":" bandit algorithm,","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"qjpffwnr6M"}],"key":"lREymGWJni"},{"type":"math","value":"\\liminf_{T \\to \\infty} \\frac{\\E[N_T^k]}{\\ln(T)} \\ge \\frac{1}{\\text{KL}(\\mu^k \\parallel \\mu^\\star)}","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"html":"lim infTE[NTk]ln(T)1KL(μkμ)\\liminf_{T \\to \\infty} \\frac{\\E[N_T^k]}{\\ln(T)} \\ge \\frac{1}{\\text{KL}(\\mu^k \\parallel \\mu^\\star)}Tliminfln(T)E[NTk]KL(μkμ)1","enumerator":"3.27","key":"bYlj3Co1Rw"},{"type":"paragraph","position":{"start":{"line":790,"column":1},"end":{"line":790,"column":1}},"children":[{"type":"text","value":"where","position":{"start":{"line":790,"column":1},"end":{"line":790,"column":1}},"key":"nE1Am0CUk6"}],"key":"UbUafxbdoL"},{"type":"math","value":"\\text{KL}(\\mu^k \\parallel \\mu^\\star) = \\mu^k \\ln \\frac{\\mu^k}{\\mu^\\star} + (1 - \\mu^k) \\ln \\frac{1 - \\mu^k}{1 - \\mu^\\star}","position":{"start":{"line":792,"column":1},"end":{"line":792,"column":1}},"html":"KL(μkμ)=μklnμkμ+(1μk)ln1μk1μ\\text{KL}(\\mu^k \\parallel \\mu^\\star) = \\mu^k \\ln \\frac{\\mu^k}{\\mu^\\star} + (1 - \\mu^k) \\ln \\frac{1 - \\mu^k}{1 - \\mu^\\star}KL(μkμ)=μklnμμk+(1μk)ln1μ1μk","enumerator":"3.28","key":"OczWR3vFp2"},{"type":"paragraph","position":{"start":{"line":794,"column":1},"end":{"line":798,"column":1}},"children":[{"type":"text","value":"measures the ","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"rbxb8D31wW"},{"type":"strong","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"children":[{"type":"text","value":"Kullback-Leibler divergence","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"Fscsqe8347"}],"key":"eHA6Wv95IA"},{"type":"text","value":" from the Bernoulli\ndistribution with mean ","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"g5sUpZrYsc"},{"type":"inlineMath","value":"\\mu^k","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"html":"μk\\mu^kμk","key":"T5yJjKNLkf"},{"type":"text","value":" to the Bernoulli distribution with mean\n","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"W0gcvsalB5"},{"type":"inlineMath","value":"\\mu^\\star","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"html":"μ\\mu^\\starμ","key":"gMcp8o8YHX"},{"type":"text","value":". It turns out that Thompson sampling achieves this lower\nbound with equality! That is, not only is the error ","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"S5Kbh21ZF2"},{"type":"emphasis","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"children":[{"type":"text","value":"rate","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"jxDLBt3Sji"}],"key":"X2gSJ9vzkY"},{"type":"text","value":" optimal, but\nthe ","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"XoK4uotMyH"},{"type":"emphasis","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"children":[{"type":"text","value":"constant factor","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"hyHAYfXbDY"}],"key":"XBxM7IoTbJ"},{"type":"text","value":" is optimal as well.","position":{"start":{"line":794,"column":1},"end":{"line":794,"column":1}},"key":"cJ9a4D1z1s"}],"key":"LTFuGPtenb"}],"key":"wvgAvpsQfl"},{"type":"block","position":{"start":{"line":800,"column":1},"end":{"line":800,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"children":[{"type":"text","value":"Contextual bandits","position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"key":"M8rsg8hhzY"}],"identifier":"contextual-bandits","label":"Contextual bandits","html_id":"contextual-bandits","implicit":true,"enumerator":"3.8","key":"ujs2HWRsey"},{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Note","key":"LEEt6sE0ME"}],"key":"aBaIFhX5X1"},{"type":"paragraph","position":{"start":{"line":805,"column":1},"end":{"line":805,"column":1}},"children":[{"type":"text","value":"This content is advanced material taught at the end of the course.","position":{"start":{"line":805,"column":1},"end":{"line":805,"column":1}},"key":"JnYJoAARmh"}],"key":"BBBDqNsC2S"}],"key":"Z2Esg0zhoQ"},{"type":"paragraph","position":{"start":{"line":808,"column":1},"end":{"line":814,"column":1}},"children":[{"type":"text","value":"In the above MAB environment, the reward distributions of the arms\nremain constant. However, in many real-world settings, we might receive\nadditional information that affects these distributions. For example, in\nthe online advertising case where each arm corresponds to an ad we could\nshow the user, we might receive information about the user’s preferences\nthat changes how likely they are to click on a given ad. We can model\nsuch environments using ","position":{"start":{"line":808,"column":1},"end":{"line":808,"column":1}},"key":"l0tbLlDw9Z"},{"type":"strong","position":{"start":{"line":808,"column":1},"end":{"line":808,"column":1}},"children":[{"type":"text","value":"contextual bandits","position":{"start":{"line":808,"column":1},"end":{"line":808,"column":1}},"key":"NxVr6P878q"}],"key":"CLtTWVeC4t"},{"type":"text","value":".","position":{"start":{"line":808,"column":1},"end":{"line":808,"column":1}},"key":"smOwwpLwAT"}],"key":"ouBybegxEG"},{"type":"proof","kind":"definition","label":"contextual_bandit","identifier":"contextual_bandit","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Contextual bandit","position":{"start":{"line":816,"column":1},"end":{"line":816,"column":1}},"key":"rynzJoxpHB"}],"key":"nOzhpV5KwD"},{"type":"paragraph","position":{"start":{"line":819,"column":1},"end":{"line":824,"column":1}},"children":[{"type":"text","value":"At each timestep ","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"zT4h2Uv9Nl"},{"type":"inlineMath","value":"t","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"html":"ttt","key":"SzVbwXExTQ"},{"type":"text","value":", a new ","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"KD99enokPo"},{"type":"emphasis","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"children":[{"type":"text","value":"context","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"grFBzBkbRl"}],"key":"IJFqljulDJ"},{"type":"text","value":"\n","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"sT5Yl3inds"},{"type":"inlineMath","value":"x_t","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"html":"xtx_txt","key":"jmfUFXarEA"},{"type":"text","value":" is drawn from some distribution ","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"ibg5NxEa5K"},{"type":"inlineMath","value":"\\nu_{\\text{x}}","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"html":"νx\\nu_{\\text{x}}νx","key":"C3c949Bi5H"},{"type":"text","value":". The learner gets\nto observe the context, and choose an action ","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"Y5dNZdtVTZ"},{"type":"inlineMath","value":"a_t","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"html":"ata_tat","key":"Ijp04ZVQs5"},{"type":"text","value":" according to some\ncontext-dependent policy ","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"epegYurREB"},{"type":"inlineMath","value":"\\pi_t(x_t)","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"html":"πt(xt)\\pi_t(x_t)πt(xt)","key":"vazLLNbkXA"},{"type":"text","value":". Then, the learner observes the\nreward from the chosen arm ","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"qfQYId707Q"},{"type":"inlineMath","value":"r_t \\sim \\nu^{a_t}(x_t)","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"html":"rtνat(xt)r_t \\sim \\nu^{a_t}(x_t)rtνat(xt)","key":"V21BNT9bN5"},{"type":"text","value":". The reward\ndistribution also depends on the context.","position":{"start":{"line":819,"column":1},"end":{"line":819,"column":1}},"key":"Ohp3eVNxqU"}],"key":"Ea1xyYeuVG"}],"enumerator":"3.2","html_id":"contextual-bandit","key":"ERrC61NgYS"}],"key":"D4d4VQVls3"},{"type":"block","position":{"start":{"line":827,"column":1},"end":{"line":827,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":829,"column":1},"end":{"line":831,"column":1}},"children":[{"type":"text","value":"Assuming our context is ","position":{"start":{"line":829,"column":1},"end":{"line":829,"column":1}},"key":"K4DIhMjA0I"},{"type":"emphasis","position":{"start":{"line":829,"column":1},"end":{"line":829,"column":1}},"children":[{"type":"text","value":"discrete","position":{"start":{"line":829,"column":1},"end":{"line":829,"column":1}},"key":"EPC7NlaQWj"}],"key":"CSQgGXq1Tk"},{"type":"text","value":", we can just perform the same\nalgorithms, treating each context-arm pair as its own arm. This gives us\nan enlarged MAB of ","position":{"start":{"line":829,"column":1},"end":{"line":829,"column":1}},"key":"HiDzcDDBqK"},{"type":"inlineMath","value":"K |\\mathcal{X}|","position":{"start":{"line":829,"column":1},"end":{"line":829,"column":1}},"html":"KXK |\\mathcal{X}|KX","key":"wHxooBZ3Lp"},{"type":"text","value":" arms.","position":{"start":{"line":829,"column":1},"end":{"line":829,"column":1}},"key":"ijz3FpyWE6"}],"key":"n2uGVXsnuC"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"LGIVNPRV4P"}],"key":"JNUqjth39W"},{"type":"paragraph","position":{"start":{"line":834,"column":1},"end":{"line":835,"column":1}},"children":[{"type":"text","value":"Write down the UCB algorithm for this enlarged MAB. That is, write an\nexpression for ","position":{"start":{"line":834,"column":1},"end":{"line":834,"column":1}},"key":"qTethfehJ1"},{"type":"inlineMath","value":"\\pi_t(x_t) = \\arg\\max_a \\dots","position":{"start":{"line":834,"column":1},"end":{"line":834,"column":1}},"html":"πt(xt)=argmaxa\\pi_t(x_t) = \\arg\\max_a \\dotsπt(xt)=argmaxa","key":"bRvqRYxVrP"},{"type":"text","value":".","position":{"start":{"line":834,"column":1},"end":{"line":834,"column":1}},"key":"KbdIlmHznE"}],"key":"xys8ZfqzIR"}],"key":"gBaRV0OUvz"},{"type":"paragraph","position":{"start":{"line":838,"column":1},"end":{"line":844,"column":1}},"children":[{"type":"text","value":"Recall that running UCB for ","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"key":"VbZ4xY8gEI"},{"type":"inlineMath","value":"T","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"html":"TTT","key":"X92Os3aY0h"},{"type":"text","value":" timesteps on an MAB with ","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"key":"CEWBQRw7UN"},{"type":"inlineMath","value":"K","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"html":"KKK","key":"QZfvEgq6dt"},{"type":"text","value":" arms\nachieves a regret bound of ","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"key":"FM4q0Thfsr"},{"type":"inlineMath","value":"\\tilde{O}(\\sqrt{TK})","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"html":"O~(TK)\\tilde{O}(\\sqrt{TK})O~(TK)","key":"RibXrIXpEP"},{"type":"text","value":". So in this problem,\nwe would achieve regret ","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"key":"VOs2iFdpyq"},{"type":"inlineMath","value":"\\tilde{O}(\\sqrt{TK|\\mathcal{X}|})","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"html":"O~(TKX)\\tilde{O}(\\sqrt{TK|\\mathcal{X}|})O~(TKX)","key":"xKAYfYVRjc"},{"type":"text","value":" in the\ncontextual MAB, which has a polynomial dependence on ","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"key":"rdVkFySilC"},{"type":"inlineMath","value":"|\\mathcal{X}|","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"html":"X|\\mathcal{X}|X","key":"Pq2w1RQrKS"},{"type":"text","value":".\nBut in a situation where we have large, or even infinitely many\ncontexts, e.g. in the case where our context is a continuous value, this\nbecomes intractable.","position":{"start":{"line":838,"column":1},"end":{"line":838,"column":1}},"key":"bbwixhEtoe"}],"key":"TQc3xHXXbm"},{"type":"paragraph","position":{"start":{"line":846,"column":1},"end":{"line":850,"column":1}},"children":[{"type":"text","value":"Note that this “enlarged MAB” treats the different contexts as entirely\nunrelated to each other, while in practice, often contexts are ","position":{"start":{"line":846,"column":1},"end":{"line":846,"column":1}},"key":"zdnJkvWPCs"},{"type":"emphasis","position":{"start":{"line":846,"column":1},"end":{"line":846,"column":1}},"children":[{"type":"text","value":"related","position":{"start":{"line":846,"column":1},"end":{"line":846,"column":1}},"key":"IzAgtegKBm"}],"key":"PwLvXSLF8r"},{"type":"text","value":"\nto each other in some way: for example, we might want to advertise\nsimilar products to users with similar preferences. How can we\nincorporate this structure into our solution?","position":{"start":{"line":846,"column":1},"end":{"line":846,"column":1}},"key":"tAS0PI53Yu"}],"key":"FcFQ30Lb7E"}],"key":"CuIVzts63U"},{"type":"block","position":{"start":{"line":852,"column":1},"end":{"line":852,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":855,"column":1},"end":{"line":855,"column":1}},"children":[{"type":"text","value":"Linear contextual bandits","position":{"start":{"line":855,"column":1},"end":{"line":855,"column":1}},"key":"ZMPutFYccI"}],"label":"lin_ucb","identifier":"lin_ucb","html_id":"lin-ucb","enumerator":"3.8.1","key":"pOhjLtlRaS"},{"type":"paragraph","position":{"start":{"line":857,"column":1},"end":{"line":866,"column":1}},"children":[{"type":"text","value":"We want to model the ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"ZZtrqAEdiq"},{"type":"emphasis","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"children":[{"type":"text","value":"mean reward","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"wqbLEjAn2F"}],"key":"YBmm9aRUeN"},{"type":"text","value":" of arm ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"k67Pamv0OY"},{"type":"inlineMath","value":"k","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"kkk","key":"ew1KjSPJFl"},{"type":"text","value":" as a function of the\ncontext, i.e. ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"Hk8ib1qhUc"},{"type":"inlineMath","value":"\\mu^k(x)","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"μk(x)\\mu^k(x)μk(x)","key":"RtwFbEItYu"},{"type":"text","value":". One simple model is the ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"C4TocQZcX8"},{"type":"emphasis","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"children":[{"type":"text","value":"linear","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"gS0C4ArjEx"}],"key":"Vh9gIYLlDe"},{"type":"text","value":" one:\n","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"m67uFXum0D"},{"type":"inlineMath","value":"\\mu^k(x) = x^\\top \\theta^k","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"μk(x)=xθk\\mu^k(x) = x^\\top \\theta^kμk(x)=xθk","key":"w7qAlton1K"},{"type":"text","value":", where ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"UQHgBylp7N"},{"type":"inlineMath","value":"x \\in \\mathcal{X} = \\mathbb{R}^d","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"xX=Rdx \\in \\mathcal{X} = \\mathbb{R}^dxX=Rd","key":"GFMFZkYI9I"},{"type":"text","value":" and\n","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"kUTo4qJ312"},{"type":"inlineMath","value":"\\theta^k \\in \\mathbb{R}^d","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"θkRd\\theta^k \\in \\mathbb{R}^dθkRd","key":"gLHy26FQPp"},{"type":"text","value":" describes a ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"rxw50Xs5yY"},{"type":"emphasis","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"children":[{"type":"text","value":"feature direction","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"OkkfDgjXp4"}],"key":"JsaGIKlt4A"},{"type":"text","value":" for arm ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"mzS3EJFovE"},{"type":"inlineMath","value":"k","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"kkk","key":"gphlX8zvRN"},{"type":"text","value":". Recall\nthat ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"v09BaUwoBw"},{"type":"strong","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"children":[{"type":"text","value":"supervised learning","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"UBovCIOfw6"}],"key":"M65Y5fN9hP"},{"type":"text","value":" gives us a way to estimate a conditional\nexpectation from samples: We learn a ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"GOW0BlWDRd"},{"type":"emphasis","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"children":[{"type":"text","value":"least squares","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"q1B4ZCPCHK"}],"key":"DzAWflO9Nn"},{"type":"text","value":" estimator from the\ntimesteps where arm ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"reocXhN8zF"},{"type":"inlineMath","value":"k","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"kkk","key":"VnlZu7gHCK"},{"type":"text","value":" was selected:","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"QUKyDPaT54"}],"key":"Zm7dwTt9A7"},{"type":"math","value":"\\hat \\theta_t^k = \\arg\\min_{\\theta \\in \\mathbb{R}^d} \\sum_{\\{ i \\in [t] : a_i = k \\}} (r_i - x_i^\\top \\theta)^2.","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"tight":true,"html":"θ^tk=argminθRd{i[t]:ai=k}(rixiθ)2.\\hat \\theta_t^k = \\arg\\min_{\\theta \\in \\mathbb{R}^d} \\sum_{\\{ i \\in [t] : a_i = k \\}} (r_i - x_i^\\top \\theta)^2.θ^tk=argθRdmin{i[t]:ai=k}(rixiθ)2.","enumerator":"3.29","key":"UWExc4c4wn"},{"type":"paragraph","position":{"start":{"line":857,"column":1},"end":{"line":866,"column":1}},"children":[{"type":"text","value":"This has the closed-form solution known as the ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"nEur58OlpS"},{"type":"emphasis","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"children":[{"type":"text","value":"ordinary least squares","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"Y16LIsKF87"}],"key":"v6ERfOGmU8"},{"type":"text","value":"\n(OLS) estimator:","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"CTYAeOHc9l"}],"key":"XjdtNHH23a"},{"type":"math","value":"\\begin{aligned}\n \\hat \\theta_t^k & = (A_t^k)^{-1} \\sum_{\\{ i \\in [t] : a_i = k \\}} x_i r_i \\\\\n \\text{where} \\quad A_t^k & = \\sum_{\\{ i \\in [t] : a_i = k \\}} x_i x_i^\\top.\n\\end{aligned}","label":"ols_bandit","identifier":"ols_bandit","html":"θ^tk=(Atk)1{i[t]:ai=k}xiriwhereAtk={i[t]:ai=k}xixi.\\begin{aligned}\n \\hat \\theta_t^k & = (A_t^k)^{-1} \\sum_{\\{ i \\in [t] : a_i = k \\}} x_i r_i \\\\\n \\text{where} \\quad A_t^k & = \\sum_{\\{ i \\in [t] : a_i = k \\}} x_i x_i^\\top.\n\\end{aligned}θ^tkwhereAtk=(Atk)1{i[t]:ai=k}xiri={i[t]:ai=k}xixi.","enumerator":"3.30","html_id":"ols-bandit","key":"c8r1n2x2LP"},{"type":"paragraph","position":{"start":{"line":877,"column":1},"end":{"line":884,"column":1}},"children":[{"type":"text","value":"We can now apply the UCB algorithm in this environment in order to\nbalance ","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"H0OitEydDD"},{"type":"emphasis","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"children":[{"type":"text","value":"exploration","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"zNRYeydI1S"}],"key":"uwnENOhhJ9"},{"type":"text","value":" of new arms and ","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"EsVarmgKEo"},{"type":"emphasis","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"children":[{"type":"text","value":"exploitation","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"hj0uvwnNyo"}],"key":"rKHGK6NrJ1"},{"type":"text","value":" of arms that we\nbelieve to have high reward. But how should we construct the upper\nconfidence bound? Previously, we treated the pulls of an arm as i.i.d.\nsamples and used Hoeffding’s inequality to bound the distance of the\nsample mean, our estimator, from the true mean. However, now our\nestimator is not a sample mean, but rather the OLS estimator above ","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"MEOaoFKBjN"},{"type":"crossReference","kind":"equation","identifier":"ols_bandit","label":"ols_bandit","children":[{"type":"text","value":"(","key":"oWHbUen5wS"},{"type":"text","value":"3.30","key":"E7uplVh5Ho"},{"type":"text","value":")","key":"HuEbDza34p"}],"template":"(%s)","enumerator":"3.30","resolved":true,"html_id":"ols-bandit","key":"KPRoy9xpzM"},{"type":"text","value":". Instead, we’ll use ","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"s7eyxhB5r4"},{"type":"strong","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"children":[{"type":"text","value":"Chebyshev’s\ninequality","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"vIf27o0die"}],"key":"dKWqDbtFqL"},{"type":"text","value":" to construct an upper confidence bound.","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"oPAKqVL7IY"}],"key":"nN3xfNi43P"},{"type":"proof","kind":"theorem","label":"chebyshev","identifier":"chebyshev","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Chebyshev’s inequality","position":{"start":{"line":886,"column":1},"end":{"line":886,"column":1}},"key":"di1GfbzOeU"}],"key":"PVbubqN6Qe"},{"type":"paragraph","position":{"start":{"line":889,"column":1},"end":{"line":891,"column":1}},"children":[{"type":"text","value":"For a random variable ","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"key":"vNRjSgM2HZ"},{"type":"inlineMath","value":"Y","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"html":"YYY","key":"IRt1b6Kd3j"},{"type":"text","value":" such that\n","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"key":"juxscJzpEz"},{"type":"inlineMath","value":"\\E Y = 0","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"html":"EY=0\\E Y = 0EY=0","key":"Xl89QeY3FD"},{"type":"text","value":" and ","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"key":"Hpc6K7FjQ0"},{"type":"inlineMath","value":"\\E Y^2 = \\sigma^2","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"html":"EY2=σ2\\E Y^2 = \\sigma^2EY2=σ2","key":"RFdxhtzpHw"},{"type":"text","value":",","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"key":"yXPl2z767x"}],"key":"k0hRFJtA7T"},{"type":"math","value":"|Y| \\le \\beta \\sigma \\quad \\text{with probability} \\ge 1 - \\frac{1}{\\beta^2}","position":{"start":{"line":889,"column":1},"end":{"line":889,"column":1}},"tight":"before","html":"Yβσwith probability11β2|Y| \\le \\beta \\sigma \\quad \\text{with probability} \\ge 1 - \\frac{1}{\\beta^2}Yβσwith probability1β21","enumerator":"3.31","key":"Qj1i2iOd0E"}],"enumerator":"3.3","html_id":"chebyshev","key":"hrRUHhfC5L"},{"type":"paragraph","position":{"start":{"line":894,"column":1},"end":{"line":896,"column":1}},"children":[{"type":"text","value":"Since the OLS estimator is known to be unbiased (try proving this\nyourself), we can apply Chebyshev’s inequality to\n","position":{"start":{"line":894,"column":1},"end":{"line":894,"column":1}},"key":"crHBLUKDEF"},{"type":"inlineMath","value":"x_t^\\top (\\hat \\theta_t^k - \\theta^k)","position":{"start":{"line":894,"column":1},"end":{"line":894,"column":1}},"html":"xt(θ^tkθk)x_t^\\top (\\hat \\theta_t^k - \\theta^k)xt(θ^tkθk)","key":"Lac2ASqmzZ"},{"type":"text","value":":","position":{"start":{"line":894,"column":1},"end":{"line":894,"column":1}},"key":"gTxXi2Gd6l"}],"key":"RSlB0pgstX"},{"type":"math","value":"\\begin{aligned}\n x_t^\\top \\theta^k \\le x_t^\\top \\hat \\theta_t^k + \\beta \\sqrt{x_t^\\top (A_t^k)^{-1} x_t} \\quad \\text{with probability} \\ge 1 - \\frac{1}{\\beta^2}\n\\end{aligned}","position":{"start":{"line":898,"column":1},"end":{"line":900,"column":1}},"html":"xtθkxtθ^tk+βxt(Atk)1xtwith probability11β2\\begin{aligned}\n x_t^\\top \\theta^k \\le x_t^\\top \\hat \\theta_t^k + \\beta \\sqrt{x_t^\\top (A_t^k)^{-1} x_t} \\quad \\text{with probability} \\ge 1 - \\frac{1}{\\beta^2}\n\\end{aligned}xtθkxtθ^tk+βxt(Atk)1xtwith probability1β21","enumerator":"3.32","key":"H5kG8aOkDP"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"pWcE24KTX7"}],"key":"JTQo9gXPWO"},{"type":"paragraph","position":{"start":{"line":903,"column":1},"end":{"line":905,"column":1}},"children":[{"type":"text","value":"We haven’t explained why ","position":{"start":{"line":903,"column":1},"end":{"line":903,"column":1}},"key":"UB0qNdywg4"},{"type":"inlineMath","value":"x_t^\\top (A_t^k)^{-1} x_t","position":{"start":{"line":903,"column":1},"end":{"line":903,"column":1}},"html":"xt(Atk)1xtx_t^\\top (A_t^k)^{-1} x_txt(Atk)1xt","key":"cf9N8zh5M5"},{"type":"text","value":" is the correct\nexpression for the variance of ","position":{"start":{"line":903,"column":1},"end":{"line":903,"column":1}},"key":"dJwjuwBTKo"},{"type":"inlineMath","value":"x_t^\\top \\hat \\theta_t^k","position":{"start":{"line":903,"column":1},"end":{"line":903,"column":1}},"html":"xtθ^tkx_t^\\top \\hat \\theta_t^kxtθ^tk","key":"YiKgq9MSr7"},{"type":"text","value":". This result\nfollows from some algebra on the definition of the OLS estimator ","position":{"start":{"line":903,"column":1},"end":{"line":903,"column":1}},"key":"AAGxBC4bgy"},{"type":"crossReference","kind":"equation","identifier":"ols_bandit","label":"ols_bandit","children":[{"type":"text","value":"(","key":"qBEvnETaXm"},{"type":"text","value":"3.30","key":"JkvJk7k6X5"},{"type":"text","value":")","key":"dskJNkgLWy"}],"template":"(%s)","enumerator":"3.30","resolved":true,"html_id":"ols-bandit","key":"lTxEmj4LfW"},{"type":"text","value":".","position":{"start":{"line":903,"column":1},"end":{"line":903,"column":1}},"key":"Sp9Rs6aBl3"}],"key":"Vkxxdocick"}],"key":"QNRwvQ50Al"},{"type":"paragraph","position":{"start":{"line":908,"column":1},"end":{"line":916,"column":1}},"children":[{"type":"text","value":"The first term is exactly our predicted reward ","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"CwIzNmlgT3"},{"type":"inlineMath","value":"\\hat \\mu^k_t(x_t)","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"html":"μ^tk(xt)\\hat \\mu^k_t(x_t)μ^tk(xt)","key":"FImZ3J5fwA"},{"type":"text","value":". To\ninterpret the second term, note that","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"yMhI4Od2A0"}],"key":"fEhP96So9b"},{"type":"math","value":"x_t^\\top (A_t^k)^{-1} x_t = \\frac{1}{N_t^k} x_t^\\top (\\Sigma_t^k)^{-1} x_t,","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"tight":true,"html":"xt(Atk)1xt=1Ntkxt(Σtk)1xt,x_t^\\top (A_t^k)^{-1} x_t = \\frac{1}{N_t^k} x_t^\\top (\\Sigma_t^k)^{-1} x_t,xt(Atk)1xt=Ntk1xt(Σtk)1xt,","enumerator":"3.33","key":"Mw4hF6aPhJ"},{"type":"paragraph","position":{"start":{"line":908,"column":1},"end":{"line":916,"column":1}},"children":[{"type":"text","value":"where","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"Mc0WDSZqcu"}],"key":"bMahNl5Mes"},{"type":"math","value":"\\Sigma_t^k = \\frac{1}{N_t^k} \\sum_{\\{ i \\in [t] : a_i = k \\}} x_i x_i^\\top","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"tight":true,"html":"Σtk=1Ntk{i[t]:ai=k}xixi\\Sigma_t^k = \\frac{1}{N_t^k} \\sum_{\\{ i \\in [t] : a_i = k \\}} x_i x_i^\\topΣtk=Ntk1{i[t]:ai=k}xixi","enumerator":"3.34","key":"pX2Oly8IsJ"},{"type":"paragraph","position":{"start":{"line":908,"column":1},"end":{"line":916,"column":1}},"children":[{"type":"text","value":"is the empirical covariance matrix of the contexts (assuming that the\ncontext has mean zero). That is, the learner is encouraged to choose\narms when ","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"VqJkLnXLn4"},{"type":"inlineMath","value":"x_t","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"html":"xtx_txt","key":"CEeUJGR35m"},{"type":"text","value":" is ","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"gzhdphRLcj"},{"type":"emphasis","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"children":[{"type":"text","value":"not aligned","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"HZcdM3uovg"}],"key":"TBNUqoW86c"},{"type":"text","value":" with the data seen so far, or if arm\n","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"oblyqFhB94"},{"type":"inlineMath","value":"k","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"html":"kkk","key":"zgnGUwazLl"},{"type":"text","value":" has not been explored much and so ","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"Bxp5lesDzs"},{"type":"inlineMath","value":"N_t^k","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"html":"NtkN_t^kNtk","key":"YtIvtIaJ9C"},{"type":"text","value":" is small.","position":{"start":{"line":908,"column":1},"end":{"line":908,"column":1}},"key":"Cykud3T2LI"}],"key":"yBnCzldiax"},{"type":"paragraph","position":{"start":{"line":918,"column":1},"end":{"line":919,"column":1}},"children":[{"type":"text","value":"We can now substitute these quantities into UCB to get the ","position":{"start":{"line":918,"column":1},"end":{"line":918,"column":1}},"key":"p5BMURMJ11"},{"type":"strong","position":{"start":{"line":918,"column":1},"end":{"line":918,"column":1}},"children":[{"type":"text","value":"LinUCB","position":{"start":{"line":918,"column":1},"end":{"line":918,"column":1}},"key":"dCbs0dfKK7"}],"key":"xjiedLOSlX"},{"type":"text","value":"\nalgorithm:","position":{"start":{"line":918,"column":1},"end":{"line":918,"column":1}},"key":"tlpWphluK4"}],"key":"AoOQUV49xB"}],"key":"WaQdBngiej"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class LinUCBPseudocode(Agent):\n def __init__(\n self, K: int, T: int, D: int, lam: float, get_c: Callable[[int], float]\n ):\n super().__init__(K, T)\n self.lam = lam\n self.get_c = get_c\n self.contexts = [None for _ in range(K)]\n self.A = np.repeat(lam * np.eye(D)[...], K)\n self.targets = np.zeros(K, D)\n self.w = np.zeros(K, D)\n\n def choose_arm(self, context: Float[Array, \" D\"]):\n c = self.get_c(self.count)\n scores = self.w @ context + c * np.sqrt(\n context.T @ np.linalg.solve(self.A, context)\n )\n return random_argmax(scores)\n\n def update_history(self, context: Float[Array, \" D\"], arm: int, reward: int):\n self.A[arm] += np.outer(context, context)\n self.targets[arm] += context * reward\n self.w[arm] = np.linalg.solve(self.A[arm], self.targets[arm])","key":"YXndCQ0BWi"},{"type":"output","id":"7_RUD-i0TDhCBh7067nKH","data":[],"key":"UiypAKgDb5"}],"data":{},"key":"ll7DV7cYD4"},{"type":"block","children":[{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"vu3tpGRxBX"}],"key":"SpQBtFJWoK"},{"type":"paragraph","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"children":[{"type":"text","value":"Note that the matrix ","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"key":"dnWnyvDPYh"},{"type":"inlineMath","value":"A_t^k","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"html":"AtkA_t^kAtk","key":"Vdxf3FPQfD"},{"type":"text","value":" above might not be invertible. When does this occur? One way to address this is to include a ","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"key":"uE8t91gBKL"},{"type":"inlineMath","value":"\\lambda I","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"html":"λI\\lambda IλI","key":"foZxSpWWDc"},{"type":"text","value":" regularization term to ensure that ","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"key":"isZOQ4xYdQ"},{"type":"inlineMath","value":"A_t^k","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"html":"AtkA_t^kAtk","key":"J9lWlvwSGz"},{"type":"text","value":" is invertible. This is equivalent to solving a ","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"key":"h9vNR2Fb7l"},{"type":"emphasis","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"children":[{"type":"text","value":"ridge regression","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"key":"OvGRhfiW18"}],"key":"Np1qFHcFAH"},{"type":"text","value":" problem instead of the unregularized least squares problem. Implement this solution. TODO SOLUTION CURRENTLY SHOWN","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"key":"NYsuy0fEax"}],"key":"GFWadhyXcN"}],"key":"yRjP0D7Ixv"}],"key":"MtOHVvkrFm"},{"type":"block","position":{"start":{"line":951,"column":1},"end":{"line":951,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":953,"column":1},"end":{"line":956,"column":1}},"children":[{"type":"inlineMath","value":"c_t","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"html":"ctc_tct","key":"l8kBlXrCWy"},{"type":"text","value":" is similar to the ","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"key":"Wk2p65SakO"},{"type":"inlineMath","value":"\\log (2t/\\delta')","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"html":"log(2t/δ)\\log (2t/\\delta')log(2t/δ)","key":"mJmqezDs4z"},{"type":"text","value":" term of UCB: It controls the\nwidth of the confidence interval. Here, we treat it as a tunable\nparameter, though in a theoretical analysis, it would depend on ","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"key":"nG6EI8kfA6"},{"type":"inlineMath","value":"A_t^k","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"html":"AtkA_t^kAtk","key":"Uan1q6HuI8"},{"type":"text","value":"\nand the probability ","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"key":"yDCvLb4CE5"},{"type":"text","value":"δ","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"key":"Q8CtNqi5mM"},{"type":"text","value":" with which the bound holds.","position":{"start":{"line":953,"column":1},"end":{"line":953,"column":1}},"key":"VODEd5T1Ag"}],"key":"cTolzNFOD6"},{"type":"paragraph","position":{"start":{"line":958,"column":1},"end":{"line":959,"column":1}},"children":[{"type":"text","value":"Using similar tools for UCB, we can also prove an ","position":{"start":{"line":958,"column":1},"end":{"line":958,"column":1}},"key":"B1JhXopuA8"},{"type":"inlineMath","value":"\\tilde{O}(\\sqrt{T})","position":{"start":{"line":958,"column":1},"end":{"line":958,"column":1}},"html":"O~(T)\\tilde{O}(\\sqrt{T})O~(T)","key":"PpK5UNOKqi"},{"type":"text","value":"\nregret bound. The full details of the analysis can be found in Section 3 of ","position":{"start":{"line":958,"column":1},"end":{"line":958,"column":1}},"key":"IkRiCFx7Ei"},{"type":"cite","kind":"narrative","label":"agarwal_reinforcement_2022","identifier":"agarwal_reinforcement_2022","children":[{"type":"text","value":"Agarwal ","key":"q5KBKPbryK"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"zdeoXkna9H"}],"key":"DoAePbZJgC"},{"type":"text","value":" (2022)","key":"KIAsHOKRa3"}],"enumerator":"3","key":"hzJ3RSWsSR"},{"type":"text","value":".","position":{"start":{"line":958,"column":1},"end":{"line":958,"column":1}},"key":"rw82wwy8Ty"}],"key":"phtF3iKItI"},{"type":"heading","depth":2,"position":{"start":{"line":961,"column":1},"end":{"line":961,"column":1}},"children":[{"type":"text","value":"Summary","position":{"start":{"line":961,"column":1},"end":{"line":961,"column":1}},"key":"MMEBXh6Fv2"}],"identifier":"summary","label":"Summary","html_id":"summary","implicit":true,"enumerator":"3.9","key":"z75ya7xZ3y"},{"type":"paragraph","position":{"start":{"line":963,"column":1},"end":{"line":964,"column":1}},"children":[{"type":"text","value":"In this chapter,\nwe explored the ","position":{"start":{"line":963,"column":1},"end":{"line":963,"column":1}},"key":"M1sF0uMqoU"},{"type":"strong","position":{"start":{"line":963,"column":1},"end":{"line":963,"column":1}},"children":[{"type":"text","value":"multi-armed bandit","position":{"start":{"line":963,"column":1},"end":{"line":963,"column":1}},"key":"Q60pQnXV9u"}],"key":"JCIT1DDazI"},{"type":"text","value":" setting for analyzing sequential decision-making in an unknown environment.","position":{"start":{"line":963,"column":1},"end":{"line":963,"column":1}},"key":"fZ8tzKbN2u"}],"key":"Nn0WAai7cJ"}],"key":"cIKUU6MYif"}],"key":"mzKmZwvXcp"},"references":{"cite":{"order":["vershynin_high-dimensional_2018","lai_asymptotically_1985","agarwal_reinforcement_2022"],"data":{"vershynin_high-dimensional_2018":{"label":"vershynin_high-dimensional_2018","enumerator":"1","html":"Vershynin, R. (2018). High-Dimensional Probability: An Introduction with Applications in Data Science. Cambridge University Press."},"lai_asymptotically_1985":{"label":"lai_asymptotically_1985","enumerator":"2","doi":"10.1016/0196-8858(85)90002-8","html":"Lai, T. L., & Robbins, H. (1985). Asymptotically Efficient Adaptive Allocation Rules. Advances in Applied Mathematics, 6(1), 4–22. 10.1016/0196-8858(85)90002-8","url":"https://doi.org/10.1016/0196-8858(85)90002-8"},"agarwal_reinforcement_2022":{"label":"agarwal_reinforcement_2022","enumerator":"3","html":"Agarwal, A., Jiang, N., Kakade, S. M., & Sun, W. (2022). Reinforcement Learning: Theory and Algorithms."}}}},"footer":{"navigation":{"prev":{"title":"2 Linear Quadratic Regulators","url":"/control","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"4 Supervised learning","url":"/supervised-learning","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file diff --git a/build/1d74500d7a5d62ffa43debb29b4fba06.png b/build/1d74500d7a5d62ffa43debb29b4fba06.png new file mode 100644 index 0000000..1429e69 Binary files /dev/null and b/build/1d74500d7a5d62ffa43debb29b4fba06.png differ diff --git a/build/_assets/app-TARM6IJU.css b/build/_assets/app-H3NBUYVS.css similarity index 51% rename from build/_assets/app-TARM6IJU.css rename to build/_assets/app-H3NBUYVS.css index 28ff93d..3d3da39 100644 --- a/build/_assets/app-TARM6IJU.css +++ b/build/_assets/app-H3NBUYVS.css @@ -1,2 +1,2 @@ -*,:after,:before{box-sizing:border-box;border:0 solid #e5e7eb}:after,:before{--tw-content:""}:host,html{line-height:1.5;-webkit-text-size-adjust:100%;-moz-tab-size:4;-o-tab-size:4;tab-size:4;font-family:ui-sans-serif,system-ui,sans-serif,Apple Color Emoji,Segoe UI Emoji,Segoe UI Symbol,Noto Color Emoji;font-feature-settings:normal;font-variation-settings:normal;-webkit-tap-highlight-color:transparent}body{margin:0;line-height:inherit}hr{height:0;color:inherit;border-top-width:1px}abbr:where([title]){-webkit-text-decoration:underline dotted;text-decoration:underline dotted}h1,h2,h3,h4,h5,h6{font-size:inherit;font-weight:inherit}a{color:inherit;text-decoration:inherit}b,strong{font-weight:bolder}code,kbd,pre,samp{font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace;font-feature-settings:normal;font-variation-settings:normal;font-size:1em}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}table{text-indent:0;border-color:inherit;border-collapse:collapse}button,input,optgroup,select,textarea{font-family:inherit;font-feature-settings:inherit;font-variation-settings:inherit;font-size:100%;font-weight:inherit;line-height:inherit;letter-spacing:inherit;color:inherit;margin:0;padding:0}button,select{text-transform:none}button,input:where([type=button]),input:where([type=reset]),input:where([type=submit]){-webkit-appearance:button;background-color:transparent;background-image:none}:-moz-focusring{outline:auto}:-moz-ui-invalid{box-shadow:none}progress{vertical-align:baseline}::-webkit-inner-spin-button,::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}summary{display:list-item}blockquote,dd,dl,figure,h1,h2,h3,h4,h5,h6,hr,p,pre{margin:0}fieldset{margin:0}fieldset,legend{padding:0}menu,ol,ul{list-style:none;margin:0;padding:0}dialog{padding:0}textarea{resize:vertical}input::-moz-placeholder,textarea::-moz-placeholder{opacity:1;color:#9ca3af}input::placeholder,textarea::placeholder{opacity:1;color:#9ca3af}[role=button],button{cursor:pointer}:disabled{cursor:default}audio,canvas,embed,iframe,img,object,svg,video{display:block;vertical-align:middle}img,video{max-width:100%;height:auto}[hidden]{display:none}.prose table td{padding:.25rem;vertical-align:top}@media (min-width:640px){.prose table td{padding:.5rem}}.prose table li,.prose table p{margin-top:0;margin-bottom:0}.prose figure table{margin-top:1.25rem;margin-bottom:0}.prose table ol>li,.prose table ul>li{padding-left:0}.prose table tr:hover td{--tw-bg-opacity:1;background-color:rgb(248 250 252/var(--tw-bg-opacity))}.prose table tr:hover td:is(.dark *){--tw-bg-opacity:1;background-color:rgb(41 37 36/var(--tw-bg-opacity))}.prose table td dl{margin:0}.prose dt>strong{font-weight:700;--tw-text-opacity:1;color:rgb(30 58 138/var(--tw-text-opacity))}.prose dt>strong:is(.dark *){--tw-text-opacity:1;color:rgb(219 234 254/var(--tw-text-opacity))}.prose dd{margin-left:2rem}.prose p img{margin:0;display:inline-block}article.\!content{min-height:100vh!important}article.content{min-height:100vh}.article table td{padding:.25rem;vertical-align:top}@media (min-width:640px){.article table td{padding:.5rem}}.article table li,.article table p{margin-top:0;margin-bottom:0}.article figure table{margin-top:1.25rem;margin-bottom:0}.article table ol>li,.article table ul>li{padding-left:0}.article table tr:hover td{--tw-bg-opacity:1;background-color:rgb(248 250 252/var(--tw-bg-opacity))}.article table tr:hover td:is(.dark *){--tw-bg-opacity:1;background-color:rgb(41 37 36/var(--tw-bg-opacity))}.article table td dl{margin:0}.article dt>strong{font-weight:700;--tw-text-opacity:1;color:rgb(30 58 138/var(--tw-text-opacity))}.article dt>strong:is(.dark *){--tw-text-opacity:1;color:rgb(219 234 254/var(--tw-text-opacity))}.article dd{margin-left:2rem}.article p img{margin:0;display:inline-block}.article{color:var(--tw-prose-body);max-width:65ch}.article :where(p):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:1.25em;margin-bottom:1.25em}.article :where([class~=lead]):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-lead);font-size:1.25em;line-height:1.6;margin-top:1.2em;margin-bottom:1.2em}.article :where(a):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-links);text-decoration:underline;font-weight:500}.article :where(strong):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-bold);font-weight:600}.article :where(a strong):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.article :where(blockquote strong):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.article :where(thead th strong):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.article :where(ol):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:decimal;margin-top:1.25em;margin-bottom:1.25em;padding-inline-start:1.625em}.article :where(ol[type=A]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:upper-alpha}.article :where(ol[type=a]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:lower-alpha}.article :where(ol[type=A s]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:upper-alpha}.article :where(ol[type=a s]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:lower-alpha}.article :where(ol[type=I]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:upper-roman}.article :where(ol[type=i]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:lower-roman}.article :where(ol[type=I s]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:upper-roman}.article :where(ol[type=i s]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:lower-roman}.article :where(ol[type="1"]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:decimal}.article :where(ul):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:disc;margin-top:1.25em;margin-bottom:1.25em;padding-inline-start:1.625em}.article :where(ol>li):not(:where([class~=not-prose],[class~=not-prose] *))::marker{font-weight:400;color:var(--tw-prose-counters)}.article :where(ul>li):not(:where([class~=not-prose],[class~=not-prose] *))::marker{color:var(--tw-prose-bullets)}.article :where(dt):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-headings);font-weight:600;margin-top:1.25em}.article :where(hr):not(:where([class~=not-prose],[class~=not-prose] *)){border-color:var(--tw-prose-hr);border-top-width:1px;margin-top:3em;margin-bottom:3em}.article :where(blockquote):not(:where([class~=not-prose],[class~=not-prose] *)){font-weight:500;font-style:italic;color:var(--tw-prose-quotes);border-inline-start-width:.25rem;border-inline-start-color:var(--tw-prose-quote-borders);quotes:"\201c""\201d""\2018""\2019";margin-top:1.6em;margin-bottom:1.6em;padding-inline-start:1em}.article :where(blockquote p:first-of-type):not(:where([class~=not-prose],[class~=not-prose] *)):before{content:none}.article :where(blockquote p:last-of-type):not(:where([class~=not-prose],[class~=not-prose] *)):after{content:close-quote}.article :where(h1):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-headings);font-weight:800;font-size:2.25em;margin-top:0;margin-bottom:.8888889em;line-height:1.1111111}.article :where(h1 strong):not(:where([class~=not-prose],[class~=not-prose] *)){font-weight:900;color:inherit}.article :where(h2):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-headings);font-weight:700;font-size:1.5em;margin-top:2em;margin-bottom:1em;line-height:1.3333333}.article :where(h2 strong):not(:where([class~=not-prose],[class~=not-prose] *)){font-weight:800;color:inherit}.article :where(h3):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-headings);font-weight:600;font-size:1.25em;margin-top:1.6em;margin-bottom:.6em;line-height:1.6}.article :where(h3 strong):not(:where([class~=not-prose],[class~=not-prose] *)){font-weight:700;color:inherit}.article :where(h4):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-headings);font-weight:600;margin-top:1.5em;margin-bottom:.5em;line-height:1.5}.article :where(h4 strong):not(:where([class~=not-prose],[class~=not-prose] *)){font-weight:700;color:inherit}.article :where(img):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:2em;margin-bottom:2em}.article :where(picture):not(:where([class~=not-prose],[class~=not-prose] *)){display:block;margin-top:2em;margin-bottom:2em}.article :where(video):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:2em;margin-bottom:2em}.article :where(kbd):not(:where([class~=not-prose],[class~=not-prose] *)){font-weight:500;font-family:inherit;color:var(--tw-prose-kbd);box-shadow:0 0 0 1px rgb(var(--tw-prose-kbd-shadows)/10%),0 3px 0 rgb(var(--tw-prose-kbd-shadows)/10%);font-size:.875em;border-radius:.3125rem;padding-top:.1875em;padding-inline-end:.375em;padding-bottom:.1875em;padding-inline-start:.375em}.article :where(code):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-code);font-weight:400;font-size:.875em}.article :where(a code):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.article :where(h1 code):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.article :where(h2 code):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit;font-size:.875em}.article :where(h3 code):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit;font-size:.9em}.article :where(h4 code):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.article :where(blockquote code):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.article :where(thead th code):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.article :where(pre):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-pre-code);background-color:var(--tw-prose-pre-bg);overflow-x:auto;font-weight:400;font-size:.875em;line-height:1.7142857;margin-top:1.7142857em;margin-bottom:1.7142857em;border-radius:.375rem;padding-top:.8571429em;padding-inline-end:1.1428571em;padding-bottom:.8571429em;padding-inline-start:1.1428571em}.article :where(pre code):not(:where([class~=not-prose],[class~=not-prose] *)){background-color:transparent;border-width:0;border-radius:0;padding:0;font-weight:inherit;color:inherit;font-size:inherit;font-family:inherit;line-height:inherit}.article :where(pre code):not(:where([class~=not-prose],[class~=not-prose] *)):before{content:none}.article :where(pre code):not(:where([class~=not-prose],[class~=not-prose] *)):after{content:none}.article :where(table):not(:where([class~=not-prose],[class~=not-prose] *)){width:100%;table-layout:auto;margin-top:2em;margin-bottom:2em;font-size:.875em;line-height:1.7142857}.article :where(thead):not(:where([class~=not-prose],[class~=not-prose] *)){border-bottom-width:1px;border-bottom-color:var(--tw-prose-th-borders)}.article :where(thead th):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-headings);font-weight:600;vertical-align:bottom;padding-inline-end:.5714286em;padding-bottom:.5714286em;padding-inline-start:.5714286em}.article :where(tbody tr):not(:where([class~=not-prose],[class~=not-prose] *)){border-bottom-width:1px;border-bottom-color:var(--tw-prose-td-borders)}.article :where(tbody tr:last-child):not(:where([class~=not-prose],[class~=not-prose] *)){border-bottom-width:0}.article :where(tbody td):not(:where([class~=not-prose],[class~=not-prose] *)){vertical-align:baseline}.article :where(tfoot):not(:where([class~=not-prose],[class~=not-prose] *)){border-top-width:1px;border-top-color:var(--tw-prose-th-borders)}.article :where(tfoot td):not(:where([class~=not-prose],[class~=not-prose] *)){vertical-align:top}.article :where(th,td):not(:where([class~=not-prose],[class~=not-prose] *)){text-align:start}.article :where(figure>*):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:0;margin-bottom:0}.article :where(figcaption):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-captions);font-size:.875em;line-height:1.4285714;margin-top:.8571429em}.article{--tw-prose-body:#374151;--tw-prose-headings:#111827;--tw-prose-lead:#4b5563;--tw-prose-links:#111827;--tw-prose-bold:#111827;--tw-prose-counters:#6b7280;--tw-prose-bullets:#d1d5db;--tw-prose-hr:#e5e7eb;--tw-prose-quotes:#111827;--tw-prose-quote-borders:#e5e7eb;--tw-prose-captions:#6b7280;--tw-prose-kbd:#111827;--tw-prose-kbd-shadows:17 24 39;--tw-prose-code:#111827;--tw-prose-pre-code:#e5e7eb;--tw-prose-pre-bg:#1f2937;--tw-prose-th-borders:#d1d5db;--tw-prose-td-borders:#e5e7eb;--tw-prose-invert-body:#d1d5db;--tw-prose-invert-lead:#9ca3af;--tw-prose-invert-counters:#9ca3af;--tw-prose-invert-bullets:#4b5563;--tw-prose-invert-hr:#374151;--tw-prose-invert-quotes:#f3f4f6;--tw-prose-invert-quote-borders:#374151;--tw-prose-invert-captions:#9ca3af;--tw-prose-invert-pre-code:#d1d5db;--tw-prose-invert-th-borders:#4b5563;--tw-prose-invert-td-borders:#374151;font-size:1rem;line-height:1.75}.article :where(picture>img):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:0;margin-bottom:0}.article :where(li):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:.25rem;margin-bottom:.25rem}.article :where(ol>li):not(:where([class~=not-prose],[class~=not-prose] *)){padding-inline-start:.375em}.article :where(ul>li):not(:where([class~=not-prose],[class~=not-prose] *)){padding-inline-start:.375em}.article :where(.prose>ul>li p):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:.75em;margin-bottom:.75em}.article :where(.prose>ul>li>p:first-child):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:1.25em}.article :where(.prose>ul>li>p:last-child):not(:where([class~=not-prose],[class~=not-prose] *)){margin-bottom:1.25em}.article :where(.prose>ol>li>p:first-child):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:1.25em}.article :where(.prose>ol>li>p:last-child):not(:where([class~=not-prose],[class~=not-prose] *)){margin-bottom:1.25em}.article :where(ul ul,ul ol,ol ul,ol ol):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:.75em;margin-bottom:.75em}.article :where(dl):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:1.25em;margin-bottom:1.25em}.article :where(dd):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:.5em;padding-inline-start:1.625em}.article :where(hr+*):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:0}.article :where(h2+*):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:0}.article :where(h3+*):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:0}.article :where(h4+*):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:0}.article :where(thead th:first-child):not(:where([class~=not-prose],[class~=not-prose] *)){padding-inline-start:0}.article :where(thead th:last-child):not(:where([class~=not-prose],[class~=not-prose] *)){padding-inline-end:0}.article :where(tbody td,tfoot td):not(:where([class~=not-prose],[class~=not-prose] *)){padding-top:.5714286em;padding-inline-end:.5714286em;padding-bottom:.5714286em;padding-inline-start:.5714286em}.article :where(tbody td:first-child,tfoot td:first-child):not(:where([class~=not-prose],[class~=not-prose] *)){padding-inline-start:0}.article :where(tbody td:last-child,tfoot td:last-child):not(:where([class~=not-prose],[class~=not-prose] *)){padding-inline-end:0}.article :where(figure):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:2em;margin-bottom:2em}.article :where(.prose>:first-child):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:0}.article :where(.prose>:last-child):not(:where([class~=not-prose],[class~=not-prose] *)){margin-bottom:0}.article :where(blockquote p:first-of-type):not(:where([class~=not-prose],[class~=not-prose] *)):after{content:none}.article :where(li>p,dd>p,header>p,footer>p):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:.25rem;margin-bottom:.25rem}.article :where(h5,h6):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-headings);font-weight:500}.article{--tw-prose-body:#44403c;--tw-prose-headings:#1c1917;--tw-prose-lead:#57534e;--tw-prose-links:#1c1917;--tw-prose-bold:#1c1917;--tw-prose-counters:#78716c;--tw-prose-bullets:#d6d3d1;--tw-prose-hr:#e7e5e4;--tw-prose-quotes:#1c1917;--tw-prose-quote-borders:#e7e5e4;--tw-prose-captions:#78716c;--tw-prose-kbd:#1c1917;--tw-prose-kbd-shadows:28 25 23;--tw-prose-code:#be185d;--tw-prose-pre-code:#e7e5e4;--tw-prose-pre-bg:#292524;--tw-prose-th-borders:#d6d3d1;--tw-prose-td-borders:#e7e5e4;--tw-prose-invert-body:#d6d3d1;--tw-prose-invert-headings:#fff;--tw-prose-invert-lead:#a8a29e;--tw-prose-invert-links:#fff;--tw-prose-invert-bold:#fff;--tw-prose-invert-counters:#a8a29e;--tw-prose-invert-bullets:#57534e;--tw-prose-invert-hr:#44403c;--tw-prose-invert-quotes:#f5f5f4;--tw-prose-invert-quote-borders:#44403c;--tw-prose-invert-captions:#a8a29e;--tw-prose-invert-kbd:#fff;--tw-prose-invert-kbd-shadows:255 255 255;--tw-prose-invert-code:#fff;--tw-prose-invert-pre-code:#d6d3d1;--tw-prose-invert-pre-bg:rgba(0,0,0,.5);--tw-prose-invert-th-borders:#57534e;--tw-prose-invert-td-borders:#44403c;max-width:none;overflow-wrap:break-word}.article:is(.dark *){--tw-prose-body:var(--tw-prose-invert-body);--tw-prose-headings:var(--tw-prose-invert-headings);--tw-prose-lead:var(--tw-prose-invert-lead);--tw-prose-links:var(--tw-prose-invert-links);--tw-prose-bold:var(--tw-prose-invert-bold);--tw-prose-counters:var(--tw-prose-invert-counters);--tw-prose-bullets:var(--tw-prose-invert-bullets);--tw-prose-hr:var(--tw-prose-invert-hr);--tw-prose-quotes:var(--tw-prose-invert-quotes);--tw-prose-quote-borders:var(--tw-prose-invert-quote-borders);--tw-prose-captions:var(--tw-prose-invert-captions);--tw-prose-kbd:var(--tw-prose-invert-kbd);--tw-prose-kbd-shadows:var(--tw-prose-invert-kbd-shadows);--tw-prose-code:#f472b6;--tw-prose-pre-code:var(--tw-prose-invert-pre-code);--tw-prose-pre-bg:var(--tw-prose-invert-pre-bg);--tw-prose-th-borders:var(--tw-prose-invert-th-borders);--tw-prose-td-borders:var(--tw-prose-invert-td-borders)}.article-grid{display:grid;grid-template-columns:[screen-start screen-inset-start] .5rem [page-start page-inset-start body-outset-start body-start gutter-left-start body-inset-start middle-start] 1fr 1fr [gutter-left-end] 1fr 1fr [gutter-right-start] 1fr 1fr [middle-end body-inset-end body-end gutter-right-end body-outset-end page-inset-end page-end] .5rem [screen-inset-end screen-end];align-content:flex-start}@media (min-width:768px){.article-grid{grid-template-columns:[screen-start] .25rem [screen-inset-start page-start page-inset-start body-outset-start] 1fr [body-start gutter-left-start] 1rem [body-inset-start] minmax(2ch,10ch) [middle-start] minmax(2ch,10ch) [gutter-left-end] minmax(2ch,10ch) minmax(2ch,10ch) [gutter-right-start] minmax(2ch,10ch) [middle-end] minmax(2ch,10ch) [body-inset-end] 1rem [body-end gutter-right-end] 1fr [body-outset-end page-inset-end page-end screen-inset-end] .25rem [screen-end]}}@media (min-width:1024px){.article-grid{grid-template-columns:[screen-start] .25rem [screen-inset-start page-start] 1rem [page-inset-start body-outset-start] 1fr [body-start gutter-left-start] 1rem [body-inset-start] minmax(8ch,10ch) [middle-start] minmax(8ch,10ch) [gutter-left-end] minmax(8ch,10ch) minmax(8ch,10ch) [gutter-right-start] minmax(8ch,10ch) [middle-end] minmax(8ch,10ch) [body-inset-end] 1rem [body-end gutter-right-end] 3rem [body-outset-end] minmax(5rem,13rem) [page-inset-end] 3rem [page-end] 1fr [screen-inset-end] .25rem [screen-end]}}@media (min-width:1280px){.article-grid{grid-template-columns:[screen-start] .25rem [screen-inset-start] 1fr [page-start] 3rem [page-inset-start] minmax(4rem,9rem) [body-outset-start] 3rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(8ch,10ch) [middle-start] minmax(8ch,10ch) [gutter-left-end] minmax(8ch,10ch) minmax(8ch,10ch) [gutter-right-start] minmax(8ch,10ch) [middle-end] minmax(8ch,10ch) [body-inset-end] 1rem [body-end gutter-right-end] 3rem [body-outset-end] minmax(5rem,13rem) [page-inset-end] 3rem [page-end] 1fr [screen-inset-end] .25rem [screen-end]}}@media (min-width:1536px){.article-grid{grid-template-columns:[screen-start] .5rem [screen-inset-start] 1fr [page-start] 3rem [page-inset-start] minmax(4rem,9rem) [body-outset-start] 3rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(8ch,10ch) [middle-start] minmax(8ch,10ch) [gutter-left-end] minmax(8ch,10ch) minmax(8ch,10ch) [gutter-right-start] minmax(8ch,10ch) [middle-end] minmax(8ch,10ch) [body-inset-end] 1rem [body-end gutter-right-end] 3rem [body-outset-end] minmax(5rem,13rem) [page-inset-end] 3rem [page-end] 1fr [screen-inset-end] .5rem [screen-end]}}.article-left-grid{display:grid;grid-template-columns:[screen-start screen-inset-start] .5rem [page-start page-inset-start body-outset-start body-start gutter-left-start body-inset-start middle-start] 1fr 1fr [gutter-left-end] 1fr 1fr [gutter-right-start] 1fr 1fr [middle-end body-inset-end body-end gutter-right-end body-outset-end page-inset-end page-end] .5rem [screen-inset-end screen-end];align-content:flex-start}@media (min-width:768px){.article-left-grid{grid-template-columns:[screen-start] .5rem [screen-inset-start page-start page-inset-start body-outset-start] 1rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(5ch,15ch) [middle-start] minmax(5ch,15ch) [gutter-left-end] minmax(5ch,15ch) minmax(5ch,15ch) [gutter-right-start] minmax(5ch,15ch) [middle-end] minmax(5ch,15ch) [body-inset-end] 1rem [body-end gutter-right-end body-outset-end page-inset-end] 1rem [page-end screen-inset-end] .5rem [screen-end]}}@media (min-width:1024px){.article-left-grid{grid-template-columns:[screen-start] .5rem [screen-inset-start page-start page-inset-start body-outset-start] 1rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(5ch,12ch) [middle-start] minmax(5ch,12ch) [gutter-left-end] minmax(5ch,12ch) minmax(5ch,12ch) [gutter-right-start] minmax(5ch,12ch) [middle-end] minmax(5ch,12ch) [body-inset-end] 1rem [body-end] 1fr [gutter-right-end] 1rem [body-outset-end] minmax(10rem,18rem) [page-inset-end] 1rem [page-end] 1fr [screen-inset-end] .5rem [screen-end]}}@media (min-width:1280px){.article-left-grid{grid-template-columns:[screen-start] .5rem [screen-inset-start page-start page-inset-start body-outset-start] 3rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(8ch,12ch) [middle-start] minmax(8ch,12ch) [gutter-left-end] minmax(8ch,12ch) minmax(8ch,12ch) [gutter-right-start] minmax(8ch,12ch) [middle-end] minmax(8ch,12ch) [body-inset-end] 1rem [body-end] 1fr [gutter-right-end] 1rem [body-outset-end] minmax(10rem,18rem) [page-inset-end] 1rem [page-end] 1fr [screen-inset-end] .5rem [screen-end]}}@media (min-width:1536px){.article-left-grid{grid-template-columns:[screen-start] .5rem [screen-inset-start] 1fr [page-start page-inset-start body-outset-start] 3rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(8ch,12ch) [middle-start] minmax(8ch,12ch) [gutter-left-end] minmax(8ch,12ch) minmax(8ch,12ch) [gutter-right-start] minmax(8ch,12ch) [middle-end] minmax(8ch,12ch) [body-inset-end] 1rem [body-end] 1fr [gutter-right-end] 1rem [body-outset-end] minmax(10rem,18rem) [page-inset-end] 1rem [page-end] 1fr [screen-inset-end] .5rem [screen-end]}}.article-center-grid{display:grid;grid-template-columns:[screen-start screen-inset-start] .5rem [page-start page-inset-start body-outset-start body-start gutter-left-start body-inset-start middle-start] 1fr 1fr [gutter-left-end] 1fr 1fr [gutter-right-start] 1fr 1fr [middle-end body-inset-end body-end gutter-right-end body-outset-end page-inset-end page-end] .5rem [screen-inset-end screen-end];align-content:flex-start}@media (min-width:768px){.article-center-grid{grid-template-columns:[screen-start] .25rem [screen-inset-start page-start page-inset-start body-outset-start] 1fr [body-start gutter-left-start] 1rem [body-inset-start] minmax(2ch,10ch) [middle-start] minmax(2ch,10ch) [gutter-left-end] minmax(2ch,10ch) minmax(2ch,10ch) [gutter-right-start] minmax(2ch,10ch) [middle-end] minmax(2ch,10ch) [body-inset-end] 1rem [body-end gutter-right-end] 1fr [body-outset-end page-inset-end page-end screen-inset-end] .25rem [screen-end]}}@media (min-width:1024px){.article-center-grid{grid-template-columns:[screen-start] .5rem [screen-inset-start page-start] 2rem [page-inset-start] 2fr [body-outset-start gutter-outset-left-start] 1rem [body-start gutter-left-start] 2rem [body-inset-start gutter-left-start] minmax(8ch,10ch) [gutter-left-end middle-start] minmax(8ch,10ch) minmax(8ch,10ch) [] minmax(8ch,10ch) [] minmax(8ch,10ch) [middle-end gutter-right-start gutter-page-right-start] minmax(8ch,10ch) [body-inset-end gutter-right-end] 2rem [body-end] 1rem [body-outset-end] 2fr [page-inset-end] 2rem [page-end screen-inset-end] .5rem [screen-end]}}@media (min-width:1280px){.article-center-grid{grid-template-columns:[screen-start] .25rem [screen-inset-start] 1fr [page-start] 3rem [page-inset-start] minmax(4rem,9rem) [body-outset-start] 3rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(8ch,10ch) [middle-start] minmax(8ch,10ch) [gutter-left-end] minmax(8ch,10ch) minmax(8ch,10ch) [gutter-right-start] minmax(8ch,10ch) [middle-end] minmax(8ch,10ch) [body-inset-end] 1rem [body-end gutter-right-end] 3rem [body-outset-end] minmax(4rem,9rem) [page-inset-end] 3rem [page-end] 1fr [screen-inset-end] .25rem [screen-end]}}@media (min-width:1536px){.article-center-grid{grid-template-columns:[screen-start] .5rem [screen-inset-start] 1fr [page-start] 3rem [page-inset-start] minmax(4rem,9rem) [body-outset-start] 3rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(8ch,10ch) [middle-start] minmax(8ch,10ch) [gutter-left-end] minmax(8ch,10ch) minmax(8ch,10ch) [gutter-right-start] minmax(8ch,10ch) [middle-end] minmax(8ch,10ch) [body-inset-end] 1rem [body-end gutter-right-end] 3rem [body-outset-end] minmax(4rem,9rem) [page-inset-end] 3rem [page-end] 1fr [screen-inset-end] .5rem [screen-end]}}.article-center-grid>*,.article-grid>*,.article-left-grid>*{grid-column:body;margin-top:0!important}.grid-gap{gap:.25rem}@media (min-width:768px){.grid-gap{gap:.5rem}}@media (min-width:1280px){.grid-gap{gap:.75rem}}@media (min-width:1536px){.grid-gap{gap:1rem}}.subgrid-gap{-moz-column-gap:.25rem;column-gap:.25rem}@media (min-width:768px){.subgrid-gap{-moz-column-gap:.5rem;column-gap:.5rem}}@media (min-width:1280px){.subgrid-gap{-moz-column-gap:.75rem;column-gap:.75rem}}@media (min-width:1536px){.subgrid-gap{-moz-column-gap:1rem;column-gap:1rem}}.col-margin-left{grid-column:body}@media (min-width:1280px){.col-margin-left{grid-column:page/body-start}}.col-margin,.col-margin-right{grid-column:body}@media (min-width:1024px){.col-margin,.col-margin-right{grid-column:body-end/page-end}}.col-margin-right-inset{grid-column:body}@media (min-width:1024px){.col-margin-right-inset{grid-column:body-end/page-inset}}.col-gutter-page-right{grid-column:body}@media (min-width:768px){.col-gutter-page-right{grid-column:gutter-right/body-outset}}@media (min-width:1024px){.col-gutter-page-right{grid-column:middle-end/page}}.col-gutter-page-left{grid-column:body}@media (min-width:768px){.col-gutter-page-left{grid-column:body-outset/gutter-left}}@media (min-width:1024px){.col-gutter-page-left{grid-column:page/middle-start}}.col-body-inset-right{grid-column:body/gutter-right-start}@media (min-width:1024px){.col-body-inset-right{grid-column:body/middle}}.col-body-inset-left{grid-column:gutter-left-end/body}@media (min-width:1024px){.col-body-inset-left{grid-column:middle/body}}.col-page-middle{grid-column:body}@media (min-width:1024px){.col-page-middle{grid-column:middle}}.shaded{margin-top:1.25rem;margin-bottom:1.25rem;--tw-bg-opacity:1;background-color:rgb(241 245 249/var(--tw-bg-opacity));padding-top:1.25rem}.shaded:is(.dark *){--tw-bg-opacity:1;background-color:rgb(30 41 59/var(--tw-bg-opacity))}.framed{margin-top:1.25rem;margin-bottom:1.25rem;border-width:1px;--tw-bg-opacity:1;background-color:rgb(248 250 252/var(--tw-bg-opacity));padding:1.25rem;--tw-shadow:0 1px 3px 0 rgba(0,0,0,.1),0 1px 2px -1px rgba(0,0,0,.1);--tw-shadow-colored:0 1px 3px 0 var(--tw-shadow-color),0 1px 2px -1px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.framed:is(.dark *){--tw-bg-opacity:1;background-color:rgb(30 41 59/var(--tw-bg-opacity))}.shaded-children>*{--tw-bg-opacity:1;background-color:rgb(248 250 252/var(--tw-bg-opacity));padding:.5rem}.shaded-children>:is(.dark *){--tw-bg-opacity:1;background-color:rgb(30 41 59/var(--tw-bg-opacity))}.rounded-children>*{border-radius:.25rem}.task-list-item{list-style-type:none}.task-list-item-checkbox{margin-left:-29px;margin-right:14px;height:1rem;width:1rem;--tw-translate-y:1px;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));border-radius:.25rem;--tw-border-opacity:1;border-color:rgb(209 213 219/var(--tw-border-opacity));--tw-bg-opacity:1;background-color:rgb(243 244 246/var(--tw-bg-opacity));--tw-text-opacity:1;color:rgb(37 99 235/var(--tw-text-opacity))}.task-list-item-checkbox:focus{--tw-border-opacity:1;border-color:rgb(59 130 246/var(--tw-border-opacity))}.task-list-item-checkbox:is(.dark *){--tw-border-opacity:1;border-color:rgb(75 85 99/var(--tw-border-opacity));--tw-bg-opacity:1;background-color:rgb(55 65 81/var(--tw-bg-opacity));--tw-ring-offset-color:#1f2937}.task-list-item-checkbox:focus:is(.dark *){--tw-border-opacity:1;border-color:rgb(37 99 235/var(--tw-border-opacity))}#footnotes p{margin:.25rem}*,:after,:before{--tw-border-spacing-x:0;--tw-border-spacing-y:0;--tw-translate-x:0;--tw-translate-y:0;--tw-rotate:0;--tw-skew-x:0;--tw-skew-y:0;--tw-scale-x:1;--tw-scale-y:1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness:proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width:0px;--tw-ring-offset-color:#fff;--tw-ring-color:rgba(59,130,246,.5);--tw-ring-offset-shadow:0 0 #0000;--tw-ring-shadow:0 0 #0000;--tw-shadow:0 0 #0000;--tw-shadow-colored:0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: ;--tw-contain-size: ;--tw-contain-layout: ;--tw-contain-paint: ;--tw-contain-style: }::backdrop{--tw-border-spacing-x:0;--tw-border-spacing-y:0;--tw-translate-x:0;--tw-translate-y:0;--tw-rotate:0;--tw-skew-x:0;--tw-skew-y:0;--tw-scale-x:1;--tw-scale-y:1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness:proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width:0px;--tw-ring-offset-color:#fff;--tw-ring-color:rgba(59,130,246,.5);--tw-ring-offset-shadow:0 0 #0000;--tw-ring-shadow:0 0 #0000;--tw-shadow:0 0 #0000;--tw-shadow-colored:0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: ;--tw-contain-size: ;--tw-contain-layout: ;--tw-contain-paint: ;--tw-contain-style: }.\!container{width:100%!important}.container{width:100%}@media (min-width:640px){.\!container{max-width:640px!important}.container{max-width:640px}}@media (min-width:768px){.\!container{max-width:768px!important}.container{max-width:768px}}@media (min-width:1024px){.\!container{max-width:1024px!important}.container{max-width:1024px}}@media (min-width:1280px){.\!container{max-width:1280px!important}.container{max-width:1280px}}@media (min-width:1536px){.\!container{max-width:1536px!important}.container{max-width:1536px}}.prose{color:var(--tw-prose-body);max-width:65ch}.prose :where(p):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:1.25em;margin-bottom:1.25em}.prose :where([class~=lead]):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-lead);font-size:1.25em;line-height:1.6;margin-top:1.2em;margin-bottom:1.2em}.prose :where(a):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-links);text-decoration:underline;font-weight:500}.prose :where(strong):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-bold);font-weight:600}.prose :where(a strong):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.prose :where(blockquote strong):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.prose :where(thead th strong):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.prose :where(ol):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:decimal;margin-top:1.25em;margin-bottom:1.25em;padding-inline-start:1.625em}.prose :where(ol[type=A]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:upper-alpha}.prose :where(ol[type=a]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:lower-alpha}.prose :where(ol[type=A s]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:upper-alpha}.prose :where(ol[type=a s]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:lower-alpha}.prose :where(ol[type=I]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:upper-roman}.prose :where(ol[type=i]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:lower-roman}.prose :where(ol[type=I s]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:upper-roman}.prose :where(ol[type=i s]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:lower-roman}.prose :where(ol[type="1"]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:decimal}.prose :where(ul):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:disc;margin-top:1.25em;margin-bottom:1.25em;padding-inline-start:1.625em}.prose :where(ol>li):not(:where([class~=not-prose],[class~=not-prose] *))::marker{font-weight:400;color:var(--tw-prose-counters)}.prose :where(ul>li):not(:where([class~=not-prose],[class~=not-prose] *))::marker{color:var(--tw-prose-bullets)}.prose :where(dt):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-headings);font-weight:600;margin-top:1.25em}.prose :where(hr):not(:where([class~=not-prose],[class~=not-prose] *)){border-color:var(--tw-prose-hr);border-top-width:1px;margin-top:3em;margin-bottom:3em}.prose :where(blockquote):not(:where([class~=not-prose],[class~=not-prose] *)){font-weight:500;font-style:italic;color:var(--tw-prose-quotes);border-inline-start-width:.25rem;border-inline-start-color:var(--tw-prose-quote-borders);quotes:"\201c""\201d""\2018""\2019";margin-top:1.6em;margin-bottom:1.6em;padding-inline-start:1em}.prose :where(blockquote p:first-of-type):not(:where([class~=not-prose],[class~=not-prose] *)):before{content:none}.prose :where(blockquote p:last-of-type):not(:where([class~=not-prose],[class~=not-prose] *)):after{content:close-quote}.prose :where(h1):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-headings);font-weight:800;font-size:2.25em;margin-top:0;margin-bottom:.8888889em;line-height:1.1111111}.prose :where(h1 strong):not(:where([class~=not-prose],[class~=not-prose] *)){font-weight:900;color:inherit}.prose :where(h2):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-headings);font-weight:700;font-size:1.5em;margin-top:2em;margin-bottom:1em;line-height:1.3333333}.prose :where(h2 strong):not(:where([class~=not-prose],[class~=not-prose] *)){font-weight:800;color:inherit}.prose :where(h3):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-headings);font-weight:600;font-size:1.25em;margin-top:1.6em;margin-bottom:.6em;line-height:1.6}.prose :where(h3 strong):not(:where([class~=not-prose],[class~=not-prose] *)){font-weight:700;color:inherit}.prose :where(h4):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-headings);font-weight:600;margin-top:1.5em;margin-bottom:.5em;line-height:1.5}.prose :where(h4 strong):not(:where([class~=not-prose],[class~=not-prose] *)){font-weight:700;color:inherit}.prose :where(img):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:2em;margin-bottom:2em}.prose :where(picture):not(:where([class~=not-prose],[class~=not-prose] *)){display:block;margin-top:2em;margin-bottom:2em}.prose :where(video):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:2em;margin-bottom:2em}.prose :where(kbd):not(:where([class~=not-prose],[class~=not-prose] *)){font-weight:500;font-family:inherit;color:var(--tw-prose-kbd);box-shadow:0 0 0 1px rgb(var(--tw-prose-kbd-shadows)/10%),0 3px 0 rgb(var(--tw-prose-kbd-shadows)/10%);font-size:.875em;border-radius:.3125rem;padding-top:.1875em;padding-inline-end:.375em;padding-bottom:.1875em;padding-inline-start:.375em}.prose :where(code):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-code);font-weight:400;font-size:.875em}.prose :where(a code):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.prose :where(h1 code):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.prose :where(h2 code):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit;font-size:.875em}.prose :where(h3 code):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit;font-size:.9em}.prose :where(h4 code):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.prose :where(blockquote code):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.prose :where(thead th code):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.prose :where(pre):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-pre-code);background-color:var(--tw-prose-pre-bg);overflow-x:auto;font-weight:400;font-size:.875em;line-height:1.7142857;margin-top:1.7142857em;margin-bottom:1.7142857em;border-radius:.375rem;padding-top:.8571429em;padding-inline-end:1.1428571em;padding-bottom:.8571429em;padding-inline-start:1.1428571em}.prose :where(pre code):not(:where([class~=not-prose],[class~=not-prose] *)){background-color:transparent;border-width:0;border-radius:0;padding:0;font-weight:inherit;color:inherit;font-size:inherit;font-family:inherit;line-height:inherit}.prose :where(pre code):not(:where([class~=not-prose],[class~=not-prose] *)):before{content:none}.prose :where(pre code):not(:where([class~=not-prose],[class~=not-prose] *)):after{content:none}.prose :where(table):not(:where([class~=not-prose],[class~=not-prose] *)){width:100%;table-layout:auto;margin-top:2em;margin-bottom:2em;font-size:.875em;line-height:1.7142857}.prose :where(thead):not(:where([class~=not-prose],[class~=not-prose] *)){border-bottom-width:1px;border-bottom-color:var(--tw-prose-th-borders)}.prose :where(thead th):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-headings);font-weight:600;vertical-align:bottom;padding-inline-end:.5714286em;padding-bottom:.5714286em;padding-inline-start:.5714286em}.prose :where(tbody tr):not(:where([class~=not-prose],[class~=not-prose] *)){border-bottom-width:1px;border-bottom-color:var(--tw-prose-td-borders)}.prose :where(tbody tr:last-child):not(:where([class~=not-prose],[class~=not-prose] *)){border-bottom-width:0}.prose :where(tbody td):not(:where([class~=not-prose],[class~=not-prose] *)){vertical-align:baseline}.prose :where(tfoot):not(:where([class~=not-prose],[class~=not-prose] *)){border-top-width:1px;border-top-color:var(--tw-prose-th-borders)}.prose :where(tfoot td):not(:where([class~=not-prose],[class~=not-prose] *)){vertical-align:top}.prose :where(th,td):not(:where([class~=not-prose],[class~=not-prose] *)){text-align:start}.prose :where(figure>*):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:0;margin-bottom:0}.prose :where(figcaption):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-captions);font-size:.875em;line-height:1.4285714;margin-top:.8571429em}.prose{--tw-prose-body:#374151;--tw-prose-headings:#111827;--tw-prose-lead:#4b5563;--tw-prose-links:#111827;--tw-prose-bold:#111827;--tw-prose-counters:#6b7280;--tw-prose-bullets:#d1d5db;--tw-prose-hr:#e5e7eb;--tw-prose-quotes:#111827;--tw-prose-quote-borders:#e5e7eb;--tw-prose-captions:#6b7280;--tw-prose-kbd:#111827;--tw-prose-kbd-shadows:17 24 39;--tw-prose-code:#111827;--tw-prose-pre-code:#e5e7eb;--tw-prose-pre-bg:#1f2937;--tw-prose-th-borders:#d1d5db;--tw-prose-td-borders:#e5e7eb;--tw-prose-invert-body:#d1d5db;--tw-prose-invert-headings:#fff;--tw-prose-invert-lead:#9ca3af;--tw-prose-invert-links:#fff;--tw-prose-invert-bold:#fff;--tw-prose-invert-counters:#9ca3af;--tw-prose-invert-bullets:#4b5563;--tw-prose-invert-hr:#374151;--tw-prose-invert-quotes:#f3f4f6;--tw-prose-invert-quote-borders:#374151;--tw-prose-invert-captions:#9ca3af;--tw-prose-invert-kbd:#fff;--tw-prose-invert-kbd-shadows:255 255 255;--tw-prose-invert-code:#fff;--tw-prose-invert-pre-code:#d1d5db;--tw-prose-invert-pre-bg:rgba(0,0,0,.5);--tw-prose-invert-th-borders:#4b5563;--tw-prose-invert-td-borders:#374151;font-size:1rem;line-height:1.75}.prose :where(picture>img):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:0;margin-bottom:0}.prose :where(li):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:.25rem;margin-bottom:.25rem}.prose :where(ol>li):not(:where([class~=not-prose],[class~=not-prose] *)){padding-inline-start:.375em}.prose :where(ul>li):not(:where([class~=not-prose],[class~=not-prose] *)){padding-inline-start:.375em}.prose :where(.prose>ul>li p):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:.75em;margin-bottom:.75em}.prose :where(.prose>ul>li>p:first-child):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:1.25em}.prose :where(.prose>ul>li>p:last-child):not(:where([class~=not-prose],[class~=not-prose] *)){margin-bottom:1.25em}.prose :where(.prose>ol>li>p:first-child):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:1.25em}.prose :where(.prose>ol>li>p:last-child):not(:where([class~=not-prose],[class~=not-prose] *)){margin-bottom:1.25em}.prose :where(ul ul,ul ol,ol ul,ol ol):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:.75em;margin-bottom:.75em}.prose :where(dl):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:1.25em;margin-bottom:1.25em}.prose :where(dd):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:.5em;padding-inline-start:1.625em}.prose :where(hr+*):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:0}.prose :where(h2+*):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:0}.prose :where(h3+*):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:0}.prose :where(h4+*):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:0}.prose :where(thead th:first-child):not(:where([class~=not-prose],[class~=not-prose] *)){padding-inline-start:0}.prose :where(thead th:last-child):not(:where([class~=not-prose],[class~=not-prose] *)){padding-inline-end:0}.prose :where(tbody td,tfoot td):not(:where([class~=not-prose],[class~=not-prose] *)){padding-top:.5714286em;padding-inline-end:.5714286em;padding-bottom:.5714286em;padding-inline-start:.5714286em}.prose :where(tbody td:first-child,tfoot td:first-child):not(:where([class~=not-prose],[class~=not-prose] *)){padding-inline-start:0}.prose :where(tbody td:last-child,tfoot td:last-child):not(:where([class~=not-prose],[class~=not-prose] *)){padding-inline-end:0}.prose :where(figure):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:2em;margin-bottom:2em}.prose :where(.prose>:first-child):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:0}.prose :where(.prose>:last-child):not(:where([class~=not-prose],[class~=not-prose] *)){margin-bottom:0}.prose :where(blockquote p:first-of-type):not(:where([class~=not-prose],[class~=not-prose] *)):after{content:none}.prose :where(li>p,dd>p,header>p,footer>p):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:.25rem;margin-bottom:.25rem}.prose :where(h5,h6):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-headings);font-weight:500}.sr-only{position:absolute;width:1px;height:1px;padding:0;margin:-1px;overflow:hidden;clip:rect(0,0,0,0);white-space:nowrap;border-width:0}.pointer-events-none{pointer-events:none}.pointer-events-auto{pointer-events:auto}.visible{visibility:visible}.invisible{visibility:hidden}.collapse{visibility:collapse}.static{position:static}.fixed{position:fixed}.absolute{position:absolute}.relative{position:relative}.sticky{position:sticky}.inset-0{inset:0}.inset-y-0{top:0;bottom:0}.-left-2{left:-.5rem}.-right-1{right:-.25rem}.-right-\[28px\]{right:-28px}.-top-3{top:-.75rem}.bottom-0{bottom:0}.bottom-2{bottom:.5rem}.left-0{left:0}.left-1{left:.25rem}.left-4{left:1rem}.right-0{right:0}.right-1{right:.25rem}.start-0{inset-inline-start:0}.top-0{top:0}.top-1{top:.25rem}.top-\[32px\]{top:32px}.top-\[60px\]{top:60px}.top-\[80px\]{top:80px}.-z-10{z-index:-10}.z-0{z-index:0}.z-10{z-index:10}.z-20{z-index:20}.z-30{z-index:30}.z-\[1000\]{z-index:1000}.z-\[1001\]{z-index:1001}.z-\[11\]{z-index:11}.z-\[2\]{z-index:2}.col-body{grid-column:body}.col-body-inset{grid-column:body-inset}.col-body-left{grid-column:body/gutter-right-start}.col-body-outset{grid-column:body-outset}.col-body-outset-left{grid-column:body-outset/body}.col-body-outset-right{grid-column:body/body-outset}.col-body-right{grid-column:gutter-left-end/body}.col-gutter-left{grid-column:gutter-left}.col-gutter-outset-left{grid-column:body-outset/gutter-left}.col-gutter-outset-right{grid-column:gutter-right/body-outset}.col-gutter-right{grid-column:gutter-right}.col-page{grid-column:page}.col-page-inset{grid-column:page-inset}.col-page-inset-left{grid-column:page-inset/body}.col-page-inset-right{grid-column:body/page-inset}.col-page-left{grid-column:page/body}.col-page-right{grid-column:body/page}.col-screen{grid-column:screen}.col-screen-inset{grid-column:screen-inset}.col-screen-inset-left{grid-column:screen-inset/body}.col-screen-inset-right{grid-column:body/screen-inset}.col-screen-left{grid-column:screen/body}.col-screen-right{grid-column:body/screen}.col-span-1{grid-column:span 1/span 1}.col-span-2{grid-column:span 2/span 2}.col-span-3{grid-column:span 3/span 3}.col-span-4{grid-column:span 4/span 4}.col-span-5{grid-column:span 5/span 5}.col-span-6{grid-column:span 6/span 6}.row-span-1{grid-row:span 1/span 1}.row-span-2{grid-row:span 2/span 2}.row-span-3{grid-row:span 3/span 3}.row-span-4{grid-row:span 4/span 4}.row-span-5{grid-row:span 5/span 5}.row-span-6{grid-row:span 6/span 6}.float-right{float:right}.m-0{margin:0}.m-1{margin:.25rem}.mx-1{margin-left:.25rem;margin-right:.25rem}.mx-2{margin-left:.5rem;margin-right:.5rem}.mx-3{margin-left:.75rem;margin-right:.75rem}.mx-auto{margin-left:auto;margin-right:auto}.my-1{margin-top:.25rem;margin-bottom:.25rem}.my-10{margin-top:2.5rem;margin-bottom:2.5rem}.my-2{margin-top:.5rem;margin-bottom:.5rem}.my-3{margin-top:.75rem;margin-bottom:.75rem}.my-4{margin-top:1rem;margin-bottom:1rem}.my-5{margin-top:1.25rem;margin-bottom:1.25rem}.my-8,.my-\[2rem\]{margin-top:2rem;margin-bottom:2rem}.-mr-1{margin-right:-.25rem}.mb-0{margin-bottom:0}.mb-1{margin-bottom:.25rem}.mb-10{margin-bottom:2.5rem}.mb-2{margin-bottom:.5rem}.mb-2\.5{margin-bottom:.625rem}.mb-3{margin-bottom:.75rem}.mb-4{margin-bottom:1rem}.mb-5{margin-bottom:1.25rem}.mb-8{margin-bottom:2rem}.mb-\[1rem\]{margin-bottom:1rem}.ml-1{margin-left:.25rem}.ml-2{margin-left:.5rem}.ml-3{margin-left:.75rem}.ml-4{margin-left:1rem}.mr-1{margin-right:.25rem}.mr-2{margin-right:.5rem}.mr-3{margin-right:.75rem}.mt-0{margin-top:0}.mt-0\.5{margin-top:.125rem}.mt-1{margin-top:.25rem}.mt-2{margin-top:.5rem}.mt-3{margin-top:.75rem}.mt-4{margin-top:1rem}.mt-5{margin-top:1.25rem}.mt-9{margin-top:2.25rem}.\!block{display:block!important}.block{display:block}.inline-block{display:inline-block}.inline{display:inline}.flex{display:flex}.inline-flex{display:inline-flex}.table{display:table}.grid{display:grid}.contents{display:contents}.\!hidden{display:none!important}.hidden{display:none}.aspect-square{aspect-ratio:1/1}.h-0{height:0}.h-10{height:2.5rem}.h-11{height:2.75rem}.h-4{height:1rem}.h-5{height:1.25rem}.h-6{height:1.5rem}.h-8{height:2rem}.h-9{height:2.25rem}.h-\[0px\]{height:0}.h-\[10px\]{height:10px}.h-\[150px\]{height:150px}.h-\[22px\]{height:22px}.h-\[2px\]{height:2px}.h-\[60px\]{height:60px}.h-full{height:100%}.h-screen{height:100vh}.max-h-\[15rem\]{max-height:15rem}.max-h-\[300px\]{max-height:300px}.max-h-\[4rem\]{max-height:4rem}.max-h-\[5rem\]{max-height:5rem}.min-h-1{min-height:.25rem}.min-h-\[2em\]{min-height:2em}.w-10{width:2.5rem}.w-4{width:1rem}.w-48{width:12rem}.w-5{width:1.25rem}.w-6{width:1.5rem}.w-8{width:2rem}.w-\[0px\]{width:0}.w-\[10px\]{width:10px}.w-\[22px\]{width:22px}.w-\[300px\]{width:300px}.w-\[400px\]{width:400px}.w-\[500px\]{width:500px}.w-auto{width:auto}.w-fit{width:-moz-fit-content;width:fit-content}.w-full{width:100%}.w-max{width:-moz-max-content;width:max-content}.w-screen{width:100vw}.min-w-0{min-width:0}.min-w-\[400px\]{min-width:400px}.max-w-\[1440px\]{max-width:1440px}.max-w-\[200px\]{max-width:200px}.max-w-\[350px\]{max-width:350px}.max-w-\[80vw\]{max-width:80vw}.max-w-\[90\%\]{max-width:90%}.max-w-full{max-width:100%}.flex-1{flex:1 1 0%}.flex-none{flex:none}.shrink-0{flex-shrink:0}.flex-grow,.grow{flex-grow:1}.grow-0{flex-grow:0}.border-collapse{border-collapse:collapse}.origin-top-left{transform-origin:top left}.origin-top-right{transform-origin:top right}.-translate-y-\[0\.1em\]{--tw-translate-y:-.1em;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.-translate-y-\[1px\],.-translate-y-px{--tw-translate-y:-1px}.-translate-y-\[1px\],.-translate-y-px,.translate-y-2{transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.translate-y-2{--tw-translate-y:.5rem}.translate-y-6{--tw-translate-y:1.5rem}.scale-100,.translate-y-6{transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.scale-100{--tw-scale-x:1;--tw-scale-y:1}.scale-95{--tw-scale-x:.95;--tw-scale-y:.95}.scale-95,.scale-x-100{transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.scale-x-100{--tw-scale-x:1}.transform{transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}@keyframes load{0%{width:0}to{width:50%}}.animate-load{animation:load 2.5s ease-out}@keyframes pulse{50%{opacity:.5}}.animate-pulse{animation:pulse 2s cubic-bezier(.4,0,.6,1) infinite}@keyframes spin{to{transform:rotate(1turn)}}.animate-spin{animation:spin 1s linear infinite}.cursor-help{cursor:help}.cursor-not-allowed{cursor:not-allowed}.cursor-pointer{cursor:pointer}.select-none{-webkit-user-select:none;-moz-user-select:none;user-select:none}.resize-none{resize:none}.resize{resize:both}.list-none{list-style-type:none}.grid-cols-1{grid-template-columns:repeat(1,minmax(0,1fr))}.grid-cols-10{grid-template-columns:repeat(10,minmax(0,1fr))}.grid-cols-11{grid-template-columns:repeat(11,minmax(0,1fr))}.grid-cols-12{grid-template-columns:repeat(12,minmax(0,1fr))}.grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}.grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}.grid-cols-4{grid-template-columns:repeat(4,minmax(0,1fr))}.grid-cols-5{grid-template-columns:repeat(5,minmax(0,1fr))}.grid-cols-6{grid-template-columns:repeat(6,minmax(0,1fr))}.grid-cols-7{grid-template-columns:repeat(7,minmax(0,1fr))}.grid-cols-8{grid-template-columns:repeat(8,minmax(0,1fr))}.grid-cols-9{grid-template-columns:repeat(9,minmax(0,1fr))}.grid-rows-\[3rem_1fr\]{grid-template-rows:3rem 1fr}.flex-row{flex-direction:row}.flex-col{flex-direction:column}.flex-wrap{flex-wrap:wrap}.flex-nowrap{flex-wrap:nowrap}.items-center{align-items:center}.items-stretch{align-items:stretch}.justify-start{justify-content:flex-start}.justify-end{justify-content:flex-end}.justify-center{justify-content:center}.justify-between{justify-content:space-between}.gap-0{gap:0}.gap-1{gap:.25rem}.gap-2{gap:.5rem}.gap-2\.5{gap:.625rem}.gap-4{gap:1rem}.gap-x-1{-moz-column-gap:.25rem;column-gap:.25rem}.gap-y-1{row-gap:.25rem}.gap-y-2{row-gap:.5rem}.space-x-1>:not([hidden])~:not([hidden]){--tw-space-x-reverse:0;margin-right:calc(.25rem*var(--tw-space-x-reverse));margin-left:calc(.25rem*(1 - var(--tw-space-x-reverse)))}.space-x-4>:not([hidden])~:not([hidden]){--tw-space-x-reverse:0;margin-right:calc(1rem*var(--tw-space-x-reverse));margin-left:calc(1rem*(1 - var(--tw-space-x-reverse)))}.divide-y>:not([hidden])~:not([hidden]){--tw-divide-y-reverse:0;border-top-width:calc(1px*(1 - var(--tw-divide-y-reverse)));border-bottom-width:calc(1px*var(--tw-divide-y-reverse))}.divide-gray-100>:not([hidden])~:not([hidden]){--tw-divide-opacity:1;border-color:rgb(243 244 246/var(--tw-divide-opacity))}.self-start{align-self:flex-start}.self-center{align-self:center}.overflow-auto{overflow:auto}.overflow-hidden{overflow:hidden}.overflow-x-auto{overflow-x:auto}.overflow-y-auto{overflow-y:auto}.overflow-y-hidden{overflow-y:hidden}.overflow-y-visible{overflow-y:visible}.overflow-y-scroll{overflow-y:scroll}.text-ellipsis{text-overflow:ellipsis}.whitespace-pre-wrap{white-space:pre-wrap}.break-words{overflow-wrap:break-word}.rounded{border-radius:.25rem}.rounded-full{border-radius:9999px}.rounded-lg{border-radius:.5rem}.rounded-md{border-radius:.375rem}.rounded-sm{border-radius:.125rem}.border{border-width:1px}.border-y{border-top-width:1px}.border-b,.border-y{border-bottom-width:1px}.border-b-2{border-bottom-width:2px}.border-l{border-left-width:1px}.border-l-2{border-left-width:2px}.border-l-4{border-left-width:4px}.border-r{border-right-width:1px}.border-t{border-top-width:1px}.border-solid{border-style:solid}.border-dotted{border-style:dotted}.border-amber-500\/70{border-color:#f59e0bb3}.border-amber-600{--tw-border-opacity:1;border-color:rgb(217 119 6/var(--tw-border-opacity))}.border-blue-500{--tw-border-opacity:1;border-color:rgb(59 130 246/var(--tw-border-opacity))}.border-blue-500\/60{border-color:#3b82f699}.border-gray-100{--tw-border-opacity:1;border-color:rgb(243 244 246/var(--tw-border-opacity))}.border-gray-200{--tw-border-opacity:1;border-color:rgb(229 231 235/var(--tw-border-opacity))}.border-gray-300{--tw-border-opacity:1;border-color:rgb(209 213 219/var(--tw-border-opacity))}.border-gray-500\/60{border-color:#6b728099}.border-gray-800{--tw-border-opacity:1;border-color:rgb(31 41 55/var(--tw-border-opacity))}.border-green-500\/60{border-color:#22c55e99}.border-green-600{--tw-border-opacity:1;border-color:rgb(22 163 74/var(--tw-border-opacity))}.border-orange-500\/60{border-color:#f9731699}.border-purple-500\/60{border-color:#a855f799}.border-red-400{--tw-border-opacity:1;border-color:rgb(248 113 113/var(--tw-border-opacity))}.border-red-500{--tw-border-opacity:1;border-color:rgb(239 68 68/var(--tw-border-opacity))}.border-red-500\/60{border-color:#ef444499}.border-red-600{--tw-border-opacity:1;border-color:rgb(220 38 38/var(--tw-border-opacity))}.border-slate-400{--tw-border-opacity:1;border-color:rgb(148 163 184/var(--tw-border-opacity))}.border-slate-600{--tw-border-opacity:1;border-color:rgb(71 85 105/var(--tw-border-opacity))}.border-stone-200{--tw-border-opacity:1;border-color:rgb(231 229 228/var(--tw-border-opacity))}.border-stone-300{--tw-border-opacity:1;border-color:rgb(214 211 209/var(--tw-border-opacity))}.border-stone-400{--tw-border-opacity:1;border-color:rgb(168 162 158/var(--tw-border-opacity))}.border-stone-700{--tw-border-opacity:1;border-color:rgb(68 64 60/var(--tw-border-opacity))}.border-b-blue-600{--tw-border-opacity:1;border-bottom-color:rgb(37 99 235/var(--tw-border-opacity))}.border-b-gray-100{--tw-border-opacity:1;border-bottom-color:rgb(243 244 246/var(--tw-border-opacity))}.border-l-blue-400{--tw-border-opacity:1;border-left-color:rgb(96 165 250/var(--tw-border-opacity))}.border-l-blue-500{--tw-border-opacity:1;border-left-color:rgb(59 130 246/var(--tw-border-opacity))}.border-l-gray-300{--tw-border-opacity:1;border-left-color:rgb(209 213 219/var(--tw-border-opacity))}.border-l-gray-50{--tw-border-opacity:1;border-left-color:rgb(249 250 251/var(--tw-border-opacity))}.bg-\[\#656c85cc\]{background-color:#656c85cc}.bg-amber-50{--tw-bg-opacity:1;background-color:rgb(255 251 235/var(--tw-bg-opacity))}.bg-amber-50\/80{background-color:#fffbebcc}.bg-black{--tw-bg-opacity:1;background-color:rgb(0 0 0/var(--tw-bg-opacity))}.bg-black\/80{background-color:#000c}.bg-blue-300\/30{background-color:#93c5fd4d}.bg-blue-50{--tw-bg-opacity:1;background-color:rgb(239 246 255/var(--tw-bg-opacity))}.bg-blue-50\/80{background-color:#eff6ffcc}.bg-blue-500{--tw-bg-opacity:1;background-color:rgb(59 130 246/var(--tw-bg-opacity))}.bg-blue-800{--tw-bg-opacity:1;background-color:rgb(30 64 175/var(--tw-bg-opacity))}.bg-blue-900{--tw-bg-opacity:1;background-color:rgb(30 58 138/var(--tw-bg-opacity))}.bg-gray-100{--tw-bg-opacity:1;background-color:rgb(243 244 246/var(--tw-bg-opacity))}.bg-gray-100\/80{background-color:#f3f4f6cc}.bg-gray-50{--tw-bg-opacity:1;background-color:rgb(249 250 251/var(--tw-bg-opacity))}.bg-gray-50\/10{background-color:#f9fafb1a}.bg-gray-50\/80{background-color:#f9fafbcc}.bg-green-50{--tw-bg-opacity:1;background-color:rgb(240 253 244/var(--tw-bg-opacity))}.bg-green-50\/80{background-color:#f0fdf4cc}.bg-inherit{background-color:inherit}.bg-orange-50\/80{background-color:#fff7edcc}.bg-orange-500{--tw-bg-opacity:1;background-color:rgb(249 115 22/var(--tw-bg-opacity))}.bg-orange-700{--tw-bg-opacity:1;background-color:rgb(194 65 12/var(--tw-bg-opacity))}.bg-purple-50\/80{background-color:#faf5ffcc}.bg-red-50{--tw-bg-opacity:1;background-color:rgb(254 242 242/var(--tw-bg-opacity))}.bg-red-50\/80{background-color:#fef1f1cc}.bg-red-500{--tw-bg-opacity:1;background-color:rgb(239 68 68/var(--tw-bg-opacity))}.bg-red-800{--tw-bg-opacity:1;background-color:rgb(153 27 27/var(--tw-bg-opacity))}.bg-slate-100{--tw-bg-opacity:1;background-color:rgb(241 245 249/var(--tw-bg-opacity))}.bg-slate-200{--tw-bg-opacity:1;background-color:rgb(226 232 240/var(--tw-bg-opacity))}.bg-slate-300\/30{background-color:#cbd5e14d}.bg-slate-50{--tw-bg-opacity:1;background-color:rgb(248 250 252/var(--tw-bg-opacity))}.bg-slate-50\/50{background-color:#f8fafc80}.bg-slate-500{--tw-bg-opacity:1;background-color:rgb(100 116 139/var(--tw-bg-opacity))}.bg-slate-700{--tw-bg-opacity:1;background-color:rgb(51 65 85/var(--tw-bg-opacity))}.bg-slate-800{--tw-bg-opacity:1;background-color:rgb(30 41 59/var(--tw-bg-opacity))}.bg-slate-900{--tw-bg-opacity:1;background-color:rgb(15 23 42/var(--tw-bg-opacity))}.bg-stone-200\/10{background-color:#e7e5e41a}.bg-stone-700{--tw-bg-opacity:1;background-color:rgb(68 64 60/var(--tw-bg-opacity))}.bg-stone-900{--tw-bg-opacity:1;background-color:rgb(28 25 23/var(--tw-bg-opacity))}.bg-transparent{background-color:transparent}.bg-white{--tw-bg-opacity:1;background-color:rgb(255 255 255/var(--tw-bg-opacity))}.bg-white\/80{background-color:#fffc}.bg-cover{background-size:cover}.bg-top{background-position:top}.bg-no-repeat{background-repeat:no-repeat}.fill-blue-900{fill:#1e3a8a}.fill-green-600{fill:#16a34a}.fill-white{fill:#fff}.object-cover{-o-object-fit:cover;object-fit:cover}.object-left{-o-object-position:left;object-position:left}.object-top{-o-object-position:top;object-position:top}.p-0\.5{padding:.125rem}.p-1{padding:.25rem}.p-2{padding:.5rem}.p-2\.5{padding:.625rem}.p-3{padding:.75rem}.p-4{padding:1rem}.p-5{padding:1.25rem}.p-6{padding:1.5rem}.px-1{padding-left:.25rem;padding-right:.25rem}.px-2{padding-left:.5rem;padding-right:.5rem}.px-3{padding-left:.75rem;padding-right:.75rem}.px-4{padding-left:1rem;padding-right:1rem}.px-6{padding-left:1.5rem;padding-right:1.5rem}.py-0\.5{padding-top:.125rem;padding-bottom:.125rem}.py-1{padding-top:.25rem;padding-bottom:.25rem}.py-2{padding-top:.5rem;padding-bottom:.5rem}.py-3{padding-top:.75rem;padding-bottom:.75rem}.py-6{padding-top:1.5rem;padding-bottom:1.5rem}.pb-2{padding-bottom:.5rem}.pb-6{padding-bottom:1.5rem}.pb-\[1rem\]{padding-bottom:1rem}.pl-0{padding-left:0}.pl-10{padding-left:2.5rem}.pl-12{padding-left:3rem}.pl-2{padding-left:.5rem}.pl-3{padding-left:.75rem}.pl-4{padding-left:1rem}.pl-8{padding-left:2rem}.pr-2{padding-right:.5rem}.pr-\[2px\]{padding-right:2px}.ps-10{padding-inline-start:2.5rem}.pt-10{padding-top:2.5rem}.pt-3{padding-top:.75rem}.pt-4{padding-top:1rem}.pt-6{padding-top:1.5rem}.pt-9{padding-top:2.25rem}.pt-\[40px\]{padding-top:40px}.pt-\[80px\]{padding-top:80px}.text-left{text-align:left}.text-center{text-align:center}.text-right{text-align:right}.align-top{vertical-align:top}.align-middle{vertical-align:middle}.font-mono{font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace}.text-\[15px\]{font-size:15px}.text-base{font-size:1rem;line-height:1.5rem}.text-lg{font-size:1.125rem;line-height:1.75rem}.text-sm{font-size:.875rem;line-height:1.25rem}.text-xl{font-size:1.25rem;line-height:1.75rem}.text-xs{font-size:.75rem;line-height:1rem}.font-bold{font-weight:700}.font-extralight{font-weight:200}.font-light{font-weight:300}.font-medium{font-weight:500}.font-normal{font-weight:400}.font-semibold{font-weight:600}.font-thin{font-weight:100}.uppercase{text-transform:uppercase}.capitalize{text-transform:capitalize}.italic{font-style:italic}.leading-3{line-height:.75rem}.leading-6{line-height:1.5rem}.leading-\[0\]{line-height:0}.leading-\[19px\]{line-height:19px}.leading-none{line-height:1}.tracking-tight{letter-spacing:-.025em}.text-amber-600{--tw-text-opacity:1;color:rgb(217 119 6/var(--tw-text-opacity))}.text-black{--tw-text-opacity:1;color:rgb(0 0 0/var(--tw-text-opacity))}.text-blue-200{--tw-text-opacity:1;color:rgb(191 219 254/var(--tw-text-opacity))}.text-blue-400{--tw-text-opacity:1;color:rgb(96 165 250/var(--tw-text-opacity))}.text-blue-500{--tw-text-opacity:1;color:rgb(59 130 246/var(--tw-text-opacity))}.text-blue-600{--tw-text-opacity:1;color:rgb(37 99 235/var(--tw-text-opacity))}.text-blue-800{--tw-text-opacity:1;color:rgb(30 64 175/var(--tw-text-opacity))}.text-gray-100{--tw-text-opacity:1;color:rgb(243 244 246/var(--tw-text-opacity))}.text-gray-200{--tw-text-opacity:1;color:rgb(229 231 235/var(--tw-text-opacity))}.text-gray-400{--tw-text-opacity:1;color:rgb(156 163 175/var(--tw-text-opacity))}.text-gray-500{--tw-text-opacity:1;color:rgb(107 114 128/var(--tw-text-opacity))}.text-gray-600{--tw-text-opacity:1;color:rgb(75 85 99/var(--tw-text-opacity))}.text-gray-700{--tw-text-opacity:1;color:rgb(55 65 81/var(--tw-text-opacity))}.text-gray-900{--tw-text-opacity:1;color:rgb(17 24 39/var(--tw-text-opacity))}.text-green-500{--tw-text-opacity:1;color:rgb(34 197 94/var(--tw-text-opacity))}.text-green-600{--tw-text-opacity:1;color:rgb(22 163 74/var(--tw-text-opacity))}.text-green-700{--tw-text-opacity:1;color:rgb(21 128 61/var(--tw-text-opacity))}.text-inherit{color:inherit}.text-neutral-700{--tw-text-opacity:1;color:rgb(64 64 64/var(--tw-text-opacity))}.text-neutral-900{--tw-text-opacity:1;color:rgb(23 23 23/var(--tw-text-opacity))}.text-orange-600{--tw-text-opacity:1;color:rgb(234 88 12/var(--tw-text-opacity))}.text-purple-600{--tw-text-opacity:1;color:rgb(147 51 234/var(--tw-text-opacity))}.text-purple-700{--tw-text-opacity:1;color:rgb(126 34 206/var(--tw-text-opacity))}.text-red-500{--tw-text-opacity:1;color:rgb(239 68 68/var(--tw-text-opacity))}.text-red-600{--tw-text-opacity:1;color:rgb(220 38 38/var(--tw-text-opacity))}.text-slate-300{--tw-text-opacity:1;color:rgb(203 213 225/var(--tw-text-opacity))}.text-slate-400{--tw-text-opacity:1;color:rgb(148 163 184/var(--tw-text-opacity))}.text-slate-50{--tw-text-opacity:1;color:rgb(248 250 252/var(--tw-text-opacity))}.text-slate-500{--tw-text-opacity:1;color:rgb(100 116 139/var(--tw-text-opacity))}.text-slate-600{--tw-text-opacity:1;color:rgb(71 85 105/var(--tw-text-opacity))}.text-slate-700{--tw-text-opacity:1;color:rgb(51 65 85/var(--tw-text-opacity))}.text-slate-900{--tw-text-opacity:1;color:rgb(15 23 42/var(--tw-text-opacity))}.text-stone-100{--tw-text-opacity:1;color:rgb(245 245 244/var(--tw-text-opacity))}.text-stone-200{--tw-text-opacity:1;color:rgb(231 229 228/var(--tw-text-opacity))}.text-stone-500{--tw-text-opacity:1;color:rgb(120 113 108/var(--tw-text-opacity))}.text-stone-700{--tw-text-opacity:1;color:rgb(68 64 60/var(--tw-text-opacity))}.text-stone-800{--tw-text-opacity:1;color:rgb(41 37 36/var(--tw-text-opacity))}.text-stone-900{--tw-text-opacity:1;color:rgb(28 25 23/var(--tw-text-opacity))}.text-violet-200{--tw-text-opacity:1;color:rgb(221 214 254/var(--tw-text-opacity))}.text-white{--tw-text-opacity:1;color:rgb(255 255 255/var(--tw-text-opacity))}.text-yellow-600{--tw-text-opacity:1;color:rgb(202 138 4/var(--tw-text-opacity))}.text-zinc-600{--tw-text-opacity:1;color:rgb(82 82 91/var(--tw-text-opacity))}.underline{text-decoration-line:underline}.no-underline{text-decoration-line:none}.placeholder-gray-400::-moz-placeholder{--tw-placeholder-opacity:1;color:rgb(156 163 175/var(--tw-placeholder-opacity))}.placeholder-gray-400::placeholder{--tw-placeholder-opacity:1;color:rgb(156 163 175/var(--tw-placeholder-opacity))}.opacity-0{opacity:0}.opacity-10{opacity:.1}.opacity-100{opacity:1}.opacity-50{opacity:.5}.opacity-60{opacity:.6}.opacity-70{opacity:.7}.opacity-80{opacity:.8}.opacity-90{opacity:.9}.shadow{--tw-shadow:0 1px 3px 0 rgba(0,0,0,.1),0 1px 2px -1px rgba(0,0,0,.1);--tw-shadow-colored:0 1px 3px 0 var(--tw-shadow-color),0 1px 2px -1px var(--tw-shadow-color)}.shadow,.shadow-2xl{box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.shadow-2xl{--tw-shadow:0 25px 50px -12px rgba(0,0,0,.25);--tw-shadow-colored:0 25px 50px -12px var(--tw-shadow-color)}.shadow-\[0px_2px_0px_0px_rgba\(0\,0\,0\,0\.08\)\]{--tw-shadow:0px 2px 0px 0px rgba(0,0,0,.08);--tw-shadow-colored:0px 2px 0px 0px var(--tw-shadow-color)}.shadow-\[0px_2px_0px_0px_rgba\(0\,0\,0\,0\.08\)\],.shadow-inner{box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.shadow-inner{--tw-shadow:inset 0 2px 4px 0 rgba(0,0,0,.05);--tw-shadow-colored:inset 0 2px 4px 0 var(--tw-shadow-color)}.shadow-lg{--tw-shadow:0 10px 15px -3px rgba(0,0,0,.1),0 4px 6px -4px rgba(0,0,0,.1);--tw-shadow-colored:0 10px 15px -3px var(--tw-shadow-color),0 4px 6px -4px var(--tw-shadow-color)}.shadow-lg,.shadow-md{box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.shadow-md{--tw-shadow:0 4px 6px -1px rgba(0,0,0,.1),0 2px 4px -2px rgba(0,0,0,.1);--tw-shadow-colored:0 4px 6px -1px var(--tw-shadow-color),0 2px 4px -2px var(--tw-shadow-color)}.shadow-sm{--tw-shadow:0 1px 2px 0 rgba(0,0,0,.05);--tw-shadow-colored:0 1px 2px 0 var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.outline-none{outline:2px solid transparent;outline-offset:2px}.\!outline{outline-style:solid!important}.outline{outline-style:solid}.outline-2{outline-width:2px}.outline-blue-200{outline-color:#bfdbfe}.ring-1{--tw-ring-offset-shadow:var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);--tw-ring-shadow:var(--tw-ring-inset) 0 0 0 calc(1px + var(--tw-ring-offset-width)) var(--tw-ring-color);box-shadow:var(--tw-ring-offset-shadow),var(--tw-ring-shadow),var(--tw-shadow,0 0 #0000)}.ring-black{--tw-ring-opacity:1;--tw-ring-color:rgb(0 0 0/var(--tw-ring-opacity))}.ring-blue-500{--tw-ring-opacity:1;--tw-ring-color:rgb(59 130 246/var(--tw-ring-opacity))}.ring-opacity-5{--tw-ring-opacity:.05}.filter{filter:var(--tw-blur) var(--tw-brightness) var(--tw-contrast) var(--tw-grayscale) var(--tw-hue-rotate) var(--tw-invert) var(--tw-saturate) var(--tw-sepia) var(--tw-drop-shadow)}.backdrop-blur{--tw-backdrop-blur:blur(8px);-webkit-backdrop-filter:var(--tw-backdrop-blur) var(--tw-backdrop-brightness) var(--tw-backdrop-contrast) var(--tw-backdrop-grayscale) var(--tw-backdrop-hue-rotate) var(--tw-backdrop-invert) var(--tw-backdrop-opacity) var(--tw-backdrop-saturate) var(--tw-backdrop-sepia);backdrop-filter:var(--tw-backdrop-blur) var(--tw-backdrop-brightness) var(--tw-backdrop-contrast) var(--tw-backdrop-grayscale) var(--tw-backdrop-hue-rotate) var(--tw-backdrop-invert) var(--tw-backdrop-opacity) var(--tw-backdrop-saturate) var(--tw-backdrop-sepia)}.transition{transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,-webkit-backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter,-webkit-backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.transition-all{transition-property:all;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.transition-colors{transition-property:color,background-color,border-color,text-decoration-color,fill,stroke;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.transition-opacity{transition-property:opacity;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.transition-transform{transition-property:transform;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.duration-100{transition-duration:.1s}.duration-200{transition-duration:.2s}.duration-300{transition-duration:.3s}.duration-500{transition-duration:.5s}.duration-700{transition-duration:.7s}.duration-75{transition-duration:75ms}.ease-in{transition-timing-function:cubic-bezier(.4,0,1,1)}.ease-in-out{transition-timing-function:cubic-bezier(.4,0,.2,1)}.ease-out{transition-timing-function:cubic-bezier(0,0,.2,1)}.smallcaps{font-variant:small-caps}details>summary{list-style:none;transition:margin .15s ease-out}details>summary::-webkit-details-marker,details>summary::marker{display:none}details[open]>summary .details-toggle{transform:rotate(90deg) translate(-5px) translateY(-5px)}details[open]>summary{margin-bottom:10px}details .details-body{overflow:auto}details[open] .details-body{margin-top:-10px}cite{font-style:normal}.cite-group.parenthetical>:after{content:";\a0"}.cite-group.narrative>:after,.xref-group>:after{content:",\a0"}.cite-group>:last-of-type:after,.xref-group>:last-of-type:after{content:""}.cite-group.parenthetical:before{content:var(--cite-group-open,"(")}.cite-group.parenthetical:after{content:var(--cite-group-close,")")}.xref-group.parenthetical:before{content:var(--xref-group-open,"(")}.xref-group.parenthetical:after{content:var(--xref-group-close,")")}figure.fig-quote figcaption{text-align:right}figure.fig-quote figcaption>p:before{content:"\2014";padding-right:.5em}figure.fig-quote.pull-quote>blockquote{font-size:1.4rem}figure.fig-code>div{margin:0}figure.subcontainer{margin-bottom:0}figure.subcontainer figcaption{margin-top:.25em;text-align:center}figure figcaption>p,figure img{margin-top:0!important;margin-bottom:0!important}.text-spacer:after{content:"\a0\2219\a0"}.text-comma:after{content:",\a0\a0"}pre>code>span[data-line-number]{display:block;position:relative;line-height:1.4rem;padding-right:1rem}pre>code>span[data-highlight=true]:after{content:" ";position:absolute;right:-.8rem;top:0;width:calc(100% + 1.6rem);opacity:.1;pointer-events:none;background:#5ca5ee}pre>code>span>.linenumber{color:gray}pre>code>span[data-highlight=true]>.linenumber{color:#5ca5ee}.dark .hljs{background:#1e1e1e!important;color:#dcdcdc}.dark .hljs-keyword,.dark .hljs-literal,.dark .hljs-name,.dark .hljs-symbol{color:#569cd6}.dark .hljs-link{color:#569cd6;text-decoration:underline}.dark .hljs-built_in,.dark .hljs-type{color:#4ec9b0}.dark .hljs-class,.dark .hljs-number{color:#b8d7a3}.dark .hljs-meta .hljs-string,.dark .hljs-string{color:#d69d85}.dark .hljs-regexp,.dark .hljs-template-tag{color:#9a5334}.dark .hljs-formula,.dark .hljs-function,.dark .hljs-params,.dark .hljs-subst,.dark .hljs-title{color:#dcdcdc}.dark .hljs-comment,.dark .hljs-quote{color:#57a64a;font-style:italic}.dark .hljs-doctag{color:#608b4e}.dark .hljs-meta,.dark .hljs-meta .hljs-keyword,.dark .hljs-tag{color:#9b9b9b}.dark .hljs-template-variable,.dark .hljs-variable{color:#bd63c5}.dark .hljs-attr,.dark .hljs-attribute{color:#9cdcfe}.dark .hljs-section{color:gold}.dark .hljs-emphasis{font-style:italic}.dark .hljs-strong{font-weight:700}.dark .hljs-bullet,.dark .hljs-selector-attr,.dark .hljs-selector-class,.dark .hljs-selector-id,.dark .hljs-selector-pseudo,.dark .hljs-selector-tag{color:#d7ba7d}.dark .hljs-addition{background-color:#144212;display:inline-block;width:100%}.dark .hljs-deletion{background-color:#600;display:inline-block;width:100%}.dark .hljs-code{color:unset}.xml .hljs-meta{color:silver;background:transparent}.hljs-comment,.hljs-quote{color:#007400}.hljs-attribute,.hljs-keyword,.hljs-literal,.hljs-name,.hljs-selector-tag,.hljs-tag{color:#aa0d91}.hljs-template-variable,.hljs-variable{color:#3f6e74}.hljs-code,.hljs-meta .hljs-string,.hljs-string{color:#c41a16}.hljs-link,.hljs-regexp{color:#0e0eff}.hljs-bullet,.hljs-number,.hljs-symbol,.hljs-title{color:#1c00cf}.hljs-meta,.hljs-section{color:#643820}.hljs-built_in,.hljs-class .hljs-title,.hljs-params,.hljs-title.class_,.hljs-type{color:#5c2699}.hljs-attr{color:#836c28}.hljs-subst{color:#000}.hljs-formula{background-color:#eee;font-style:italic}.hljs-addition{background-color:#baeeba}.hljs-deletion{background-color:#ffc8bd}.hljs-selector-class,.hljs-selector-id{color:#9b703f}.hljs-doctag,.hljs-strong{font-weight:700}.hljs-emphasis{font-style:italic}.katex-display{margin:0!important}.katex .eqn-num{opacity:0;-webkit-user-select:none;-moz-user-select:none;user-select:none;pointer-events:none}.font-system{font-family:Menlo,Consolas,DejaVu Sans Mono,monospace}.jupyter-error{background-color:#fdd}.jp-OutputPrompt{display:none}table.dataframe{border:none;border-collapse:collapse;border-spacing:0;color:#000;font-size:1em;table-layout:fixed;margin:0!important}.dataframe thead{border-bottom:1px solid #000;vertical-align:bottom}.dataframe td,.dataframe th,.dataframe tr{text-align:right;vertical-align:middle;padding:.5em;line-height:normal;white-space:normal;max-width:none;border:none}.dataframe th{font-weight:700}.dataframe tbody tr:nth-child(odd){background:#f5f5f5}.dataframe tbody tr:hover{background:rgba(66,165,245,.2)}html.dark{--jp-ui-font-color0:#fff;--jp-ui-font-color1:hsla(0,0%,100%,.87);--jp-ui-font-color2:hsla(0,0%,100%,.54);--jp-ui-font-color3:hsla(0,0%,100%,.38);--jp-ui-inverse-font-color0:#000;--jp-ui-inverse-font-color1:rgba(0,0,0,.8);--jp-ui-inverse-font-color2:rgba(0,0,0,.5);--jp-ui-inverse-font-color3:rgba(0,0,0,.3);--jp-content-font-color0:#fff;--jp-content-font-color1:#fff;--jp-content-font-color2:hsla(0,0%,100%,.7);--jp-content-font-color3:hsla(0,0%,100%,.5);--jp-layout-color0:#111;--jp-layout-color1:var(--md-grey-900);--jp-layout-color2:var(--md-grey-800);--jp-layout-color3:var(--md-grey-700);--jp-layout-color4:var(--md-grey-600)}.sphinx-desc-addname,.sphinx-desc-inline,.sphinx-desc-name,.sphinx-desc-optional,.sphinx-desc-parameterlist,.sphinx-desc-returns,.sphinx-desc-sig-element,.sphinx-desc-sig-keyword,.sphinx-desc-sig-keyword-type,.sphinx-desc-sig-literal-char,.sphinx-desc-sig-literal-number,.sphinx-desc-sig-literal-string,.sphinx-desc-sig-name,.sphinx-desc-sig-operator,.sphinx-desc-sig-punctuation,.sphinx-desc-sig-space,.sphinx-desc-signature-line,.sphinx-desc-type,.sphinx-desc-type-parameter{white-space:pre}.sphinx-desc-name{font-size:1.1em;font-weight:700}.sphinx-desc-signature{font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace;font-weight:300}.sphinx-desc-returns:before{--tw-content:" \2192 ";content:var(--tw-content)}dl>dt:has([class^=sphinx-desc-]){font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace;font-weight:300}dl>dt:has([class^=sphinx-desc-])>em{white-space:pre}dl:has(>dt>[class^=sphinx-desc-])>dd>dl:not(:has(>dt>[class^=sphinx-desc-])){display:grid;grid-template-columns:fit-content(30%) auto}dl:has(>dt>[class^=sphinx-desc-])>dd>dl:not(:has(>dt>[class^=sphinx-desc-]))>dd>p{margin:unset!important}dl:has(>dt>[class^=sphinx-desc-])>dd>dl:not(:has(>dt>[class^=sphinx-desc-]))>:is(dt,dd){margin:unset!important}.myst-grid>*{margin:0!important}.hover-card-content{animation-duration:.6s;animation-timing-function:cubic-bezier(.16,1,.3,1);z-index:10}.hover-card-content[data-side=top]{animation-name:slideUp}.hover-card-content[data-side=bottom]{animation-name:slideDown}@keyframes slideUp{0%{opacity:0;transform:translateY(10px)}to{opacity:1;transform:translateY(0)}}@keyframes slideDown{0%{opacity:0;transform:translateY(-10px)}to{opacity:1;transform:translateY(0)}}.hover-document{border-radius:.25rem;border-width:1px;--tw-border-opacity:1;border-color:rgb(249 250 251/var(--tw-border-opacity));--tw-bg-opacity:1;background-color:rgb(255 255 255/var(--tw-bg-opacity));font-size:.875rem;line-height:1.25rem;--tw-shadow:0 20px 25px -5px rgba(0,0,0,.1),0 8px 10px -6px rgba(0,0,0,.1);--tw-shadow-colored:0 20px 25px -5px var(--tw-shadow-color),0 8px 10px -6px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.hover-document:is(.dark *){--tw-bg-opacity:1;background-color:rgb(30 41 59/var(--tw-bg-opacity))}.hover-link{font-weight:400;--tw-text-opacity:1;color:rgb(29 78 216/var(--tw-text-opacity));text-decoration-color:#e2e8f0;text-underline-offset:2px}.hover-link:hover{--tw-text-opacity:1;color:rgb(59 130 246/var(--tw-text-opacity))}.hover-link:is(.dark *){--tw-text-opacity:1;color:rgb(219 234 254/var(--tw-text-opacity));text-decoration-color:#475569}p[data-line-number].line:before{content:attr(data-line-number);position:absolute;left:0;font-family:monospace;width:1.25em;text-align:right;-webkit-user-select:none;-moz-user-select:none;user-select:none;color:gray;overflow:hidden}p.line{position:relative;margin:0}.collapsible-content{overflow:hidden}.collapsible-content[data-state=open]{animation:open-content .3s ease-out}.collapsible-content[data-state=closed]{animation:close-content .3s ease-out}@keyframes open-content{0%{height:0}to{height:var(--radix-collapsible-content-height)}}@keyframes close-content{0%{height:var(--radix-collapsible-content-height)}to{height:0}}input[type=search]::-webkit-search-cancel-button,input[type=search]::-webkit-search-decoration,input[type=search]::-webkit-search-results-button,input[type=search]::-webkit-search-results-decoration{display:none}@media (min-width:1024px){.lg\:col-margin-right{grid-column:body}@media (min-width:1024px){.lg\:col-margin-right{grid-column:body-end/page-end}}}@media (min-width:1280px){.xl\:article-grid{display:grid;grid-template-columns:[screen-start screen-inset-start] .5rem [page-start page-inset-start body-outset-start body-start gutter-left-start body-inset-start middle-start] 1fr 1fr [gutter-left-end] 1fr 1fr [gutter-right-start] 1fr 1fr [middle-end body-inset-end body-end gutter-right-end body-outset-end page-inset-end page-end] .5rem [screen-inset-end screen-end];align-content:flex-start}@media (min-width:768px){.xl\:article-grid{grid-template-columns:[screen-start] .25rem [screen-inset-start page-start page-inset-start body-outset-start] 1fr [body-start gutter-left-start] 1rem [body-inset-start] minmax(2ch,10ch) [middle-start] minmax(2ch,10ch) [gutter-left-end] minmax(2ch,10ch) minmax(2ch,10ch) [gutter-right-start] minmax(2ch,10ch) [middle-end] minmax(2ch,10ch) [body-inset-end] 1rem [body-end gutter-right-end] 1fr [body-outset-end page-inset-end page-end screen-inset-end] .25rem [screen-end]}}@media (min-width:1024px){.xl\:article-grid{grid-template-columns:[screen-start] .25rem [screen-inset-start page-start] 1rem [page-inset-start body-outset-start] 1fr [body-start gutter-left-start] 1rem [body-inset-start] minmax(8ch,10ch) [middle-start] minmax(8ch,10ch) [gutter-left-end] minmax(8ch,10ch) minmax(8ch,10ch) [gutter-right-start] minmax(8ch,10ch) [middle-end] minmax(8ch,10ch) [body-inset-end] 1rem [body-end gutter-right-end] 3rem [body-outset-end] minmax(5rem,13rem) [page-inset-end] 3rem [page-end] 1fr [screen-inset-end] .25rem [screen-end]}}@media (min-width:1280px){.xl\:article-grid{grid-template-columns:[screen-start] .25rem [screen-inset-start] 1fr [page-start] 3rem [page-inset-start] minmax(4rem,9rem) [body-outset-start] 3rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(8ch,10ch) [middle-start] minmax(8ch,10ch) [gutter-left-end] minmax(8ch,10ch) minmax(8ch,10ch) [gutter-right-start] minmax(8ch,10ch) [middle-end] minmax(8ch,10ch) [body-inset-end] 1rem [body-end gutter-right-end] 3rem [body-outset-end] minmax(5rem,13rem) [page-inset-end] 3rem [page-end] 1fr [screen-inset-end] .25rem [screen-end]}}@media (min-width:1536px){.xl\:article-grid{grid-template-columns:[screen-start] .5rem [screen-inset-start] 1fr [page-start] 3rem [page-inset-start] minmax(4rem,9rem) [body-outset-start] 3rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(8ch,10ch) [middle-start] minmax(8ch,10ch) [gutter-left-end] minmax(8ch,10ch) minmax(8ch,10ch) [gutter-right-start] minmax(8ch,10ch) [middle-end] minmax(8ch,10ch) [body-inset-end] 1rem [body-end gutter-right-end] 3rem [body-outset-end] minmax(5rem,13rem) [page-inset-end] 3rem [page-end] 1fr [screen-inset-end] .5rem [screen-end]}}.xl\:article-left-grid{display:grid;grid-template-columns:[screen-start screen-inset-start] .5rem [page-start page-inset-start body-outset-start body-start gutter-left-start body-inset-start middle-start] 1fr 1fr [gutter-left-end] 1fr 1fr [gutter-right-start] 1fr 1fr [middle-end body-inset-end body-end gutter-right-end body-outset-end page-inset-end page-end] .5rem [screen-inset-end screen-end];align-content:flex-start}@media (min-width:768px){.xl\:article-left-grid{grid-template-columns:[screen-start] .5rem [screen-inset-start page-start page-inset-start body-outset-start] 1rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(5ch,15ch) [middle-start] minmax(5ch,15ch) [gutter-left-end] minmax(5ch,15ch) minmax(5ch,15ch) [gutter-right-start] minmax(5ch,15ch) [middle-end] minmax(5ch,15ch) [body-inset-end] 1rem [body-end gutter-right-end body-outset-end page-inset-end] 1rem [page-end screen-inset-end] .5rem [screen-end]}}@media (min-width:1024px){.xl\:article-left-grid{grid-template-columns:[screen-start] .5rem [screen-inset-start page-start page-inset-start body-outset-start] 1rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(5ch,12ch) [middle-start] minmax(5ch,12ch) [gutter-left-end] minmax(5ch,12ch) minmax(5ch,12ch) [gutter-right-start] minmax(5ch,12ch) [middle-end] minmax(5ch,12ch) [body-inset-end] 1rem [body-end] 1fr [gutter-right-end] 1rem [body-outset-end] minmax(10rem,18rem) [page-inset-end] 1rem [page-end] 1fr [screen-inset-end] .5rem [screen-end]}}@media (min-width:1280px){.xl\:article-left-grid{grid-template-columns:[screen-start] .5rem [screen-inset-start page-start page-inset-start body-outset-start] 3rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(8ch,12ch) [middle-start] minmax(8ch,12ch) [gutter-left-end] minmax(8ch,12ch) minmax(8ch,12ch) [gutter-right-start] minmax(8ch,12ch) [middle-end] minmax(8ch,12ch) [body-inset-end] 1rem [body-end] 1fr [gutter-right-end] 1rem [body-outset-end] minmax(10rem,18rem) [page-inset-end] 1rem [page-end] 1fr [screen-inset-end] .5rem [screen-end]}}@media (min-width:1536px){.xl\:article-left-grid{grid-template-columns:[screen-start] .5rem [screen-inset-start] 1fr [page-start page-inset-start body-outset-start] 3rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(8ch,12ch) [middle-start] minmax(8ch,12ch) [gutter-left-end] minmax(8ch,12ch) minmax(8ch,12ch) [gutter-right-start] minmax(8ch,12ch) [middle-end] minmax(8ch,12ch) [body-inset-end] 1rem [body-end] 1fr [gutter-right-end] 1rem [body-outset-end] minmax(10rem,18rem) [page-inset-end] 1rem [page-end] 1fr [screen-inset-end] .5rem [screen-end]}}.xl\:article-center-grid{display:grid;grid-template-columns:[screen-start screen-inset-start] .5rem [page-start page-inset-start body-outset-start body-start gutter-left-start body-inset-start middle-start] 1fr 1fr [gutter-left-end] 1fr 1fr [gutter-right-start] 1fr 1fr [middle-end body-inset-end body-end gutter-right-end body-outset-end page-inset-end page-end] .5rem [screen-inset-end screen-end];align-content:flex-start}@media (min-width:768px){.xl\:article-center-grid{grid-template-columns:[screen-start] .25rem [screen-inset-start page-start page-inset-start body-outset-start] 1fr [body-start gutter-left-start] 1rem [body-inset-start] minmax(2ch,10ch) [middle-start] minmax(2ch,10ch) [gutter-left-end] minmax(2ch,10ch) minmax(2ch,10ch) [gutter-right-start] minmax(2ch,10ch) [middle-end] minmax(2ch,10ch) [body-inset-end] 1rem [body-end gutter-right-end] 1fr [body-outset-end page-inset-end page-end screen-inset-end] .25rem [screen-end]}}@media (min-width:1024px){.xl\:article-center-grid{grid-template-columns:[screen-start] .5rem [screen-inset-start page-start] 2rem [page-inset-start] 2fr [body-outset-start gutter-outset-left-start] 1rem [body-start gutter-left-start] 2rem [body-inset-start gutter-left-start] minmax(8ch,10ch) [gutter-left-end middle-start] minmax(8ch,10ch) minmax(8ch,10ch) [] minmax(8ch,10ch) [] minmax(8ch,10ch) [middle-end gutter-right-start gutter-page-right-start] minmax(8ch,10ch) [body-inset-end gutter-right-end] 2rem [body-end] 1rem [body-outset-end] 2fr [page-inset-end] 2rem [page-end screen-inset-end] .5rem [screen-end]}}@media (min-width:1280px){.xl\:article-center-grid{grid-template-columns:[screen-start] .25rem [screen-inset-start] 1fr [page-start] 3rem [page-inset-start] minmax(4rem,9rem) [body-outset-start] 3rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(8ch,10ch) [middle-start] minmax(8ch,10ch) [gutter-left-end] minmax(8ch,10ch) minmax(8ch,10ch) [gutter-right-start] minmax(8ch,10ch) [middle-end] minmax(8ch,10ch) [body-inset-end] 1rem [body-end gutter-right-end] 3rem [body-outset-end] minmax(4rem,9rem) [page-inset-end] 3rem [page-end] 1fr [screen-inset-end] .25rem [screen-end]}}@media (min-width:1536px){.xl\:article-center-grid{grid-template-columns:[screen-start] .5rem [screen-inset-start] 1fr [page-start] 3rem [page-inset-start] minmax(4rem,9rem) [body-outset-start] 3rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(8ch,10ch) [middle-start] minmax(8ch,10ch) [gutter-left-end] minmax(8ch,10ch) minmax(8ch,10ch) [gutter-right-start] minmax(8ch,10ch) [middle-end] minmax(8ch,10ch) [body-inset-end] 1rem [body-end gutter-right-end] 3rem [body-outset-end] minmax(4rem,9rem) [page-inset-end] 3rem [page-end] 1fr [screen-inset-end] .5rem [screen-end]}}.xl\:article-center-grid>*,.xl\:article-grid>*,.xl\:article-left-grid>*{grid-column:body}.xl\:article-center-grid>*,.xl\:article-grid>*,.xl\:article-left-grid>*{margin-top:0!important}.xl\:col-margin-left{grid-column:body}@media (min-width:1280px){.xl\:col-margin-left{grid-column:page/body-start}}}.after\:mr-1:after{content:var(--tw-content);margin-right:.25rem}.after\:content-\[\'\,\'\]:after{--tw-content:",";content:var(--tw-content)}.focus-within\:z-40:focus-within{z-index:40}.focus-within\:h-auto:focus-within{height:auto}.focus-within\:w-auto:focus-within{width:auto}.focus-within\:p-2:focus-within{padding:.5rem}.focus-within\:ring-1:focus-within{--tw-ring-offset-shadow:var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);--tw-ring-shadow:var(--tw-ring-inset) 0 0 0 calc(1px + var(--tw-ring-offset-width)) var(--tw-ring-color);box-shadow:var(--tw-ring-offset-shadow),var(--tw-ring-shadow),var(--tw-shadow,0 0 #0000)}.hover\:border-blue-500:hover{--tw-border-opacity:1;border-color:rgb(59 130 246/var(--tw-border-opacity))}.hover\:border-blue-600:hover{--tw-border-opacity:1;border-color:rgb(37 99 235/var(--tw-border-opacity))}.hover\:border-transparent:hover{border-color:transparent}.hover\:border-l-blue-500:hover{--tw-border-opacity:1;border-left-color:rgb(59 130 246/var(--tw-border-opacity))}.hover\:bg-gray-100:hover{--tw-bg-opacity:1;background-color:rgb(243 244 246/var(--tw-bg-opacity))}.hover\:bg-neutral-100:hover{--tw-bg-opacity:1;background-color:rgb(245 245 245/var(--tw-bg-opacity))}.hover\:bg-slate-200:hover{--tw-bg-opacity:1;background-color:rgb(226 232 240/var(--tw-bg-opacity))}.hover\:bg-slate-300\/30:hover{background-color:#cbd5e14d}.hover\:bg-slate-800:hover{--tw-bg-opacity:1;background-color:rgb(30 41 59/var(--tw-bg-opacity))}.hover\:bg-stone-700:hover{--tw-bg-opacity:1;background-color:rgb(68 64 60/var(--tw-bg-opacity))}.hover\:font-light:hover{font-weight:300}.hover\:font-semibold:hover{font-weight:600}.hover\:text-\[\#1DA1F2\]:hover{--tw-text-opacity:1;color:rgb(29 161 242/var(--tw-text-opacity))}.hover\:text-\[\#599F46\]:hover{--tw-text-opacity:1;color:rgb(89 159 70/var(--tw-text-opacity))}.hover\:text-\[\#A9C751\]:hover{--tw-text-opacity:1;color:rgb(169 199 81/var(--tw-text-opacity))}.hover\:text-\[\#E18435\]:hover{--tw-text-opacity:1;color:rgb(225 132 53/var(--tw-text-opacity))}.hover\:text-black:hover{--tw-text-opacity:1;color:rgb(0 0 0/var(--tw-text-opacity))}.hover\:text-blue-400:hover{--tw-text-opacity:1;color:rgb(96 165 250/var(--tw-text-opacity))}.hover\:text-blue-500:hover{--tw-text-opacity:1;color:rgb(59 130 246/var(--tw-text-opacity))}.hover\:text-blue-600:hover{--tw-text-opacity:1;color:rgb(37 99 235/var(--tw-text-opacity))}.hover\:text-blue-700:hover{--tw-text-opacity:1;color:rgb(29 78 216/var(--tw-text-opacity))}.hover\:text-gray-700:hover{--tw-text-opacity:1;color:rgb(55 65 81/var(--tw-text-opacity))}.hover\:text-green-500:hover{--tw-text-opacity:1;color:rgb(34 197 94/var(--tw-text-opacity))}.hover\:text-inherit:hover{color:inherit}.hover\:text-stone-500:hover{--tw-text-opacity:1;color:rgb(120 113 108/var(--tw-text-opacity))}.hover\:text-stone-900:hover{--tw-text-opacity:1;color:rgb(28 25 23/var(--tw-text-opacity))}.hover\:text-violet-100:hover{--tw-text-opacity:1;color:rgb(237 233 254/var(--tw-text-opacity))}.hover\:text-white:hover{--tw-text-opacity:1;color:rgb(255 255 255/var(--tw-text-opacity))}.hover\:underline:hover{text-decoration-line:underline}.hover\:no-underline:hover{text-decoration-line:none}.hover\:opacity-10:hover{opacity:.1}.hover\:opacity-100:hover{opacity:1}.hover\:shadow-\[inset_0_0_0px_30px_\#00000003\]:hover{--tw-shadow:inset 0 0 0px 30px #00000003;--tw-shadow-colored:inset 0 0 0px 30px var(--tw-shadow-color)}.hover\:shadow-\[inset_0_0_0px_30px_\#00000003\]:hover,.hover\:shadow-lg:hover{box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.hover\:shadow-lg:hover{--tw-shadow:0 10px 15px -3px rgba(0,0,0,.1),0 4px 6px -4px rgba(0,0,0,.1);--tw-shadow-colored:0 10px 15px -3px var(--tw-shadow-color),0 4px 6px -4px var(--tw-shadow-color)}.hover\:shadow-md:hover{--tw-shadow:0 4px 6px -1px rgba(0,0,0,.1),0 2px 4px -2px rgba(0,0,0,.1);--tw-shadow-colored:0 4px 6px -1px var(--tw-shadow-color),0 2px 4px -2px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.hover\:ring-blue-500:hover{--tw-ring-opacity:1;--tw-ring-color:rgb(59 130 246/var(--tw-ring-opacity))}.focus\:border-blue-500:focus{--tw-border-opacity:1;border-color:rgb(59 130 246/var(--tw-border-opacity))}.focus\:opacity-100:focus{opacity:1}.focus\:shadow-\[0_0_0_2px\]:focus{--tw-shadow:0 0 0 2px;--tw-shadow-colored:0 0 0 2px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.focus\:shadow-black:focus{--tw-shadow-color:#000;--tw-shadow:var(--tw-shadow-colored)}.focus\:outline-none:focus{outline:2px solid transparent;outline-offset:2px}.focus\:outline:focus{outline-style:solid}.focus\:ring-blue-500:focus{--tw-ring-opacity:1;--tw-ring-color:rgb(59 130 246/var(--tw-ring-opacity))}.focus-visible\:ring-2:focus-visible{--tw-ring-offset-shadow:var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);--tw-ring-shadow:var(--tw-ring-inset) 0 0 0 calc(2px + var(--tw-ring-offset-width)) var(--tw-ring-color);box-shadow:var(--tw-ring-offset-shadow),var(--tw-ring-shadow),var(--tw-shadow,0 0 #0000)}.focus-visible\:ring-white:focus-visible{--tw-ring-opacity:1;--tw-ring-color:rgb(255 255 255/var(--tw-ring-opacity))}.focus-visible\:ring-opacity-75:focus-visible{--tw-ring-opacity:.75}.active\:text-green-700:active{--tw-text-opacity:1;color:rgb(21 128 61/var(--tw-text-opacity))}.active\:opacity-100:active{opacity:1}.group\/block:hover .group-hover\/block\:flex{display:flex}.group\/block:hover .group-hover\/block\:hidden{display:none}.group:hover .group-hover\:-translate-x-1{--tw-translate-x:-.25rem}.group:hover .group-hover\:-translate-x-1,.group:hover .group-hover\:translate-x-1{transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.group:hover .group-hover\:translate-x-1{--tw-translate-x:.25rem}.group:hover .group-hover\:underline{text-decoration-line:underline}.group:hover .group-hover\:opacity-100,.group\/backmatter:hover .group-hover\/backmatter\:opacity-100,.group\/block:hover .group-hover\/block\:opacity-100{opacity:1}.group:hover .group-hover\:opacity-70{opacity:.7}.group[aria-selected=true] .group-aria-selected\:visible{visibility:visible}.group[aria-selected=true] .group-aria-selected\:bg-blue-600{--tw-bg-opacity:1;background-color:rgb(37 99 235/var(--tw-bg-opacity))}.group[aria-selected=true] .group-aria-selected\:text-white{--tw-text-opacity:1;color:rgb(255 255 255/var(--tw-text-opacity))}.group[aria-selected=true] .group-aria-selected\:underline{text-decoration-line:underline}.group[data-state=open] .group-data-\[state\=open\]\:rotate-90{--tw-rotate:90deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.dark\:block:is(.dark *){display:block}.dark\:hidden:is(.dark *){display:none}.dark\:rounded:is(.dark *){border-radius:.25rem}.dark\:border-y-0:is(.dark *){border-top-width:0;border-bottom-width:0}.dark\:border-l-4:is(.dark *){border-left-width:4px}.dark\:border-amber-500\/70:is(.dark *){border-color:#f59e0bb3}.dark\:border-blue-500\/60:is(.dark *){border-color:#3b82f699}.dark\:border-gray-500:is(.dark *){--tw-border-opacity:1;border-color:rgb(107 114 128/var(--tw-border-opacity))}.dark\:border-gray-500\/60:is(.dark *){border-color:#6b728099}.dark\:border-gray-600:is(.dark *){--tw-border-opacity:1;border-color:rgb(75 85 99/var(--tw-border-opacity))}.dark\:border-gray-800:is(.dark *){--tw-border-opacity:1;border-color:rgb(31 41 55/var(--tw-border-opacity))}.dark\:border-green-500\/60:is(.dark *){border-color:#22c55e99}.dark\:border-orange-500\/60:is(.dark *){border-color:#f9731699}.dark\:border-purple-500\/60:is(.dark *){border-color:#a855f799}.dark\:border-red-500\/60:is(.dark *){border-color:#ef444499}.dark\:border-slate-300:is(.dark *){--tw-border-opacity:1;border-color:rgb(203 213 225/var(--tw-border-opacity))}.dark\:border-slate-600:is(.dark *){--tw-border-opacity:1;border-color:rgb(71 85 105/var(--tw-border-opacity))}.dark\:border-white:is(.dark *){--tw-border-opacity:1;border-color:rgb(255 255 255/var(--tw-border-opacity))}.dark\:border-b-white:is(.dark *){--tw-border-opacity:1;border-bottom-color:rgb(255 255 255/var(--tw-border-opacity))}.dark\:border-l-blue-400:is(.dark *){--tw-border-opacity:1;border-left-color:rgb(96 165 250/var(--tw-border-opacity))}.dark\:border-l-gray-50:is(.dark *){--tw-border-opacity:1;border-left-color:rgb(249 250 251/var(--tw-border-opacity))}.dark\:bg-black\/80:is(.dark *){background-color:#000c}.dark\:bg-gray-700:is(.dark *){--tw-bg-opacity:1;background-color:rgb(55 65 81/var(--tw-bg-opacity))}.dark\:bg-neutral-800\/80:is(.dark *){background-color:#262626cc}.dark\:bg-orange-700:is(.dark *){--tw-bg-opacity:1;background-color:rgb(194 65 12/var(--tw-bg-opacity))}.dark\:bg-red-800:is(.dark *){--tw-bg-opacity:1;background-color:rgb(153 27 27/var(--tw-bg-opacity))}.dark\:bg-slate-500:is(.dark *){--tw-bg-opacity:1;background-color:rgb(100 116 139/var(--tw-bg-opacity))}.dark\:bg-slate-600:is(.dark *){--tw-bg-opacity:1;background-color:rgb(71 85 105/var(--tw-bg-opacity))}.dark\:bg-slate-800:is(.dark *){--tw-bg-opacity:1;background-color:rgb(30 41 59/var(--tw-bg-opacity))}.dark\:bg-slate-800\/50:is(.dark *){background-color:#1e293b80}.dark\:bg-slate-900:is(.dark *){--tw-bg-opacity:1;background-color:rgb(15 23 42/var(--tw-bg-opacity))}.dark\:bg-stone-700:is(.dark *){--tw-bg-opacity:1;background-color:rgb(68 64 60/var(--tw-bg-opacity))}.dark\:bg-stone-800:is(.dark *){--tw-bg-opacity:1;background-color:rgb(41 37 36/var(--tw-bg-opacity))}.dark\:bg-stone-900:is(.dark *){--tw-bg-opacity:1;background-color:rgb(28 25 23/var(--tw-bg-opacity))}.dark\:bg-stone-900\/80:is(.dark *){background-color:#1c1917cc}.dark\:bg-white:is(.dark *){--tw-bg-opacity:1;background-color:rgb(255 255 255/var(--tw-bg-opacity))}.dark\:fill-white:is(.dark *){fill:#fff}.dark\:text-black:is(.dark *){--tw-text-opacity:1;color:rgb(0 0 0/var(--tw-text-opacity))}.dark\:text-blue-200:is(.dark *){--tw-text-opacity:1;color:rgb(191 219 254/var(--tw-text-opacity))}.dark\:text-blue-400:is(.dark *){--tw-text-opacity:1;color:rgb(96 165 250/var(--tw-text-opacity))}.dark\:text-gray-100:is(.dark *){--tw-text-opacity:1;color:rgb(243 244 246/var(--tw-text-opacity))}.dark\:text-gray-200:is(.dark *){--tw-text-opacity:1;color:rgb(229 231 235/var(--tw-text-opacity))}.dark\:text-gray-300:is(.dark *){--tw-text-opacity:1;color:rgb(209 213 219/var(--tw-text-opacity))}.dark\:text-gray-400:is(.dark *){--tw-text-opacity:1;color:rgb(156 163 175/var(--tw-text-opacity))}.dark\:text-gray-600:is(.dark *){--tw-text-opacity:1;color:rgb(75 85 99/var(--tw-text-opacity))}.dark\:text-green-500:is(.dark *){--tw-text-opacity:1;color:rgb(34 197 94/var(--tw-text-opacity))}.dark\:text-neutral-200:is(.dark *){--tw-text-opacity:1;color:rgb(229 229 229/var(--tw-text-opacity))}.dark\:text-purple-500:is(.dark *){--tw-text-opacity:1;color:rgb(168 85 247/var(--tw-text-opacity))}.dark\:text-slate-100:is(.dark *){--tw-text-opacity:1;color:rgb(241 245 249/var(--tw-text-opacity))}.dark\:text-slate-300:is(.dark *){--tw-text-opacity:1;color:rgb(203 213 225/var(--tw-text-opacity))}.dark\:text-slate-50:is(.dark *){--tw-text-opacity:1;color:rgb(248 250 252/var(--tw-text-opacity))}.dark\:text-stone-200:is(.dark *){--tw-text-opacity:1;color:rgb(231 229 228/var(--tw-text-opacity))}.dark\:text-stone-300:is(.dark *){--tw-text-opacity:1;color:rgb(214 211 209/var(--tw-text-opacity))}.dark\:text-stone-700:is(.dark *){--tw-text-opacity:1;color:rgb(68 64 60/var(--tw-text-opacity))}.dark\:text-white:is(.dark *){--tw-text-opacity:1;color:rgb(255 255 255/var(--tw-text-opacity))}.dark\:text-zinc-400:is(.dark *){--tw-text-opacity:1;color:rgb(161 161 170/var(--tw-text-opacity))}.dark\:placeholder-gray-400:is(.dark *)::-moz-placeholder{--tw-placeholder-opacity:1;color:rgb(156 163 175/var(--tw-placeholder-opacity))}.dark\:placeholder-gray-400:is(.dark *)::placeholder{--tw-placeholder-opacity:1;color:rgb(156 163 175/var(--tw-placeholder-opacity))}.dark\:shadow-2xl:is(.dark *){--tw-shadow:0 25px 50px -12px rgba(0,0,0,.25);--tw-shadow-colored:0 25px 50px -12px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.dark\:shadow-none:is(.dark *){--tw-shadow:0 0 #0000;--tw-shadow-colored:0 0 #0000;box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.dark\:shadow-neutral-700:is(.dark *){--tw-shadow-color:#404040;--tw-shadow:var(--tw-shadow-colored)}.dark\:shadow-neutral-800:is(.dark *){--tw-shadow-color:#262626;--tw-shadow:var(--tw-shadow-colored)}.dark\:shadow-neutral-900:is(.dark *){--tw-shadow-color:#171717;--tw-shadow:var(--tw-shadow-colored)}.dark\:shadow-stone-700:is(.dark *){--tw-shadow-color:#44403c;--tw-shadow:var(--tw-shadow-colored)}.dark\:hover\:border-blue-400:hover:is(.dark *){--tw-border-opacity:1;border-color:rgb(96 165 250/var(--tw-border-opacity))}.dark\:hover\:border-blue-500:hover:is(.dark *){--tw-border-opacity:1;border-color:rgb(59 130 246/var(--tw-border-opacity))}.dark\:hover\:bg-slate-700:hover:is(.dark *){--tw-bg-opacity:1;background-color:rgb(51 65 85/var(--tw-bg-opacity))}.dark\:hover\:bg-stone-200:hover:is(.dark *){--tw-bg-opacity:1;background-color:rgb(231 229 228/var(--tw-bg-opacity))}.dark\:hover\:text-black:hover:is(.dark *){--tw-text-opacity:1;color:rgb(0 0 0/var(--tw-text-opacity))}.dark\:hover\:text-blue-400:hover:is(.dark *){--tw-text-opacity:1;color:rgb(96 165 250/var(--tw-text-opacity))}.dark\:hover\:text-gray-100:hover:is(.dark *){--tw-text-opacity:1;color:rgb(243 244 246/var(--tw-text-opacity))}.dark\:hover\:text-neutral-800:hover:is(.dark *){--tw-text-opacity:1;color:rgb(38 38 38/var(--tw-text-opacity))}.hover\:dark\:text-stone-100:is(.dark *):hover{--tw-text-opacity:1;color:rgb(245 245 244/var(--tw-text-opacity))}.dark\:hover\:shadow-\[inset_0_0_0px_30px_\#FFFFFF03\]:hover:is(.dark *){--tw-shadow:inset 0 0 0px 30px #ffffff03;--tw-shadow-colored:inset 0 0 0px 30px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.dark\:hover\:ring-blue-500:hover:is(.dark *){--tw-ring-opacity:1;--tw-ring-color:rgb(59 130 246/var(--tw-ring-opacity))}.dark\:focus\:border-blue-500:focus:is(.dark *){--tw-border-opacity:1;border-color:rgb(59 130 246/var(--tw-border-opacity))}.dark\:focus\:ring-blue-500:focus:is(.dark *){--tw-ring-opacity:1;--tw-ring-color:rgb(59 130 246/var(--tw-ring-opacity))}@media not all and (min-width:1280px){.max-xl\:min-w-\[300px\]{min-width:300px}}@media (min-width:640px){.sm\:left-1\/2{left:50%}.sm\:right-2{right:.5rem}.sm\:top-\[var\(--content-top\)\]{top:var(--content-top)}.sm\:col-span-2{grid-column:span 2/span 2}.sm\:mr-5{margin-right:1.25rem}.sm\:mr-7{margin-right:1.75rem}.sm\:mt-0{margin-top:0}.sm\:block{display:block}.sm\:flex{display:flex}.sm\:grid{display:grid}.sm\:hidden{display:none}.sm\:h-auto{height:auto}.sm\:max-h-\[var\(--content-max-height\)\]{max-height:var(--content-max-height)}.sm\:w-64{width:16rem}.sm\:w-\[90vw\]{width:90vw}.sm\:max-w-\[400px\]{max-width:400px}.sm\:max-w-\[500px\]{max-width:500px}.sm\:max-w-screen-sm{max-width:640px}.sm\:-translate-x-1\/2{--tw-translate-x:-50%;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.sm\:grid-cols-1{grid-template-columns:repeat(1,minmax(0,1fr))}.sm\:grid-cols-10{grid-template-columns:repeat(10,minmax(0,1fr))}.sm\:grid-cols-11{grid-template-columns:repeat(11,minmax(0,1fr))}.sm\:grid-cols-12{grid-template-columns:repeat(12,minmax(0,1fr))}.sm\:grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}.sm\:grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}.sm\:grid-cols-4{grid-template-columns:repeat(4,minmax(0,1fr))}.sm\:grid-cols-5{grid-template-columns:repeat(5,minmax(0,1fr))}.sm\:grid-cols-6{grid-template-columns:repeat(6,minmax(0,1fr))}.sm\:grid-cols-7{grid-template-columns:repeat(7,minmax(0,1fr))}.sm\:grid-cols-8{grid-template-columns:repeat(8,minmax(0,1fr))}.sm\:grid-cols-9{grid-template-columns:repeat(9,minmax(0,1fr))}.sm\:gap-4{gap:1rem}.sm\:rounded-md{border-radius:.375rem}.sm\:px-0{padding-left:0;padding-right:0}.sm\:text-xl{font-size:1.25rem;line-height:1.75rem}}@media (min-width:768px){.md\:my-\[4rem\]{margin-top:4rem;margin-bottom:4rem}.md\:ml-5{margin-left:1.25rem}.md\:block{display:block}.md\:w-\[200px\]{width:200px}.md\:max-w-\[300px\]{max-width:300px}.md\:flex-none{flex:none}.md\:grid-cols-1{grid-template-columns:repeat(1,minmax(0,1fr))}.md\:grid-cols-10{grid-template-columns:repeat(10,minmax(0,1fr))}.md\:grid-cols-11{grid-template-columns:repeat(11,minmax(0,1fr))}.md\:grid-cols-12{grid-template-columns:repeat(12,minmax(0,1fr))}.md\:grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}.md\:grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}.md\:grid-cols-4{grid-template-columns:repeat(4,minmax(0,1fr))}.md\:grid-cols-5{grid-template-columns:repeat(5,minmax(0,1fr))}.md\:grid-cols-6{grid-template-columns:repeat(6,minmax(0,1fr))}.md\:grid-cols-7{grid-template-columns:repeat(7,minmax(0,1fr))}.md\:grid-cols-8{grid-template-columns:repeat(8,minmax(0,1fr))}.md\:grid-cols-9{grid-template-columns:repeat(9,minmax(0,1fr))}.md\:flex-row{flex-direction:row}.md\:flex-col{flex-direction:column}.md\:px-8{padding-left:2rem;padding-right:2rem}}@media (min-width:1024px){.lg\:sticky{position:sticky}.lg\:z-10{z-index:10}.lg\:my-0{margin-top:0;margin-bottom:0}.lg\:ml-10{margin-left:2.5rem}.lg\:mr-0{margin-right:0}.lg\:block{display:block}.lg\:flex{display:flex}.lg\:hidden{display:none}.lg\:h-0{height:0}.lg\:grid-cols-1{grid-template-columns:repeat(1,minmax(0,1fr))}.lg\:grid-cols-10{grid-template-columns:repeat(10,minmax(0,1fr))}.lg\:grid-cols-11{grid-template-columns:repeat(11,minmax(0,1fr))}.lg\:grid-cols-12{grid-template-columns:repeat(12,minmax(0,1fr))}.lg\:grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}.lg\:grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}.lg\:grid-cols-4{grid-template-columns:repeat(4,minmax(0,1fr))}.lg\:grid-cols-5{grid-template-columns:repeat(5,minmax(0,1fr))}.lg\:grid-cols-6{grid-template-columns:repeat(6,minmax(0,1fr))}.lg\:grid-cols-7{grid-template-columns:repeat(7,minmax(0,1fr))}.lg\:grid-cols-8{grid-template-columns:repeat(8,minmax(0,1fr))}.lg\:grid-cols-9{grid-template-columns:repeat(9,minmax(0,1fr))}.lg\:pt-0{padding-top:0}}@media (min-width:1280px){.xl\:pointer-events-none{pointer-events:none}.xl\:ml-0{margin-left:0}.xl\:ml-7{margin-left:1.75rem}.xl\:flex{display:flex}.xl\:hidden{display:none}.xl\:w-screen{width:100vw}.xl\:min-w-\[19\.5rem\]{min-width:19.5rem}.xl\:grid-cols-1{grid-template-columns:repeat(1,minmax(0,1fr))}.xl\:grid-cols-10{grid-template-columns:repeat(10,minmax(0,1fr))}.xl\:grid-cols-11{grid-template-columns:repeat(11,minmax(0,1fr))}.xl\:grid-cols-12{grid-template-columns:repeat(12,minmax(0,1fr))}.xl\:grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}.xl\:grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}.xl\:grid-cols-4{grid-template-columns:repeat(4,minmax(0,1fr))}.xl\:grid-cols-5{grid-template-columns:repeat(5,minmax(0,1fr))}.xl\:grid-cols-6{grid-template-columns:repeat(6,minmax(0,1fr))}.xl\:grid-cols-7{grid-template-columns:repeat(7,minmax(0,1fr))}.xl\:grid-cols-8{grid-template-columns:repeat(8,minmax(0,1fr))}.xl\:grid-cols-9{grid-template-columns:repeat(9,minmax(0,1fr))}} +*,:after,:before{box-sizing:border-box;border:0 solid #e5e7eb}:after,:before{--tw-content:""}:host,html{line-height:1.5;-webkit-text-size-adjust:100%;-moz-tab-size:4;-o-tab-size:4;tab-size:4;font-family:ui-sans-serif,system-ui,sans-serif,Apple Color Emoji,Segoe UI Emoji,Segoe UI Symbol,Noto Color Emoji;font-feature-settings:normal;font-variation-settings:normal;-webkit-tap-highlight-color:transparent}body{margin:0;line-height:inherit}hr{height:0;color:inherit;border-top-width:1px}abbr:where([title]){-webkit-text-decoration:underline dotted;text-decoration:underline dotted}h1,h2,h3,h4,h5,h6{font-size:inherit;font-weight:inherit}a{color:inherit;text-decoration:inherit}b,strong{font-weight:bolder}code,kbd,pre,samp{font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace;font-feature-settings:normal;font-variation-settings:normal;font-size:1em}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}table{text-indent:0;border-color:inherit;border-collapse:collapse}button,input,optgroup,select,textarea{font-family:inherit;font-feature-settings:inherit;font-variation-settings:inherit;font-size:100%;font-weight:inherit;line-height:inherit;letter-spacing:inherit;color:inherit;margin:0;padding:0}button,select{text-transform:none}button,input:where([type=button]),input:where([type=reset]),input:where([type=submit]){-webkit-appearance:button;background-color:transparent;background-image:none}:-moz-focusring{outline:auto}:-moz-ui-invalid{box-shadow:none}progress{vertical-align:baseline}::-webkit-inner-spin-button,::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}summary{display:list-item}blockquote,dd,dl,figure,h1,h2,h3,h4,h5,h6,hr,p,pre{margin:0}fieldset{margin:0}fieldset,legend{padding:0}menu,ol,ul{list-style:none;margin:0;padding:0}dialog{padding:0}textarea{resize:vertical}input::-moz-placeholder,textarea::-moz-placeholder{opacity:1;color:#9ca3af}input::placeholder,textarea::placeholder{opacity:1;color:#9ca3af}[role=button],button{cursor:pointer}:disabled{cursor:default}audio,canvas,embed,iframe,img,object,svg,video{display:block;vertical-align:middle}img,video{max-width:100%;height:auto}[hidden]{display:none}.prose table td{padding:.25rem;vertical-align:top}@media (min-width:640px){.prose table td{padding:.5rem}}.prose table li,.prose table p{margin-top:0;margin-bottom:0}.prose figure table{margin-top:1.25rem;margin-bottom:0}.prose table ol>li,.prose table ul>li{padding-left:0}.prose table tr:hover td{--tw-bg-opacity:1;background-color:rgb(248 250 252/var(--tw-bg-opacity))}.prose table tr:hover td:is(.dark *){--tw-bg-opacity:1;background-color:rgb(41 37 36/var(--tw-bg-opacity))}.prose table td dl{margin:0}.prose dt>strong{font-weight:700;--tw-text-opacity:1;color:rgb(30 58 138/var(--tw-text-opacity))}.prose dt>strong:is(.dark *){--tw-text-opacity:1;color:rgb(219 234 254/var(--tw-text-opacity))}.prose dd{margin-left:2rem}.prose p img{margin:0;display:inline-block}article.\!content{min-height:100vh!important}article.content{min-height:100vh}.article table td{padding:.25rem;vertical-align:top}@media (min-width:640px){.article table td{padding:.5rem}}.article table li,.article table p{margin-top:0;margin-bottom:0}.article figure table{margin-top:1.25rem;margin-bottom:0}.article table ol>li,.article table ul>li{padding-left:0}.article table tr:hover td{--tw-bg-opacity:1;background-color:rgb(248 250 252/var(--tw-bg-opacity))}.article table tr:hover td:is(.dark *){--tw-bg-opacity:1;background-color:rgb(41 37 36/var(--tw-bg-opacity))}.article table td dl{margin:0}.article dt>strong{font-weight:700;--tw-text-opacity:1;color:rgb(30 58 138/var(--tw-text-opacity))}.article dt>strong:is(.dark *){--tw-text-opacity:1;color:rgb(219 234 254/var(--tw-text-opacity))}.article dd{margin-left:2rem}.article p img{margin:0;display:inline-block}.article{color:var(--tw-prose-body);max-width:65ch}.article :where(p):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:1.25em;margin-bottom:1.25em}.article :where([class~=lead]):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-lead);font-size:1.25em;line-height:1.6;margin-top:1.2em;margin-bottom:1.2em}.article :where(a):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-links);text-decoration:underline;font-weight:500}.article :where(strong):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-bold);font-weight:600}.article :where(a strong):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.article :where(blockquote strong):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.article :where(thead th strong):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.article :where(ol):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:decimal;margin-top:1.25em;margin-bottom:1.25em;padding-inline-start:1.625em}.article :where(ol[type=A]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:upper-alpha}.article :where(ol[type=a]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:lower-alpha}.article :where(ol[type=A s]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:upper-alpha}.article :where(ol[type=a s]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:lower-alpha}.article :where(ol[type=I]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:upper-roman}.article :where(ol[type=i]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:lower-roman}.article :where(ol[type=I s]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:upper-roman}.article :where(ol[type=i s]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:lower-roman}.article :where(ol[type="1"]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:decimal}.article :where(ul):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:disc;margin-top:1.25em;margin-bottom:1.25em;padding-inline-start:1.625em}.article :where(ol>li):not(:where([class~=not-prose],[class~=not-prose] *))::marker{font-weight:400;color:var(--tw-prose-counters)}.article :where(ul>li):not(:where([class~=not-prose],[class~=not-prose] *))::marker{color:var(--tw-prose-bullets)}.article :where(dt):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-headings);font-weight:600;margin-top:1.25em}.article :where(hr):not(:where([class~=not-prose],[class~=not-prose] *)){border-color:var(--tw-prose-hr);border-top-width:1px;margin-top:3em;margin-bottom:3em}.article :where(blockquote):not(:where([class~=not-prose],[class~=not-prose] *)){font-weight:500;font-style:italic;color:var(--tw-prose-quotes);border-inline-start-width:.25rem;border-inline-start-color:var(--tw-prose-quote-borders);quotes:"\201c""\201d""\2018""\2019";margin-top:1.6em;margin-bottom:1.6em;padding-inline-start:1em}.article :where(blockquote p:first-of-type):not(:where([class~=not-prose],[class~=not-prose] *)):before{content:none}.article :where(blockquote p:last-of-type):not(:where([class~=not-prose],[class~=not-prose] *)):after{content:close-quote}.article :where(h1):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-headings);font-weight:800;font-size:2.25em;margin-top:0;margin-bottom:.8888889em;line-height:1.1111111}.article :where(h1 strong):not(:where([class~=not-prose],[class~=not-prose] *)){font-weight:900;color:inherit}.article :where(h2):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-headings);font-weight:700;font-size:1.5em;margin-top:2em;margin-bottom:1em;line-height:1.3333333}.article :where(h2 strong):not(:where([class~=not-prose],[class~=not-prose] *)){font-weight:800;color:inherit}.article :where(h3):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-headings);font-weight:600;font-size:1.25em;margin-top:1.6em;margin-bottom:.6em;line-height:1.6}.article :where(h3 strong):not(:where([class~=not-prose],[class~=not-prose] *)){font-weight:700;color:inherit}.article :where(h4):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-headings);font-weight:600;margin-top:1.5em;margin-bottom:.5em;line-height:1.5}.article :where(h4 strong):not(:where([class~=not-prose],[class~=not-prose] *)){font-weight:700;color:inherit}.article :where(img):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:2em;margin-bottom:2em}.article :where(picture):not(:where([class~=not-prose],[class~=not-prose] *)){display:block;margin-top:2em;margin-bottom:2em}.article :where(video):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:2em;margin-bottom:2em}.article :where(kbd):not(:where([class~=not-prose],[class~=not-prose] *)){font-weight:500;font-family:inherit;color:var(--tw-prose-kbd);box-shadow:0 0 0 1px rgb(var(--tw-prose-kbd-shadows)/10%),0 3px 0 rgb(var(--tw-prose-kbd-shadows)/10%);font-size:.875em;border-radius:.3125rem;padding-top:.1875em;padding-inline-end:.375em;padding-bottom:.1875em;padding-inline-start:.375em}.article :where(code):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-code);font-weight:400;font-size:.875em}.article :where(a code):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.article :where(h1 code):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.article :where(h2 code):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit;font-size:.875em}.article :where(h3 code):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit;font-size:.9em}.article :where(h4 code):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.article :where(blockquote code):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.article :where(thead th code):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.article :where(pre):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-pre-code);background-color:var(--tw-prose-pre-bg);overflow-x:auto;font-weight:400;font-size:.875em;line-height:1.7142857;margin-top:1.7142857em;margin-bottom:1.7142857em;border-radius:.375rem;padding-top:.8571429em;padding-inline-end:1.1428571em;padding-bottom:.8571429em;padding-inline-start:1.1428571em}.article :where(pre code):not(:where([class~=not-prose],[class~=not-prose] *)){background-color:transparent;border-width:0;border-radius:0;padding:0;font-weight:inherit;color:inherit;font-size:inherit;font-family:inherit;line-height:inherit}.article :where(pre code):not(:where([class~=not-prose],[class~=not-prose] *)):before{content:none}.article :where(pre code):not(:where([class~=not-prose],[class~=not-prose] *)):after{content:none}.article :where(table):not(:where([class~=not-prose],[class~=not-prose] *)){width:100%;table-layout:auto;margin-top:2em;margin-bottom:2em;font-size:.875em;line-height:1.7142857}.article :where(thead):not(:where([class~=not-prose],[class~=not-prose] *)){border-bottom-width:1px;border-bottom-color:var(--tw-prose-th-borders)}.article :where(thead th):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-headings);font-weight:600;vertical-align:bottom;padding-inline-end:.5714286em;padding-bottom:.5714286em;padding-inline-start:.5714286em}.article :where(tbody tr):not(:where([class~=not-prose],[class~=not-prose] *)){border-bottom-width:1px;border-bottom-color:var(--tw-prose-td-borders)}.article :where(tbody tr:last-child):not(:where([class~=not-prose],[class~=not-prose] *)){border-bottom-width:0}.article :where(tbody td):not(:where([class~=not-prose],[class~=not-prose] *)){vertical-align:baseline}.article :where(tfoot):not(:where([class~=not-prose],[class~=not-prose] *)){border-top-width:1px;border-top-color:var(--tw-prose-th-borders)}.article :where(tfoot td):not(:where([class~=not-prose],[class~=not-prose] *)){vertical-align:top}.article :where(th,td):not(:where([class~=not-prose],[class~=not-prose] *)){text-align:start}.article :where(figure>*):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:0;margin-bottom:0}.article :where(figcaption):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-captions);font-size:.875em;line-height:1.4285714;margin-top:.8571429em}.article{--tw-prose-body:#374151;--tw-prose-headings:#111827;--tw-prose-lead:#4b5563;--tw-prose-links:#111827;--tw-prose-bold:#111827;--tw-prose-counters:#6b7280;--tw-prose-bullets:#d1d5db;--tw-prose-hr:#e5e7eb;--tw-prose-quotes:#111827;--tw-prose-quote-borders:#e5e7eb;--tw-prose-captions:#6b7280;--tw-prose-kbd:#111827;--tw-prose-kbd-shadows:17 24 39;--tw-prose-code:#111827;--tw-prose-pre-code:#e5e7eb;--tw-prose-pre-bg:#1f2937;--tw-prose-th-borders:#d1d5db;--tw-prose-td-borders:#e5e7eb;--tw-prose-invert-body:#d1d5db;--tw-prose-invert-lead:#9ca3af;--tw-prose-invert-counters:#9ca3af;--tw-prose-invert-bullets:#4b5563;--tw-prose-invert-hr:#374151;--tw-prose-invert-quotes:#f3f4f6;--tw-prose-invert-quote-borders:#374151;--tw-prose-invert-captions:#9ca3af;--tw-prose-invert-pre-code:#d1d5db;--tw-prose-invert-th-borders:#4b5563;--tw-prose-invert-td-borders:#374151;font-size:1rem;line-height:1.75}.article :where(picture>img):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:0;margin-bottom:0}.article :where(li):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:.25rem;margin-bottom:.25rem}.article :where(ol>li):not(:where([class~=not-prose],[class~=not-prose] *)){padding-inline-start:.375em}.article :where(ul>li):not(:where([class~=not-prose],[class~=not-prose] *)){padding-inline-start:.375em}.article :where(.prose>ul>li p):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:.75em;margin-bottom:.75em}.article :where(.prose>ul>li>p:first-child):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:1.25em}.article :where(.prose>ul>li>p:last-child):not(:where([class~=not-prose],[class~=not-prose] *)){margin-bottom:1.25em}.article :where(.prose>ol>li>p:first-child):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:1.25em}.article :where(.prose>ol>li>p:last-child):not(:where([class~=not-prose],[class~=not-prose] *)){margin-bottom:1.25em}.article :where(ul ul,ul ol,ol ul,ol ol):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:.75em;margin-bottom:.75em}.article :where(dl):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:1.25em;margin-bottom:1.25em}.article :where(dd):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:.5em;padding-inline-start:1.625em}.article :where(hr+*):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:0}.article :where(h2+*):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:0}.article :where(h3+*):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:0}.article :where(h4+*):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:0}.article :where(thead th:first-child):not(:where([class~=not-prose],[class~=not-prose] *)){padding-inline-start:0}.article :where(thead th:last-child):not(:where([class~=not-prose],[class~=not-prose] *)){padding-inline-end:0}.article :where(tbody td,tfoot td):not(:where([class~=not-prose],[class~=not-prose] *)){padding-top:.5714286em;padding-inline-end:.5714286em;padding-bottom:.5714286em;padding-inline-start:.5714286em}.article :where(tbody td:first-child,tfoot td:first-child):not(:where([class~=not-prose],[class~=not-prose] *)){padding-inline-start:0}.article :where(tbody td:last-child,tfoot td:last-child):not(:where([class~=not-prose],[class~=not-prose] *)){padding-inline-end:0}.article :where(figure):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:2em;margin-bottom:2em}.article :where(.prose>:first-child):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:0}.article :where(.prose>:last-child):not(:where([class~=not-prose],[class~=not-prose] *)){margin-bottom:0}.article :where(blockquote p:first-of-type):not(:where([class~=not-prose],[class~=not-prose] *)):after{content:none}.article :where(li>p,dd>p,header>p,footer>p):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:.25rem;margin-bottom:.25rem}.article :where(h5,h6):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-headings);font-weight:500}.article{--tw-prose-body:#44403c;--tw-prose-headings:#1c1917;--tw-prose-lead:#57534e;--tw-prose-links:#1c1917;--tw-prose-bold:#1c1917;--tw-prose-counters:#78716c;--tw-prose-bullets:#d6d3d1;--tw-prose-hr:#e7e5e4;--tw-prose-quotes:#1c1917;--tw-prose-quote-borders:#e7e5e4;--tw-prose-captions:#78716c;--tw-prose-kbd:#1c1917;--tw-prose-kbd-shadows:28 25 23;--tw-prose-code:#be185d;--tw-prose-pre-code:#e7e5e4;--tw-prose-pre-bg:#292524;--tw-prose-th-borders:#d6d3d1;--tw-prose-td-borders:#e7e5e4;--tw-prose-invert-body:#d6d3d1;--tw-prose-invert-headings:#fff;--tw-prose-invert-lead:#a8a29e;--tw-prose-invert-links:#fff;--tw-prose-invert-bold:#fff;--tw-prose-invert-counters:#a8a29e;--tw-prose-invert-bullets:#57534e;--tw-prose-invert-hr:#44403c;--tw-prose-invert-quotes:#f5f5f4;--tw-prose-invert-quote-borders:#44403c;--tw-prose-invert-captions:#a8a29e;--tw-prose-invert-kbd:#fff;--tw-prose-invert-kbd-shadows:255 255 255;--tw-prose-invert-code:#fff;--tw-prose-invert-pre-code:#d6d3d1;--tw-prose-invert-pre-bg:rgba(0,0,0,.5);--tw-prose-invert-th-borders:#57534e;--tw-prose-invert-td-borders:#44403c;max-width:none;overflow-wrap:break-word}.article:is(.dark *){--tw-prose-body:var(--tw-prose-invert-body);--tw-prose-headings:var(--tw-prose-invert-headings);--tw-prose-lead:var(--tw-prose-invert-lead);--tw-prose-links:var(--tw-prose-invert-links);--tw-prose-bold:var(--tw-prose-invert-bold);--tw-prose-counters:var(--tw-prose-invert-counters);--tw-prose-bullets:var(--tw-prose-invert-bullets);--tw-prose-hr:var(--tw-prose-invert-hr);--tw-prose-quotes:var(--tw-prose-invert-quotes);--tw-prose-quote-borders:var(--tw-prose-invert-quote-borders);--tw-prose-captions:var(--tw-prose-invert-captions);--tw-prose-kbd:var(--tw-prose-invert-kbd);--tw-prose-kbd-shadows:var(--tw-prose-invert-kbd-shadows);--tw-prose-code:#f472b6;--tw-prose-pre-code:var(--tw-prose-invert-pre-code);--tw-prose-pre-bg:var(--tw-prose-invert-pre-bg);--tw-prose-th-borders:var(--tw-prose-invert-th-borders);--tw-prose-td-borders:var(--tw-prose-invert-td-borders)}.article-grid{display:grid;grid-template-columns:[screen-start screen-inset-start] .5rem [page-start page-inset-start body-outset-start body-start gutter-left-start body-inset-start middle-start] 1fr 1fr [gutter-left-end] 1fr 1fr [gutter-right-start] 1fr 1fr [middle-end body-inset-end body-end gutter-right-end body-outset-end page-inset-end page-end] .5rem [screen-inset-end screen-end];align-content:flex-start}@media (min-width:768px){.article-grid{grid-template-columns:[screen-start] .25rem [screen-inset-start page-start page-inset-start body-outset-start] 1fr [body-start gutter-left-start] 1rem [body-inset-start] minmax(2ch,10ch) [middle-start] minmax(2ch,10ch) [gutter-left-end] minmax(2ch,10ch) minmax(2ch,10ch) [gutter-right-start] minmax(2ch,10ch) [middle-end] minmax(2ch,10ch) [body-inset-end] 1rem [body-end gutter-right-end] 1fr [body-outset-end page-inset-end page-end screen-inset-end] .25rem [screen-end]}}@media (min-width:1024px){.article-grid{grid-template-columns:[screen-start] .25rem [screen-inset-start page-start] 1rem [page-inset-start body-outset-start] 1fr [body-start gutter-left-start] 1rem [body-inset-start] minmax(8ch,10ch) [middle-start] minmax(8ch,10ch) [gutter-left-end] minmax(8ch,10ch) minmax(8ch,10ch) [gutter-right-start] minmax(8ch,10ch) [middle-end] minmax(8ch,10ch) [body-inset-end] 1rem [body-end gutter-right-end] 3rem [body-outset-end] minmax(5rem,13rem) [page-inset-end] 3rem [page-end] 1fr [screen-inset-end] .25rem [screen-end]}}@media (min-width:1280px){.article-grid{grid-template-columns:[screen-start] .25rem [screen-inset-start] 1fr [page-start] 3rem [page-inset-start] minmax(4rem,9rem) [body-outset-start] 3rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(8ch,10ch) [middle-start] minmax(8ch,10ch) [gutter-left-end] minmax(8ch,10ch) minmax(8ch,10ch) [gutter-right-start] minmax(8ch,10ch) [middle-end] minmax(8ch,10ch) [body-inset-end] 1rem [body-end gutter-right-end] 3rem [body-outset-end] minmax(5rem,13rem) [page-inset-end] 3rem [page-end] 1fr [screen-inset-end] .25rem [screen-end]}}@media (min-width:1536px){.article-grid{grid-template-columns:[screen-start] .5rem [screen-inset-start] 1fr [page-start] 3rem [page-inset-start] minmax(4rem,9rem) [body-outset-start] 3rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(8ch,10ch) [middle-start] minmax(8ch,10ch) [gutter-left-end] minmax(8ch,10ch) minmax(8ch,10ch) [gutter-right-start] minmax(8ch,10ch) [middle-end] minmax(8ch,10ch) [body-inset-end] 1rem [body-end gutter-right-end] 3rem [body-outset-end] minmax(5rem,13rem) [page-inset-end] 3rem [page-end] 1fr [screen-inset-end] .5rem [screen-end]}}.article-left-grid{display:grid;grid-template-columns:[screen-start screen-inset-start] .5rem [page-start page-inset-start body-outset-start body-start gutter-left-start body-inset-start middle-start] 1fr 1fr [gutter-left-end] 1fr 1fr [gutter-right-start] 1fr 1fr [middle-end body-inset-end body-end gutter-right-end body-outset-end page-inset-end page-end] .5rem [screen-inset-end screen-end];align-content:flex-start}@media (min-width:768px){.article-left-grid{grid-template-columns:[screen-start] .5rem [screen-inset-start page-start page-inset-start body-outset-start] 1rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(5ch,15ch) [middle-start] minmax(5ch,15ch) [gutter-left-end] minmax(5ch,15ch) minmax(5ch,15ch) [gutter-right-start] minmax(5ch,15ch) [middle-end] minmax(5ch,15ch) [body-inset-end] 1rem [body-end gutter-right-end body-outset-end page-inset-end] 1rem [page-end screen-inset-end] .5rem [screen-end]}}@media (min-width:1024px){.article-left-grid{grid-template-columns:[screen-start] .5rem [screen-inset-start page-start page-inset-start body-outset-start] 1rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(5ch,12ch) [middle-start] minmax(5ch,12ch) [gutter-left-end] minmax(5ch,12ch) minmax(5ch,12ch) [gutter-right-start] minmax(5ch,12ch) [middle-end] minmax(5ch,12ch) [body-inset-end] 1rem [body-end] 1fr [gutter-right-end] 1rem [body-outset-end] minmax(10rem,18rem) [page-inset-end] 1rem [page-end] 1fr [screen-inset-end] .5rem [screen-end]}}@media (min-width:1280px){.article-left-grid{grid-template-columns:[screen-start] .5rem [screen-inset-start page-start page-inset-start body-outset-start] 3rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(8ch,12ch) [middle-start] minmax(8ch,12ch) [gutter-left-end] minmax(8ch,12ch) minmax(8ch,12ch) [gutter-right-start] minmax(8ch,12ch) [middle-end] minmax(8ch,12ch) [body-inset-end] 1rem [body-end] 1fr [gutter-right-end] 1rem [body-outset-end] minmax(10rem,18rem) [page-inset-end] 1rem [page-end] 1fr [screen-inset-end] .5rem [screen-end]}}@media (min-width:1536px){.article-left-grid{grid-template-columns:[screen-start] .5rem [screen-inset-start] 1fr [page-start page-inset-start body-outset-start] 3rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(8ch,12ch) [middle-start] minmax(8ch,12ch) [gutter-left-end] minmax(8ch,12ch) minmax(8ch,12ch) [gutter-right-start] minmax(8ch,12ch) [middle-end] minmax(8ch,12ch) [body-inset-end] 1rem [body-end] 1fr [gutter-right-end] 1rem [body-outset-end] minmax(10rem,18rem) [page-inset-end] 1rem [page-end] 1fr [screen-inset-end] .5rem [screen-end]}}.article-center-grid{display:grid;grid-template-columns:[screen-start screen-inset-start] .5rem [page-start page-inset-start body-outset-start body-start gutter-left-start body-inset-start middle-start] 1fr 1fr [gutter-left-end] 1fr 1fr [gutter-right-start] 1fr 1fr [middle-end body-inset-end body-end gutter-right-end body-outset-end page-inset-end page-end] .5rem [screen-inset-end screen-end];align-content:flex-start}@media (min-width:768px){.article-center-grid{grid-template-columns:[screen-start] .25rem [screen-inset-start page-start page-inset-start body-outset-start] 1fr [body-start gutter-left-start] 1rem [body-inset-start] minmax(2ch,10ch) [middle-start] minmax(2ch,10ch) [gutter-left-end] minmax(2ch,10ch) minmax(2ch,10ch) [gutter-right-start] minmax(2ch,10ch) [middle-end] minmax(2ch,10ch) [body-inset-end] 1rem [body-end gutter-right-end] 1fr [body-outset-end page-inset-end page-end screen-inset-end] .25rem [screen-end]}}@media (min-width:1024px){.article-center-grid{grid-template-columns:[screen-start] .5rem [screen-inset-start page-start] 2rem [page-inset-start] 2fr [body-outset-start gutter-outset-left-start] 1rem [body-start gutter-left-start] 2rem [body-inset-start gutter-left-start] minmax(8ch,10ch) [gutter-left-end middle-start] minmax(8ch,10ch) minmax(8ch,10ch) [] minmax(8ch,10ch) [] minmax(8ch,10ch) [middle-end gutter-right-start gutter-page-right-start] minmax(8ch,10ch) [body-inset-end gutter-right-end] 2rem [body-end] 1rem [body-outset-end] 2fr [page-inset-end] 2rem [page-end screen-inset-end] .5rem [screen-end]}}@media (min-width:1280px){.article-center-grid{grid-template-columns:[screen-start] .25rem [screen-inset-start] 1fr [page-start] 3rem [page-inset-start] minmax(4rem,9rem) [body-outset-start] 3rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(8ch,10ch) [middle-start] minmax(8ch,10ch) [gutter-left-end] minmax(8ch,10ch) minmax(8ch,10ch) [gutter-right-start] minmax(8ch,10ch) [middle-end] minmax(8ch,10ch) [body-inset-end] 1rem [body-end gutter-right-end] 3rem [body-outset-end] minmax(4rem,9rem) [page-inset-end] 3rem [page-end] 1fr [screen-inset-end] .25rem [screen-end]}}@media (min-width:1536px){.article-center-grid{grid-template-columns:[screen-start] .5rem [screen-inset-start] 1fr [page-start] 3rem [page-inset-start] minmax(4rem,9rem) [body-outset-start] 3rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(8ch,10ch) [middle-start] minmax(8ch,10ch) [gutter-left-end] minmax(8ch,10ch) minmax(8ch,10ch) [gutter-right-start] minmax(8ch,10ch) [middle-end] minmax(8ch,10ch) [body-inset-end] 1rem [body-end gutter-right-end] 3rem [body-outset-end] minmax(4rem,9rem) [page-inset-end] 3rem [page-end] 1fr [screen-inset-end] .5rem [screen-end]}}.article-center-grid>*,.article-grid>*,.article-left-grid>*{grid-column:body;margin-top:0!important}.grid-gap{gap:.25rem}@media (min-width:768px){.grid-gap{gap:.5rem}}@media (min-width:1280px){.grid-gap{gap:.75rem}}@media (min-width:1536px){.grid-gap{gap:1rem}}.subgrid-gap{-moz-column-gap:.25rem;column-gap:.25rem}@media (min-width:768px){.subgrid-gap{-moz-column-gap:.5rem;column-gap:.5rem}}@media (min-width:1280px){.subgrid-gap{-moz-column-gap:.75rem;column-gap:.75rem}}@media (min-width:1536px){.subgrid-gap{-moz-column-gap:1rem;column-gap:1rem}}.col-margin-left{grid-column:body}@media (min-width:1280px){.col-margin-left{grid-column:page/body-start}}.col-margin,.col-margin-right{grid-column:body}@media (min-width:1024px){.col-margin,.col-margin-right{grid-column:body-end/page-end}}.col-margin-right-inset{grid-column:body}@media (min-width:1024px){.col-margin-right-inset{grid-column:body-end/page-inset}}.col-gutter-page-right{grid-column:body}@media (min-width:768px){.col-gutter-page-right{grid-column:gutter-right/body-outset}}@media (min-width:1024px){.col-gutter-page-right{grid-column:middle-end/page}}.col-gutter-page-left{grid-column:body}@media (min-width:768px){.col-gutter-page-left{grid-column:body-outset/gutter-left}}@media (min-width:1024px){.col-gutter-page-left{grid-column:page/middle-start}}.col-body-inset-right{grid-column:body/gutter-right-start}@media (min-width:1024px){.col-body-inset-right{grid-column:body/middle}}.col-body-inset-left{grid-column:gutter-left-end/body}@media (min-width:1024px){.col-body-inset-left{grid-column:middle/body}}.col-page-middle{grid-column:body}@media (min-width:1024px){.col-page-middle{grid-column:middle}}.shaded{margin-top:1.25rem;margin-bottom:1.25rem;--tw-bg-opacity:1;background-color:rgb(241 245 249/var(--tw-bg-opacity));padding-top:1.25rem}.shaded:is(.dark *){--tw-bg-opacity:1;background-color:rgb(30 41 59/var(--tw-bg-opacity))}.framed{margin-top:1.25rem;margin-bottom:1.25rem;border-width:1px;--tw-bg-opacity:1;background-color:rgb(248 250 252/var(--tw-bg-opacity));padding:1.25rem;--tw-shadow:0 1px 3px 0 rgba(0,0,0,.1),0 1px 2px -1px rgba(0,0,0,.1);--tw-shadow-colored:0 1px 3px 0 var(--tw-shadow-color),0 1px 2px -1px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.framed:is(.dark *){--tw-bg-opacity:1;background-color:rgb(30 41 59/var(--tw-bg-opacity))}.shaded-children>*{--tw-bg-opacity:1;background-color:rgb(248 250 252/var(--tw-bg-opacity));padding:.5rem}.shaded-children>:is(.dark *){--tw-bg-opacity:1;background-color:rgb(30 41 59/var(--tw-bg-opacity))}.rounded-children>*{border-radius:.25rem}.task-list-item{list-style-type:none}.task-list-item-checkbox{margin-left:-29px;margin-right:14px;height:1rem;width:1rem;--tw-translate-y:1px;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y));border-radius:.25rem;--tw-border-opacity:1;border-color:rgb(209 213 219/var(--tw-border-opacity));--tw-bg-opacity:1;background-color:rgb(243 244 246/var(--tw-bg-opacity));--tw-text-opacity:1;color:rgb(37 99 235/var(--tw-text-opacity))}.task-list-item-checkbox:focus{--tw-border-opacity:1;border-color:rgb(59 130 246/var(--tw-border-opacity))}.task-list-item-checkbox:is(.dark *){--tw-border-opacity:1;border-color:rgb(75 85 99/var(--tw-border-opacity));--tw-bg-opacity:1;background-color:rgb(55 65 81/var(--tw-bg-opacity));--tw-ring-offset-color:#1f2937}.task-list-item-checkbox:focus:is(.dark *){--tw-border-opacity:1;border-color:rgb(37 99 235/var(--tw-border-opacity))}#footnotes p{margin:.25rem}*,:after,:before{--tw-border-spacing-x:0;--tw-border-spacing-y:0;--tw-translate-x:0;--tw-translate-y:0;--tw-rotate:0;--tw-skew-x:0;--tw-skew-y:0;--tw-scale-x:1;--tw-scale-y:1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness:proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width:0px;--tw-ring-offset-color:#fff;--tw-ring-color:rgba(59,130,246,.5);--tw-ring-offset-shadow:0 0 #0000;--tw-ring-shadow:0 0 #0000;--tw-shadow:0 0 #0000;--tw-shadow-colored:0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: ;--tw-contain-size: ;--tw-contain-layout: ;--tw-contain-paint: ;--tw-contain-style: }::backdrop{--tw-border-spacing-x:0;--tw-border-spacing-y:0;--tw-translate-x:0;--tw-translate-y:0;--tw-rotate:0;--tw-skew-x:0;--tw-skew-y:0;--tw-scale-x:1;--tw-scale-y:1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness:proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width:0px;--tw-ring-offset-color:#fff;--tw-ring-color:rgba(59,130,246,.5);--tw-ring-offset-shadow:0 0 #0000;--tw-ring-shadow:0 0 #0000;--tw-shadow:0 0 #0000;--tw-shadow-colored:0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: ;--tw-contain-size: ;--tw-contain-layout: ;--tw-contain-paint: ;--tw-contain-style: }.\!container{width:100%!important}.container{width:100%}@media (min-width:640px){.\!container{max-width:640px!important}.container{max-width:640px}}@media (min-width:768px){.\!container{max-width:768px!important}.container{max-width:768px}}@media (min-width:1024px){.\!container{max-width:1024px!important}.container{max-width:1024px}}@media (min-width:1280px){.\!container{max-width:1280px!important}.container{max-width:1280px}}@media (min-width:1536px){.\!container{max-width:1536px!important}.container{max-width:1536px}}.prose{color:var(--tw-prose-body);max-width:65ch}.prose :where(p):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:1.25em;margin-bottom:1.25em}.prose :where([class~=lead]):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-lead);font-size:1.25em;line-height:1.6;margin-top:1.2em;margin-bottom:1.2em}.prose :where(a):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-links);text-decoration:underline;font-weight:500}.prose :where(strong):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-bold);font-weight:600}.prose :where(a strong):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.prose :where(blockquote strong):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.prose :where(thead th strong):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.prose :where(ol):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:decimal;margin-top:1.25em;margin-bottom:1.25em;padding-inline-start:1.625em}.prose :where(ol[type=A]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:upper-alpha}.prose :where(ol[type=a]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:lower-alpha}.prose :where(ol[type=A s]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:upper-alpha}.prose :where(ol[type=a s]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:lower-alpha}.prose :where(ol[type=I]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:upper-roman}.prose :where(ol[type=i]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:lower-roman}.prose :where(ol[type=I s]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:upper-roman}.prose :where(ol[type=i s]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:lower-roman}.prose :where(ol[type="1"]):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:decimal}.prose :where(ul):not(:where([class~=not-prose],[class~=not-prose] *)){list-style-type:disc;margin-top:1.25em;margin-bottom:1.25em;padding-inline-start:1.625em}.prose :where(ol>li):not(:where([class~=not-prose],[class~=not-prose] *))::marker{font-weight:400;color:var(--tw-prose-counters)}.prose :where(ul>li):not(:where([class~=not-prose],[class~=not-prose] *))::marker{color:var(--tw-prose-bullets)}.prose :where(dt):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-headings);font-weight:600;margin-top:1.25em}.prose :where(hr):not(:where([class~=not-prose],[class~=not-prose] *)){border-color:var(--tw-prose-hr);border-top-width:1px;margin-top:3em;margin-bottom:3em}.prose :where(blockquote):not(:where([class~=not-prose],[class~=not-prose] *)){font-weight:500;font-style:italic;color:var(--tw-prose-quotes);border-inline-start-width:.25rem;border-inline-start-color:var(--tw-prose-quote-borders);quotes:"\201c""\201d""\2018""\2019";margin-top:1.6em;margin-bottom:1.6em;padding-inline-start:1em}.prose :where(blockquote p:first-of-type):not(:where([class~=not-prose],[class~=not-prose] *)):before{content:none}.prose :where(blockquote p:last-of-type):not(:where([class~=not-prose],[class~=not-prose] *)):after{content:close-quote}.prose :where(h1):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-headings);font-weight:800;font-size:2.25em;margin-top:0;margin-bottom:.8888889em;line-height:1.1111111}.prose :where(h1 strong):not(:where([class~=not-prose],[class~=not-prose] *)){font-weight:900;color:inherit}.prose :where(h2):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-headings);font-weight:700;font-size:1.5em;margin-top:2em;margin-bottom:1em;line-height:1.3333333}.prose :where(h2 strong):not(:where([class~=not-prose],[class~=not-prose] *)){font-weight:800;color:inherit}.prose :where(h3):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-headings);font-weight:600;font-size:1.25em;margin-top:1.6em;margin-bottom:.6em;line-height:1.6}.prose :where(h3 strong):not(:where([class~=not-prose],[class~=not-prose] *)){font-weight:700;color:inherit}.prose :where(h4):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-headings);font-weight:600;margin-top:1.5em;margin-bottom:.5em;line-height:1.5}.prose :where(h4 strong):not(:where([class~=not-prose],[class~=not-prose] *)){font-weight:700;color:inherit}.prose :where(img):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:2em;margin-bottom:2em}.prose :where(picture):not(:where([class~=not-prose],[class~=not-prose] *)){display:block;margin-top:2em;margin-bottom:2em}.prose :where(video):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:2em;margin-bottom:2em}.prose :where(kbd):not(:where([class~=not-prose],[class~=not-prose] *)){font-weight:500;font-family:inherit;color:var(--tw-prose-kbd);box-shadow:0 0 0 1px rgb(var(--tw-prose-kbd-shadows)/10%),0 3px 0 rgb(var(--tw-prose-kbd-shadows)/10%);font-size:.875em;border-radius:.3125rem;padding-top:.1875em;padding-inline-end:.375em;padding-bottom:.1875em;padding-inline-start:.375em}.prose :where(code):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-code);font-weight:400;font-size:.875em}.prose :where(a code):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.prose :where(h1 code):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.prose :where(h2 code):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit;font-size:.875em}.prose :where(h3 code):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit;font-size:.9em}.prose :where(h4 code):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.prose :where(blockquote code):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.prose :where(thead th code):not(:where([class~=not-prose],[class~=not-prose] *)){color:inherit}.prose :where(pre):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-pre-code);background-color:var(--tw-prose-pre-bg);overflow-x:auto;font-weight:400;font-size:.875em;line-height:1.7142857;margin-top:1.7142857em;margin-bottom:1.7142857em;border-radius:.375rem;padding-top:.8571429em;padding-inline-end:1.1428571em;padding-bottom:.8571429em;padding-inline-start:1.1428571em}.prose :where(pre code):not(:where([class~=not-prose],[class~=not-prose] *)){background-color:transparent;border-width:0;border-radius:0;padding:0;font-weight:inherit;color:inherit;font-size:inherit;font-family:inherit;line-height:inherit}.prose :where(pre code):not(:where([class~=not-prose],[class~=not-prose] *)):before{content:none}.prose :where(pre code):not(:where([class~=not-prose],[class~=not-prose] *)):after{content:none}.prose :where(table):not(:where([class~=not-prose],[class~=not-prose] *)){width:100%;table-layout:auto;margin-top:2em;margin-bottom:2em;font-size:.875em;line-height:1.7142857}.prose :where(thead):not(:where([class~=not-prose],[class~=not-prose] *)){border-bottom-width:1px;border-bottom-color:var(--tw-prose-th-borders)}.prose :where(thead th):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-headings);font-weight:600;vertical-align:bottom;padding-inline-end:.5714286em;padding-bottom:.5714286em;padding-inline-start:.5714286em}.prose :where(tbody tr):not(:where([class~=not-prose],[class~=not-prose] *)){border-bottom-width:1px;border-bottom-color:var(--tw-prose-td-borders)}.prose :where(tbody tr:last-child):not(:where([class~=not-prose],[class~=not-prose] *)){border-bottom-width:0}.prose :where(tbody td):not(:where([class~=not-prose],[class~=not-prose] *)){vertical-align:baseline}.prose :where(tfoot):not(:where([class~=not-prose],[class~=not-prose] *)){border-top-width:1px;border-top-color:var(--tw-prose-th-borders)}.prose :where(tfoot td):not(:where([class~=not-prose],[class~=not-prose] *)){vertical-align:top}.prose :where(th,td):not(:where([class~=not-prose],[class~=not-prose] *)){text-align:start}.prose :where(figure>*):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:0;margin-bottom:0}.prose :where(figcaption):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-captions);font-size:.875em;line-height:1.4285714;margin-top:.8571429em}.prose{--tw-prose-body:#374151;--tw-prose-headings:#111827;--tw-prose-lead:#4b5563;--tw-prose-links:#111827;--tw-prose-bold:#111827;--tw-prose-counters:#6b7280;--tw-prose-bullets:#d1d5db;--tw-prose-hr:#e5e7eb;--tw-prose-quotes:#111827;--tw-prose-quote-borders:#e5e7eb;--tw-prose-captions:#6b7280;--tw-prose-kbd:#111827;--tw-prose-kbd-shadows:17 24 39;--tw-prose-code:#111827;--tw-prose-pre-code:#e5e7eb;--tw-prose-pre-bg:#1f2937;--tw-prose-th-borders:#d1d5db;--tw-prose-td-borders:#e5e7eb;--tw-prose-invert-body:#d1d5db;--tw-prose-invert-headings:#fff;--tw-prose-invert-lead:#9ca3af;--tw-prose-invert-links:#fff;--tw-prose-invert-bold:#fff;--tw-prose-invert-counters:#9ca3af;--tw-prose-invert-bullets:#4b5563;--tw-prose-invert-hr:#374151;--tw-prose-invert-quotes:#f3f4f6;--tw-prose-invert-quote-borders:#374151;--tw-prose-invert-captions:#9ca3af;--tw-prose-invert-kbd:#fff;--tw-prose-invert-kbd-shadows:255 255 255;--tw-prose-invert-code:#fff;--tw-prose-invert-pre-code:#d1d5db;--tw-prose-invert-pre-bg:rgba(0,0,0,.5);--tw-prose-invert-th-borders:#4b5563;--tw-prose-invert-td-borders:#374151;font-size:1rem;line-height:1.75}.prose :where(picture>img):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:0;margin-bottom:0}.prose :where(li):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:.25rem;margin-bottom:.25rem}.prose :where(ol>li):not(:where([class~=not-prose],[class~=not-prose] *)){padding-inline-start:.375em}.prose :where(ul>li):not(:where([class~=not-prose],[class~=not-prose] *)){padding-inline-start:.375em}.prose :where(.prose>ul>li p):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:.75em;margin-bottom:.75em}.prose :where(.prose>ul>li>p:first-child):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:1.25em}.prose :where(.prose>ul>li>p:last-child):not(:where([class~=not-prose],[class~=not-prose] *)){margin-bottom:1.25em}.prose :where(.prose>ol>li>p:first-child):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:1.25em}.prose :where(.prose>ol>li>p:last-child):not(:where([class~=not-prose],[class~=not-prose] *)){margin-bottom:1.25em}.prose :where(ul ul,ul ol,ol ul,ol ol):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:.75em;margin-bottom:.75em}.prose :where(dl):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:1.25em;margin-bottom:1.25em}.prose :where(dd):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:.5em;padding-inline-start:1.625em}.prose :where(hr+*):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:0}.prose :where(h2+*):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:0}.prose :where(h3+*):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:0}.prose :where(h4+*):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:0}.prose :where(thead th:first-child):not(:where([class~=not-prose],[class~=not-prose] *)){padding-inline-start:0}.prose :where(thead th:last-child):not(:where([class~=not-prose],[class~=not-prose] *)){padding-inline-end:0}.prose :where(tbody td,tfoot td):not(:where([class~=not-prose],[class~=not-prose] *)){padding-top:.5714286em;padding-inline-end:.5714286em;padding-bottom:.5714286em;padding-inline-start:.5714286em}.prose :where(tbody td:first-child,tfoot td:first-child):not(:where([class~=not-prose],[class~=not-prose] *)){padding-inline-start:0}.prose :where(tbody td:last-child,tfoot td:last-child):not(:where([class~=not-prose],[class~=not-prose] *)){padding-inline-end:0}.prose :where(figure):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:2em;margin-bottom:2em}.prose :where(.prose>:first-child):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:0}.prose :where(.prose>:last-child):not(:where([class~=not-prose],[class~=not-prose] *)){margin-bottom:0}.prose :where(blockquote p:first-of-type):not(:where([class~=not-prose],[class~=not-prose] *)):after{content:none}.prose :where(li>p,dd>p,header>p,footer>p):not(:where([class~=not-prose],[class~=not-prose] *)){margin-top:.25rem;margin-bottom:.25rem}.prose :where(h5,h6):not(:where([class~=not-prose],[class~=not-prose] *)){color:var(--tw-prose-headings);font-weight:500}.sr-only{position:absolute;width:1px;height:1px;padding:0;margin:-1px;overflow:hidden;clip:rect(0,0,0,0);white-space:nowrap;border-width:0}.pointer-events-none{pointer-events:none}.pointer-events-auto{pointer-events:auto}.visible{visibility:visible}.invisible{visibility:hidden}.collapse{visibility:collapse}.static{position:static}.fixed{position:fixed}.absolute{position:absolute}.relative{position:relative}.sticky{position:sticky}.inset-0{inset:0}.inset-y-0{top:0;bottom:0}.-left-2{left:-.5rem}.-right-1{right:-.25rem}.-right-\[28px\]{right:-28px}.-top-3{top:-.75rem}.bottom-0{bottom:0}.bottom-2{bottom:.5rem}.left-0{left:0}.left-1{left:.25rem}.left-4{left:1rem}.right-0{right:0}.right-1{right:.25rem}.start-0{inset-inline-start:0}.top-0{top:0}.top-1{top:.25rem}.top-\[32px\]{top:32px}.top-\[60px\]{top:60px}.top-\[80px\]{top:80px}.-z-10{z-index:-10}.z-0{z-index:0}.z-10{z-index:10}.z-20{z-index:20}.z-30{z-index:30}.z-\[1000\]{z-index:1000}.z-\[1001\]{z-index:1001}.z-\[11\]{z-index:11}.z-\[2\]{z-index:2}.col-body{grid-column:body}.col-body-inset{grid-column:body-inset}.col-body-left{grid-column:body/gutter-right-start}.col-body-outset{grid-column:body-outset}.col-body-outset-left{grid-column:body-outset/body}.col-body-outset-right{grid-column:body/body-outset}.col-body-right{grid-column:gutter-left-end/body}.col-gutter-left{grid-column:gutter-left}.col-gutter-outset-left{grid-column:body-outset/gutter-left}.col-gutter-outset-right{grid-column:gutter-right/body-outset}.col-gutter-right{grid-column:gutter-right}.col-page{grid-column:page}.col-page-inset{grid-column:page-inset}.col-page-inset-left{grid-column:page-inset/body}.col-page-inset-right{grid-column:body/page-inset}.col-page-left{grid-column:page/body}.col-page-right{grid-column:body/page}.col-screen{grid-column:screen}.col-screen-inset{grid-column:screen-inset}.col-screen-inset-left{grid-column:screen-inset/body}.col-screen-inset-right{grid-column:body/screen-inset}.col-screen-left{grid-column:screen/body}.col-screen-right{grid-column:body/screen}.col-span-1{grid-column:span 1/span 1}.col-span-2{grid-column:span 2/span 2}.col-span-3{grid-column:span 3/span 3}.col-span-4{grid-column:span 4/span 4}.col-span-5{grid-column:span 5/span 5}.col-span-6{grid-column:span 6/span 6}.row-span-1{grid-row:span 1/span 1}.row-span-2{grid-row:span 2/span 2}.row-span-3{grid-row:span 3/span 3}.row-span-4{grid-row:span 4/span 4}.row-span-5{grid-row:span 5/span 5}.row-span-6{grid-row:span 6/span 6}.float-right{float:right}.m-0{margin:0}.m-1{margin:.25rem}.mx-1{margin-left:.25rem;margin-right:.25rem}.mx-2{margin-left:.5rem;margin-right:.5rem}.mx-3{margin-left:.75rem;margin-right:.75rem}.mx-auto{margin-left:auto;margin-right:auto}.my-1{margin-top:.25rem;margin-bottom:.25rem}.my-10{margin-top:2.5rem;margin-bottom:2.5rem}.my-2{margin-top:.5rem;margin-bottom:.5rem}.my-3{margin-top:.75rem;margin-bottom:.75rem}.my-4{margin-top:1rem;margin-bottom:1rem}.my-5{margin-top:1.25rem;margin-bottom:1.25rem}.my-8,.my-\[2rem\]{margin-top:2rem;margin-bottom:2rem}.-mr-1{margin-right:-.25rem}.mb-0{margin-bottom:0}.mb-1{margin-bottom:.25rem}.mb-10{margin-bottom:2.5rem}.mb-2{margin-bottom:.5rem}.mb-2\.5{margin-bottom:.625rem}.mb-3{margin-bottom:.75rem}.mb-4{margin-bottom:1rem}.mb-5{margin-bottom:1.25rem}.mb-8{margin-bottom:2rem}.mb-\[1rem\]{margin-bottom:1rem}.ml-1{margin-left:.25rem}.ml-2{margin-left:.5rem}.ml-3{margin-left:.75rem}.ml-4{margin-left:1rem}.mr-1{margin-right:.25rem}.mr-2{margin-right:.5rem}.mr-3{margin-right:.75rem}.mt-0{margin-top:0}.mt-0\.5{margin-top:.125rem}.mt-1{margin-top:.25rem}.mt-2{margin-top:.5rem}.mt-3{margin-top:.75rem}.mt-4{margin-top:1rem}.mt-5{margin-top:1.25rem}.mt-9{margin-top:2.25rem}.\!block{display:block!important}.block{display:block}.inline-block{display:inline-block}.inline{display:inline}.flex{display:flex}.inline-flex{display:inline-flex}.table{display:table}.grid{display:grid}.contents{display:contents}.\!hidden{display:none!important}.hidden{display:none}.aspect-square{aspect-ratio:1/1}.h-0{height:0}.h-10{height:2.5rem}.h-11{height:2.75rem}.h-4{height:1rem}.h-5{height:1.25rem}.h-6{height:1.5rem}.h-8{height:2rem}.h-9{height:2.25rem}.h-\[0px\]{height:0}.h-\[10px\]{height:10px}.h-\[150px\]{height:150px}.h-\[22px\]{height:22px}.h-\[2px\]{height:2px}.h-\[60px\]{height:60px}.h-full{height:100%}.h-screen{height:100vh}.max-h-\[15rem\]{max-height:15rem}.max-h-\[300px\]{max-height:300px}.max-h-\[4rem\]{max-height:4rem}.max-h-\[5rem\]{max-height:5rem}.min-h-1{min-height:.25rem}.min-h-\[2em\]{min-height:2em}.w-10{width:2.5rem}.w-4{width:1rem}.w-48{width:12rem}.w-5{width:1.25rem}.w-6{width:1.5rem}.w-8{width:2rem}.w-\[0px\]{width:0}.w-\[10px\]{width:10px}.w-\[22px\]{width:22px}.w-\[300px\]{width:300px}.w-\[400px\]{width:400px}.w-\[500px\]{width:500px}.w-auto{width:auto}.w-fit{width:-moz-fit-content;width:fit-content}.w-full{width:100%}.w-max{width:-moz-max-content;width:max-content}.w-screen{width:100vw}.min-w-0{min-width:0}.min-w-\[400px\]{min-width:400px}.max-w-\[1440px\]{max-width:1440px}.max-w-\[200px\]{max-width:200px}.max-w-\[350px\]{max-width:350px}.max-w-\[80vw\]{max-width:80vw}.max-w-\[90\%\]{max-width:90%}.max-w-full{max-width:100%}.flex-1{flex:1 1 0%}.flex-none{flex:none}.shrink-0{flex-shrink:0}.flex-grow,.grow{flex-grow:1}.grow-0{flex-grow:0}.border-collapse{border-collapse:collapse}.origin-top-left{transform-origin:top left}.origin-top-right{transform-origin:top right}.-translate-y-\[0\.1em\]{--tw-translate-y:-.1em;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.-translate-y-\[1px\],.-translate-y-px{--tw-translate-y:-1px}.-translate-y-\[1px\],.-translate-y-px,.translate-y-2{transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.translate-y-2{--tw-translate-y:.5rem}.translate-y-6{--tw-translate-y:1.5rem}.scale-100,.translate-y-6{transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.scale-100{--tw-scale-x:1;--tw-scale-y:1}.scale-95{--tw-scale-x:.95;--tw-scale-y:.95}.scale-95,.scale-x-100{transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.scale-x-100{--tw-scale-x:1}.transform{transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}@keyframes load{0%{width:0}to{width:50%}}.animate-load{animation:load 2.5s ease-out}@keyframes pulse{50%{opacity:.5}}.animate-pulse{animation:pulse 2s cubic-bezier(.4,0,.6,1) infinite}@keyframes spin{to{transform:rotate(1turn)}}.animate-spin{animation:spin 1s linear infinite}.cursor-help{cursor:help}.cursor-not-allowed{cursor:not-allowed}.cursor-pointer{cursor:pointer}.select-none{-webkit-user-select:none;-moz-user-select:none;user-select:none}.resize-none{resize:none}.resize{resize:both}.list-none{list-style-type:none}.grid-cols-1{grid-template-columns:repeat(1,minmax(0,1fr))}.grid-cols-10{grid-template-columns:repeat(10,minmax(0,1fr))}.grid-cols-11{grid-template-columns:repeat(11,minmax(0,1fr))}.grid-cols-12{grid-template-columns:repeat(12,minmax(0,1fr))}.grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}.grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}.grid-cols-4{grid-template-columns:repeat(4,minmax(0,1fr))}.grid-cols-5{grid-template-columns:repeat(5,minmax(0,1fr))}.grid-cols-6{grid-template-columns:repeat(6,minmax(0,1fr))}.grid-cols-7{grid-template-columns:repeat(7,minmax(0,1fr))}.grid-cols-8{grid-template-columns:repeat(8,minmax(0,1fr))}.grid-cols-9{grid-template-columns:repeat(9,minmax(0,1fr))}.grid-rows-\[3rem_1fr\]{grid-template-rows:3rem 1fr}.flex-row{flex-direction:row}.flex-col{flex-direction:column}.flex-wrap{flex-wrap:wrap}.flex-nowrap{flex-wrap:nowrap}.items-center{align-items:center}.items-stretch{align-items:stretch}.justify-start{justify-content:flex-start}.justify-end{justify-content:flex-end}.justify-center{justify-content:center}.justify-between{justify-content:space-between}.gap-0{gap:0}.gap-1{gap:.25rem}.gap-2{gap:.5rem}.gap-2\.5{gap:.625rem}.gap-4{gap:1rem}.gap-x-1{-moz-column-gap:.25rem;column-gap:.25rem}.gap-y-1{row-gap:.25rem}.gap-y-2{row-gap:.5rem}.space-x-1>:not([hidden])~:not([hidden]){--tw-space-x-reverse:0;margin-right:calc(.25rem*var(--tw-space-x-reverse));margin-left:calc(.25rem*(1 - var(--tw-space-x-reverse)))}.space-x-4>:not([hidden])~:not([hidden]){--tw-space-x-reverse:0;margin-right:calc(1rem*var(--tw-space-x-reverse));margin-left:calc(1rem*(1 - var(--tw-space-x-reverse)))}.divide-y>:not([hidden])~:not([hidden]){--tw-divide-y-reverse:0;border-top-width:calc(1px*(1 - var(--tw-divide-y-reverse)));border-bottom-width:calc(1px*var(--tw-divide-y-reverse))}.divide-gray-100>:not([hidden])~:not([hidden]){--tw-divide-opacity:1;border-color:rgb(243 244 246/var(--tw-divide-opacity))}.self-start{align-self:flex-start}.self-center{align-self:center}.overflow-auto{overflow:auto}.overflow-hidden{overflow:hidden}.overflow-x-auto{overflow-x:auto}.overflow-y-auto{overflow-y:auto}.overflow-y-hidden{overflow-y:hidden}.overflow-y-visible{overflow-y:visible}.overflow-y-scroll{overflow-y:scroll}.truncate{overflow:hidden;white-space:nowrap}.text-ellipsis,.truncate{text-overflow:ellipsis}.whitespace-pre-wrap{white-space:pre-wrap}.break-words{overflow-wrap:break-word}.rounded{border-radius:.25rem}.rounded-full{border-radius:9999px}.rounded-lg{border-radius:.5rem}.rounded-md{border-radius:.375rem}.rounded-sm{border-radius:.125rem}.border{border-width:1px}.border-y{border-top-width:1px}.border-b,.border-y{border-bottom-width:1px}.border-b-2{border-bottom-width:2px}.border-l{border-left-width:1px}.border-l-2{border-left-width:2px}.border-l-4{border-left-width:4px}.border-r{border-right-width:1px}.border-t{border-top-width:1px}.border-solid{border-style:solid}.border-dotted{border-style:dotted}.border-amber-500\/70{border-color:#f59e0bb3}.border-amber-600{--tw-border-opacity:1;border-color:rgb(217 119 6/var(--tw-border-opacity))}.border-blue-500{--tw-border-opacity:1;border-color:rgb(59 130 246/var(--tw-border-opacity))}.border-blue-500\/60{border-color:#3b82f699}.border-gray-100{--tw-border-opacity:1;border-color:rgb(243 244 246/var(--tw-border-opacity))}.border-gray-200{--tw-border-opacity:1;border-color:rgb(229 231 235/var(--tw-border-opacity))}.border-gray-300{--tw-border-opacity:1;border-color:rgb(209 213 219/var(--tw-border-opacity))}.border-gray-500\/60{border-color:#6b728099}.border-gray-800{--tw-border-opacity:1;border-color:rgb(31 41 55/var(--tw-border-opacity))}.border-green-500\/60{border-color:#22c55e99}.border-green-600{--tw-border-opacity:1;border-color:rgb(22 163 74/var(--tw-border-opacity))}.border-orange-500\/60{border-color:#f9731699}.border-purple-500\/60{border-color:#a855f799}.border-red-400{--tw-border-opacity:1;border-color:rgb(248 113 113/var(--tw-border-opacity))}.border-red-500{--tw-border-opacity:1;border-color:rgb(239 68 68/var(--tw-border-opacity))}.border-red-500\/60{border-color:#ef444499}.border-red-600{--tw-border-opacity:1;border-color:rgb(220 38 38/var(--tw-border-opacity))}.border-slate-400{--tw-border-opacity:1;border-color:rgb(148 163 184/var(--tw-border-opacity))}.border-slate-600{--tw-border-opacity:1;border-color:rgb(71 85 105/var(--tw-border-opacity))}.border-stone-200{--tw-border-opacity:1;border-color:rgb(231 229 228/var(--tw-border-opacity))}.border-stone-300{--tw-border-opacity:1;border-color:rgb(214 211 209/var(--tw-border-opacity))}.border-stone-400{--tw-border-opacity:1;border-color:rgb(168 162 158/var(--tw-border-opacity))}.border-stone-700{--tw-border-opacity:1;border-color:rgb(68 64 60/var(--tw-border-opacity))}.border-b-blue-600{--tw-border-opacity:1;border-bottom-color:rgb(37 99 235/var(--tw-border-opacity))}.border-b-gray-100{--tw-border-opacity:1;border-bottom-color:rgb(243 244 246/var(--tw-border-opacity))}.border-l-blue-400{--tw-border-opacity:1;border-left-color:rgb(96 165 250/var(--tw-border-opacity))}.border-l-blue-500{--tw-border-opacity:1;border-left-color:rgb(59 130 246/var(--tw-border-opacity))}.border-l-gray-300{--tw-border-opacity:1;border-left-color:rgb(209 213 219/var(--tw-border-opacity))}.border-l-gray-50{--tw-border-opacity:1;border-left-color:rgb(249 250 251/var(--tw-border-opacity))}.bg-\[\#656c85cc\]{background-color:#656c85cc}.bg-amber-50{--tw-bg-opacity:1;background-color:rgb(255 251 235/var(--tw-bg-opacity))}.bg-amber-50\/80{background-color:#fffbebcc}.bg-black{--tw-bg-opacity:1;background-color:rgb(0 0 0/var(--tw-bg-opacity))}.bg-black\/80{background-color:#000c}.bg-blue-300\/30{background-color:#93c5fd4d}.bg-blue-50{--tw-bg-opacity:1;background-color:rgb(239 246 255/var(--tw-bg-opacity))}.bg-blue-50\/80{background-color:#eff6ffcc}.bg-blue-500{--tw-bg-opacity:1;background-color:rgb(59 130 246/var(--tw-bg-opacity))}.bg-blue-800{--tw-bg-opacity:1;background-color:rgb(30 64 175/var(--tw-bg-opacity))}.bg-blue-900{--tw-bg-opacity:1;background-color:rgb(30 58 138/var(--tw-bg-opacity))}.bg-gray-100{--tw-bg-opacity:1;background-color:rgb(243 244 246/var(--tw-bg-opacity))}.bg-gray-100\/80{background-color:#f3f4f6cc}.bg-gray-50{--tw-bg-opacity:1;background-color:rgb(249 250 251/var(--tw-bg-opacity))}.bg-gray-50\/10{background-color:#f9fafb1a}.bg-gray-50\/80{background-color:#f9fafbcc}.bg-green-50{--tw-bg-opacity:1;background-color:rgb(240 253 244/var(--tw-bg-opacity))}.bg-green-50\/80{background-color:#f0fdf4cc}.bg-inherit{background-color:inherit}.bg-orange-50\/80{background-color:#fff7edcc}.bg-orange-500{--tw-bg-opacity:1;background-color:rgb(249 115 22/var(--tw-bg-opacity))}.bg-orange-700{--tw-bg-opacity:1;background-color:rgb(194 65 12/var(--tw-bg-opacity))}.bg-purple-50\/80{background-color:#faf5ffcc}.bg-red-50{--tw-bg-opacity:1;background-color:rgb(254 242 242/var(--tw-bg-opacity))}.bg-red-50\/80{background-color:#fef1f1cc}.bg-red-500{--tw-bg-opacity:1;background-color:rgb(239 68 68/var(--tw-bg-opacity))}.bg-red-800{--tw-bg-opacity:1;background-color:rgb(153 27 27/var(--tw-bg-opacity))}.bg-slate-100{--tw-bg-opacity:1;background-color:rgb(241 245 249/var(--tw-bg-opacity))}.bg-slate-200{--tw-bg-opacity:1;background-color:rgb(226 232 240/var(--tw-bg-opacity))}.bg-slate-300\/30{background-color:#cbd5e14d}.bg-slate-50{--tw-bg-opacity:1;background-color:rgb(248 250 252/var(--tw-bg-opacity))}.bg-slate-50\/50{background-color:#f8fafc80}.bg-slate-500{--tw-bg-opacity:1;background-color:rgb(100 116 139/var(--tw-bg-opacity))}.bg-slate-700{--tw-bg-opacity:1;background-color:rgb(51 65 85/var(--tw-bg-opacity))}.bg-slate-800{--tw-bg-opacity:1;background-color:rgb(30 41 59/var(--tw-bg-opacity))}.bg-slate-900{--tw-bg-opacity:1;background-color:rgb(15 23 42/var(--tw-bg-opacity))}.bg-stone-200\/10{background-color:#e7e5e41a}.bg-stone-700{--tw-bg-opacity:1;background-color:rgb(68 64 60/var(--tw-bg-opacity))}.bg-stone-900{--tw-bg-opacity:1;background-color:rgb(28 25 23/var(--tw-bg-opacity))}.bg-transparent{background-color:transparent}.bg-white{--tw-bg-opacity:1;background-color:rgb(255 255 255/var(--tw-bg-opacity))}.bg-white\/80{background-color:#fffc}.bg-cover{background-size:cover}.bg-top{background-position:top}.bg-no-repeat{background-repeat:no-repeat}.fill-blue-900{fill:#1e3a8a}.fill-green-600{fill:#16a34a}.fill-white{fill:#fff}.object-cover{-o-object-fit:cover;object-fit:cover}.object-left{-o-object-position:left;object-position:left}.object-top{-o-object-position:top;object-position:top}.p-0\.5{padding:.125rem}.p-1{padding:.25rem}.p-2{padding:.5rem}.p-2\.5{padding:.625rem}.p-3{padding:.75rem}.p-4{padding:1rem}.p-5{padding:1.25rem}.p-6{padding:1.5rem}.px-1{padding-left:.25rem;padding-right:.25rem}.px-2{padding-left:.5rem;padding-right:.5rem}.px-3{padding-left:.75rem;padding-right:.75rem}.px-4{padding-left:1rem;padding-right:1rem}.px-6{padding-left:1.5rem;padding-right:1.5rem}.py-0\.5{padding-top:.125rem;padding-bottom:.125rem}.py-1{padding-top:.25rem;padding-bottom:.25rem}.py-2{padding-top:.5rem;padding-bottom:.5rem}.py-3{padding-top:.75rem;padding-bottom:.75rem}.py-6{padding-top:1.5rem;padding-bottom:1.5rem}.pb-2{padding-bottom:.5rem}.pb-6{padding-bottom:1.5rem}.pb-\[1rem\]{padding-bottom:1rem}.pl-0{padding-left:0}.pl-10{padding-left:2.5rem}.pl-12{padding-left:3rem}.pl-2{padding-left:.5rem}.pl-3{padding-left:.75rem}.pl-4{padding-left:1rem}.pl-8{padding-left:2rem}.pr-2{padding-right:.5rem}.pr-\[2px\]{padding-right:2px}.ps-10{padding-inline-start:2.5rem}.pt-10{padding-top:2.5rem}.pt-3{padding-top:.75rem}.pt-4{padding-top:1rem}.pt-6{padding-top:1.5rem}.pt-9{padding-top:2.25rem}.pt-\[40px\]{padding-top:40px}.pt-\[80px\]{padding-top:80px}.text-left{text-align:left}.text-center{text-align:center}.text-right{text-align:right}.align-top{vertical-align:top}.align-middle{vertical-align:middle}.font-mono{font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace}.text-\[15px\]{font-size:15px}.text-base{font-size:1rem;line-height:1.5rem}.text-lg{font-size:1.125rem;line-height:1.75rem}.text-sm{font-size:.875rem;line-height:1.25rem}.text-xl{font-size:1.25rem;line-height:1.75rem}.text-xs{font-size:.75rem;line-height:1rem}.font-bold{font-weight:700}.font-extralight{font-weight:200}.font-light{font-weight:300}.font-medium{font-weight:500}.font-normal{font-weight:400}.font-semibold{font-weight:600}.font-thin{font-weight:100}.uppercase{text-transform:uppercase}.capitalize{text-transform:capitalize}.italic{font-style:italic}.leading-3{line-height:.75rem}.leading-6{line-height:1.5rem}.leading-\[0\]{line-height:0}.leading-\[19px\]{line-height:19px}.leading-none{line-height:1}.tracking-tight{letter-spacing:-.025em}.text-amber-600{--tw-text-opacity:1;color:rgb(217 119 6/var(--tw-text-opacity))}.text-black{--tw-text-opacity:1;color:rgb(0 0 0/var(--tw-text-opacity))}.text-blue-200{--tw-text-opacity:1;color:rgb(191 219 254/var(--tw-text-opacity))}.text-blue-400{--tw-text-opacity:1;color:rgb(96 165 250/var(--tw-text-opacity))}.text-blue-500{--tw-text-opacity:1;color:rgb(59 130 246/var(--tw-text-opacity))}.text-blue-600{--tw-text-opacity:1;color:rgb(37 99 235/var(--tw-text-opacity))}.text-blue-800{--tw-text-opacity:1;color:rgb(30 64 175/var(--tw-text-opacity))}.text-gray-100{--tw-text-opacity:1;color:rgb(243 244 246/var(--tw-text-opacity))}.text-gray-200{--tw-text-opacity:1;color:rgb(229 231 235/var(--tw-text-opacity))}.text-gray-400{--tw-text-opacity:1;color:rgb(156 163 175/var(--tw-text-opacity))}.text-gray-500{--tw-text-opacity:1;color:rgb(107 114 128/var(--tw-text-opacity))}.text-gray-600{--tw-text-opacity:1;color:rgb(75 85 99/var(--tw-text-opacity))}.text-gray-700{--tw-text-opacity:1;color:rgb(55 65 81/var(--tw-text-opacity))}.text-gray-900{--tw-text-opacity:1;color:rgb(17 24 39/var(--tw-text-opacity))}.text-green-500{--tw-text-opacity:1;color:rgb(34 197 94/var(--tw-text-opacity))}.text-green-600{--tw-text-opacity:1;color:rgb(22 163 74/var(--tw-text-opacity))}.text-green-700{--tw-text-opacity:1;color:rgb(21 128 61/var(--tw-text-opacity))}.text-inherit{color:inherit}.text-neutral-700{--tw-text-opacity:1;color:rgb(64 64 64/var(--tw-text-opacity))}.text-neutral-900{--tw-text-opacity:1;color:rgb(23 23 23/var(--tw-text-opacity))}.text-orange-600{--tw-text-opacity:1;color:rgb(234 88 12/var(--tw-text-opacity))}.text-purple-600{--tw-text-opacity:1;color:rgb(147 51 234/var(--tw-text-opacity))}.text-purple-700{--tw-text-opacity:1;color:rgb(126 34 206/var(--tw-text-opacity))}.text-red-500{--tw-text-opacity:1;color:rgb(239 68 68/var(--tw-text-opacity))}.text-red-600{--tw-text-opacity:1;color:rgb(220 38 38/var(--tw-text-opacity))}.text-slate-300{--tw-text-opacity:1;color:rgb(203 213 225/var(--tw-text-opacity))}.text-slate-400{--tw-text-opacity:1;color:rgb(148 163 184/var(--tw-text-opacity))}.text-slate-50{--tw-text-opacity:1;color:rgb(248 250 252/var(--tw-text-opacity))}.text-slate-500{--tw-text-opacity:1;color:rgb(100 116 139/var(--tw-text-opacity))}.text-slate-600{--tw-text-opacity:1;color:rgb(71 85 105/var(--tw-text-opacity))}.text-slate-700{--tw-text-opacity:1;color:rgb(51 65 85/var(--tw-text-opacity))}.text-slate-900{--tw-text-opacity:1;color:rgb(15 23 42/var(--tw-text-opacity))}.text-stone-100{--tw-text-opacity:1;color:rgb(245 245 244/var(--tw-text-opacity))}.text-stone-200{--tw-text-opacity:1;color:rgb(231 229 228/var(--tw-text-opacity))}.text-stone-500{--tw-text-opacity:1;color:rgb(120 113 108/var(--tw-text-opacity))}.text-stone-700{--tw-text-opacity:1;color:rgb(68 64 60/var(--tw-text-opacity))}.text-stone-800{--tw-text-opacity:1;color:rgb(41 37 36/var(--tw-text-opacity))}.text-stone-900{--tw-text-opacity:1;color:rgb(28 25 23/var(--tw-text-opacity))}.text-violet-200{--tw-text-opacity:1;color:rgb(221 214 254/var(--tw-text-opacity))}.text-white{--tw-text-opacity:1;color:rgb(255 255 255/var(--tw-text-opacity))}.text-yellow-600{--tw-text-opacity:1;color:rgb(202 138 4/var(--tw-text-opacity))}.text-zinc-600{--tw-text-opacity:1;color:rgb(82 82 91/var(--tw-text-opacity))}.underline{text-decoration-line:underline}.no-underline{text-decoration-line:none}.placeholder-gray-400::-moz-placeholder{--tw-placeholder-opacity:1;color:rgb(156 163 175/var(--tw-placeholder-opacity))}.placeholder-gray-400::placeholder{--tw-placeholder-opacity:1;color:rgb(156 163 175/var(--tw-placeholder-opacity))}.opacity-0{opacity:0}.opacity-10{opacity:.1}.opacity-100{opacity:1}.opacity-50{opacity:.5}.opacity-60{opacity:.6}.opacity-70{opacity:.7}.opacity-80{opacity:.8}.opacity-90{opacity:.9}.shadow{--tw-shadow:0 1px 3px 0 rgba(0,0,0,.1),0 1px 2px -1px rgba(0,0,0,.1);--tw-shadow-colored:0 1px 3px 0 var(--tw-shadow-color),0 1px 2px -1px var(--tw-shadow-color)}.shadow,.shadow-2xl{box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.shadow-2xl{--tw-shadow:0 25px 50px -12px rgba(0,0,0,.25);--tw-shadow-colored:0 25px 50px -12px var(--tw-shadow-color)}.shadow-\[0px_2px_0px_0px_rgba\(0\,0\,0\,0\.08\)\]{--tw-shadow:0px 2px 0px 0px rgba(0,0,0,.08);--tw-shadow-colored:0px 2px 0px 0px var(--tw-shadow-color)}.shadow-\[0px_2px_0px_0px_rgba\(0\,0\,0\,0\.08\)\],.shadow-inner{box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.shadow-inner{--tw-shadow:inset 0 2px 4px 0 rgba(0,0,0,.05);--tw-shadow-colored:inset 0 2px 4px 0 var(--tw-shadow-color)}.shadow-lg{--tw-shadow:0 10px 15px -3px rgba(0,0,0,.1),0 4px 6px -4px rgba(0,0,0,.1);--tw-shadow-colored:0 10px 15px -3px var(--tw-shadow-color),0 4px 6px -4px var(--tw-shadow-color)}.shadow-lg,.shadow-md{box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.shadow-md{--tw-shadow:0 4px 6px -1px rgba(0,0,0,.1),0 2px 4px -2px rgba(0,0,0,.1);--tw-shadow-colored:0 4px 6px -1px var(--tw-shadow-color),0 2px 4px -2px var(--tw-shadow-color)}.shadow-sm{--tw-shadow:0 1px 2px 0 rgba(0,0,0,.05);--tw-shadow-colored:0 1px 2px 0 var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.outline-none{outline:2px solid transparent;outline-offset:2px}.\!outline{outline-style:solid!important}.outline{outline-style:solid}.outline-2{outline-width:2px}.outline-blue-200{outline-color:#bfdbfe}.ring-1{--tw-ring-offset-shadow:var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);--tw-ring-shadow:var(--tw-ring-inset) 0 0 0 calc(1px + var(--tw-ring-offset-width)) var(--tw-ring-color);box-shadow:var(--tw-ring-offset-shadow),var(--tw-ring-shadow),var(--tw-shadow,0 0 #0000)}.ring-black{--tw-ring-opacity:1;--tw-ring-color:rgb(0 0 0/var(--tw-ring-opacity))}.ring-blue-500{--tw-ring-opacity:1;--tw-ring-color:rgb(59 130 246/var(--tw-ring-opacity))}.ring-opacity-5{--tw-ring-opacity:.05}.filter{filter:var(--tw-blur) var(--tw-brightness) var(--tw-contrast) var(--tw-grayscale) var(--tw-hue-rotate) var(--tw-invert) var(--tw-saturate) var(--tw-sepia) var(--tw-drop-shadow)}.backdrop-blur{--tw-backdrop-blur:blur(8px);-webkit-backdrop-filter:var(--tw-backdrop-blur) var(--tw-backdrop-brightness) var(--tw-backdrop-contrast) var(--tw-backdrop-grayscale) var(--tw-backdrop-hue-rotate) var(--tw-backdrop-invert) var(--tw-backdrop-opacity) var(--tw-backdrop-saturate) var(--tw-backdrop-sepia);backdrop-filter:var(--tw-backdrop-blur) var(--tw-backdrop-brightness) var(--tw-backdrop-contrast) var(--tw-backdrop-grayscale) var(--tw-backdrop-hue-rotate) var(--tw-backdrop-invert) var(--tw-backdrop-opacity) var(--tw-backdrop-saturate) var(--tw-backdrop-sepia)}.transition{transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,-webkit-backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter,-webkit-backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.transition-all{transition-property:all;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.transition-colors{transition-property:color,background-color,border-color,text-decoration-color,fill,stroke;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.transition-opacity{transition-property:opacity;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.transition-transform{transition-property:transform;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.duration-100{transition-duration:.1s}.duration-200{transition-duration:.2s}.duration-300{transition-duration:.3s}.duration-500{transition-duration:.5s}.duration-700{transition-duration:.7s}.duration-75{transition-duration:75ms}.ease-in{transition-timing-function:cubic-bezier(.4,0,1,1)}.ease-in-out{transition-timing-function:cubic-bezier(.4,0,.2,1)}.ease-out{transition-timing-function:cubic-bezier(0,0,.2,1)}.smallcaps{font-variant:small-caps}details>summary{list-style:none;transition:margin .15s ease-out}details>summary::-webkit-details-marker,details>summary::marker{display:none}details[open]>summary .details-toggle{transform:rotate(90deg) translate(-5px) translateY(-5px)}details[open]>summary{margin-bottom:10px}details .details-body{overflow:auto}details[open] .details-body{margin-top:-10px}cite{font-style:normal}.cite-group.parenthetical>:after{content:";\a0"}.cite-group.narrative>:after,.xref-group>:after{content:",\a0"}.cite-group>:last-of-type:after,.xref-group>:last-of-type:after{content:""}.cite-group.parenthetical:before{content:var(--cite-group-open,"(")}.cite-group.parenthetical:after{content:var(--cite-group-close,")")}.xref-group.parenthetical:before{content:var(--xref-group-open,"(")}.xref-group.parenthetical:after{content:var(--xref-group-close,")")}figure.fig-quote figcaption{text-align:right}figure.fig-quote figcaption>p:before{content:"\2014";padding-right:.5em}figure.fig-quote.pull-quote>blockquote{font-size:1.4rem}figure.fig-code>div{margin:0}figure.subcontainer{margin-bottom:0}figure.subcontainer figcaption{margin-top:.25em;text-align:center}figure figcaption>p,figure img{margin-top:0!important;margin-bottom:0!important}.text-spacer:after{content:"\a0\2219\a0"}.text-comma:after{content:",\a0\a0"}pre>code>span[data-line-number]{display:block;position:relative;line-height:1.4rem;padding-right:1rem}pre>code>span[data-highlight=true]:after{content:" ";position:absolute;right:-.8rem;top:0;width:calc(100% + 1.6rem);opacity:.1;pointer-events:none;background:#5ca5ee}pre>code>span>.linenumber{color:gray}pre>code>span[data-highlight=true]>.linenumber{color:#5ca5ee}.dark .hljs{background:#1e1e1e!important;color:#dcdcdc}.dark .hljs-keyword,.dark .hljs-literal,.dark .hljs-name,.dark .hljs-symbol{color:#569cd6}.dark .hljs-link{color:#569cd6;text-decoration:underline}.dark .hljs-built_in,.dark .hljs-type{color:#4ec9b0}.dark .hljs-class,.dark .hljs-number{color:#b8d7a3}.dark .hljs-meta .hljs-string,.dark .hljs-string{color:#d69d85}.dark .hljs-regexp,.dark .hljs-template-tag{color:#9a5334}.dark .hljs-formula,.dark .hljs-function,.dark .hljs-params,.dark .hljs-subst,.dark .hljs-title{color:#dcdcdc}.dark .hljs-comment,.dark .hljs-quote{color:#57a64a;font-style:italic}.dark .hljs-doctag{color:#608b4e}.dark .hljs-meta,.dark .hljs-meta .hljs-keyword,.dark .hljs-tag{color:#9b9b9b}.dark .hljs-template-variable,.dark .hljs-variable{color:#bd63c5}.dark .hljs-attr,.dark .hljs-attribute{color:#9cdcfe}.dark .hljs-section{color:gold}.dark .hljs-emphasis{font-style:italic}.dark .hljs-strong{font-weight:700}.dark .hljs-bullet,.dark .hljs-selector-attr,.dark .hljs-selector-class,.dark .hljs-selector-id,.dark .hljs-selector-pseudo,.dark .hljs-selector-tag{color:#d7ba7d}.dark .hljs-addition{background-color:#144212;display:inline-block;width:100%}.dark .hljs-deletion{background-color:#600;display:inline-block;width:100%}.dark .hljs-code{color:unset}.xml .hljs-meta{color:silver;background:transparent}.hljs-comment,.hljs-quote{color:#007400}.hljs-attribute,.hljs-keyword,.hljs-literal,.hljs-name,.hljs-selector-tag,.hljs-tag{color:#aa0d91}.hljs-template-variable,.hljs-variable{color:#3f6e74}.hljs-code,.hljs-meta .hljs-string,.hljs-string{color:#c41a16}.hljs-link,.hljs-regexp{color:#0e0eff}.hljs-bullet,.hljs-number,.hljs-symbol,.hljs-title{color:#1c00cf}.hljs-meta,.hljs-section{color:#643820}.hljs-built_in,.hljs-class .hljs-title,.hljs-params,.hljs-title.class_,.hljs-type{color:#5c2699}.hljs-attr{color:#836c28}.hljs-subst{color:#000}.hljs-formula{background-color:#eee;font-style:italic}.hljs-addition{background-color:#baeeba}.hljs-deletion{background-color:#ffc8bd}.hljs-selector-class,.hljs-selector-id{color:#9b703f}.hljs-doctag,.hljs-strong{font-weight:700}.hljs-emphasis{font-style:italic}.katex-display{margin:0!important}.katex .eqn-num{opacity:0;-webkit-user-select:none;-moz-user-select:none;user-select:none;pointer-events:none}.font-system{font-family:Menlo,Consolas,DejaVu Sans Mono,monospace}.jupyter-error{background-color:#fdd}.jp-OutputPrompt{display:none}table.dataframe{border:none;border-collapse:collapse;border-spacing:0;color:#000;font-size:1em;table-layout:fixed;margin:0!important}.dataframe thead{border-bottom:1px solid #000;vertical-align:bottom}.dataframe td,.dataframe th,.dataframe tr{text-align:right;vertical-align:middle;padding:.5em;line-height:normal;white-space:normal;max-width:none;border:none}.dataframe th{font-weight:700}.dataframe tbody tr:nth-child(odd){background:#f5f5f5}.dataframe tbody tr:hover{background:rgba(66,165,245,.2)}html.dark{--jp-ui-font-color0:#fff;--jp-ui-font-color1:hsla(0,0%,100%,.87);--jp-ui-font-color2:hsla(0,0%,100%,.54);--jp-ui-font-color3:hsla(0,0%,100%,.38);--jp-ui-inverse-font-color0:#000;--jp-ui-inverse-font-color1:rgba(0,0,0,.8);--jp-ui-inverse-font-color2:rgba(0,0,0,.5);--jp-ui-inverse-font-color3:rgba(0,0,0,.3);--jp-content-font-color0:#fff;--jp-content-font-color1:#fff;--jp-content-font-color2:hsla(0,0%,100%,.7);--jp-content-font-color3:hsla(0,0%,100%,.5);--jp-layout-color0:#111;--jp-layout-color1:var(--md-grey-900);--jp-layout-color2:var(--md-grey-800);--jp-layout-color3:var(--md-grey-700);--jp-layout-color4:var(--md-grey-600)}.sphinx-desc-addname,.sphinx-desc-inline,.sphinx-desc-name,.sphinx-desc-optional,.sphinx-desc-parameterlist,.sphinx-desc-returns,.sphinx-desc-sig-element,.sphinx-desc-sig-keyword,.sphinx-desc-sig-keyword-type,.sphinx-desc-sig-literal-char,.sphinx-desc-sig-literal-number,.sphinx-desc-sig-literal-string,.sphinx-desc-sig-name,.sphinx-desc-sig-operator,.sphinx-desc-sig-punctuation,.sphinx-desc-sig-space,.sphinx-desc-signature-line,.sphinx-desc-type,.sphinx-desc-type-parameter{white-space:pre}.sphinx-desc-name{font-size:1.1em;font-weight:700}.sphinx-desc-signature{font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace;font-weight:300}.sphinx-desc-returns:before{--tw-content:" \2192 ";content:var(--tw-content)}dl>dt:has([class^=sphinx-desc-]){font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace;font-weight:300}dl>dt:has([class^=sphinx-desc-])>em{white-space:pre}dl:has(>dt>[class^=sphinx-desc-])>dd>dl:not(:has(>dt>[class^=sphinx-desc-])){display:grid;grid-template-columns:fit-content(30%) auto}dl:has(>dt>[class^=sphinx-desc-])>dd>dl:not(:has(>dt>[class^=sphinx-desc-]))>dd>p{margin:unset!important}dl:has(>dt>[class^=sphinx-desc-])>dd>dl:not(:has(>dt>[class^=sphinx-desc-]))>:is(dt,dd){margin:unset!important}.myst-grid>*{margin:0!important}.hover-card-content{animation-duration:.6s;animation-timing-function:cubic-bezier(.16,1,.3,1);z-index:10}.hover-card-content[data-side=top]{animation-name:slideUp}.hover-card-content[data-side=bottom]{animation-name:slideDown}@keyframes slideUp{0%{opacity:0;transform:translateY(10px)}to{opacity:1;transform:translateY(0)}}@keyframes slideDown{0%{opacity:0;transform:translateY(-10px)}to{opacity:1;transform:translateY(0)}}.hover-document{border-radius:.25rem;border-width:1px;--tw-border-opacity:1;border-color:rgb(249 250 251/var(--tw-border-opacity));--tw-bg-opacity:1;background-color:rgb(255 255 255/var(--tw-bg-opacity));font-size:.875rem;line-height:1.25rem;--tw-shadow:0 20px 25px -5px rgba(0,0,0,.1),0 8px 10px -6px rgba(0,0,0,.1);--tw-shadow-colored:0 20px 25px -5px var(--tw-shadow-color),0 8px 10px -6px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.hover-document:is(.dark *){--tw-bg-opacity:1;background-color:rgb(30 41 59/var(--tw-bg-opacity))}.hover-link{font-weight:400;--tw-text-opacity:1;color:rgb(29 78 216/var(--tw-text-opacity));text-decoration-color:#e2e8f0;text-underline-offset:2px}.hover-link:hover{--tw-text-opacity:1;color:rgb(59 130 246/var(--tw-text-opacity))}.hover-link:is(.dark *){--tw-text-opacity:1;color:rgb(219 234 254/var(--tw-text-opacity));text-decoration-color:#475569}p[data-line-number].line:before{content:attr(data-line-number);position:absolute;left:0;font-family:monospace;width:1.25em;text-align:right;-webkit-user-select:none;-moz-user-select:none;user-select:none;color:gray;overflow:hidden}p.line{position:relative;margin:0}.collapsible-content{overflow:hidden}.collapsible-content[data-state=open]{animation:open-content .3s ease-out}.collapsible-content[data-state=closed]{animation:close-content .3s ease-out}@keyframes open-content{0%{height:0}to{height:var(--radix-collapsible-content-height)}}@keyframes close-content{0%{height:var(--radix-collapsible-content-height)}to{height:0}}input[type=search]::-webkit-search-cancel-button,input[type=search]::-webkit-search-decoration,input[type=search]::-webkit-search-results-button,input[type=search]::-webkit-search-results-decoration{display:none}@media (min-width:1024px){.lg\:col-margin-right{grid-column:body}@media (min-width:1024px){.lg\:col-margin-right{grid-column:body-end/page-end}}}@media (min-width:1280px){.xl\:article-grid{display:grid;grid-template-columns:[screen-start screen-inset-start] .5rem [page-start page-inset-start body-outset-start body-start gutter-left-start body-inset-start middle-start] 1fr 1fr [gutter-left-end] 1fr 1fr [gutter-right-start] 1fr 1fr [middle-end body-inset-end body-end gutter-right-end body-outset-end page-inset-end page-end] .5rem [screen-inset-end screen-end];align-content:flex-start}@media (min-width:768px){.xl\:article-grid{grid-template-columns:[screen-start] .25rem [screen-inset-start page-start page-inset-start body-outset-start] 1fr [body-start gutter-left-start] 1rem [body-inset-start] minmax(2ch,10ch) [middle-start] minmax(2ch,10ch) [gutter-left-end] minmax(2ch,10ch) minmax(2ch,10ch) [gutter-right-start] minmax(2ch,10ch) [middle-end] minmax(2ch,10ch) [body-inset-end] 1rem [body-end gutter-right-end] 1fr [body-outset-end page-inset-end page-end screen-inset-end] .25rem [screen-end]}}@media (min-width:1024px){.xl\:article-grid{grid-template-columns:[screen-start] .25rem [screen-inset-start page-start] 1rem [page-inset-start body-outset-start] 1fr [body-start gutter-left-start] 1rem [body-inset-start] minmax(8ch,10ch) [middle-start] minmax(8ch,10ch) [gutter-left-end] minmax(8ch,10ch) minmax(8ch,10ch) [gutter-right-start] minmax(8ch,10ch) [middle-end] minmax(8ch,10ch) [body-inset-end] 1rem [body-end gutter-right-end] 3rem [body-outset-end] minmax(5rem,13rem) [page-inset-end] 3rem [page-end] 1fr [screen-inset-end] .25rem [screen-end]}}@media (min-width:1280px){.xl\:article-grid{grid-template-columns:[screen-start] .25rem [screen-inset-start] 1fr [page-start] 3rem [page-inset-start] minmax(4rem,9rem) [body-outset-start] 3rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(8ch,10ch) [middle-start] minmax(8ch,10ch) [gutter-left-end] minmax(8ch,10ch) minmax(8ch,10ch) [gutter-right-start] minmax(8ch,10ch) [middle-end] minmax(8ch,10ch) [body-inset-end] 1rem [body-end gutter-right-end] 3rem [body-outset-end] minmax(5rem,13rem) [page-inset-end] 3rem [page-end] 1fr [screen-inset-end] .25rem [screen-end]}}@media (min-width:1536px){.xl\:article-grid{grid-template-columns:[screen-start] .5rem [screen-inset-start] 1fr [page-start] 3rem [page-inset-start] minmax(4rem,9rem) [body-outset-start] 3rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(8ch,10ch) [middle-start] minmax(8ch,10ch) [gutter-left-end] minmax(8ch,10ch) minmax(8ch,10ch) [gutter-right-start] minmax(8ch,10ch) [middle-end] minmax(8ch,10ch) [body-inset-end] 1rem [body-end gutter-right-end] 3rem [body-outset-end] minmax(5rem,13rem) [page-inset-end] 3rem [page-end] 1fr [screen-inset-end] .5rem [screen-end]}}.xl\:article-left-grid{display:grid;grid-template-columns:[screen-start screen-inset-start] .5rem [page-start page-inset-start body-outset-start body-start gutter-left-start body-inset-start middle-start] 1fr 1fr [gutter-left-end] 1fr 1fr [gutter-right-start] 1fr 1fr [middle-end body-inset-end body-end gutter-right-end body-outset-end page-inset-end page-end] .5rem [screen-inset-end screen-end];align-content:flex-start}@media (min-width:768px){.xl\:article-left-grid{grid-template-columns:[screen-start] .5rem [screen-inset-start page-start page-inset-start body-outset-start] 1rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(5ch,15ch) [middle-start] minmax(5ch,15ch) [gutter-left-end] minmax(5ch,15ch) minmax(5ch,15ch) [gutter-right-start] minmax(5ch,15ch) [middle-end] minmax(5ch,15ch) [body-inset-end] 1rem [body-end gutter-right-end body-outset-end page-inset-end] 1rem [page-end screen-inset-end] .5rem [screen-end]}}@media (min-width:1024px){.xl\:article-left-grid{grid-template-columns:[screen-start] .5rem [screen-inset-start page-start page-inset-start body-outset-start] 1rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(5ch,12ch) [middle-start] minmax(5ch,12ch) [gutter-left-end] minmax(5ch,12ch) minmax(5ch,12ch) [gutter-right-start] minmax(5ch,12ch) [middle-end] minmax(5ch,12ch) [body-inset-end] 1rem [body-end] 1fr [gutter-right-end] 1rem [body-outset-end] minmax(10rem,18rem) [page-inset-end] 1rem [page-end] 1fr [screen-inset-end] .5rem [screen-end]}}@media (min-width:1280px){.xl\:article-left-grid{grid-template-columns:[screen-start] .5rem [screen-inset-start page-start page-inset-start body-outset-start] 3rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(8ch,12ch) [middle-start] minmax(8ch,12ch) [gutter-left-end] minmax(8ch,12ch) minmax(8ch,12ch) [gutter-right-start] minmax(8ch,12ch) [middle-end] minmax(8ch,12ch) [body-inset-end] 1rem [body-end] 1fr [gutter-right-end] 1rem [body-outset-end] minmax(10rem,18rem) [page-inset-end] 1rem [page-end] 1fr [screen-inset-end] .5rem [screen-end]}}@media (min-width:1536px){.xl\:article-left-grid{grid-template-columns:[screen-start] .5rem [screen-inset-start] 1fr [page-start page-inset-start body-outset-start] 3rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(8ch,12ch) [middle-start] minmax(8ch,12ch) [gutter-left-end] minmax(8ch,12ch) minmax(8ch,12ch) [gutter-right-start] minmax(8ch,12ch) [middle-end] minmax(8ch,12ch) [body-inset-end] 1rem [body-end] 1fr [gutter-right-end] 1rem [body-outset-end] minmax(10rem,18rem) [page-inset-end] 1rem [page-end] 1fr [screen-inset-end] .5rem [screen-end]}}.xl\:article-center-grid{display:grid;grid-template-columns:[screen-start screen-inset-start] .5rem [page-start page-inset-start body-outset-start body-start gutter-left-start body-inset-start middle-start] 1fr 1fr [gutter-left-end] 1fr 1fr [gutter-right-start] 1fr 1fr [middle-end body-inset-end body-end gutter-right-end body-outset-end page-inset-end page-end] .5rem [screen-inset-end screen-end];align-content:flex-start}@media (min-width:768px){.xl\:article-center-grid{grid-template-columns:[screen-start] .25rem [screen-inset-start page-start page-inset-start body-outset-start] 1fr [body-start gutter-left-start] 1rem [body-inset-start] minmax(2ch,10ch) [middle-start] minmax(2ch,10ch) [gutter-left-end] minmax(2ch,10ch) minmax(2ch,10ch) [gutter-right-start] minmax(2ch,10ch) [middle-end] minmax(2ch,10ch) [body-inset-end] 1rem [body-end gutter-right-end] 1fr [body-outset-end page-inset-end page-end screen-inset-end] .25rem [screen-end]}}@media (min-width:1024px){.xl\:article-center-grid{grid-template-columns:[screen-start] .5rem [screen-inset-start page-start] 2rem [page-inset-start] 2fr [body-outset-start gutter-outset-left-start] 1rem [body-start gutter-left-start] 2rem [body-inset-start gutter-left-start] minmax(8ch,10ch) [gutter-left-end middle-start] minmax(8ch,10ch) minmax(8ch,10ch) [] minmax(8ch,10ch) [] minmax(8ch,10ch) [middle-end gutter-right-start gutter-page-right-start] minmax(8ch,10ch) [body-inset-end gutter-right-end] 2rem [body-end] 1rem [body-outset-end] 2fr [page-inset-end] 2rem [page-end screen-inset-end] .5rem [screen-end]}}@media (min-width:1280px){.xl\:article-center-grid{grid-template-columns:[screen-start] .25rem [screen-inset-start] 1fr [page-start] 3rem [page-inset-start] minmax(4rem,9rem) [body-outset-start] 3rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(8ch,10ch) [middle-start] minmax(8ch,10ch) [gutter-left-end] minmax(8ch,10ch) minmax(8ch,10ch) [gutter-right-start] minmax(8ch,10ch) [middle-end] minmax(8ch,10ch) [body-inset-end] 1rem [body-end gutter-right-end] 3rem [body-outset-end] minmax(4rem,9rem) [page-inset-end] 3rem [page-end] 1fr [screen-inset-end] .25rem [screen-end]}}@media (min-width:1536px){.xl\:article-center-grid{grid-template-columns:[screen-start] .5rem [screen-inset-start] 1fr [page-start] 3rem [page-inset-start] minmax(4rem,9rem) [body-outset-start] 3rem [body-start gutter-left-start] 1rem [body-inset-start] minmax(8ch,10ch) [middle-start] minmax(8ch,10ch) [gutter-left-end] minmax(8ch,10ch) minmax(8ch,10ch) [gutter-right-start] minmax(8ch,10ch) [middle-end] minmax(8ch,10ch) [body-inset-end] 1rem [body-end gutter-right-end] 3rem [body-outset-end] minmax(4rem,9rem) [page-inset-end] 3rem [page-end] 1fr [screen-inset-end] .5rem [screen-end]}}.xl\:article-center-grid>*,.xl\:article-grid>*,.xl\:article-left-grid>*{grid-column:body}.xl\:article-center-grid>*,.xl\:article-grid>*,.xl\:article-left-grid>*{margin-top:0!important}.xl\:col-margin-left{grid-column:body}@media (min-width:1280px){.xl\:col-margin-left{grid-column:page/body-start}}}.before\:content-\[\'\.\.\._\'\]:before{--tw-content:"... ";content:var(--tw-content)}.after\:mr-1:after{content:var(--tw-content);margin-right:.25rem}.after\:content-\[\'\,\'\]:after{--tw-content:",";content:var(--tw-content)}.after\:content-\[\'_\.\.\.\'\]:after{--tw-content:" ...";content:var(--tw-content)}.focus-within\:z-40:focus-within{z-index:40}.focus-within\:h-auto:focus-within{height:auto}.focus-within\:w-auto:focus-within{width:auto}.focus-within\:p-2:focus-within{padding:.5rem}.focus-within\:ring-1:focus-within{--tw-ring-offset-shadow:var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);--tw-ring-shadow:var(--tw-ring-inset) 0 0 0 calc(1px + var(--tw-ring-offset-width)) var(--tw-ring-color);box-shadow:var(--tw-ring-offset-shadow),var(--tw-ring-shadow),var(--tw-shadow,0 0 #0000)}.hover\:border-blue-500:hover{--tw-border-opacity:1;border-color:rgb(59 130 246/var(--tw-border-opacity))}.hover\:border-blue-600:hover{--tw-border-opacity:1;border-color:rgb(37 99 235/var(--tw-border-opacity))}.hover\:border-transparent:hover{border-color:transparent}.hover\:border-l-blue-500:hover{--tw-border-opacity:1;border-left-color:rgb(59 130 246/var(--tw-border-opacity))}.hover\:bg-gray-100:hover{--tw-bg-opacity:1;background-color:rgb(243 244 246/var(--tw-bg-opacity))}.hover\:bg-neutral-100:hover{--tw-bg-opacity:1;background-color:rgb(245 245 245/var(--tw-bg-opacity))}.hover\:bg-slate-200:hover{--tw-bg-opacity:1;background-color:rgb(226 232 240/var(--tw-bg-opacity))}.hover\:bg-slate-300\/30:hover{background-color:#cbd5e14d}.hover\:bg-slate-800:hover{--tw-bg-opacity:1;background-color:rgb(30 41 59/var(--tw-bg-opacity))}.hover\:bg-stone-700:hover{--tw-bg-opacity:1;background-color:rgb(68 64 60/var(--tw-bg-opacity))}.hover\:font-light:hover{font-weight:300}.hover\:font-semibold:hover{font-weight:600}.hover\:text-\[\#1DA1F2\]:hover{--tw-text-opacity:1;color:rgb(29 161 242/var(--tw-text-opacity))}.hover\:text-\[\#599F46\]:hover{--tw-text-opacity:1;color:rgb(89 159 70/var(--tw-text-opacity))}.hover\:text-\[\#A9C751\]:hover{--tw-text-opacity:1;color:rgb(169 199 81/var(--tw-text-opacity))}.hover\:text-\[\#E18435\]:hover{--tw-text-opacity:1;color:rgb(225 132 53/var(--tw-text-opacity))}.hover\:text-black:hover{--tw-text-opacity:1;color:rgb(0 0 0/var(--tw-text-opacity))}.hover\:text-blue-400:hover{--tw-text-opacity:1;color:rgb(96 165 250/var(--tw-text-opacity))}.hover\:text-blue-500:hover{--tw-text-opacity:1;color:rgb(59 130 246/var(--tw-text-opacity))}.hover\:text-blue-600:hover{--tw-text-opacity:1;color:rgb(37 99 235/var(--tw-text-opacity))}.hover\:text-blue-700:hover{--tw-text-opacity:1;color:rgb(29 78 216/var(--tw-text-opacity))}.hover\:text-gray-700:hover{--tw-text-opacity:1;color:rgb(55 65 81/var(--tw-text-opacity))}.hover\:text-green-500:hover{--tw-text-opacity:1;color:rgb(34 197 94/var(--tw-text-opacity))}.hover\:text-inherit:hover{color:inherit}.hover\:text-stone-500:hover{--tw-text-opacity:1;color:rgb(120 113 108/var(--tw-text-opacity))}.hover\:text-stone-900:hover{--tw-text-opacity:1;color:rgb(28 25 23/var(--tw-text-opacity))}.hover\:text-violet-100:hover{--tw-text-opacity:1;color:rgb(237 233 254/var(--tw-text-opacity))}.hover\:text-white:hover{--tw-text-opacity:1;color:rgb(255 255 255/var(--tw-text-opacity))}.hover\:underline:hover{text-decoration-line:underline}.hover\:no-underline:hover{text-decoration-line:none}.hover\:opacity-10:hover{opacity:.1}.hover\:opacity-100:hover{opacity:1}.hover\:shadow-\[inset_0_0_0px_30px_\#00000003\]:hover{--tw-shadow:inset 0 0 0px 30px #00000003;--tw-shadow-colored:inset 0 0 0px 30px var(--tw-shadow-color)}.hover\:shadow-\[inset_0_0_0px_30px_\#00000003\]:hover,.hover\:shadow-lg:hover{box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.hover\:shadow-lg:hover{--tw-shadow:0 10px 15px -3px rgba(0,0,0,.1),0 4px 6px -4px rgba(0,0,0,.1);--tw-shadow-colored:0 10px 15px -3px var(--tw-shadow-color),0 4px 6px -4px var(--tw-shadow-color)}.hover\:shadow-md:hover{--tw-shadow:0 4px 6px -1px rgba(0,0,0,.1),0 2px 4px -2px rgba(0,0,0,.1);--tw-shadow-colored:0 4px 6px -1px var(--tw-shadow-color),0 2px 4px -2px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.hover\:ring-blue-500:hover{--tw-ring-opacity:1;--tw-ring-color:rgb(59 130 246/var(--tw-ring-opacity))}.focus\:border-blue-500:focus{--tw-border-opacity:1;border-color:rgb(59 130 246/var(--tw-border-opacity))}.focus\:opacity-100:focus{opacity:1}.focus\:shadow-\[0_0_0_2px\]:focus{--tw-shadow:0 0 0 2px;--tw-shadow-colored:0 0 0 2px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.focus\:shadow-black:focus{--tw-shadow-color:#000;--tw-shadow:var(--tw-shadow-colored)}.focus\:outline-none:focus{outline:2px solid transparent;outline-offset:2px}.focus\:outline:focus{outline-style:solid}.focus\:ring-blue-500:focus{--tw-ring-opacity:1;--tw-ring-color:rgb(59 130 246/var(--tw-ring-opacity))}.focus-visible\:ring-2:focus-visible{--tw-ring-offset-shadow:var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);--tw-ring-shadow:var(--tw-ring-inset) 0 0 0 calc(2px + var(--tw-ring-offset-width)) var(--tw-ring-color);box-shadow:var(--tw-ring-offset-shadow),var(--tw-ring-shadow),var(--tw-shadow,0 0 #0000)}.focus-visible\:ring-white:focus-visible{--tw-ring-opacity:1;--tw-ring-color:rgb(255 255 255/var(--tw-ring-opacity))}.focus-visible\:ring-opacity-75:focus-visible{--tw-ring-opacity:.75}.active\:text-green-700:active{--tw-text-opacity:1;color:rgb(21 128 61/var(--tw-text-opacity))}.active\:opacity-100:active{opacity:1}.group\/block:hover .group-hover\/block\:flex{display:flex}.group\/block:hover .group-hover\/block\:hidden{display:none}.group:hover .group-hover\:-translate-x-1{--tw-translate-x:-.25rem}.group:hover .group-hover\:-translate-x-1,.group:hover .group-hover\:translate-x-1{transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.group:hover .group-hover\:translate-x-1{--tw-translate-x:.25rem}.group:hover .group-hover\:underline{text-decoration-line:underline}.group:hover .group-hover\:opacity-100,.group\/backmatter:hover .group-hover\/backmatter\:opacity-100,.group\/block:hover .group-hover\/block\:opacity-100{opacity:1}.group:hover .group-hover\:opacity-70{opacity:.7}.group[aria-selected=true] .group-aria-selected\:visible{visibility:visible}.group[aria-selected=true] .group-aria-selected\:bg-blue-600{--tw-bg-opacity:1;background-color:rgb(37 99 235/var(--tw-bg-opacity))}.group[aria-selected=true] .group-aria-selected\:text-white{--tw-text-opacity:1;color:rgb(255 255 255/var(--tw-text-opacity))}.group[aria-selected=true] .group-aria-selected\:underline{text-decoration-line:underline}.group[data-state=open] .group-data-\[state\=open\]\:rotate-90{--tw-rotate:90deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.dark\:block:is(.dark *){display:block}.dark\:hidden:is(.dark *){display:none}.dark\:rounded:is(.dark *){border-radius:.25rem}.dark\:border-y-0:is(.dark *){border-top-width:0;border-bottom-width:0}.dark\:border-l-4:is(.dark *){border-left-width:4px}.dark\:border-amber-500\/70:is(.dark *){border-color:#f59e0bb3}.dark\:border-blue-500\/60:is(.dark *){border-color:#3b82f699}.dark\:border-gray-500:is(.dark *){--tw-border-opacity:1;border-color:rgb(107 114 128/var(--tw-border-opacity))}.dark\:border-gray-500\/60:is(.dark *){border-color:#6b728099}.dark\:border-gray-600:is(.dark *){--tw-border-opacity:1;border-color:rgb(75 85 99/var(--tw-border-opacity))}.dark\:border-gray-800:is(.dark *){--tw-border-opacity:1;border-color:rgb(31 41 55/var(--tw-border-opacity))}.dark\:border-green-500\/60:is(.dark *){border-color:#22c55e99}.dark\:border-orange-500\/60:is(.dark *){border-color:#f9731699}.dark\:border-purple-500\/60:is(.dark *){border-color:#a855f799}.dark\:border-red-500\/60:is(.dark *){border-color:#ef444499}.dark\:border-slate-300:is(.dark *){--tw-border-opacity:1;border-color:rgb(203 213 225/var(--tw-border-opacity))}.dark\:border-slate-600:is(.dark *){--tw-border-opacity:1;border-color:rgb(71 85 105/var(--tw-border-opacity))}.dark\:border-white:is(.dark *){--tw-border-opacity:1;border-color:rgb(255 255 255/var(--tw-border-opacity))}.dark\:border-b-white:is(.dark *){--tw-border-opacity:1;border-bottom-color:rgb(255 255 255/var(--tw-border-opacity))}.dark\:border-l-blue-400:is(.dark *){--tw-border-opacity:1;border-left-color:rgb(96 165 250/var(--tw-border-opacity))}.dark\:border-l-gray-50:is(.dark *){--tw-border-opacity:1;border-left-color:rgb(249 250 251/var(--tw-border-opacity))}.dark\:bg-black\/80:is(.dark *){background-color:#000c}.dark\:bg-gray-700:is(.dark *){--tw-bg-opacity:1;background-color:rgb(55 65 81/var(--tw-bg-opacity))}.dark\:bg-neutral-800\/80:is(.dark *){background-color:#262626cc}.dark\:bg-orange-700:is(.dark *){--tw-bg-opacity:1;background-color:rgb(194 65 12/var(--tw-bg-opacity))}.dark\:bg-red-800:is(.dark *){--tw-bg-opacity:1;background-color:rgb(153 27 27/var(--tw-bg-opacity))}.dark\:bg-slate-500:is(.dark *){--tw-bg-opacity:1;background-color:rgb(100 116 139/var(--tw-bg-opacity))}.dark\:bg-slate-600:is(.dark *){--tw-bg-opacity:1;background-color:rgb(71 85 105/var(--tw-bg-opacity))}.dark\:bg-slate-800:is(.dark *){--tw-bg-opacity:1;background-color:rgb(30 41 59/var(--tw-bg-opacity))}.dark\:bg-slate-800\/50:is(.dark *){background-color:#1e293b80}.dark\:bg-slate-900:is(.dark *){--tw-bg-opacity:1;background-color:rgb(15 23 42/var(--tw-bg-opacity))}.dark\:bg-stone-700:is(.dark *){--tw-bg-opacity:1;background-color:rgb(68 64 60/var(--tw-bg-opacity))}.dark\:bg-stone-800:is(.dark *){--tw-bg-opacity:1;background-color:rgb(41 37 36/var(--tw-bg-opacity))}.dark\:bg-stone-900:is(.dark *){--tw-bg-opacity:1;background-color:rgb(28 25 23/var(--tw-bg-opacity))}.dark\:bg-stone-900\/80:is(.dark *){background-color:#1c1917cc}.dark\:bg-white:is(.dark *){--tw-bg-opacity:1;background-color:rgb(255 255 255/var(--tw-bg-opacity))}.dark\:fill-white:is(.dark *){fill:#fff}.dark\:text-black:is(.dark *){--tw-text-opacity:1;color:rgb(0 0 0/var(--tw-text-opacity))}.dark\:text-blue-200:is(.dark *){--tw-text-opacity:1;color:rgb(191 219 254/var(--tw-text-opacity))}.dark\:text-blue-400:is(.dark *){--tw-text-opacity:1;color:rgb(96 165 250/var(--tw-text-opacity))}.dark\:text-gray-100:is(.dark *){--tw-text-opacity:1;color:rgb(243 244 246/var(--tw-text-opacity))}.dark\:text-gray-200:is(.dark *){--tw-text-opacity:1;color:rgb(229 231 235/var(--tw-text-opacity))}.dark\:text-gray-300:is(.dark *){--tw-text-opacity:1;color:rgb(209 213 219/var(--tw-text-opacity))}.dark\:text-gray-400:is(.dark *){--tw-text-opacity:1;color:rgb(156 163 175/var(--tw-text-opacity))}.dark\:text-gray-600:is(.dark *){--tw-text-opacity:1;color:rgb(75 85 99/var(--tw-text-opacity))}.dark\:text-green-500:is(.dark *){--tw-text-opacity:1;color:rgb(34 197 94/var(--tw-text-opacity))}.dark\:text-neutral-200:is(.dark *){--tw-text-opacity:1;color:rgb(229 229 229/var(--tw-text-opacity))}.dark\:text-purple-500:is(.dark *){--tw-text-opacity:1;color:rgb(168 85 247/var(--tw-text-opacity))}.dark\:text-slate-100:is(.dark *){--tw-text-opacity:1;color:rgb(241 245 249/var(--tw-text-opacity))}.dark\:text-slate-300:is(.dark *){--tw-text-opacity:1;color:rgb(203 213 225/var(--tw-text-opacity))}.dark\:text-slate-50:is(.dark *){--tw-text-opacity:1;color:rgb(248 250 252/var(--tw-text-opacity))}.dark\:text-stone-200:is(.dark *){--tw-text-opacity:1;color:rgb(231 229 228/var(--tw-text-opacity))}.dark\:text-stone-300:is(.dark *){--tw-text-opacity:1;color:rgb(214 211 209/var(--tw-text-opacity))}.dark\:text-stone-700:is(.dark *){--tw-text-opacity:1;color:rgb(68 64 60/var(--tw-text-opacity))}.dark\:text-white:is(.dark *){--tw-text-opacity:1;color:rgb(255 255 255/var(--tw-text-opacity))}.dark\:text-zinc-400:is(.dark *){--tw-text-opacity:1;color:rgb(161 161 170/var(--tw-text-opacity))}.dark\:placeholder-gray-400:is(.dark *)::-moz-placeholder{--tw-placeholder-opacity:1;color:rgb(156 163 175/var(--tw-placeholder-opacity))}.dark\:placeholder-gray-400:is(.dark *)::placeholder{--tw-placeholder-opacity:1;color:rgb(156 163 175/var(--tw-placeholder-opacity))}.dark\:shadow-2xl:is(.dark *){--tw-shadow:0 25px 50px -12px rgba(0,0,0,.25);--tw-shadow-colored:0 25px 50px -12px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.dark\:shadow-none:is(.dark *){--tw-shadow:0 0 #0000;--tw-shadow-colored:0 0 #0000;box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.dark\:shadow-neutral-700:is(.dark *){--tw-shadow-color:#404040;--tw-shadow:var(--tw-shadow-colored)}.dark\:shadow-neutral-800:is(.dark *){--tw-shadow-color:#262626;--tw-shadow:var(--tw-shadow-colored)}.dark\:shadow-neutral-900:is(.dark *){--tw-shadow-color:#171717;--tw-shadow:var(--tw-shadow-colored)}.dark\:shadow-stone-700:is(.dark *){--tw-shadow-color:#44403c;--tw-shadow:var(--tw-shadow-colored)}.dark\:hover\:border-blue-400:hover:is(.dark *){--tw-border-opacity:1;border-color:rgb(96 165 250/var(--tw-border-opacity))}.dark\:hover\:border-blue-500:hover:is(.dark *){--tw-border-opacity:1;border-color:rgb(59 130 246/var(--tw-border-opacity))}.dark\:hover\:bg-slate-700:hover:is(.dark *){--tw-bg-opacity:1;background-color:rgb(51 65 85/var(--tw-bg-opacity))}.dark\:hover\:bg-stone-200:hover:is(.dark *){--tw-bg-opacity:1;background-color:rgb(231 229 228/var(--tw-bg-opacity))}.dark\:hover\:text-black:hover:is(.dark *){--tw-text-opacity:1;color:rgb(0 0 0/var(--tw-text-opacity))}.dark\:hover\:text-blue-400:hover:is(.dark *){--tw-text-opacity:1;color:rgb(96 165 250/var(--tw-text-opacity))}.dark\:hover\:text-gray-100:hover:is(.dark *){--tw-text-opacity:1;color:rgb(243 244 246/var(--tw-text-opacity))}.dark\:hover\:text-neutral-800:hover:is(.dark *){--tw-text-opacity:1;color:rgb(38 38 38/var(--tw-text-opacity))}.hover\:dark\:text-stone-100:is(.dark *):hover{--tw-text-opacity:1;color:rgb(245 245 244/var(--tw-text-opacity))}.dark\:hover\:shadow-\[inset_0_0_0px_30px_\#FFFFFF03\]:hover:is(.dark *){--tw-shadow:inset 0 0 0px 30px #ffffff03;--tw-shadow-colored:inset 0 0 0px 30px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow,0 0 #0000),var(--tw-ring-shadow,0 0 #0000),var(--tw-shadow)}.dark\:hover\:ring-blue-500:hover:is(.dark *){--tw-ring-opacity:1;--tw-ring-color:rgb(59 130 246/var(--tw-ring-opacity))}.dark\:focus\:border-blue-500:focus:is(.dark *){--tw-border-opacity:1;border-color:rgb(59 130 246/var(--tw-border-opacity))}.dark\:focus\:ring-blue-500:focus:is(.dark *){--tw-ring-opacity:1;--tw-ring-color:rgb(59 130 246/var(--tw-ring-opacity))}@media not all and (min-width:1280px){.max-xl\:min-w-\[300px\]{min-width:300px}}@media (min-width:640px){.sm\:left-1\/2{left:50%}.sm\:right-2{right:.5rem}.sm\:top-\[var\(--content-top\)\]{top:var(--content-top)}.sm\:col-span-2{grid-column:span 2/span 2}.sm\:mr-5{margin-right:1.25rem}.sm\:mr-7{margin-right:1.75rem}.sm\:mt-0{margin-top:0}.sm\:block{display:block}.sm\:flex{display:flex}.sm\:grid{display:grid}.sm\:hidden{display:none}.sm\:h-auto{height:auto}.sm\:max-h-\[var\(--content-max-height\)\]{max-height:var(--content-max-height)}.sm\:w-64{width:16rem}.sm\:w-\[90vw\]{width:90vw}.sm\:max-w-\[400px\]{max-width:400px}.sm\:max-w-\[500px\]{max-width:500px}.sm\:max-w-screen-sm{max-width:640px}.sm\:-translate-x-1\/2{--tw-translate-x:-50%;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.sm\:grid-cols-1{grid-template-columns:repeat(1,minmax(0,1fr))}.sm\:grid-cols-10{grid-template-columns:repeat(10,minmax(0,1fr))}.sm\:grid-cols-11{grid-template-columns:repeat(11,minmax(0,1fr))}.sm\:grid-cols-12{grid-template-columns:repeat(12,minmax(0,1fr))}.sm\:grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}.sm\:grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}.sm\:grid-cols-4{grid-template-columns:repeat(4,minmax(0,1fr))}.sm\:grid-cols-5{grid-template-columns:repeat(5,minmax(0,1fr))}.sm\:grid-cols-6{grid-template-columns:repeat(6,minmax(0,1fr))}.sm\:grid-cols-7{grid-template-columns:repeat(7,minmax(0,1fr))}.sm\:grid-cols-8{grid-template-columns:repeat(8,minmax(0,1fr))}.sm\:grid-cols-9{grid-template-columns:repeat(9,minmax(0,1fr))}.sm\:gap-4{gap:1rem}.sm\:rounded-md{border-radius:.375rem}.sm\:px-0{padding-left:0;padding-right:0}.sm\:text-xl{font-size:1.25rem;line-height:1.75rem}}@media (min-width:768px){.md\:my-\[4rem\]{margin-top:4rem;margin-bottom:4rem}.md\:ml-5{margin-left:1.25rem}.md\:block{display:block}.md\:w-\[200px\]{width:200px}.md\:max-w-\[300px\]{max-width:300px}.md\:flex-none{flex:none}.md\:grid-cols-1{grid-template-columns:repeat(1,minmax(0,1fr))}.md\:grid-cols-10{grid-template-columns:repeat(10,minmax(0,1fr))}.md\:grid-cols-11{grid-template-columns:repeat(11,minmax(0,1fr))}.md\:grid-cols-12{grid-template-columns:repeat(12,minmax(0,1fr))}.md\:grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}.md\:grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}.md\:grid-cols-4{grid-template-columns:repeat(4,minmax(0,1fr))}.md\:grid-cols-5{grid-template-columns:repeat(5,minmax(0,1fr))}.md\:grid-cols-6{grid-template-columns:repeat(6,minmax(0,1fr))}.md\:grid-cols-7{grid-template-columns:repeat(7,minmax(0,1fr))}.md\:grid-cols-8{grid-template-columns:repeat(8,minmax(0,1fr))}.md\:grid-cols-9{grid-template-columns:repeat(9,minmax(0,1fr))}.md\:flex-row{flex-direction:row}.md\:flex-col{flex-direction:column}.md\:px-8{padding-left:2rem;padding-right:2rem}}@media (min-width:1024px){.lg\:sticky{position:sticky}.lg\:z-10{z-index:10}.lg\:my-0{margin-top:0;margin-bottom:0}.lg\:ml-10{margin-left:2.5rem}.lg\:mr-0{margin-right:0}.lg\:block{display:block}.lg\:flex{display:flex}.lg\:hidden{display:none}.lg\:h-0{height:0}.lg\:grid-cols-1{grid-template-columns:repeat(1,minmax(0,1fr))}.lg\:grid-cols-10{grid-template-columns:repeat(10,minmax(0,1fr))}.lg\:grid-cols-11{grid-template-columns:repeat(11,minmax(0,1fr))}.lg\:grid-cols-12{grid-template-columns:repeat(12,minmax(0,1fr))}.lg\:grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}.lg\:grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}.lg\:grid-cols-4{grid-template-columns:repeat(4,minmax(0,1fr))}.lg\:grid-cols-5{grid-template-columns:repeat(5,minmax(0,1fr))}.lg\:grid-cols-6{grid-template-columns:repeat(6,minmax(0,1fr))}.lg\:grid-cols-7{grid-template-columns:repeat(7,minmax(0,1fr))}.lg\:grid-cols-8{grid-template-columns:repeat(8,minmax(0,1fr))}.lg\:grid-cols-9{grid-template-columns:repeat(9,minmax(0,1fr))}.lg\:pt-0{padding-top:0}}@media (min-width:1280px){.xl\:pointer-events-none{pointer-events:none}.xl\:ml-0{margin-left:0}.xl\:ml-7{margin-left:1.75rem}.xl\:flex{display:flex}.xl\:hidden{display:none}.xl\:w-screen{width:100vw}.xl\:min-w-\[19\.5rem\]{min-width:19.5rem}.xl\:grid-cols-1{grid-template-columns:repeat(1,minmax(0,1fr))}.xl\:grid-cols-10{grid-template-columns:repeat(10,minmax(0,1fr))}.xl\:grid-cols-11{grid-template-columns:repeat(11,minmax(0,1fr))}.xl\:grid-cols-12{grid-template-columns:repeat(12,minmax(0,1fr))}.xl\:grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}.xl\:grid-cols-3{grid-template-columns:repeat(3,minmax(0,1fr))}.xl\:grid-cols-4{grid-template-columns:repeat(4,minmax(0,1fr))}.xl\:grid-cols-5{grid-template-columns:repeat(5,minmax(0,1fr))}.xl\:grid-cols-6{grid-template-columns:repeat(6,minmax(0,1fr))}.xl\:grid-cols-7{grid-template-columns:repeat(7,minmax(0,1fr))}.xl\:grid-cols-8{grid-template-columns:repeat(8,minmax(0,1fr))}.xl\:grid-cols-9{grid-template-columns:repeat(9,minmax(0,1fr))}} /*! tailwindcss v3.4.10 | MIT License | https://tailwindcss.com*/ diff --git a/build/_shared/chunk-P4DJOY6Q.js b/build/_shared/chunk-JLDGA2DL.js similarity index 92% rename from build/_shared/chunk-P4DJOY6Q.js rename to build/_shared/chunk-JLDGA2DL.js index 35eb1b0..4b63193 100644 --- a/build/_shared/chunk-P4DJOY6Q.js +++ b/build/_shared/chunk-JLDGA2DL.js @@ -1,14 +1,14 @@ -import{b as Zl,g as S1,h as Av,k as eq}from"/build/_shared/chunk-YAIQ7LUU.js";import{a as ji}from"/build/_shared/chunk-OCWQY3HK.js";import{g as gA,n as _A}from"/build/_shared/chunk-ZQWAZXET.js";import{a as As,b as EA,c as Ap,d as w1,e as Q8,f as Vo,g as Rv,h as gn,i as kv,j as Ls,k as MA,l as lu}from"/build/_shared/chunk-HYMQ7M2K.js";import{a as Ct}from"/build/_shared/chunk-3CVK3PYF.js";import{b as Lp}from"/build/_shared/chunk-J6FHCSRC.js";import{B as vA,D as bA,b as hA,o as fA,r as mA,s as pA,t as Jl,y as ch}from"/build/_shared/chunk-IQBJE7PC.js";import{$ as TA,A as Rp,B as V8,I as kp,J as wA,K as b1,O as SA,Q as x1,R as G8,S as Y8,T as K8,U as X8,V as J8,W as Z8,X as CA,Y as y1,Z as Lv,_ as IA,a as Ts,aa as tq,b as uh,ba as RA,c as mt,ca as iq,d as xA,da as Ns,e as au,ea as C1,f as Fi,fa as kA,g as Qn,ga as nq,h as Te,ha as AA,i as Rs,ia as rq,j as yA,k as Cv,l as pt,m as Tp,n as v1,o as an,p as Ev,q as ks,r as Mv,s as er,t as Iv,u as Ce,v as Zt,w as Tv,z as fo}from"/build/_shared/chunk-5CFTM6YW.js";import{a as iA,b as nA,c as rA,d as Xl,e as sA,g as g1,h as _1,i as oA,j as aA,k as lA,m as cA,n as uA,p as dA}from"/build/_shared/chunk-OCTKKCIL.js";import{a as oe,b as Sv,c as U8,d as he}from"/build/_shared/chunk-UAI5KRM7.js";import{b as $,c as Ge,d as lh,e as P,f as Pa}from"/build/_shared/chunk-2NH4LW52.js";var BD=Ge((Wce,Ga)=>{function ES(t){return Ga.exports=ES=typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?function(e){return typeof e}:function(e){return e&&typeof Symbol=="function"&&e.constructor===Symbol&&e!==Symbol.prototype?"symbol":typeof e},Ga.exports.__esModule=!0,Ga.exports.default=Ga.exports,ES(t)}Ga.exports=ES,Ga.exports.__esModule=!0,Ga.exports.default=Ga.exports});var FD=Ge(($ce,Ya)=>{var HD=BD().default;function jD(){"use strict";Ya.exports=jD=function(){return e},Ya.exports.__esModule=!0,Ya.exports.default=Ya.exports;var t,e={},i=Object.prototype,n=i.hasOwnProperty,r=Object.defineProperty||function(K,V,ie){K[V]=ie.value},s=typeof Symbol=="function"?Symbol:{},o=s.iterator||"@@iterator",a=s.asyncIterator||"@@asyncIterator",l=s.toStringTag||"@@toStringTag";function c(K,V,ie){return Object.defineProperty(K,V,{value:ie,enumerable:!0,configurable:!0,writable:!0}),K[V]}try{c({},"")}catch{c=function(ie,_e,Ne){return ie[_e]=Ne}}function u(K,V,ie,_e){var Ne=V&&V.prototype instanceof y?V:y,ye=Object.create(Ne.prototype),Ie=new Z(_e||[]);return r(ye,"_invoke",{value:w(K,ie,Ie)}),ye}function d(K,V,ie){try{return{type:"normal",arg:K.call(V,ie)}}catch(_e){return{type:"throw",arg:_e}}}e.wrap=u;var f="suspendedStart",h="suspendedYield",m="executing",p="completed",_={};function y(){}function S(){}function T(){}var O={};c(O,o,function(){return this});var A=Object.getPrototypeOf,b=A&&A(A(X([])));b&&b!==i&&n.call(b,o)&&(O=b);var M=T.prototype=y.prototype=Object.create(O);function C(K){["next","throw","return"].forEach(function(V){c(K,V,function(ie){return this._invoke(V,ie)})})}function x(K,V){function ie(Ne,ye,Ie,at){var Ve=d(K[Ne],K,ye);if(Ve.type!=="throw"){var Ze=Ve.arg,ct=Ze.value;return ct&&HD(ct)=="object"&&n.call(ct,"__await")?V.resolve(ct.__await).then(function(yt){ie("next",yt,Ie,at)},function(yt){ie("throw",yt,Ie,at)}):V.resolve(ct).then(function(yt){Ze.value=yt,Ie(Ze)},function(yt){return ie("throw",yt,Ie,at)})}at(Ve.arg)}var _e;r(this,"_invoke",{value:function(ye,Ie){function at(){return new V(function(Ve,Ze){ie(ye,Ie,Ve,Ze)})}return _e=_e?_e.then(at,at):at()}})}function w(K,V,ie){var _e=f;return function(Ne,ye){if(_e===m)throw Error("Generator is already running");if(_e===p){if(Ne==="throw")throw ye;return{value:t,done:!0}}for(ie.method=Ne,ie.arg=ye;;){var Ie=ie.delegate;if(Ie){var at=E(Ie,ie);if(at){if(at===_)continue;return at}}if(ie.method==="next")ie.sent=ie._sent=ie.arg;else if(ie.method==="throw"){if(_e===f)throw _e=p,ie.arg;ie.dispatchException(ie.arg)}else ie.method==="return"&&ie.abrupt("return",ie.arg);_e=m;var Ve=d(K,V,ie);if(Ve.type==="normal"){if(_e=ie.done?p:h,Ve.arg===_)continue;return{value:Ve.arg,done:ie.done}}Ve.type==="throw"&&(_e=p,ie.method="throw",ie.arg=Ve.arg)}}}function E(K,V){var ie=V.method,_e=K.iterator[ie];if(_e===t)return V.delegate=null,ie==="throw"&&K.iterator.return&&(V.method="return",V.arg=t,E(K,V),V.method==="throw")||ie!=="return"&&(V.method="throw",V.arg=new TypeError("The iterator does not provide a '"+ie+"' method")),_;var Ne=d(_e,K.iterator,V.arg);if(Ne.type==="throw")return V.method="throw",V.arg=Ne.arg,V.delegate=null,_;var ye=Ne.arg;return ye?ye.done?(V[K.resultName]=ye.value,V.next=K.nextLoc,V.method!=="return"&&(V.method="next",V.arg=t),V.delegate=null,_):ye:(V.method="throw",V.arg=new TypeError("iterator result is not an object"),V.delegate=null,_)}function N(K){var V={tryLoc:K[0]};1 in K&&(V.catchLoc=K[1]),2 in K&&(V.finallyLoc=K[2],V.afterLoc=K[3]),this.tryEntries.push(V)}function B(K){var V=K.completion||{};V.type="normal",delete V.arg,K.completion=V}function Z(K){this.tryEntries=[{tryLoc:"root"}],K.forEach(N,this),this.reset(!0)}function X(K){if(K||K===""){var V=K[o];if(V)return V.call(K);if(typeof K.next=="function")return K;if(!isNaN(K.length)){var ie=-1,_e=function Ne(){for(;++ie=0;--Ne){var ye=this.tryEntries[Ne],Ie=ye.completion;if(ye.tryLoc==="root")return _e("end");if(ye.tryLoc<=this.prev){var at=n.call(ye,"catchLoc"),Ve=n.call(ye,"finallyLoc");if(at&&Ve){if(this.prev=0;--_e){var Ne=this.tryEntries[_e];if(Ne.tryLoc<=this.prev&&n.call(Ne,"finallyLoc")&&this.prev=0;--ie){var _e=this.tryEntries[ie];if(_e.finallyLoc===V)return this.complete(_e.completion,_e.afterLoc),B(_e),_}},catch:function(V){for(var ie=this.tryEntries.length-1;ie>=0;--ie){var _e=this.tryEntries[ie];if(_e.tryLoc===V){var Ne=_e.completion;if(Ne.type==="throw"){var ye=Ne.arg;B(_e)}return ye}}throw Error("illegal catch attempt")},delegateYield:function(V,ie,_e){return this.delegate={iterator:X(V),resultName:ie,nextLoc:_e},this.method==="next"&&(this.arg=t),_}},e}Ya.exports=jD,Ya.exports.__esModule=!0,Ya.exports.default=Ya.exports});var MS=Ge((qce,WD)=>{var vb=FD()();WD.exports=vb;try{regeneratorRuntime=vb}catch{typeof globalThis=="object"?globalThis.regeneratorRuntime=vb:Function("r","regeneratorRuntime = r")(vb)}});var tO=Ge(eO=>{"use strict";var lf=oe();function VU(t,e){return t===e&&(t!==0||1/t===1/e)||t!==t&&e!==e}var GU=typeof Object.is=="function"?Object.is:VU,YU=lf.useState,KU=lf.useEffect,XU=lf.useLayoutEffect,JU=lf.useDebugValue;function ZU(t,e){var i=e(),n=YU({inst:{value:i,getSnapshot:e}}),r=n[0].inst,s=n[1];return XU(function(){r.value=i,r.getSnapshot=e,LS(r)&&s({inst:r})},[t,i,e]),KU(function(){return LS(r)&&s({inst:r}),t(function(){LS(r)&&s({inst:r})})},[t]),JU(i),i}function LS(t){var e=t.getSnapshot;t=t.value;try{var i=e();return!GU(t,i)}catch{return!0}}function QU(t,e){return e()}var e9=typeof window>"u"||typeof window.document>"u"||typeof window.document.createElement>"u"?QU:ZU;eO.useSyncExternalStore=lf.useSyncExternalStore!==void 0?lf.useSyncExternalStore:e9});var NS=Ge((Cue,iO)=>{"use strict";iO.exports=tO()});var d2=Ge(zb=>{"use strict";Object.defineProperty(zb,"__esModule",{value:!0});zb.OutputAreaByRef=void 0;var RV=(Fi(),Pa(au)),Ob=he(),kV=RV.__importDefault(oe());zb.OutputAreaByRef=kV.default.forwardRef(({busy:t,content:e},i)=>(0,Ob.jsx)("div",{children:(0,Ob.jsxs)("div",Object.assign({className:"m-1 hover:delay-15"},{children:[(0,Ob.jsx)("div",Object.assign({className:"p-1 rounded",ref:i},{children:e||"[Output Area]"})),t&&(0,Ob.jsx)("div",{children:"Cell is running..."})]}))}))});var KS=Ge(YS=>{"use strict";Object.defineProperty(YS,"__esModule",{value:!0});var AV="0.4.10";YS.default=AV});function LV(t,e){let i=new URL(e);return`${t}-${i.origin+i.pathname}`}function Pb(t,e,i){let n=`${e}/build/${i}`,r=`${e}/v2/${i}`;return{build:n,launch:r,storageKey:LV(t,n)}}function NV(t){if(!t.binder.repo)throw Error("repo is required for git provider");let{repo:e,binderUrl:i,ref:n}=t.binder,r=encodeURIComponent(e.replace(/(^\/)|(\/?$)/g,"")),s=i?.replace(/(\/?$)/g,""),o=`git/${r}/${n??"HEAD"}`;return Pb(t.savedSessions.storagePrefix,s,o)}function DV(t){var e,i,n;if(!t.binder.repo)throw Error("repo is required for gitlab provider");let r=(e=t.binder.binderUrl)===null||e===void 0?void 0:e.replace(/(\/?$)/g,""),o=`gl/${encodeURIComponent(((i=t.binder.repo)!==null&&i!==void 0?i:"").replace(/^(https?:\/\/)?gitlab.com\//,"").replace(/(^\/)|(\/?$)/g,""))}/${(n=t.binder.ref)!==null&&n!==void 0?n:"HEAD"}`;return Pb(t.savedSessions.storagePrefix,r,o)}function OV(t){var e,i;if(!t.binder.repo)throw Error("repo is required for github provider");let n=t.binder.repo.replace(/^(https?:\/\/)?github.com\//,"").replace(/(^\/)|(\/?$)/g,""),r=(e=t.binder.binderUrl)===null||e===void 0?void 0:e.replace(/(\/?$)/g,""),s=`gh/${n}/${(i=t.binder.ref)!==null&&i!==void 0?i:"HEAD"}`;return Pb(t.savedSessions.storagePrefix,r,s)}function zV(t){var e,i;if(!t.binder.repo)throw Error("repo is required for gist provider");let n=t.binder.repo.replace(/^(https?:\/\/)?github.com\//,"").replace(/(^\/)|(\/?$)/g,""),r=(e=t.binder.binderUrl)===null||e===void 0?void 0:e.replace(/(\/?$)/g,""),s=`gist/${n}/${(i=t.binder.ref)!==null&&i!==void 0?i:"HEAD"}`;return Pb(t.savedSessions.storagePrefix,r,s)}function Hb(t,e){var i,n;let r=(i=e.reduce((o,a)=>Object.assign(Object.assign({},o),{[a.name]:a}),{}))!==null&&i!==void 0?i:{},s=(n=t.binder.repoProvider)!==null&&n!==void 0?n:"github";if(!Object.keys(r).includes(s))throw Error(`Unknown provider ${t.binder.repoProvider}`);if(!r[s].makeUrls)throw Error(`No makeUrls function for ${s}`);return r[s].makeUrls(t)}var PV,BV,HV,jV,Bb,h2=$(()=>{PV={name:"github",makeUrls:OV},BV={name:"gitlab",makeUrls:DV},HV={name:"git",makeUrls:NV},jV={name:"gist",makeUrls:zV},Bb=[PV,BV,HV,jV]});function FV(t){let e=window.localStorage.getItem(t);if(!e)return;let i=JSON.parse(e);window.localStorage.setItem(t,JSON.stringify(Object.assign(Object.assign({},i),{lastUsed:new Date})))}function f2(t,e,i){try{let{baseUrl:n,token:r,wsUrl:s}=i;window.localStorage.setItem(t,JSON.stringify({id:e,baseUrl:n,token:r,wsUrl:s,lastUsed:new Date}))}catch(n){console.warn("Couldn't save thebe binder connection info to local storage",n)}}function m2(t,e){return mt(this,void 0,void 0,function*(){if(!t.enabled)return null;let i=window.localStorage.getItem(e);if(i==null)return console.debug("thebe:getExistingServer No session saved in ",e),null;console.debug("thebe:getExistingServer Saved binder session found");let n=JSON.parse(i??""),r=new Date(n.lastUsed);if((new Date().getTime()-r.getTime())/1e3>t.maxAge)return console.debug(`thebe:getExistingServer Not using expired binder session for ${n.baseUrl} from ${r}`),window.localStorage.removeItem(e),null;try{yield jb.KernelAPI.listRunning(jb.ServerConnection.makeSettings(n))}catch(a){return console.debug("thebe:getExistingServer Saved binder connection appears to be invalid, requesting new session",a),window.localStorage.removeItem(e),null}return FV(e),console.debug(`thebe:getExistingServer Saved binder session is valid and will be reused ${n.baseUrl}`),n})}function p2(t="thebe-binder"){let e=[];for(let i=0;iwindow.localStorage.removeItem(i))}function g2(t){console.debug(`thebe:clearSavedSession - removing ${t}`),window.localStorage.removeItem(t)}var jb,XS=$(()=>{Fi();jb=P(v1())});function Fb(t){var e;return t.traceback?Array.isArray(t.traceback)?`${t.evalue} +import{b as Zl,g as S1,h as Av,k as eq}from"/build/_shared/chunk-YAIQ7LUU.js";import{a as ji}from"/build/_shared/chunk-OCWQY3HK.js";import{g as gA,n as _A}from"/build/_shared/chunk-ZQWAZXET.js";import{a as As,b as EA,c as Ap,d as w1,e as Q8,f as Vo,g as Rv,h as gn,i as kv,j as Ls,k as MA,l as lu}from"/build/_shared/chunk-HYMQ7M2K.js";import{a as Ct}from"/build/_shared/chunk-3CVK3PYF.js";import{b as Lp}from"/build/_shared/chunk-J6FHCSRC.js";import{B as vA,D as bA,b as hA,o as fA,r as mA,s as pA,t as Jl,y as uh}from"/build/_shared/chunk-IQBJE7PC.js";import{$ as TA,A as Rp,B as V8,I as kp,J as wA,K as b1,O as SA,Q as x1,R as G8,S as Y8,T as K8,U as X8,V as J8,W as Z8,X as CA,Y as y1,Z as Lv,_ as IA,a as Ts,aa as tq,b as dh,ba as RA,c as mt,ca as iq,d as xA,da as Ns,e as au,ea as C1,f as Fi,fa as kA,g as Qn,ga as nq,h as Te,ha as AA,i as Rs,ia as rq,j as yA,k as Cv,l as pt,m as Tp,n as v1,o as an,p as Ev,q as ks,r as Mv,s as er,t as Iv,u as Ce,v as Qt,w as Tv,z as fo}from"/build/_shared/chunk-5CFTM6YW.js";import{a as iA,b as nA,c as rA,d as Xl,e as sA,g as g1,h as _1,i as oA,j as aA,k as lA,m as cA,n as uA,p as dA}from"/build/_shared/chunk-OCTKKCIL.js";import{a as oe,b as Sv,c as U8,d as he}from"/build/_shared/chunk-UAI5KRM7.js";import{b as $,c as Ge,d as ch,e as P,f as Pa}from"/build/_shared/chunk-2NH4LW52.js";var BD=Ge((Wce,Ga)=>{function ES(t){return Ga.exports=ES=typeof Symbol=="function"&&typeof Symbol.iterator=="symbol"?function(e){return typeof e}:function(e){return e&&typeof Symbol=="function"&&e.constructor===Symbol&&e!==Symbol.prototype?"symbol":typeof e},Ga.exports.__esModule=!0,Ga.exports.default=Ga.exports,ES(t)}Ga.exports=ES,Ga.exports.__esModule=!0,Ga.exports.default=Ga.exports});var FD=Ge(($ce,Ya)=>{var HD=BD().default;function jD(){"use strict";Ya.exports=jD=function(){return e},Ya.exports.__esModule=!0,Ya.exports.default=Ya.exports;var t,e={},i=Object.prototype,n=i.hasOwnProperty,r=Object.defineProperty||function(K,V,ie){K[V]=ie.value},s=typeof Symbol=="function"?Symbol:{},o=s.iterator||"@@iterator",a=s.asyncIterator||"@@asyncIterator",l=s.toStringTag||"@@toStringTag";function c(K,V,ie){return Object.defineProperty(K,V,{value:ie,enumerable:!0,configurable:!0,writable:!0}),K[V]}try{c({},"")}catch{c=function(ie,_e,Ne){return ie[_e]=Ne}}function u(K,V,ie,_e){var Ne=V&&V.prototype instanceof y?V:y,ye=Object.create(Ne.prototype),Ie=new Z(_e||[]);return r(ye,"_invoke",{value:w(K,ie,Ie)}),ye}function d(K,V,ie){try{return{type:"normal",arg:K.call(V,ie)}}catch(_e){return{type:"throw",arg:_e}}}e.wrap=u;var f="suspendedStart",h="suspendedYield",m="executing",p="completed",_={};function y(){}function S(){}function T(){}var O={};c(O,o,function(){return this});var A=Object.getPrototypeOf,b=A&&A(A(X([])));b&&b!==i&&n.call(b,o)&&(O=b);var M=T.prototype=y.prototype=Object.create(O);function C(K){["next","throw","return"].forEach(function(V){c(K,V,function(ie){return this._invoke(V,ie)})})}function x(K,V){function ie(Ne,ye,Ie,at){var Ve=d(K[Ne],K,ye);if(Ve.type!=="throw"){var Ze=Ve.arg,ct=Ze.value;return ct&&HD(ct)=="object"&&n.call(ct,"__await")?V.resolve(ct.__await).then(function(yt){ie("next",yt,Ie,at)},function(yt){ie("throw",yt,Ie,at)}):V.resolve(ct).then(function(yt){Ze.value=yt,Ie(Ze)},function(yt){return ie("throw",yt,Ie,at)})}at(Ve.arg)}var _e;r(this,"_invoke",{value:function(ye,Ie){function at(){return new V(function(Ve,Ze){ie(ye,Ie,Ve,Ze)})}return _e=_e?_e.then(at,at):at()}})}function w(K,V,ie){var _e=f;return function(Ne,ye){if(_e===m)throw Error("Generator is already running");if(_e===p){if(Ne==="throw")throw ye;return{value:t,done:!0}}for(ie.method=Ne,ie.arg=ye;;){var Ie=ie.delegate;if(Ie){var at=E(Ie,ie);if(at){if(at===_)continue;return at}}if(ie.method==="next")ie.sent=ie._sent=ie.arg;else if(ie.method==="throw"){if(_e===f)throw _e=p,ie.arg;ie.dispatchException(ie.arg)}else ie.method==="return"&&ie.abrupt("return",ie.arg);_e=m;var Ve=d(K,V,ie);if(Ve.type==="normal"){if(_e=ie.done?p:h,Ve.arg===_)continue;return{value:Ve.arg,done:ie.done}}Ve.type==="throw"&&(_e=p,ie.method="throw",ie.arg=Ve.arg)}}}function E(K,V){var ie=V.method,_e=K.iterator[ie];if(_e===t)return V.delegate=null,ie==="throw"&&K.iterator.return&&(V.method="return",V.arg=t,E(K,V),V.method==="throw")||ie!=="return"&&(V.method="throw",V.arg=new TypeError("The iterator does not provide a '"+ie+"' method")),_;var Ne=d(_e,K.iterator,V.arg);if(Ne.type==="throw")return V.method="throw",V.arg=Ne.arg,V.delegate=null,_;var ye=Ne.arg;return ye?ye.done?(V[K.resultName]=ye.value,V.next=K.nextLoc,V.method!=="return"&&(V.method="next",V.arg=t),V.delegate=null,_):ye:(V.method="throw",V.arg=new TypeError("iterator result is not an object"),V.delegate=null,_)}function N(K){var V={tryLoc:K[0]};1 in K&&(V.catchLoc=K[1]),2 in K&&(V.finallyLoc=K[2],V.afterLoc=K[3]),this.tryEntries.push(V)}function B(K){var V=K.completion||{};V.type="normal",delete V.arg,K.completion=V}function Z(K){this.tryEntries=[{tryLoc:"root"}],K.forEach(N,this),this.reset(!0)}function X(K){if(K||K===""){var V=K[o];if(V)return V.call(K);if(typeof K.next=="function")return K;if(!isNaN(K.length)){var ie=-1,_e=function Ne(){for(;++ie=0;--Ne){var ye=this.tryEntries[Ne],Ie=ye.completion;if(ye.tryLoc==="root")return _e("end");if(ye.tryLoc<=this.prev){var at=n.call(ye,"catchLoc"),Ve=n.call(ye,"finallyLoc");if(at&&Ve){if(this.prev=0;--_e){var Ne=this.tryEntries[_e];if(Ne.tryLoc<=this.prev&&n.call(Ne,"finallyLoc")&&this.prev=0;--ie){var _e=this.tryEntries[ie];if(_e.finallyLoc===V)return this.complete(_e.completion,_e.afterLoc),B(_e),_}},catch:function(V){for(var ie=this.tryEntries.length-1;ie>=0;--ie){var _e=this.tryEntries[ie];if(_e.tryLoc===V){var Ne=_e.completion;if(Ne.type==="throw"){var ye=Ne.arg;B(_e)}return ye}}throw Error("illegal catch attempt")},delegateYield:function(V,ie,_e){return this.delegate={iterator:X(V),resultName:ie,nextLoc:_e},this.method==="next"&&(this.arg=t),_}},e}Ya.exports=jD,Ya.exports.__esModule=!0,Ya.exports.default=Ya.exports});var MS=Ge((qce,WD)=>{var vb=FD()();WD.exports=vb;try{regeneratorRuntime=vb}catch{typeof globalThis=="object"?globalThis.regeneratorRuntime=vb:Function("r","regeneratorRuntime = r")(vb)}});var tO=Ge(eO=>{"use strict";var cf=oe();function VU(t,e){return t===e&&(t!==0||1/t===1/e)||t!==t&&e!==e}var GU=typeof Object.is=="function"?Object.is:VU,YU=cf.useState,KU=cf.useEffect,XU=cf.useLayoutEffect,JU=cf.useDebugValue;function ZU(t,e){var i=e(),n=YU({inst:{value:i,getSnapshot:e}}),r=n[0].inst,s=n[1];return XU(function(){r.value=i,r.getSnapshot=e,LS(r)&&s({inst:r})},[t,i,e]),KU(function(){return LS(r)&&s({inst:r}),t(function(){LS(r)&&s({inst:r})})},[t]),JU(i),i}function LS(t){var e=t.getSnapshot;t=t.value;try{var i=e();return!GU(t,i)}catch{return!0}}function QU(t,e){return e()}var e9=typeof window>"u"||typeof window.document>"u"||typeof window.document.createElement>"u"?QU:ZU;eO.useSyncExternalStore=cf.useSyncExternalStore!==void 0?cf.useSyncExternalStore:e9});var NS=Ge((Cue,iO)=>{"use strict";iO.exports=tO()});var d2=Ge(zb=>{"use strict";Object.defineProperty(zb,"__esModule",{value:!0});zb.OutputAreaByRef=void 0;var RV=(Fi(),Pa(au)),Ob=he(),kV=RV.__importDefault(oe());zb.OutputAreaByRef=kV.default.forwardRef(({busy:t,content:e},i)=>(0,Ob.jsx)("div",{children:(0,Ob.jsxs)("div",Object.assign({className:"m-1 hover:delay-15"},{children:[(0,Ob.jsx)("div",Object.assign({className:"p-1 rounded",ref:i},{children:e||"[Output Area]"})),t&&(0,Ob.jsx)("div",{children:"Cell is running..."})]}))}))});var KS=Ge(YS=>{"use strict";Object.defineProperty(YS,"__esModule",{value:!0});var AV="0.4.10";YS.default=AV});function LV(t,e){let i=new URL(e);return`${t}-${i.origin+i.pathname}`}function Pb(t,e,i){let n=`${e}/build/${i}`,r=`${e}/v2/${i}`;return{build:n,launch:r,storageKey:LV(t,n)}}function NV(t){if(!t.binder.repo)throw Error("repo is required for git provider");let{repo:e,binderUrl:i,ref:n}=t.binder,r=encodeURIComponent(e.replace(/(^\/)|(\/?$)/g,"")),s=i?.replace(/(\/?$)/g,""),o=`git/${r}/${n??"HEAD"}`;return Pb(t.savedSessions.storagePrefix,s,o)}function DV(t){var e,i,n;if(!t.binder.repo)throw Error("repo is required for gitlab provider");let r=(e=t.binder.binderUrl)===null||e===void 0?void 0:e.replace(/(\/?$)/g,""),o=`gl/${encodeURIComponent(((i=t.binder.repo)!==null&&i!==void 0?i:"").replace(/^(https?:\/\/)?gitlab.com\//,"").replace(/(^\/)|(\/?$)/g,""))}/${(n=t.binder.ref)!==null&&n!==void 0?n:"HEAD"}`;return Pb(t.savedSessions.storagePrefix,r,o)}function OV(t){var e,i;if(!t.binder.repo)throw Error("repo is required for github provider");let n=t.binder.repo.replace(/^(https?:\/\/)?github.com\//,"").replace(/(^\/)|(\/?$)/g,""),r=(e=t.binder.binderUrl)===null||e===void 0?void 0:e.replace(/(\/?$)/g,""),s=`gh/${n}/${(i=t.binder.ref)!==null&&i!==void 0?i:"HEAD"}`;return Pb(t.savedSessions.storagePrefix,r,s)}function zV(t){var e,i;if(!t.binder.repo)throw Error("repo is required for gist provider");let n=t.binder.repo.replace(/^(https?:\/\/)?github.com\//,"").replace(/(^\/)|(\/?$)/g,""),r=(e=t.binder.binderUrl)===null||e===void 0?void 0:e.replace(/(\/?$)/g,""),s=`gist/${n}/${(i=t.binder.ref)!==null&&i!==void 0?i:"HEAD"}`;return Pb(t.savedSessions.storagePrefix,r,s)}function Hb(t,e){var i,n;let r=(i=e.reduce((o,a)=>Object.assign(Object.assign({},o),{[a.name]:a}),{}))!==null&&i!==void 0?i:{},s=(n=t.binder.repoProvider)!==null&&n!==void 0?n:"github";if(!Object.keys(r).includes(s))throw Error(`Unknown provider ${t.binder.repoProvider}`);if(!r[s].makeUrls)throw Error(`No makeUrls function for ${s}`);return r[s].makeUrls(t)}var PV,BV,HV,jV,Bb,h2=$(()=>{PV={name:"github",makeUrls:OV},BV={name:"gitlab",makeUrls:DV},HV={name:"git",makeUrls:NV},jV={name:"gist",makeUrls:zV},Bb=[PV,BV,HV,jV]});function FV(t){let e=window.localStorage.getItem(t);if(!e)return;let i=JSON.parse(e);window.localStorage.setItem(t,JSON.stringify(Object.assign(Object.assign({},i),{lastUsed:new Date})))}function f2(t,e,i){try{let{baseUrl:n,token:r,wsUrl:s}=i;window.localStorage.setItem(t,JSON.stringify({id:e,baseUrl:n,token:r,wsUrl:s,lastUsed:new Date}))}catch(n){console.warn("Couldn't save thebe binder connection info to local storage",n)}}function m2(t,e){return mt(this,void 0,void 0,function*(){if(!t.enabled)return null;let i=window.localStorage.getItem(e);if(i==null)return console.debug("thebe:getExistingServer No session saved in ",e),null;console.debug("thebe:getExistingServer Saved binder session found");let n=JSON.parse(i??""),r=new Date(n.lastUsed);if((new Date().getTime()-r.getTime())/1e3>t.maxAge)return console.debug(`thebe:getExistingServer Not using expired binder session for ${n.baseUrl} from ${r}`),window.localStorage.removeItem(e),null;try{yield jb.KernelAPI.listRunning(jb.ServerConnection.makeSettings(n))}catch(a){return console.debug("thebe:getExistingServer Saved binder connection appears to be invalid, requesting new session",a),window.localStorage.removeItem(e),null}return FV(e),console.debug(`thebe:getExistingServer Saved binder session is valid and will be reused ${n.baseUrl}`),n})}function p2(t="thebe-binder"){let e=[];for(let i=0;iwindow.localStorage.removeItem(i))}function g2(t){console.debug(`thebe:clearSavedSession - removing ${t}`),window.localStorage.removeItem(t)}var jb,XS=$(()=>{Fi();jb=P(v1())});function Fb(t){var e;return t.traceback?Array.isArray(t.traceback)?`${t.evalue} ${((e=t.traceback)!==null&&e!==void 0?e:[]).join("")}`:`${t.evalue} -${JSON.stringify(t.traceback)}`:t.evalue}var Qi,$s,JS,jr,mc,_o,$n,gf,xu,el=$(()=>{(function(t){t.launching="launching",t.ready="server-ready",t.closed="closed",t.unknown="unknown"})(Qi||(Qi={}));(function(t){t.starting="starting",t.ready="ready",t.shutdown="shutdown"})($s||($s={}));(function(t){t.starting="starting",t.ready="ready",t.shutdown="shutdown"})(JS||(JS={}));(function(t){t.attached="attached",t.detached="detached",t.executing="executing",t.idle="idle"})(jr||(jr={}));(function(t){t.attached="attached",t.detached="detached",t.executing="executing",t.idle="idle"})(mc||(mc={}));(function(t){t.server="server",t.session="session",t.kernel="kernel",t.notebook="notebook",t.cell="cell"})(_o||(_o={}));(function(t){t.warning="warning",t.executeError="execute-error",t.error="error",t.server="server-error",t.session="session-error"})($n||($n={}));(function(t){t.status="status",t.error="error"})(gf||(gf={}));xu=class{constructor(){this.listeners={}}_ensureMap(e){e in this.listeners||(this.listeners[e]=new Map)}trigger(e,i){e in this.listeners&&this.listeners[e].forEach(({unbind:n},r)=>{r(e,i),n&&this.listeners[e].delete(r)})}on(e,i){return this._ensureMap(e),this.listeners[e].set(i,{unbind:!1}),()=>this.off(e,i)}one(e,i){return this._ensureMap(e),this.listeners[e].set(i,{unbind:!0}),()=>this.off(e,i)}off(e,i){e in this.listeners&&this.listeners[e].delete(i)}}});function*vf(){}function ZS(t,e){let i=0;for(let n of t)if(e(n,i++))return n}function v2(t,e){let i=0;for(let n of t)if(e(n,i++)===!1)return!1;return!0}function b2(t,e){let i=0;for(let n of t)if(e(n,i++))return!0;return!1}function*x2(t){if(typeof t.retro=="function")yield*t.retro();else for(let e=t.length-1;e>-1;e--)yield t[e]}var Be,_2,_f,QS=$(()=>{(function(t){function e(M,C,x=0,w=-1){let E=M.length;if(E===0)return-1;x<0?x=Math.max(0,x+E):x=Math.min(x,E-1),w<0?w=Math.max(0,w+E):w=Math.min(w,E-1);let N;w0;){let X=Z>>1,K=B+X;x(M[K],C)<0?(B=K+1,Z-=X+1):Z=X}return B}t.lowerBound=a;function l(M,C,x,w=0,E=-1){let N=M.length;if(N===0)return 0;w<0?w=Math.max(0,w+N):w=Math.min(w,N-1),E<0?E=Math.max(0,E+N):E=Math.min(E,N-1);let B=w,Z=E-w+1;for(;Z>0;){let X=Z>>1,K=B+X;x(M[K],C)>0?Z=X:(B=K+1,Z-=X+1)}return B}t.upperBound=l;function c(M,C,x){if(M===C)return!0;if(M.length!==C.length)return!1;for(let w=0,E=M.length;w=N&&(x=E<0?N-1:N),w===void 0?w=E<0?-1:N:w<0?w=Math.max(w+N,E<0?-1:0):w>=N&&(w=E<0?N-1:N);let B;E<0&&w>=x||E>0&&x>=w?B=0:E<0?B=Math.floor((w-x+1)/E+1):B=Math.floor((w-x-1)/E+1);let Z=[];for(let X=0;X=w))return;let N=w-x+1;if(C>0?C=C%N:C<0&&(C=(C%N+N)%N),C===0)return;let B=x+C;f(M,x,B-1),f(M,B,w),f(M,x,w)}t.rotate=h;function m(M,C,x=0,w=-1){let E=M.length;if(E===0)return;x<0?x=Math.max(0,x+E):x=Math.min(x,E-1),w<0?w=Math.max(0,w+E):w=Math.min(w,E-1);let N;wC;--E)M[E]=M[E-1];M[C]=x}t.insert=p;function _(M,C){let x=M.length;if(C<0&&(C+=x),C<0||C>=x)return;let w=M[C];for(let E=C+1;E=x&&B<=w&&M[B]===C||w=x)&&M[B]===C?N++:N>0&&(M[B-N]=M[B]);return N>0&&(M.length=E-N),N}t.removeAllOf=T;function O(M,C,x=0,w=-1){let E,N=n(M,C,x,w);return N!==-1&&(E=_(M,N)),{index:N,value:E}}t.removeFirstWhere=O;function A(M,C,x=-1,w=0){let E,N=r(M,C,x,w);return N!==-1&&(E=_(M,N)),{index:N,value:E}}t.removeLastWhere=A;function b(M,C,x=0,w=-1){let E=M.length;if(E===0)return 0;x<0?x=Math.max(0,x+E):x=Math.min(x,E-1),w<0?w=Math.max(0,w+E):w=Math.min(w,E-1);let N=0;for(let B=0;B=x&&B<=w&&C(M[B],B)||w=x)&&C(M[B],B)?N++:N>0&&(M[B-N]=M[B]);return N>0&&(M.length=E-N),N}t.removeAllWhere=b})(Be||(Be={}));(function(t){function e(i,n,r){return r===0?1/0:i>n&&r>0||ia?1:0}t.cmp=s})(_f||(_f={}))});var y2,ei,yu,wu,vo,w2=$(()=>{(function(t){function e(i){let n=document.body,r=s=>{s.preventDefault(),s.stopPropagation(),s.clipboardData.setData("text",i),n.removeEventListener("copy",r,!0)};n.addEventListener("copy",r,!0),document.execCommand("copy")}t.copyText=e})(y2||(y2={}));(function(t){function e(s){let o=window.getComputedStyle(s),a=parseFloat(o.borderTopWidth)||0,l=parseFloat(o.borderLeftWidth)||0,c=parseFloat(o.borderRightWidth)||0,u=parseFloat(o.borderBottomWidth)||0,d=parseFloat(o.paddingTop)||0,f=parseFloat(o.paddingLeft)||0,h=parseFloat(o.paddingRight)||0,m=parseFloat(o.paddingBottom)||0,p=l+f+h+c,_=a+d+m+u;return{borderTop:a,borderLeft:l,borderRight:c,borderBottom:u,paddingTop:d,paddingLeft:f,paddingRight:h,paddingBottom:m,horizontalSum:p,verticalSum:_}}t.boxSizing=e;function i(s){let o=window.getComputedStyle(s),a=parseFloat(o.minWidth)||0,l=parseFloat(o.minHeight)||0,c=parseFloat(o.maxWidth)||1/0,u=parseFloat(o.maxHeight)||1/0;return c=Math.max(a,c),u=Math.max(l,u),{minWidth:a,minHeight:l,maxWidth:c,maxHeight:u}}t.sizeLimits=i;function n(s,o,a){let l=s.getBoundingClientRect();return o>=l.left&&o=l.top&&a=a.bottom)){if(l.topa.bottom&&l.height>=a.height){s.scrollTop-=a.top-l.top;return}if(l.topa.height){s.scrollTop-=a.bottom-l.bottom;return}if(l.bottom>a.bottom&&l.height{let u=Element.prototype;return u.matches||u.matchesSelector||u.mozMatchesSelector||u.msMatchesSelector||u.oMatchesSelector||u.webkitMatchesSelector||function(d){let f=this,h=f.ownerDocument?f.ownerDocument.querySelectorAll(d):[];return Array.prototype.indexOf.call(h,f)!==-1}})();function e(u){u=u.split(",",1)[0];let d=0,f=0,h=0;function m(p){let _=u.match(p);return _===null?!1:(u=u.slice(_[0].length),!0)}for(u=u.replace(c," $1 ");u.length>0;){if(m(i)){d++;continue}if(m(n)){f++;continue}if(m(r)){f++;continue}if(m(o)){h++;continue}if(m(a)){f++;continue}if(m(s)){h++;continue}if(!m(l))return 0}return d=Math.min(d,255),f=Math.min(f,255),h=Math.min(h,255),d<<16|f<<8|h}t.calculateSingle=e;let i=/^#[^\s\+>~#\.\[:]+/,n=/^\.[^\s\+>~#\.\[:]+/,r=/^\[[^\]]+\]/,s=/^[^\s\+>~#\.\[:]+/,o=/^(::[^\s\+>~#\.\[:]+|:first-line|:first-letter|:before|:after)/,a=/^:[^\s\+>~#\.\[:]+/,l=/^[\s\+>~\*]+/,c=/:not\(([^\)]+)\)/g})(vo||(vo={}))});var bf,tl,S2=$(()=>{bf=class{constructor(){this._first=null,this._last=null,this._size=0}get isEmpty(){return this._size===0}get size(){return this._size}get length(){return this._size}get first(){return this._first?this._first.value:void 0}get last(){return this._last?this._last.value:void 0}get firstNode(){return this._first}get lastNode(){return this._last}*[Symbol.iterator](){let e=this._first;for(;e;)yield e.value,e=e.next}*retro(){let e=this._last;for(;e;)yield e.value,e=e.prev}*nodes(){let e=this._first;for(;e;)yield e,e=e.next}*retroNodes(){let e=this._last;for(;e;)yield e,e=e.prev}assign(e){this.clear();for(let i of e)this.addLast(i)}push(e){this.addLast(e)}pop(){return this.removeLast()}shift(e){this.addFirst(e)}unshift(){return this.removeFirst()}addFirst(e){let i=new tl.LinkedListNode(this,e);return this._first?(i.next=this._first,this._first.prev=i,this._first=i):(this._first=i,this._last=i),this._size++,i}addLast(e){let i=new tl.LinkedListNode(this,e);return this._last?(i.prev=this._last,this._last.next=i,this._last=i):(this._first=i,this._last=i),this._size++,i}insertBefore(e,i){if(!i||i===this._first)return this.addFirst(e);if(!(i instanceof tl.LinkedListNode)||i.list!==this)throw new Error("Reference node is not owned by the list.");let n=new tl.LinkedListNode(this,e),r=i,s=r.prev;return n.next=r,n.prev=s,r.prev=n,s.next=n,this._size++,n}insertAfter(e,i){if(!i||i===this._last)return this.addLast(e);if(!(i instanceof tl.LinkedListNode)||i.list!==this)throw new Error("Reference node is not owned by the list.");let n=new tl.LinkedListNode(this,e),r=i,s=r.next;return n.next=s,n.prev=r,r.next=n,s.prev=n,this._size++,n}removeFirst(){let e=this._first;if(e)return e===this._last?(this._first=null,this._last=null):(this._first=e.next,this._first.prev=null),e.list=null,e.next=null,e.prev=null,this._size--,e.value}removeLast(){let e=this._last;if(e)return e===this._first?(this._first=null,this._last=null):(this._last=e.prev,this._last.next=null),e.list=null,e.next=null,e.prev=null,this._size--,e.value}removeNode(e){if(!(e instanceof tl.LinkedListNode)||e.list!==this)throw new Error("Node is not owned by the list.");let i=e;i===this._first&&i===this._last?(this._first=null,this._last=null):i===this._first?(this._first=i.next,this._first.prev=null):i===this._last?(this._last=i.prev,this._last.next=null):(i.next.prev=i.prev,i.prev.next=i.next),i.list=null,i.next=null,i.prev=null,this._size--}clear(){let e=this._first;for(;e;){let i=e.next;e.list=null,e.prev=null,e.next=null,e=i}this._first=null,this._last=null,this._size=0}};(function(t){function e(i){let n=new t;return n.assign(i),n}t.from=e})(bf||(bf={}));(function(t){class e{constructor(n,r){this.list=null,this.next=null,this.prev=null,this.list=n,this.value=r}}t.LinkedListNode=e})(tl||(tl={}))});var pr,pc,Ae,Wb=$(()=>{QS();S2();pr=class{constructor(e){this.type=e}get isConflatable(){return!1}conflate(e){return!1}},pc=class extends pr{get isConflatable(){return!0}conflate(e){return!0}};(function(t){let e=null,i=(C=>x=>{let w=!1;return C.then(()=>!w&&x()),()=>{w=!0}})(Promise.resolve());function n(C,x){let w=f.get(C);if(!w||w.length===0){y(C,x);return}v2(x2(w),N=>N?_(N,C,x):!0)&&y(C,x)}t.sendMessage=n;function r(C,x){if(!x.isConflatable){S(C,x);return}b2(d,E=>E.handler!==C||!E.msg||E.msg.type!==x.type||!E.msg.isConflatable?!1:E.msg.conflate(x))||S(C,x)}t.postMessage=r;function s(C,x){let w=f.get(C);w&&w.indexOf(x)!==-1||(w?w.push(x):f.set(C,[x]))}t.installMessageHook=s;function o(C,x){let w=f.get(C);if(!w)return;let E=w.indexOf(x);E!==-1&&(w[E]=null,O(w))}t.removeMessageHook=o;function a(C){let x=f.get(C);x&&x.length>0&&(Be.fill(x,null),O(x));for(let w of d)w.handler===C&&(w.handler=null,w.msg=null)}t.clearData=a;function l(){p||e===null||(e(),e=null,p=!0,T(),p=!1)}t.flush=l;function c(){return m}t.getExceptionHandler=c;function u(C){let x=m;return m=C,x}t.setExceptionHandler=u;let d=new bf,f=new WeakMap,h=new Set,m=C=>{console.error(C)},p=!1;function _(C,x,w){let E=!0;try{typeof C=="function"?E=C(x,w):E=C.messageHook(x,w)}catch(N){m(N)}return E}function y(C,x){try{C.processMessage(x)}catch(w){m(w)}}function S(C,x){d.addLast({handler:C,msg:x}),e===null&&(e=i(T))}function T(){if(e=null,d.isEmpty)return;let C={handler:null,msg:null};for(d.addLast(C);;){let x=d.removeFirst();if(x===C)return;x.handler&&x.msg&&n(x.handler,x.msg)}}function O(C){h.size===0&&i(A),h.add(C)}function A(){h.forEach(b),h.clear()}function b(C){Be.removeAllWhere(C,M)}function M(C){return C===null}})(Ae||(Ae={}))});var sl,na,ss,eg,me,$b,sa,Cu,xf,yf,tg,ig,bo,nl,eC,qb,Ub,tC,Eu,iC,ng,nC,os,Su,Vb,rC,wf,il,ra,gr,C2,WV,gc,qs,sC,en,Mu,qi,rl,xn,Sf,Gb,E2,M2,oC,I2,T2,lC=$(()=>{QS();sl=P(Qn());w2();Wb();Tp();Rs();Ev();Iv();Tv();Cv();Mv();na=class{constructor(){this.sizeHint=0,this.minSize=0,this.maxSize=1/0,this.stretch=1,this.size=0,this.done=!1}};(function(t){function e(s,o){let a=s.length;if(a===0)return o;let l=0,c=0,u=0,d=0,f=0;for(let p=0;p0&&(d+=_.stretch,f++)}if(o===u)return 0;if(o<=l){for(let p=0;p=c){for(let p=0;p0&&p>h;){let _=p,y=d;for(let S=0;S0&&p>h;){let _=p/m;for(let y=0;y0&&p>h;){let _=p,y=d;for(let S=0;S=T.maxSize?(p-=T.maxSize-T.size,d-=T.stretch,T.size=T.maxSize,T.done=!0,m--,f--):(p-=O,T.size+=O)}}for(;m>0&&p>h;){let _=p/m;for(let y=0;y=S.maxSize?(p-=S.maxSize-S.size,S.size=S.maxSize,S.done=!0,m--):(p-=_,S.size+=_))}}}return 0}t.calc=e;function i(s,o,a){s.length===0||a===0||(a>0?n(s,o,a):r(s,o,-a))}t.adjust=i;function n(s,o,a){let l=0;for(let f=0;f<=o;++f){let h=s[f];l+=h.maxSize-h.size}let c=0;for(let f=o+1,h=s.length;f=0&&u>0;--f){let h=s[f],m=h.maxSize-h.size;m>=u?(h.sizeHint=h.size+u,u=0):(h.sizeHint=h.size+m,u-=m)}let d=a;for(let f=o+1,h=s.length;f0;++f){let m=s[f],p=m.size-m.minSize;p>=d?(m.sizeHint=m.size-d,d=0):(m.sizeHint=m.size-p,d-=p)}}function r(s,o,a){let l=0;for(let f=o+1,h=s.length;f0;++f){let m=s[f],p=m.maxSize-m.size;p>=u?(m.sizeHint=m.size+u,u=0):(m.sizeHint=m.size+p,u-=p)}let d=a;for(let f=o;f>=0&&d>0;--f){let h=s[f],m=h.size-h.minSize;m>=d?(h.sizeHint=h.size-d,d=0):(h.sizeHint=h.size-m,d-=m)}}})(ss||(ss={}));eg=class{constructor(e){this._label="",this._caption="",this._mnemonic=-1,this._icon=void 0,this._iconClass="",this._iconLabel="",this._className="",this._closable=!1,this._changed=new Te(this),this._isDisposed=!1,this.owner=e.owner,e.label!==void 0&&(this._label=e.label),e.mnemonic!==void 0&&(this._mnemonic=e.mnemonic),e.icon!==void 0&&(this._icon=e.icon),e.iconClass!==void 0&&(this._iconClass=e.iconClass),e.iconLabel!==void 0&&(this._iconLabel=e.iconLabel),e.caption!==void 0&&(this._caption=e.caption),e.className!==void 0&&(this._className=e.className),e.closable!==void 0&&(this._closable=e.closable),this._dataset=e.dataset||{}}get changed(){return this._changed}get label(){return this._label}set label(e){this._label!==e&&(this._label=e,this._changed.emit(void 0))}get mnemonic(){return this._mnemonic}set mnemonic(e){this._mnemonic!==e&&(this._mnemonic=e,this._changed.emit(void 0))}get icon(){return this._icon}set icon(e){this._icon!==e&&(this._icon=e,this._changed.emit(void 0))}get iconClass(){return this._iconClass}set iconClass(e){this._iconClass!==e&&(this._iconClass=e,this._changed.emit(void 0))}get iconLabel(){return this._iconLabel}set iconLabel(e){this._iconLabel!==e&&(this._iconLabel=e,this._changed.emit(void 0))}get caption(){return this._caption}set caption(e){this._caption!==e&&(this._caption=e,this._changed.emit(void 0))}get className(){return this._className}set className(e){this._className!==e&&(this._className=e,this._changed.emit(void 0))}get closable(){return this._closable}set closable(e){this._closable!==e&&(this._closable=e,this._changed.emit(void 0))}get dataset(){return this._dataset}set dataset(e){this._dataset!==e&&(this._dataset=e,this._changed.emit(void 0))}get isDisposed(){return this._isDisposed}dispose(){this.isDisposed||(this._isDisposed=!0,Te.clearData(this))}},me=class{constructor(e={}){this._flags=0,this._layout=null,this._parent=null,this._disposed=new Te(this),this._hiddenMode=me.HiddenMode.Display,this.node=$b.createNode(e),this.addClass("lm-Widget")}dispose(){this.isDisposed||(this.setFlag(me.Flag.IsDisposed),this._disposed.emit(void 0),this.parent?this.parent=null:this.isAttached&&me.detach(this),this._layout&&(this._layout.dispose(),this._layout=null),this.title.dispose(),Te.clearData(this),Ae.clearData(this),pt.clearData(this))}get disposed(){return this._disposed}get isDisposed(){return this.testFlag(me.Flag.IsDisposed)}get isAttached(){return this.testFlag(me.Flag.IsAttached)}get isHidden(){return this.testFlag(me.Flag.IsHidden)}get isVisible(){return this.testFlag(me.Flag.IsVisible)}get title(){return $b.titleProperty.get(this)}get id(){return this.node.id}set id(e){this.node.id=e}get dataset(){return this.node.dataset}get hiddenMode(){return this._hiddenMode}set hiddenMode(e){this._hiddenMode!==e&&(this.isHidden&&this._toggleHidden(!1),e==me.HiddenMode.Scale?this.node.style.willChange="transform":this.node.style.willChange="auto",this._hiddenMode=e,this.isHidden&&this._toggleHidden(!0))}get parent(){return this._parent}set parent(e){if(this._parent!==e){if(e&&this.contains(e))throw new Error("Invalid parent widget.");if(this._parent&&!this._parent.isDisposed){let i=new me.ChildMessage("child-removed",this);Ae.sendMessage(this._parent,i)}if(this._parent=e,this._parent&&!this._parent.isDisposed){let i=new me.ChildMessage("child-added",this);Ae.sendMessage(this._parent,i)}this.isDisposed||Ae.sendMessage(this,me.Msg.ParentChanged)}}get layout(){return this._layout}set layout(e){if(this._layout!==e){if(this.testFlag(me.Flag.DisallowLayout))throw new Error("Cannot set widget layout.");if(this._layout)throw new Error("Cannot change widget layout.");if(e.parent)throw new Error("Cannot change layout parent.");this._layout=e,e.parent=this}}*children(){this._layout&&(yield*this._layout)}contains(e){for(let i=e;i;i=i._parent)if(i===this)return!0;return!1}hasClass(e){return this.node.classList.contains(e)}addClass(e){this.node.classList.add(e)}removeClass(e){this.node.classList.remove(e)}toggleClass(e,i){return i===!0?(this.node.classList.add(e),!0):i===!1?(this.node.classList.remove(e),!1):this.node.classList.toggle(e)}update(){Ae.postMessage(this,me.Msg.UpdateRequest)}fit(){Ae.postMessage(this,me.Msg.FitRequest)}activate(){Ae.postMessage(this,me.Msg.ActivateRequest)}close(){Ae.sendMessage(this,me.Msg.CloseRequest)}show(){if(this.testFlag(me.Flag.IsHidden)&&(this.isAttached&&(!this.parent||this.parent.isVisible)&&Ae.sendMessage(this,me.Msg.BeforeShow),this.clearFlag(me.Flag.IsHidden),this._toggleHidden(!1),this.isAttached&&(!this.parent||this.parent.isVisible)&&Ae.sendMessage(this,me.Msg.AfterShow),this.parent)){let e=new me.ChildMessage("child-shown",this);Ae.sendMessage(this.parent,e)}}hide(){if(!this.testFlag(me.Flag.IsHidden)&&(this.isAttached&&(!this.parent||this.parent.isVisible)&&Ae.sendMessage(this,me.Msg.BeforeHide),this.setFlag(me.Flag.IsHidden),this._toggleHidden(!0),this.isAttached&&(!this.parent||this.parent.isVisible)&&Ae.sendMessage(this,me.Msg.AfterHide),this.parent)){let e=new me.ChildMessage("child-hidden",this);Ae.sendMessage(this.parent,e)}}setHidden(e){e?this.hide():this.show()}testFlag(e){return(this._flags&e)!==0}setFlag(e){this._flags|=e}clearFlag(e){this._flags&=~e}processMessage(e){switch(e.type){case"resize":this.notifyLayout(e),this.onResize(e);break;case"update-request":this.notifyLayout(e),this.onUpdateRequest(e);break;case"fit-request":this.notifyLayout(e),this.onFitRequest(e);break;case"before-show":this.notifyLayout(e),this.onBeforeShow(e);break;case"after-show":this.setFlag(me.Flag.IsVisible),this.notifyLayout(e),this.onAfterShow(e);break;case"before-hide":this.notifyLayout(e),this.onBeforeHide(e);break;case"after-hide":this.clearFlag(me.Flag.IsVisible),this.notifyLayout(e),this.onAfterHide(e);break;case"before-attach":this.notifyLayout(e),this.onBeforeAttach(e);break;case"after-attach":!this.isHidden&&(!this.parent||this.parent.isVisible)&&this.setFlag(me.Flag.IsVisible),this.setFlag(me.Flag.IsAttached),this.notifyLayout(e),this.onAfterAttach(e);break;case"before-detach":this.notifyLayout(e),this.onBeforeDetach(e);break;case"after-detach":this.clearFlag(me.Flag.IsVisible),this.clearFlag(me.Flag.IsAttached),this.notifyLayout(e),this.onAfterDetach(e);break;case"activate-request":this.notifyLayout(e),this.onActivateRequest(e);break;case"close-request":this.notifyLayout(e),this.onCloseRequest(e);break;case"child-added":this.notifyLayout(e),this.onChildAdded(e);break;case"child-removed":this.notifyLayout(e),this.onChildRemoved(e);break;default:this.notifyLayout(e);break}}notifyLayout(e){this._layout&&this._layout.processParentMessage(e)}onCloseRequest(e){this.parent?this.parent=null:this.isAttached&&me.detach(this)}onResize(e){}onUpdateRequest(e){}onFitRequest(e){}onActivateRequest(e){}onBeforeShow(e){}onAfterShow(e){}onBeforeHide(e){}onAfterHide(e){}onBeforeAttach(e){}onAfterAttach(e){}onBeforeDetach(e){}onAfterDetach(e){}onChildAdded(e){}onChildRemoved(e){}_toggleHidden(e){if(e)switch(this._hiddenMode){case me.HiddenMode.Display:this.addClass("lm-mod-hidden");break;case me.HiddenMode.Scale:this.node.style.transform="scale(0)",this.node.setAttribute("aria-hidden","true");break;case me.HiddenMode.ContentVisibility:this.node.style.contentVisibility="hidden",this.node.style.zIndex="-1";break}else switch(this._hiddenMode){case me.HiddenMode.Display:this.removeClass("lm-mod-hidden");break;case me.HiddenMode.Scale:this.node.style.transform="",this.node.removeAttribute("aria-hidden");break;case me.HiddenMode.ContentVisibility:this.node.style.contentVisibility="",this.node.style.zIndex="";break}}};(function(t){(function(s){s[s.Display=0]="Display",s[s.Scale=1]="Scale",s[s.ContentVisibility=2]="ContentVisibility"})(t.HiddenMode||(t.HiddenMode={})),function(s){s[s.IsDisposed=1]="IsDisposed",s[s.IsAttached=2]="IsAttached",s[s.IsHidden=4]="IsHidden",s[s.IsVisible=8]="IsVisible",s[s.DisallowLayout=16]="DisallowLayout"}(t.Flag||(t.Flag={})),function(s){s.BeforeShow=new pr("before-show"),s.AfterShow=new pr("after-show"),s.BeforeHide=new pr("before-hide"),s.AfterHide=new pr("after-hide"),s.BeforeAttach=new pr("before-attach"),s.AfterAttach=new pr("after-attach"),s.BeforeDetach=new pr("before-detach"),s.AfterDetach=new pr("after-detach"),s.ParentChanged=new pr("parent-changed"),s.UpdateRequest=new pc("update-request"),s.FitRequest=new pc("fit-request"),s.ActivateRequest=new pc("activate-request"),s.CloseRequest=new pc("close-request")}(t.Msg||(t.Msg={}));class e extends pr{constructor(o,a){super(o),this.child=a}}t.ChildMessage=e;class i extends pr{constructor(o,a){super("resize"),this.width=o,this.height=a}}t.ResizeMessage=i,function(s){s.UnknownSize=new s(-1,-1)}(i=t.ResizeMessage||(t.ResizeMessage={}));function n(s,o,a=null){if(s.parent)throw new Error("Cannot attach a child widget.");if(s.isAttached||s.node.isConnected)throw new Error("Widget is already attached.");if(!o.isConnected)throw new Error("Host is not attached.");Ae.sendMessage(s,t.Msg.BeforeAttach),o.insertBefore(s.node,a),Ae.sendMessage(s,t.Msg.AfterAttach)}t.attach=n;function r(s){if(s.parent)throw new Error("Cannot detach a child widget.");if(!s.isAttached||!s.node.isConnected)throw new Error("Widget is not attached.");Ae.sendMessage(s,t.Msg.BeforeDetach),s.node.parentNode.removeChild(s.node),Ae.sendMessage(s,t.Msg.AfterDetach)}t.detach=r})(me||(me={}));(function(t){t.titleProperty=new pt({name:"title",create:i=>new eg({owner:i})});function e(i){return i.node||document.createElement(i.tag||"div")}t.createNode=e})($b||($b={}));sa=class{constructor(e={}){this._disposed=!1,this._parent=null,this._fitPolicy=e.fitPolicy||"set-min-size"}dispose(){this._parent=null,this._disposed=!0,Te.clearData(this),pt.clearData(this)}get isDisposed(){return this._disposed}get parent(){return this._parent}set parent(e){if(this._parent!==e){if(this._parent)throw new Error("Cannot change parent widget.");if(e.layout!==this)throw new Error("Invalid parent widget.");this._parent=e,this.init()}}get fitPolicy(){return this._fitPolicy}set fitPolicy(e){if(this._fitPolicy!==e&&(this._fitPolicy=e,this._parent)){let i=this._parent.node.style;i.minWidth="",i.minHeight="",i.maxWidth="",i.maxHeight="",this._parent.fit()}}processParentMessage(e){switch(e.type){case"resize":this.onResize(e);break;case"update-request":this.onUpdateRequest(e);break;case"fit-request":this.onFitRequest(e);break;case"before-show":this.onBeforeShow(e);break;case"after-show":this.onAfterShow(e);break;case"before-hide":this.onBeforeHide(e);break;case"after-hide":this.onAfterHide(e);break;case"before-attach":this.onBeforeAttach(e);break;case"after-attach":this.onAfterAttach(e);break;case"before-detach":this.onBeforeDetach(e);break;case"after-detach":this.onAfterDetach(e);break;case"child-removed":this.onChildRemoved(e);break;case"child-shown":this.onChildShown(e);break;case"child-hidden":this.onChildHidden(e);break}}init(){for(let e of this)e.parent=this.parent}onResize(e){for(let i of this)Ae.sendMessage(i,me.ResizeMessage.UnknownSize)}onUpdateRequest(e){for(let i of this)Ae.sendMessage(i,me.ResizeMessage.UnknownSize)}onBeforeAttach(e){for(let i of this)Ae.sendMessage(i,e)}onAfterAttach(e){for(let i of this)Ae.sendMessage(i,e)}onBeforeDetach(e){for(let i of this)Ae.sendMessage(i,e)}onAfterDetach(e){for(let i of this)Ae.sendMessage(i,e)}onBeforeShow(e){for(let i of this)i.isHidden||Ae.sendMessage(i,e)}onAfterShow(e){for(let i of this)i.isHidden||Ae.sendMessage(i,e)}onBeforeHide(e){for(let i of this)i.isHidden||Ae.sendMessage(i,e)}onAfterHide(e){for(let i of this)i.isHidden||Ae.sendMessage(i,e)}onChildRemoved(e){this.removeWidget(e.child)}onFitRequest(e){}onChildShown(e){}onChildHidden(e){}};(function(t){function e(s){return xf.horizontalAlignmentProperty.get(s)}t.getHorizontalAlignment=e;function i(s,o){xf.horizontalAlignmentProperty.set(s,o)}t.setHorizontalAlignment=i;function n(s){return xf.verticalAlignmentProperty.get(s)}t.getVerticalAlignment=n;function r(s,o){xf.verticalAlignmentProperty.set(s,o)}t.setVerticalAlignment=r})(sa||(sa={}));Cu=class{constructor(e){this._top=NaN,this._left=NaN,this._width=NaN,this._height=NaN,this._minWidth=0,this._minHeight=0,this._maxWidth=1/0,this._maxHeight=1/0,this._disposed=!1,this.widget=e,this.widget.node.style.position="absolute",this.widget.node.style.contain="strict"}dispose(){if(this._disposed)return;this._disposed=!0;let e=this.widget.node.style;e.position="",e.top="",e.left="",e.width="",e.height="",e.contain=""}get minWidth(){return this._minWidth}get minHeight(){return this._minHeight}get maxWidth(){return this._maxWidth}get maxHeight(){return this._maxHeight}get isDisposed(){return this._disposed}get isHidden(){return this.widget.isHidden}get isVisible(){return this.widget.isVisible}get isAttached(){return this.widget.isAttached}fit(){let e=ei.sizeLimits(this.widget.node);this._minWidth=e.minWidth,this._minHeight=e.minHeight,this._maxWidth=e.maxWidth,this._maxHeight=e.maxHeight}update(e,i,n,r){let s=Math.max(this._minWidth,Math.min(n,this._maxWidth)),o=Math.max(this._minHeight,Math.min(r,this._maxHeight));if(s"center",changed:e}),t.verticalAlignmentProperty=new pt({name:"verticalAlignment",create:()=>"top",changed:e});function e(i){i.parent&&i.parent.layout&&i.parent.update()}})(xf||(xf={}));yf=class extends sa{constructor(){super(...arguments),this._widgets=[]}dispose(){for(;this._widgets.length>0;)this._widgets.pop().dispose();super.dispose()}get widgets(){return this._widgets}*[Symbol.iterator](){yield*this._widgets}addWidget(e){this.insertWidget(this._widgets.length,e)}insertWidget(e,i){i.parent=this.parent;let n=this._widgets.indexOf(i),r=Math.max(0,Math.min(e,this._widgets.length));if(n===-1){Be.insert(this._widgets,r,i),this.parent&&this.attachWidget(r,i);return}r===this._widgets.length&&r--,n!==r&&(Be.move(this._widgets,n,r),this.parent&&this.moveWidget(n,r,i))}removeWidget(e){this.removeWidgetAt(this._widgets.indexOf(e))}removeWidgetAt(e){let i=Be.removeAt(this._widgets,e);i&&this.parent&&this.detachWidget(e,i)}init(){super.init();let e=0;for(let i of this)this.attachWidget(e++,i)}attachWidget(e,i){let n=this.parent.node.children[e];this.parent.isAttached&&Ae.sendMessage(i,me.Msg.BeforeAttach),this.parent.node.insertBefore(i.node,n),this.parent.isAttached&&Ae.sendMessage(i,me.Msg.AfterAttach)}moveWidget(e,i,n){this.parent.isAttached&&Ae.sendMessage(n,me.Msg.BeforeDetach),this.parent.node.removeChild(n.node),this.parent.isAttached&&Ae.sendMessage(n,me.Msg.AfterDetach);let r=this.parent.node.children[i];this.parent.isAttached&&Ae.sendMessage(n,me.Msg.BeforeAttach),this.parent.node.insertBefore(n.node,r),this.parent.isAttached&&Ae.sendMessage(n,me.Msg.AfterAttach)}detachWidget(e,i){this.parent.isAttached&&Ae.sendMessage(i,me.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.isAttached&&Ae.sendMessage(i,me.Msg.AfterDetach)}};(function(t){function e(i){return Math.max(0,Math.floor(i))}t.clampDimension=e})(tg||(tg={}));ig=tg,bo=class extends yf{constructor(e){super(),this.widgetOffset=0,this._fixed=0,this._spacing=4,this._dirty=!1,this._hasNormedSizes=!1,this._sizers=[],this._items=[],this._handles=[],this._box=null,this._alignment="start",this._orientation="horizontal",this.renderer=e.renderer,e.orientation!==void 0&&(this._orientation=e.orientation),e.alignment!==void 0&&(this._alignment=e.alignment),e.spacing!==void 0&&(this._spacing=tg.clampDimension(e.spacing))}dispose(){for(let e of this._items)e.dispose();this._box=null,this._items.length=0,this._sizers.length=0,this._handles.length=0,super.dispose()}get orientation(){return this._orientation}set orientation(e){this._orientation!==e&&(this._orientation=e,this.parent&&(this.parent.dataset.orientation=e,this.parent.fit()))}get alignment(){return this._alignment}set alignment(e){this._alignment!==e&&(this._alignment=e,this.parent&&(this.parent.dataset.alignment=e,this.parent.update()))}get spacing(){return this._spacing}set spacing(e){e=tg.clampDimension(e),this._spacing!==e&&(this._spacing=e,this.parent&&this.parent.fit())}get handles(){return this._handles}absoluteSizes(){return this._sizers.map(e=>e.size)}relativeSizes(){return nl.normalize(this._sizers.map(e=>e.size))}setRelativeSizes(e,i=!0){let n=this._sizers.length,r=e.slice(0,n);for(;r.length0&&(s.sizeHint=s.size);ss.adjust(this._sizers,e,r),this.parent&&this.parent.update()}}init(){this.parent.dataset.orientation=this.orientation,this.parent.dataset.alignment=this.alignment,super.init()}attachWidget(e,i){let n=new Cu(i),r=nl.createHandle(this.renderer),s=nl.averageSize(this._sizers),o=nl.createSizer(s);Be.insert(this._items,e,n),Be.insert(this._sizers,e,o),Be.insert(this._handles,e,r),this.parent.isAttached&&Ae.sendMessage(i,me.Msg.BeforeAttach),this.parent.node.appendChild(i.node),this.parent.node.appendChild(r),this.parent.isAttached&&Ae.sendMessage(i,me.Msg.AfterAttach),this.parent.fit()}moveWidget(e,i,n){Be.move(this._items,e,i),Be.move(this._sizers,e,i),Be.move(this._handles,e,i),this.parent.fit()}detachWidget(e,i){let n=Be.removeAt(this._items,e),r=Be.removeAt(this._handles,e);Be.removeAt(this._sizers,e),this.parent.isAttached&&Ae.sendMessage(i,me.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.node.removeChild(r),this.parent.isAttached&&Ae.sendMessage(i,me.Msg.AfterDetach),n.dispose(),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}updateItemPosition(e,i,n,r,s,o,a){let l=this._items[e];if(l.isHidden)return;let c=this._handles[e].style;i?(n+=this.widgetOffset,l.update(n,r,a,s),n+=a,c.top=`${r}px`,c.left=`${n}px`,c.width=`${this._spacing}px`,c.height=`${s}px`):(r+=this.widgetOffset,l.update(n,r,o,a),r+=a,c.top=`${r}px`,c.left=`${n}px`,c.width=`${o}px`,c.height=`${this._spacing}px`)}_fit(){let e=0,i=-1;for(let l=0,c=this._items.length;l0&&(d.sizeHint=d.size),u.isHidden){d.minSize=0,d.maxSize=0;continue}u.fit(),d.stretch=bo.getStretch(u.widget),n?(d.minSize=u.minWidth,d.maxSize=u.maxWidth,r+=u.minWidth,s=Math.max(s,u.minHeight)):(d.minSize=u.minHeight,d.maxSize=u.maxHeight,s+=u.minHeight,r=Math.max(r,u.minWidth))}let o=this._box=ei.boxSizing(this.parent.node);r+=o.horizontalSum,s+=o.verticalSum;let a=this.parent.node.style;a.minWidth=`${r}px`,a.minHeight=`${s}px`,this._dirty=!0,this.parent.parent&&Ae.sendMessage(this.parent.parent,me.Msg.FitRequest),this._dirty&&Ae.sendMessage(this.parent,me.Msg.UpdateRequest)}_update(e,i){this._dirty=!1;let n=0;for(let d=0,f=this._items.length;d0){let d;if(u?d=Math.max(0,o-this._fixed):d=Math.max(0,a-this._fixed),this._hasNormedSizes){for(let h of this._sizers)h.sizeHint*=d;this._hasNormedSizes=!1}let f=ss.calc(this._sizers,d);if(f>0)switch(this._alignment){case"start":break;case"center":l=0,c=f/2;break;case"end":l=0,c=f;break;case"justify":l=f/n,c=0;break;default:throw"unreachable"}}for(let d=0,f=this._items.length;d0,coerce:(o,a)=>Math.max(0,Math.floor(a)),changed:s});function e(o){let a=new na;return a.sizeHint=Math.floor(o),a}t.createSizer=e;function i(o){let a=o.createHandle();return a.style.position="absolute",a.style.contain="style",a}t.createHandle=i;function n(o){return o.reduce((a,l)=>a+l.size,0)/o.length||0}t.averageSize=n;function r(o){let a=o.length;if(a===0)return[];let l=o.reduce((c,u)=>c+Math.abs(u),0);return l===0?o.map(c=>1/a):o.map(c=>c/l)}t.normalize=r;function s(o){o.parent&&o.parent.layout instanceof bo&&o.parent.fit()}})(nl||(nl={}));eC=class extends bo{constructor(e){super({...e,orientation:e.orientation||"vertical"}),this._titles=[],this.titleSpace=e.titleSpace||22}get titleSpace(){return this.widgetOffset}set titleSpace(e){e=ig.clampDimension(e),this.widgetOffset!==e&&(this.widgetOffset=e,this.parent&&this.parent.fit())}get titles(){return this._titles}dispose(){this.isDisposed||(this._titles.length=0,super.dispose())}updateTitle(e,i){let n=this._titles[e],r=n.classList.contains("lm-mod-expanded"),s=qb.createTitle(this.renderer,i.title,r);this._titles[e]=s,this.parent.node.replaceChild(s,n)}insertWidget(e,i){i.id||(i.id=`id-${sl.UUID.uuid4()}`),super.insertWidget(e,i)}attachWidget(e,i){let n=qb.createTitle(this.renderer,i.title);Be.insert(this._titles,e,n),this.parent.node.appendChild(n),i.node.setAttribute("role","region"),i.node.setAttribute("aria-labelledby",n.id),super.attachWidget(e,i)}moveWidget(e,i,n){Be.move(this._titles,e,i),super.moveWidget(e,i,n)}detachWidget(e,i){let n=Be.removeAt(this._titles,e);this.parent.node.removeChild(n),super.detachWidget(e,i)}updateItemPosition(e,i,n,r,s,o,a){let l=this._titles[e].style;l.top=`${r}px`,l.left=`${n}px`,l.height=`${this.widgetOffset}px`,i?l.width=`${s}px`:l.width=`${o}px`,super.updateItemPosition(e,i,n,r,s,o,a)}};(function(t){function e(i,n,r=!0){let s=i.createSectionTitle(n);return s.style.position="absolute",s.style.contain="strict",s.setAttribute("aria-label",`${n.label} Section`),s.setAttribute("aria-expanded",r?"true":"false"),s.setAttribute("aria-controls",n.owner.id),r&&s.classList.add("lm-mod-expanded"),s}t.createTitle=e})(qb||(qb={}));Ub=class extends me{constructor(e={}){super(),this.addClass("lm-Panel"),this.layout=tC.createLayout(e)}get widgets(){return this.layout.widgets}addWidget(e){this.layout.addWidget(e)}insertWidget(e,i){this.layout.insertWidget(e,i)}};(function(t){function e(i){return i.layout||new yf}t.createLayout=e})(tC||(tC={}));Eu=class extends Ub{constructor(e={}){super({layout:iC.createLayout(e)}),this._handleMoved=new Te(this),this._pressData=null,this.addClass("lm-SplitPanel")}dispose(){this._releaseMouse(),super.dispose()}get orientation(){return this.layout.orientation}set orientation(e){this.layout.orientation=e}get alignment(){return this.layout.alignment}set alignment(e){this.layout.alignment=e}get spacing(){return this.layout.spacing}set spacing(e){this.layout.spacing=e}get renderer(){return this.layout.renderer}get handleMoved(){return this._handleMoved}get handles(){return this.layout.handles}relativeSizes(){return this.layout.relativeSizes()}setRelativeSizes(e,i=!0){this.layout.setRelativeSizes(e,i)}handleEvent(e){switch(e.type){case"pointerdown":this._evtPointerDown(e);break;case"pointermove":this._evtPointerMove(e);break;case"pointerup":this._evtPointerUp(e);break;case"keydown":this._evtKeyDown(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("pointerdown",this)}onAfterDetach(e){this.node.removeEventListener("pointerdown",this),this._releaseMouse()}onChildAdded(e){e.child.addClass("lm-SplitPanel-child"),this._releaseMouse()}onChildRemoved(e){e.child.removeClass("lm-SplitPanel-child"),this._releaseMouse()}_evtKeyDown(e){this._pressData&&(e.preventDefault(),e.stopPropagation()),e.keyCode===27&&this._releaseMouse()}_evtPointerDown(e){if(e.button!==0)return;let i=this.layout,n=Be.findFirstIndex(i.handles,c=>c.contains(e.target));if(n===-1)return;e.preventDefault(),e.stopPropagation(),document.addEventListener("pointerup",this,!0),document.addEventListener("pointermove",this,!0),document.addEventListener("keydown",this,!0),document.addEventListener("contextmenu",this,!0);let r,s=i.handles[n],o=s.getBoundingClientRect();i.orientation==="horizontal"?r=e.clientX-o.left:r=e.clientY-o.top;let a=window.getComputedStyle(s),l=an.overrideCursor(a.cursor);this._pressData={index:n,delta:r,override:l}}_evtPointerMove(e){e.preventDefault(),e.stopPropagation();let i,n=this.layout,r=this.node.getBoundingClientRect();n.orientation==="horizontal"?i=e.clientX-r.left-this._pressData.delta:i=e.clientY-r.top-this._pressData.delta,n.moveHandle(this._pressData.index,i)}_evtPointerUp(e){e.button===0&&(e.preventDefault(),e.stopPropagation(),this._releaseMouse())}_releaseMouse(){this._pressData&&(this._pressData.override.dispose(),this._pressData=null,this._handleMoved.emit(),document.removeEventListener("keydown",this,!0),document.removeEventListener("pointerup",this,!0),document.removeEventListener("pointermove",this,!0),document.removeEventListener("contextmenu",this,!0))}};(function(t){class e{createHandle(){let s=document.createElement("div");return s.className="lm-SplitPanel-handle",s}}t.Renderer=e,t.defaultRenderer=new e;function i(r){return bo.getStretch(r)}t.getStretch=i;function n(r,s){bo.setStretch(r,s)}t.setStretch=n})(Eu||(Eu={}));(function(t){function e(i){return i.layout||new bo({renderer:i.renderer||Eu.defaultRenderer,orientation:i.orientation,alignment:i.alignment,spacing:i.spacing})}t.createLayout=e})(iC||(iC={}));ng=class extends Eu{constructor(e={}){super({...e,layout:nC.createLayout(e)}),this._widgetSizesCache=new WeakMap,this._expansionToggled=new Te(this),this.addClass("lm-AccordionPanel")}get renderer(){return this.layout.renderer}get titleSpace(){return this.layout.titleSpace}set titleSpace(e){this.layout.titleSpace=e}get titles(){return this.layout.titles}get expansionToggled(){return this._expansionToggled}addWidget(e){super.addWidget(e),e.title.changed.connect(this._onTitleChanged,this)}collapse(e){let i=this.layout.widgets[e];i&&!i.isHidden&&this._toggleExpansion(e)}expand(e){let i=this.layout.widgets[e];i&&i.isHidden&&this._toggleExpansion(e)}insertWidget(e,i){super.insertWidget(e,i),i.title.changed.connect(this._onTitleChanged,this)}handleEvent(e){switch(super.handleEvent(e),e.type){case"click":this._evtClick(e);break;case"keydown":this._eventKeyDown(e);break}}onBeforeAttach(e){this.node.addEventListener("click",this),this.node.addEventListener("keydown",this),super.onBeforeAttach(e)}onAfterDetach(e){super.onAfterDetach(e),this.node.removeEventListener("click",this),this.node.removeEventListener("keydown",this)}_onTitleChanged(e){let i=Be.findFirstIndex(this.widgets,n=>n.contains(e.owner));i>=0&&(this.layout.updateTitle(i,e.owner),this.update())}_computeWidgetSize(e){let i=this.layout,n=i.widgets[e];if(!n)return;let r=n.isHidden,s=i.absoluteSizes(),o=(r?-1:1)*this.spacing,a=s.reduce((c,u)=>c+u),l=[...s];if(r){let c=this._widgetSizesCache.get(n);if(!c)return;l[e]+=c;let u=l.map(d=>d-c>0).lastIndexOf(!0);u===-1?l.forEach((d,f)=>{f!==e&&(l[f]-=s[f]/a*(c-o))}):l[u]-=c-o}else{let c=s[e];this._widgetSizesCache.set(n,c),l[e]=0;let u=l.map(d=>d>0).lastIndexOf(!0);if(u===-1)return;l[u]=s[u]+c+o}return l.map(c=>c/(a+o))}_evtClick(e){let i=e.target;if(i){let n=Be.findFirstIndex(this.titles,r=>r.contains(i));n>=0&&(e.preventDefault(),e.stopPropagation(),this._toggleExpansion(n))}}_eventKeyDown(e){if(e.defaultPrevented)return;let i=e.target,n=!1;if(i){let r=Be.findFirstIndex(this.titles,s=>s.contains(i));if(r>=0){let s=e.keyCode.toString();if(e.key.match(/Space|Enter/)||s.match(/13|32/))i.click(),n=!0;else if(this.orientation==="horizontal"?e.key.match(/ArrowLeft|ArrowRight/)||s.match(/37|39/):e.key.match(/ArrowUp|ArrowDown/)||s.match(/38|40/)){let o=e.key.match(/ArrowLeft|ArrowUp/)||s.match(/37|38/)?-1:1,a=this.titles.length,l=(r+a+o)%a;this.titles[l].focus(),n=!0}else e.key==="End"||s==="35"?(this.titles[this.titles.length-1].focus(),n=!0):(e.key==="Home"||s==="36")&&(this.titles[0].focus(),n=!0)}n&&e.preventDefault()}}_toggleExpansion(e){let i=this.titles[e],n=this.layout.widgets[e],r=this._computeWidgetSize(e);r&&this.setRelativeSizes(r,!1),n.isHidden?(i.classList.add("lm-mod-expanded"),i.setAttribute("aria-expanded","true"),n.show()):(i.classList.remove("lm-mod-expanded"),i.setAttribute("aria-expanded","false"),n.hide()),this._expansionToggled.emit(e)}};(function(t){class e extends Eu.Renderer{constructor(){super(),this.titleClassName="lm-AccordionPanel-title",this._titleID=0,this._titleKeys=new WeakMap,this._uuid=++e._nInstance}createCollapseIcon(n){return document.createElement("span")}createSectionTitle(n){let r=document.createElement("h3");r.setAttribute("tabindex","0"),r.id=this.createTitleKey(n),r.className=this.titleClassName;for(let a in n.dataset)r.dataset[a]=n.dataset[a];let s=r.appendChild(this.createCollapseIcon(n));s.className="lm-AccordionPanel-titleCollapser";let o=r.appendChild(document.createElement("span"));return o.className="lm-AccordionPanel-titleLabel",o.textContent=n.label,o.title=n.caption||n.label,r}createTitleKey(n){let r=this._titleKeys.get(n);return r===void 0&&(r=`title-key-${this._uuid}-${this._titleID++}`,this._titleKeys.set(n,r)),r}}e._nInstance=0,t.Renderer=e,t.defaultRenderer=new e})(ng||(ng={}));(function(t){function e(i){return i.layout||new eC({renderer:i.renderer||ng.defaultRenderer,orientation:i.orientation,alignment:i.alignment,spacing:i.spacing,titleSpace:i.titleSpace})}t.createLayout=e})(nC||(nC={}));os=class extends yf{constructor(e={}){super(),this._fixed=0,this._spacing=4,this._dirty=!1,this._sizers=[],this._items=[],this._box=null,this._alignment="start",this._direction="top-to-bottom",e.direction!==void 0&&(this._direction=e.direction),e.alignment!==void 0&&(this._alignment=e.alignment),e.spacing!==void 0&&(this._spacing=ig.clampDimension(e.spacing))}dispose(){for(let e of this._items)e.dispose();this._box=null,this._items.length=0,this._sizers.length=0,super.dispose()}get direction(){return this._direction}set direction(e){this._direction!==e&&(this._direction=e,this.parent&&(this.parent.dataset.direction=e,this.parent.fit()))}get alignment(){return this._alignment}set alignment(e){this._alignment!==e&&(this._alignment=e,this.parent&&(this.parent.dataset.alignment=e,this.parent.update()))}get spacing(){return this._spacing}set spacing(e){e=ig.clampDimension(e),this._spacing!==e&&(this._spacing=e,this.parent&&this.parent.fit())}init(){this.parent.dataset.direction=this.direction,this.parent.dataset.alignment=this.alignment,super.init()}attachWidget(e,i){Be.insert(this._items,e,new Cu(i)),Be.insert(this._sizers,e,new na),this.parent.isAttached&&Ae.sendMessage(i,me.Msg.BeforeAttach),this.parent.node.appendChild(i.node),this.parent.isAttached&&Ae.sendMessage(i,me.Msg.AfterAttach),this.parent.fit()}moveWidget(e,i,n){Be.move(this._items,e,i),Be.move(this._sizers,e,i),this.parent.update()}detachWidget(e,i){let n=Be.removeAt(this._items,e);Be.removeAt(this._sizers,e),this.parent.isAttached&&Ae.sendMessage(i,me.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.isAttached&&Ae.sendMessage(i,me.Msg.AfterDetach),n.dispose(),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_fit(){let e=0;for(let a=0,l=this._items.length;a0)switch(this._alignment){case"start":break;case"center":c=0,u=l/2;break;case"end":c=0,u=l;break;case"justify":c=l/n,u=0;break;default:throw"unreachable"}for(let d=0,f=this._items.length;d0,coerce:(r,s)=>Math.max(0,Math.floor(s)),changed:n}),t.sizeBasisProperty=new pt({name:"sizeBasis",create:()=>0,coerce:(r,s)=>Math.max(0,Math.floor(s)),changed:n});function e(r){return r==="left-to-right"||r==="right-to-left"}t.isHorizontal=e;function i(r){return Math.max(0,Math.floor(r))}t.clampSpacing=i;function n(r){r.parent&&r.parent.layout instanceof os&&r.parent.fit()}})(Su||(Su={}));Vb=class extends Ub{constructor(e={}){super({layout:rC.createLayout(e)}),this.addClass("lm-BoxPanel")}get direction(){return this.layout.direction}set direction(e){this.layout.direction=e}get alignment(){return this.layout.alignment}set alignment(e){this.layout.alignment=e}get spacing(){return this.layout.spacing}set spacing(e){this.layout.spacing=e}onChildAdded(e){e.child.addClass("lm-BoxPanel-child")}onChildRemoved(e){e.child.removeClass("lm-BoxPanel-child")}};(function(t){function e(s){return os.getStretch(s)}t.getStretch=e;function i(s,o){os.setStretch(s,o)}t.setStretch=i;function n(s){return os.getSizeBasis(s)}t.getSizeBasis=n;function r(s,o){os.setSizeBasis(s,o)}t.setSizeBasis=r})(Vb||(Vb={}));(function(t){function e(i){return i.layout||new os(i)}t.createLayout=e})(rC||(rC={}));wf=class extends me{constructor(e){super({node:il.createNode()}),this._activeIndex=-1,this._items=[],this._results=null,this.addClass("lm-CommandPalette"),this.setFlag(me.Flag.DisallowLayout),this.commands=e.commands,this.renderer=e.renderer||wf.defaultRenderer,this.commands.commandChanged.connect(this._onGenericChange,this),this.commands.keyBindingChanged.connect(this._onGenericChange,this)}dispose(){this._items.length=0,this._results=null,super.dispose()}get searchNode(){return this.node.getElementsByClassName("lm-CommandPalette-search")[0]}get inputNode(){return this.node.getElementsByClassName("lm-CommandPalette-input")[0]}get contentNode(){return this.node.getElementsByClassName("lm-CommandPalette-content")[0]}get items(){return this._items}addItem(e){let i=il.createItem(this.commands,e);return this._items.push(i),this.refresh(),i}addItems(e){let i=e.map(n=>il.createItem(this.commands,n));return i.forEach(n=>this._items.push(n)),this.refresh(),i}removeItem(e){this.removeItemAt(this._items.indexOf(e))}removeItemAt(e){Be.removeAt(this._items,e)&&this.refresh()}clearItems(){this._items.length!==0&&(this._items.length=0,this.refresh())}refresh(){if(this._results=null,this.inputNode.value!==""){let e=this.node.getElementsByClassName("lm-close-icon")[0];e.style.display="inherit"}else{let e=this.node.getElementsByClassName("lm-close-icon")[0];e.style.display="none"}this.update()}handleEvent(e){switch(e.type){case"click":this._evtClick(e);break;case"keydown":this._evtKeyDown(e);break;case"input":this.refresh();break;case"focus":case"blur":this._toggleFocused();break}}onBeforeAttach(e){this.node.addEventListener("click",this),this.node.addEventListener("keydown",this),this.node.addEventListener("input",this),this.node.addEventListener("focus",this,!0),this.node.addEventListener("blur",this,!0)}onAfterDetach(e){this.node.removeEventListener("click",this),this.node.removeEventListener("keydown",this),this.node.removeEventListener("input",this),this.node.removeEventListener("focus",this,!0),this.node.removeEventListener("blur",this,!0)}onAfterShow(e){this.update(),super.onAfterShow(e)}onActivateRequest(e){if(this.isAttached){let i=this.inputNode;i.focus(),i.select()}}onUpdateRequest(e){if(this.isHidden)return;let i=this.inputNode.value,n=this.contentNode,r=this._results;if(r||(r=this._results=il.search(this._items,i),this._activeIndex=i?Be.findFirstIndex(r,il.canActivate):-1),!i&&r.length===0){Zt.render(null,n);return}if(i&&r.length===0){let l=this.renderer.renderEmptyMessage({query:i});Zt.render(l,n);return}let s=this.renderer,o=this._activeIndex,a=new Array(r.length);for(let l=0,c=r.length;l=r.length)n.scrollTop=0;else{let l=n.children[o];ei.scrollIntoViewIfNeeded(n,l)}}_evtClick(e){if(e.button!==0)return;if(e.target.classList.contains("lm-close-icon")){this.inputNode.value="",this.refresh();return}let i=Be.findFirstIndex(this.contentNode.children,n=>n.contains(e.target));i!==-1&&(e.preventDefault(),e.stopPropagation(),this._execute(i))}_evtKeyDown(e){if(!(e.altKey||e.ctrlKey||e.metaKey||e.shiftKey))switch(e.keyCode){case 13:e.preventDefault(),e.stopPropagation(),this._execute(this._activeIndex);break;case 38:e.preventDefault(),e.stopPropagation(),this._activatePreviousItem();break;case 40:e.preventDefault(),e.stopPropagation(),this._activateNextItem();break}}_activateNextItem(){if(!this._results||this._results.length===0)return;let e=this._activeIndex,i=this._results.length,n=eC-x),b=S.slice(0,A),M=S.slice(A);for(let C=0,x=M.length;Cp.command===h&&sl.JSONExt.deepEqual(p.args,m))||null}}})(il||(il={}));ra=class extends me{constructor(e){super({node:gr.createNode()}),this._childIndex=-1,this._activeIndex=-1,this._openTimerID=0,this._closeTimerID=0,this._items=[],this._childMenu=null,this._parentMenu=null,this._aboutToClose=new Te(this),this._menuRequested=new Te(this),this.addClass("lm-Menu"),this.setFlag(me.Flag.DisallowLayout),this.commands=e.commands,this.renderer=e.renderer||ra.defaultRenderer}dispose(){this.close(),this._items.length=0,super.dispose()}get aboutToClose(){return this._aboutToClose}get menuRequested(){return this._menuRequested}get parentMenu(){return this._parentMenu}get childMenu(){return this._childMenu}get rootMenu(){let e=this;for(;e._parentMenu;)e=e._parentMenu;return e}get leafMenu(){let e=this;for(;e._childMenu;)e=e._childMenu;return e}get contentNode(){return this.node.getElementsByClassName("lm-Menu-content")[0]}get activeItem(){return this._items[this._activeIndex]||null}set activeItem(e){this.activeIndex=e?this._items.indexOf(e):-1}get activeIndex(){return this._activeIndex}set activeIndex(e){(e<0||e>=this._items.length)&&(e=-1),e!==-1&&!gr.canActivate(this._items[e])&&(e=-1),this._activeIndex!==e&&(this._activeIndex=e,this._activeIndex>=0&&this.contentNode.childNodes[this._activeIndex]&&this.contentNode.childNodes[this._activeIndex].focus(),this.update())}get items(){return this._items}activateNextItem(){let e=this._items.length,i=this._activeIndex,n=i{this.activeIndex=a}})}Zt.render(o,this.contentNode)}onCloseRequest(e){this._cancelOpenTimer(),this._cancelCloseTimer(),this.activeIndex=-1;let i=this._childMenu;i&&(this._childIndex=-1,this._childMenu=null,i._parentMenu=null,i.close());let n=this._parentMenu;n&&(this._parentMenu=null,n._childIndex=-1,n._childMenu=null,n.activate()),this.isAttached&&this._aboutToClose.emit(void 0),super.onCloseRequest(e)}_evtKeyDown(e){e.preventDefault(),e.stopPropagation();let i=e.keyCode;if(i===13){this.triggerActiveItem();return}if(i===27){this.close();return}if(i===37){this._parentMenu?this.close():this._menuRequested.emit("previous");return}if(i===38){this.activatePreviousItem();return}if(i===39){let o=this.activeItem;o&&o.type==="submenu"?this.triggerActiveItem():this.rootMenu._menuRequested.emit("next");return}if(i===40){this.activateNextItem();return}let n=ks().keyForKeydownEvent(e);if(!n)return;let r=this._activeIndex+1,s=gr.findMnemonic(this._items,n,r);s.index!==-1&&!s.multiple?(this.activeIndex=s.index,this.triggerActiveItem()):s.index!==-1?this.activeIndex=s.index:s.auto!==-1&&(this.activeIndex=s.auto)}_evtMouseUp(e){e.button===0&&(e.preventDefault(),e.stopPropagation(),this.triggerActiveItem())}_evtMouseMove(e){let i=Be.findFirstIndex(this.contentNode.children,r=>ei.hitTest(r,e.clientX,e.clientY));if(i===this._activeIndex)return;if(this.activeIndex=i,i=this.activeIndex,i===this._childIndex){this._cancelOpenTimer(),this._cancelCloseTimer();return}this._childIndex!==-1&&this._startCloseTimer(),this._cancelOpenTimer();let n=this.activeItem;!n||n.type!=="submenu"||!n.submenu||this._startOpenTimer()}_evtMouseEnter(e){for(let i=this._parentMenu;i;i=i._parentMenu)i._cancelOpenTimer(),i._cancelCloseTimer(),i.activeIndex=i._childIndex}_evtMouseLeave(e){if(this._cancelOpenTimer(),!this._childMenu){this.activeIndex=-1;return}let{clientX:i,clientY:n}=e;if(ei.hitTest(this._childMenu.node,i,n)){this._cancelCloseTimer();return}this.activeIndex=-1,this._startCloseTimer()}_evtMouseDown(e){this._parentMenu||(gr.hitTestMenus(this,e.clientX,e.clientY)?(e.preventDefault(),e.stopPropagation()):this.close())}_openChildMenu(e=!1){let i=this.activeItem;if(!i||i.type!=="submenu"||!i.submenu){this._closeChildMenu();return}let n=i.submenu;if(n===this._childMenu)return;ra.saveWindowData(),this._closeChildMenu(),this._childMenu=n,this._childIndex=this._activeIndex,n._parentMenu=this,Ae.sendMessage(this,me.Msg.UpdateRequest);let r=this.contentNode.children[this._activeIndex];gr.openSubmenu(n,r),e&&(n.activeIndex=-1,n.activateNextItem()),n.activate()}_closeChildMenu(){this._childMenu&&this._childMenu.close()}_startOpenTimer(){this._openTimerID===0&&(this._openTimerID=window.setTimeout(()=>{this._openTimerID=0,this._openChildMenu()},gr.TIMER_DELAY))}_startCloseTimer(){this._closeTimerID===0&&(this._closeTimerID=window.setTimeout(()=>{this._closeTimerID=0,this._closeChildMenu()},gr.TIMER_DELAY))}_cancelOpenTimer(){this._openTimerID!==0&&(clearTimeout(this._openTimerID),this._openTimerID=0)}_cancelCloseTimer(){this._closeTimerID!==0&&(clearTimeout(this._closeTimerID),this._closeTimerID=0)}static saveWindowData(){gr.saveWindowData()}};(function(t){class e{renderItem(n){let r=this.createItemClass(n),s=this.createItemDataset(n),o=this.createItemARIA(n);return Ce.li({className:r,dataset:s,tabindex:"0",onfocus:n.onfocus,...o},this.renderIcon(n),this.renderLabel(n),this.renderShortcut(n),this.renderSubmenu(n))}renderIcon(n){let r=this.createIconClass(n);return Ce.div({className:r},n.item.icon,n.item.iconLabel)}renderLabel(n){let r=this.formatLabel(n);return Ce.div({className:"lm-Menu-itemLabel"},r)}renderShortcut(n){let r=this.formatShortcut(n);return Ce.div({className:"lm-Menu-itemShortcut"},r)}renderSubmenu(n){return Ce.div({className:"lm-Menu-itemSubmenuIcon"})}createItemClass(n){let r="lm-Menu-item";n.item.isEnabled||(r+=" lm-mod-disabled"),n.item.isToggled&&(r+=" lm-mod-toggled"),n.item.isVisible||(r+=" lm-mod-hidden"),n.active&&(r+=" lm-mod-active"),n.collapsed&&(r+=" lm-mod-collapsed");let s=n.item.className;return s&&(r+=` ${s}`),r}createItemDataset(n){let r,{type:s,command:o,dataset:a}=n.item;return s==="command"?r={...a,type:s,command:o}:r={...a,type:s},r}createIconClass(n){let r="lm-Menu-itemIcon",s=n.item.iconClass;return s?`${r} ${s}`:r}createItemARIA(n){let r={};switch(n.item.type){case"separator":r.role="presentation";break;case"submenu":r["aria-haspopup"]="true",n.item.isEnabled||(r["aria-disabled"]="true");break;default:n.item.isEnabled||(r["aria-disabled"]="true"),r.role="menuitem"}return r}formatLabel(n){let{label:r,mnemonic:s}=n.item;if(s<0||s>=r.length)return r;let o=r.slice(0,s),a=r.slice(s+1),l=r[s],c=Ce.span({className:"lm-Menu-itemMnemonic"},l);return[o,c,a]}formatShortcut(n){let r=n.item.keyBinding;return r?er.formatKeystroke(r.keys):null}}t.Renderer=e,t.defaultRenderer=new e})(ra||(ra={}));(function(t){t.TIMER_DELAY=300,t.SUBMENU_OVERLAP=3;let e=null,i=0;function n(){return i>0?(i--,e):u()}function r(){e=u(),i++}t.saveWindowData=r;function s(){let p=document.createElement("div"),_=document.createElement("ul");return _.className="lm-Menu-content",p.appendChild(_),_.setAttribute("role","menu"),p.tabIndex=0,p}t.createNode=s;function o(p){return p.type!=="separator"&&p.isEnabled&&p.isVisible}t.canActivate=o;function a(p,_){return new m(p.commands,_)}t.createItem=a;function l(p,_,y){for(let S=p;S;S=S.childMenu)if(ei.hitTest(S.node,_,y))return!0;return!1}t.hitTestMenus=l;function c(p){let _=new Array(p.length);Be.fill(_,!1);let y=0,S=p.length;for(;y=0;--T){let A=p[T];if(A.isVisible){if(A.type!=="separator")break;_[T]=!0}}let O=!1;for(;++yM+x&&(_=M+x-Z),!T&&y+X>C+w&&(y>C+w?y=C+w-X:y=y-X),B.transform=`translate(${Math.max(0,_)}px, ${Math.max(0,y)}px`,B.opacity="1"}t.openRootMenu=d;function f(p,_){let y=n(),S=y.pageXOffset,T=y.pageYOffset,O=y.clientWidth,A=y.clientHeight;Ae.sendMessage(p,me.Msg.UpdateRequest);let b=A,M=p.node,C=M.style;C.opacity="0",C.maxHeight=`${b}px`,me.attach(p,document.body);let{width:x,height:w}=M.getBoundingClientRect(),E=ei.boxSizing(p.node),N=_.getBoundingClientRect(),B=N.right-t.SUBMENU_OVERLAP;B+x>S+O&&(B=N.left+t.SUBMENU_OVERLAP-x);let Z=N.top-E.borderTop-E.paddingTop;Z+w>T+A&&(Z=N.bottom+E.borderBottom+E.paddingBottom-w),C.transform=`translate(${Math.max(0,B)}px, ${Math.max(0,Z)}px`,C.opacity="1"}t.openSubmenu=f;function h(p,_,y){let S=-1,T=-1,O=!1,A=_.toUpperCase();for(let b=0,M=p.length;b=0&&ES.command===_&&sl.JSONExt.deepEqual(S.args,y))||null}return null}}})(gr||(gr={}));(function(t){function e(o,a){let l=n(o.selector),c=o.rank!==void 0?o.rank:1/0;return{...o,selector:l,rank:c,id:a}}t.createItem=e;function i(o,a,l,c){let u=a.target;if(!u)return null;let d=a.currentTarget;if(!d||!d.contains(u)&&(u=document.elementFromPoint(a.clientX,a.clientY),!u||!d.contains(u)))return null;let f=[],h=o.slice();for(;u!==null;){let m=[];for(let p=0,_=h.length;p<_;++p){let y=h[p];y&&wu.matches(u,y.selector)&&(m.push(y),h[p]=null)}if(m.length!==0&&(l&&m.sort(c?s:r),f.push(...m)),u===d)break;u=u.parentElement}return l||f.sort(c?s:r),f}t.matchItems=i;function n(o){if(o.indexOf(",")!==-1)throw new Error(`Selector cannot contain commas: ${o}`);if(!wu.isValid(o))throw new Error(`Invalid selector: ${o}`);return o}function r(o,a){let l=o.rank,c=a.rank;return l!==c?l=this._titles.length)&&(e=-1),this._currentIndex===e)return;let i=this._currentIndex,n=this._titles[i]||null,r=e,s=this._titles[r]||null;this._currentIndex=r,this._previousTitle=n,this.update(),this._currentChanged.emit({previousIndex:i,previousTitle:n,currentIndex:r,currentTitle:s})}get name(){return this._name}set name(e){this._name=e,e?this.contentNode.setAttribute("aria-label",e):this.contentNode.removeAttribute("aria-label")}get orientation(){return this._orientation}set orientation(e){this._orientation!==e&&(this._releaseMouse(),this._orientation=e,this.dataset.orientation=e,this.contentNode.setAttribute("aria-orientation",e))}get addButtonEnabled(){return this._addButtonEnabled}set addButtonEnabled(e){this._addButtonEnabled!==e&&(this._addButtonEnabled=e,e?this.addButtonNode.classList.remove("lm-mod-hidden"):this.addButtonNode.classList.add("lm-mod-hidden"))}get titles(){return this._titles}get contentNode(){return this.node.getElementsByClassName("lm-TabBar-content")[0]}get addButtonNode(){return this.node.getElementsByClassName("lm-TabBar-addButton")[0]}addTab(e){return this.insertTab(this._titles.length,e)}insertTab(e,i){this._releaseMouse();let n=qs.asTitle(i),r=this._titles.indexOf(n),s=Math.max(0,Math.min(e,this._titles.length));return r===-1?(Be.insert(this._titles,s,n),n.changed.connect(this._onTitleChanged,this),this.update(),this._adjustCurrentForInsert(s,n),n):(s===this._titles.length&&s--,r===s||(Be.move(this._titles,r,s),this.update(),this._adjustCurrentForMove(r,s)),n)}removeTab(e){this.removeTabAt(this._titles.indexOf(e))}removeTabAt(e){this._releaseMouse();let i=Be.removeAt(this._titles,e);i&&(i.changed.disconnect(this._onTitleChanged,this),i===this._previousTitle&&(this._previousTitle=null),this.update(),this._adjustCurrentForRemove(e,i))}clearTabs(){if(this._titles.length===0)return;this._releaseMouse();for(let n of this._titles)n.changed.disconnect(this._onTitleChanged,this);let e=this.currentIndex,i=this.currentTitle;this._currentIndex=-1,this._previousTitle=null,this._titles.length=0,this.update(),e!==-1&&this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:-1,currentTitle:null})}releaseMouse(){this._releaseMouse()}handleEvent(e){switch(e.type){case"pointerdown":this._evtPointerDown(e);break;case"pointermove":this._evtPointerMove(e);break;case"pointerup":this._evtPointerUp(e);break;case"dblclick":this._evtDblClick(e);break;case"keydown":e.eventPhase===Event.CAPTURING_PHASE?this._evtKeyDownCapturing(e):this._evtKeyDown(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("pointerdown",this),this.node.addEventListener("dblclick",this),this.node.addEventListener("keydown",this)}onAfterDetach(e){this.node.removeEventListener("pointerdown",this),this.node.removeEventListener("dblclick",this),this.node.removeEventListener("keydown",this),this._releaseMouse()}onUpdateRequest(e){var i;let n=this._titles,r=this.renderer,s=this.currentTitle,o=new Array(n.length),a=(i=this._getCurrentTabindex())!==null&&i!==void 0?i:this._currentIndex>-1?this._currentIndex:0;for(let l=0,c=n.length;lei.hitTest(o,e.clientX,e.clientY));if(n===-1)return;let r=this.titles[n],s=i[n].querySelector(".lm-TabBar-tabLabel");if(s&&s.contains(e.target)){let o=r.label||"",a=s.innerHTML;s.innerHTML="";let l=document.createElement("input");l.classList.add("lm-TabBar-tabInput"),l.value=o,s.appendChild(l);let c=()=>{l.removeEventListener("blur",c),s.innerHTML=a,this.node.addEventListener("keydown",this)};l.addEventListener("dblclick",u=>u.stopPropagation()),l.addEventListener("blur",c),l.addEventListener("keydown",u=>{u.key==="Enter"?(l.value!==""&&(r.label=r.caption=l.value),c()):u.key==="Escape"&&c()}),this.node.removeEventListener("keydown",this),l.select(),l.focus(),s.children.length>0&&s.children[0].focus()}}_evtKeyDownCapturing(e){e.eventPhase===Event.CAPTURING_PHASE&&(e.preventDefault(),e.stopPropagation(),e.key==="Escape"&&this._releaseMouse())}_evtKeyDown(e){var i,n,r;if(!(e.key==="Tab"||e.eventPhase===Event.CAPTURING_PHASE)){if(e.key==="Enter"||e.key==="Spacebar"||e.key===" "){let s=document.activeElement;if(this.addButtonEnabled&&this.addButtonNode.contains(s))e.preventDefault(),e.stopPropagation(),this._addRequested.emit();else{let o=Be.findFirstIndex(this.contentNode.children,a=>a.contains(s));o>=0&&(e.preventDefault(),e.stopPropagation(),this.currentIndex=o)}}else if(WV.includes(e.key)){let s=[...this.contentNode.children];if(this.addButtonEnabled&&s.push(this.addButtonNode),s.length<=1)return;e.preventDefault(),e.stopPropagation();let o=s.indexOf(document.activeElement);o===-1&&(o=this._currentIndex);let a;e.key==="ArrowRight"&&this._orientation==="horizontal"||e.key==="ArrowDown"&&this._orientation==="vertical"?a=(i=s[o+1])!==null&&i!==void 0?i:s[0]:e.key==="ArrowLeft"&&this._orientation==="horizontal"||e.key==="ArrowUp"&&this._orientation==="vertical"?a=(n=s[o-1])!==null&&n!==void 0?n:s[s.length-1]:e.key==="Home"?a=s[0]:e.key==="End"&&(a=s[s.length-1]),a&&((r=s[o])===null||r===void 0||r.setAttribute("tabindex","-1"),a?.setAttribute("tabindex","0"),a.focus())}}}_evtPointerDown(e){if(e.button!==0&&e.button!==1||this._dragData||e.target.classList.contains("lm-TabBar-tabInput"))return;let i=this.addButtonEnabled&&this.addButtonNode.contains(e.target),n=this.contentNode.children,r=Be.findFirstIndex(n,o=>ei.hitTest(o,e.clientX,e.clientY));if(r===-1&&!i||(e.preventDefault(),e.stopPropagation(),this._dragData={tab:n[r],index:r,pressX:e.clientX,pressY:e.clientY,tabPos:-1,tabSize:-1,tabPressPos:-1,targetIndex:-1,tabLayout:null,contentRect:null,override:null,dragActive:!1,dragAborted:!1,detachRequested:!1},this.document.addEventListener("pointerup",this,!0),e.button===1||i))return;let s=n[r].querySelector(this.renderer.closeIconSelector);s&&s.contains(e.target)||(this.tabsMovable&&(this.document.addEventListener("pointermove",this,!0),this.document.addEventListener("keydown",this,!0),this.document.addEventListener("contextmenu",this,!0)),this.allowDeselect&&this.currentIndex===r?this.currentIndex=-1:this.currentIndex=r,this.currentIndex!==-1&&this._tabActivateRequested.emit({index:this.currentIndex,title:this.currentTitle}))}_evtPointerMove(e){let i=this._dragData;if(!i)return;e.preventDefault(),e.stopPropagation();let n=this.contentNode.children;if(!(!i.dragActive&&!qs.dragExceeded(i,e))){if(!i.dragActive){let r=i.tab.getBoundingClientRect();this._orientation==="horizontal"?(i.tabPos=i.tab.offsetLeft,i.tabSize=r.width,i.tabPressPos=i.pressX-r.left):(i.tabPos=i.tab.offsetTop,i.tabSize=r.height,i.tabPressPos=i.pressY-r.top),i.tabPressOffset={x:i.pressX-r.left,y:i.pressY-r.top},i.tabLayout=qs.snapTabLayout(n,this._orientation),i.contentRect=this.contentNode.getBoundingClientRect(),i.override=an.overrideCursor("default"),i.tab.classList.add("lm-mod-dragging"),this.addClass("lm-mod-dragging"),i.dragActive=!0}if(!i.detachRequested&&qs.detachExceeded(i,e)){i.detachRequested=!0;let r=i.index,s=e.clientX,o=e.clientY,a=n[r],l=this._titles[r];if(this._tabDetachRequested.emit({index:r,title:l,tab:a,clientX:s,clientY:o,offset:i.tabPressOffset}),i.dragAborted)return}qs.layoutTabs(n,i,e,this._orientation)}}_evtPointerUp(e){if(e.button!==0&&e.button!==1)return;let i=this._dragData;if(!i)return;if(e.preventDefault(),e.stopPropagation(),this.document.removeEventListener("pointermove",this,!0),this.document.removeEventListener("pointerup",this,!0),this.document.removeEventListener("keydown",this,!0),this.document.removeEventListener("contextmenu",this,!0),!i.dragActive){if(this._dragData=null,this.addButtonEnabled&&this.addButtonNode.contains(e.target)){this._addRequested.emit(void 0);return}let s=this.contentNode.children,o=Be.findFirstIndex(s,c=>ei.hitTest(c,e.clientX,e.clientY));if(o!==i.index)return;let a=this._titles[o];if(!a.closable)return;if(e.button===1){this._tabCloseRequested.emit({index:o,title:a});return}let l=s[o].querySelector(this.renderer.closeIconSelector);if(l&&l.contains(e.target)){this._tabCloseRequested.emit({index:o,title:a});return}return}if(e.button!==0)return;qs.finalizeTabPosition(i,this._orientation),i.tab.classList.remove("lm-mod-dragging");let n=qs.parseTransitionDuration(i.tab);setTimeout(()=>{if(i.dragAborted)return;this._dragData=null,qs.resetTabPositions(this.contentNode.children,this._orientation),i.override.dispose(),this.removeClass("lm-mod-dragging");let r=i.index,s=i.targetIndex;s===-1||r===s||(Be.move(this._titles,r,s),this._adjustCurrentForMove(r,s),this._tabMoved.emit({fromIndex:r,toIndex:s,title:this._titles[s]}),Ae.sendMessage(this,me.Msg.UpdateRequest))},n)}_releaseMouse(){let e=this._dragData;e&&(this._dragData=null,this.document.removeEventListener("pointermove",this,!0),this.document.removeEventListener("pointerup",this,!0),this.document.removeEventListener("keydown",this,!0),this.document.removeEventListener("contextmenu",this,!0),e.dragAborted=!0,e.dragActive&&(qs.resetTabPositions(this.contentNode.children,this._orientation),e.override.dispose(),e.tab.classList.remove("lm-mod-dragging"),this.removeClass("lm-mod-dragging")))}_adjustCurrentForInsert(e,i){let n=this.currentTitle,r=this._currentIndex,s=this.insertBehavior;if(s==="select-tab"||s==="select-tab-if-needed"&&r===-1){this._currentIndex=e,this._previousTitle=n,this._currentChanged.emit({previousIndex:r,previousTitle:n,currentIndex:e,currentTitle:i});return}r>=e&&this._currentIndex++}_adjustCurrentForMove(e,i){this._currentIndex===e?this._currentIndex=i:this._currentIndex=i?this._currentIndex++:this._currentIndex>e&&this._currentIndex<=i&&this._currentIndex--}_adjustCurrentForRemove(e,i){let n=this._currentIndex,r=this.removeBehavior;if(n!==e){n>e&&this._currentIndex--;return}if(this._titles.length===0){this._currentIndex=-1,this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:-1,currentTitle:null});return}if(r==="select-tab-after"){this._currentIndex=Math.min(e,this._titles.length-1),this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:this._currentIndex,currentTitle:this.currentTitle});return}if(r==="select-tab-before"){this._currentIndex=Math.max(0,e-1),this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:this._currentIndex,currentTitle:this.currentTitle});return}if(r==="select-previous-tab"){this._previousTitle?(this._currentIndex=this._titles.indexOf(this._previousTitle),this._previousTitle=null):this._currentIndex=Math.min(e,this._titles.length-1),this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:this._currentIndex,currentTitle:this.currentTitle});return}this._currentIndex=-1,this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:-1,currentTitle:null})}_onTitleChanged(e){this.update()}};(function(t){class e{constructor(){this.closeIconSelector=".lm-TabBar-tabCloseIcon",this._tabID=0,this._tabKeys=new WeakMap,this._uuid=++e._nInstance}renderTab(n){let r=n.title.caption,s=this.createTabKey(n),o=s,a=this.createTabStyle(n),l=this.createTabClass(n),c=this.createTabDataset(n),u=this.createTabARIA(n);return n.title.closable?Ce.li({id:o,key:s,className:l,title:r,style:a,dataset:c,...u},this.renderIcon(n),this.renderLabel(n),this.renderCloseIcon(n)):Ce.li({id:o,key:s,className:l,title:r,style:a,dataset:c,...u},this.renderIcon(n),this.renderLabel(n))}renderIcon(n){let{title:r}=n,s=this.createIconClass(n);return Ce.div({className:s},r.icon,r.iconLabel)}renderLabel(n){return Ce.div({className:"lm-TabBar-tabLabel"},n.title.label)}renderCloseIcon(n){return Ce.div({className:"lm-TabBar-tabCloseIcon"})}createTabKey(n){let r=this._tabKeys.get(n.title);return r===void 0&&(r=`tab-key-${this._uuid}-${this._tabID++}`,this._tabKeys.set(n.title,r)),r}createTabStyle(n){return{zIndex:`${n.zIndex}`}}createTabClass(n){let r="lm-TabBar-tab";return n.title.className&&(r+=` ${n.title.className}`),n.title.closable&&(r+=" lm-mod-closable"),n.current&&(r+=" lm-mod-current"),r}createTabDataset(n){return n.title.dataset}createTabARIA(n){var r;return{role:"tab","aria-selected":n.current.toString(),tabindex:`${(r=n.tabIndex)!==null&&r!==void 0?r:"-1"}`}}createIconClass(n){let r="lm-TabBar-tabIcon",s=n.title.iconClass;return s?`${r} ${s}`:r}}e._nInstance=0,t.Renderer=e,t.defaultRenderer=new e,t.addButtonSelector=".lm-TabBar-addButton"})(gc||(gc={}));(function(t){t.DRAG_THRESHOLD=5,t.DETACH_THRESHOLD=20;function e(){let u=document.createElement("div"),d=document.createElement("ul");d.setAttribute("role","tablist"),d.className="lm-TabBar-content",u.appendChild(d);let f=document.createElement("div");return f.className="lm-TabBar-addButton lm-mod-hidden",f.setAttribute("tabindex","-1"),f.setAttribute("role","button"),u.appendChild(f),u}t.createNode=e;function i(u){return u instanceof eg?u:new eg(u)}t.asTitle=i;function n(u){let d=window.getComputedStyle(u);return 1e3*(parseFloat(d.transitionDuration)||0)}t.parseTransitionDuration=n;function r(u,d){let f=new Array(u.length);for(let h=0,m=u.length;h=t.DRAG_THRESHOLD||h>=t.DRAG_THRESHOLD}t.dragExceeded=s;function o(u,d){let f=u.contentRect;return d.clientX=f.right+t.DETACH_THRESHOLD||d.clientY=f.bottom+t.DETACH_THRESHOLD}t.detachExceeded=o;function a(u,d,f,h){let m,p,_,y;h==="horizontal"?(m=d.pressX,p=f.clientX-d.contentRect.left,_=f.clientX,y=d.contentRect.width):(m=d.pressY,p=f.clientY-d.contentRect.top,_=f.clientY,y=d.contentRect.height);let S=d.index,T=p-d.tabPressPos,O=T+d.tabSize;for(let A=0,b=u.length;A>1);if(Ad.index&&O>x)M=`${-d.tabSize-C.margin}px`,S=Math.max(S,A);else if(A===d.index){let w=_-m,E=y-(d.tabPos+d.tabSize);M=`${Math.max(-d.tabPos,Math.min(w,E))}px`}else M="";h==="horizontal"?u[A].style.left=M:u[A].style.top=M}d.targetIndex=S}t.layoutTabs=a;function l(u,d){let f;d==="horizontal"?f=u.contentRect.width:f=u.contentRect.height;let h;if(u.targetIndex===u.index)h=0;else if(u.targetIndex>u.index){let _=u.tabLayout[u.targetIndex];h=_.pos+_.size-u.tabSize-u.tabPos}else h=u.tabLayout[u.targetIndex].pos-u.tabPos;let m=f-(u.tabPos+u.tabSize),p=Math.max(-u.tabPos,Math.min(h,m));d==="horizontal"?u.tab.style.left=`${p}px`:u.tab.style.top=`${p}px`}t.finalizeTabPosition=l;function c(u,d){for(let f of u)d==="horizontal"?f.style.left="":f.style.top=""}t.resetTabPositions=c})(qs||(qs={}));sC=class extends sa{constructor(e){super(),this._spacing=4,this._dirty=!1,this._root=null,this._box=null,this._items=new Map,this.renderer=e.renderer,e.spacing!==void 0&&(this._spacing=ig.clampDimension(e.spacing)),this._document=e.document||document,this._hiddenMode=e.hiddenMode!==void 0?e.hiddenMode:me.HiddenMode.Display}dispose(){let e=this[Symbol.iterator]();this._items.forEach(i=>{i.dispose()}),this._box=null,this._root=null,this._items.clear();for(let i of e)i.dispose();super.dispose()}get hiddenMode(){return this._hiddenMode}set hiddenMode(e){if(this._hiddenMode!==e){this._hiddenMode=e;for(let i of this.tabBars())if(i.titles.length>1)for(let n of i.titles)n.owner.hiddenMode=this._hiddenMode}}get spacing(){return this._spacing}set spacing(e){e=ig.clampDimension(e),this._spacing!==e&&(this._spacing=e,this.parent&&this.parent.fit())}get isEmpty(){return this._root===null}[Symbol.iterator](){return this._root?this._root.iterAllWidgets():vf()}widgets(){return this._root?this._root.iterUserWidgets():vf()}selectedWidgets(){return this._root?this._root.iterSelectedWidgets():vf()}tabBars(){return this._root?this._root.iterTabBars():vf()}handles(){return this._root?this._root.iterHandles():vf()}moveHandle(e,i,n){let r=e.classList.contains("lm-mod-hidden");if(!this._root||r)return;let s=this._root.findSplitNode(e);if(!s)return;let o;s.node.orientation==="horizontal"?o=i-e.offsetLeft:o=n-e.offsetTop,o!==0&&(s.node.holdSizes(),ss.adjust(s.node.sizers,s.index,o),this.parent&&this.parent.update())}saveLayout(){return this._root?(this._root.holdAllSizes(),{main:this._root.createConfig()}):{main:null}}restoreLayout(e){let i=new Set,n;e.main?n=en.normalizeAreaConfig(e.main,i):n=null;let r=this.widgets(),s=this.tabBars(),o=this.handles();this._root=null;for(let a of r)i.has(a)||(a.parent=null);for(let a of s)a.dispose();for(let a of o)a.parentNode&&a.parentNode.removeChild(a);for(let a of i)a.parent=this.parent;n?this._root=en.realizeAreaConfig(n,{createTabBar:a=>this._createTabBar(),createHandle:()=>this._createHandle()},this._document):this._root=null,this.parent&&(i.forEach(a=>{this.attachWidget(a)}),this.parent.fit())}addWidget(e,i={}){let n=i.ref||null,r=i.mode||"tab-after",s=null;if(this._root&&n&&(s=this._root.findTabNode(n)),n&&!s)throw new Error("Reference widget is not in the layout.");switch(e.parent=this.parent,r){case"tab-after":this._insertTab(e,n,s,!0);break;case"tab-before":this._insertTab(e,n,s,!1);break;case"split-top":this._insertSplit(e,n,s,"vertical",!1);break;case"split-left":this._insertSplit(e,n,s,"horizontal",!1);break;case"split-right":this._insertSplit(e,n,s,"horizontal",!0);break;case"split-bottom":this._insertSplit(e,n,s,"vertical",!0);break;case"merge-top":this._insertSplit(e,n,s,"vertical",!1,!0);break;case"merge-left":this._insertSplit(e,n,s,"horizontal",!1,!0);break;case"merge-right":this._insertSplit(e,n,s,"horizontal",!0,!0);break;case"merge-bottom":this._insertSplit(e,n,s,"vertical",!0,!0);break}this.parent&&(this.attachWidget(e),this.parent.fit())}removeWidget(e){this._removeWidget(e),this.parent&&(this.detachWidget(e),this.parent.fit())}hitTestTabAreas(e,i){if(!this._root||!this.parent||!this.parent.isVisible)return null;this._box||(this._box=ei.boxSizing(this.parent.node));let n=this.parent.node.getBoundingClientRect(),r=e-n.left-this._box.borderLeft,s=i-n.top-this._box.borderTop,o=this._root.hitTestTabNodes(r,s);if(!o)return null;let{tabBar:a,top:l,left:c,width:u,height:d}=o,f=this._box.borderLeft+this._box.borderRight,h=this._box.borderTop+this._box.borderBottom,m=n.width-f-(c+u),p=n.height-h-(l+d);return{tabBar:a,x:r,y:s,top:l,left:c,right:m,bottom:p,width:u,height:d}}init(){super.init();for(let e of this)this.attachWidget(e);for(let e of this.handles())this.parent.node.appendChild(e);this.parent.fit()}attachWidget(e){this.parent.node!==e.node.parentNode&&(this._items.set(e,new Cu(e)),this.parent.isAttached&&Ae.sendMessage(e,me.Msg.BeforeAttach),this.parent.node.appendChild(e.node),this.parent.isAttached&&Ae.sendMessage(e,me.Msg.AfterAttach))}detachWidget(e){if(this.parent.node!==e.node.parentNode)return;this.parent.isAttached&&Ae.sendMessage(e,me.Msg.BeforeDetach),this.parent.node.removeChild(e.node),this.parent.isAttached&&Ae.sendMessage(e,me.Msg.AfterDetach);let i=this._items.get(e);i&&(this._items.delete(e),i.dispose())}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_removeWidget(e){if(!this._root)return;let i=this._root.findTabNode(e);if(!i)return;if(en.removeAria(e),i.tabBar.titles.length>1){if(i.tabBar.removeTab(e.title),this._hiddenMode===me.HiddenMode.Scale&&i.tabBar.titles.length==1){let f=i.tabBar.titles[0].owner;f.hiddenMode=me.HiddenMode.Display}return}if(i.tabBar.dispose(),this._root===i){this._root=null;return}this._root.holdAllSizes();let n=i.parent;i.parent=null;let r=Be.removeFirstOf(n.children,i),s=Be.removeAt(n.handles,r);if(Be.removeAt(n.sizers,r),s.parentNode&&s.parentNode.removeChild(s),n.children.length>1){n.syncHandles();return}let o=n.parent;n.parent=null;let a=n.children[0],l=n.handles[0];if(n.children.length=0,n.handles.length=0,n.sizers.length=0,l.parentNode&&l.parentNode.removeChild(l),this._root===n){a.parent=null,this._root=a;return}let c=o,u=c.children.indexOf(n);if(a instanceof en.TabLayoutNode){a.parent=c,c.children[u]=a;return}let d=Be.removeAt(c.handles,u);Be.removeAt(c.children,u),Be.removeAt(c.sizers,u),d.parentNode&&d.parentNode.removeChild(d);for(let f=0,h=a.children.length;f=this._left+this._width||m=this._top+this._height?null:this}createConfig(){let h=this.tabBar.titles.map(p=>p.owner),m=this.tabBar.currentIndex;return{type:"tab-area",widgets:h,currentIndex:m}}holdAllSizes(){}fit(h,m){let p=0,_=0,y=1/0,S=1/0,T=m.get(this.tabBar),O=this.tabBar.currentTitle,A=O?m.get(O.owner):void 0,[b,M]=this.sizers;return T&&T.fit(),A&&A.fit(),T&&!T.isHidden?(p=Math.max(p,T.minWidth),_+=T.minHeight,b.minSize=T.minHeight,b.maxSize=T.maxHeight):(b.minSize=0,b.maxSize=0),A&&!A.isHidden?(p=Math.max(p,A.minWidth),_+=A.minHeight,M.minSize=A.minHeight,M.maxSize=1/0):(M.minSize=0,M.maxSize=1/0),{minWidth:p,minHeight:_,maxWidth:y,maxHeight:S}}update(h,m,p,_,y,S){this._top=m,this._left=h,this._width=p,this._height=_;let T=S.get(this.tabBar),O=this.tabBar.currentTitle,A=O?S.get(O.owner):void 0;if(ss.calc(this.sizers,_),T&&!T.isHidden){let b=this.sizers[0].size;T.update(h,m,p,b),m+=b}if(A&&!A.isHidden){let b=this.sizers[1].size;A.update(h,m,p,b)}}}t.TabLayoutNode=r;class s{constructor(h){this.parent=null,this.normalized=!1,this.children=[],this.sizers=[],this.handles=[],this.orientation=h}*iterAllWidgets(){for(let h of this.children)yield*h.iterAllWidgets()}*iterUserWidgets(){for(let h of this.children)yield*h.iterUserWidgets()}*iterSelectedWidgets(){for(let h of this.children)yield*h.iterSelectedWidgets()}*iterTabBars(){for(let h of this.children)yield*h.iterTabBars()}*iterHandles(){yield*this.handles;for(let h of this.children)yield*h.iterHandles()}findTabNode(h){for(let m=0,p=this.children.length;m_.createConfig());return{type:"split-area",orientation:h,children:p,sizes:m}}syncHandles(){this.handles.forEach((h,m)=>{h.setAttribute("data-orientation",this.orientation),m===this.handles.length-1?h.classList.add("lm-mod-hidden"):h.classList.remove("lm-mod-hidden")})}holdSizes(){for(let h of this.sizers)h.sizeHint=h.size}holdAllSizes(){for(let h of this.children)h.holdAllSizes();this.holdSizes()}normalizeSizes(){let h=this.sizers.length;if(h===0)return;this.holdSizes();let m=this.sizers.reduce((p,_)=>p+_.sizeHint,0);if(m===0)for(let p of this.sizers)p.size=p.sizeHint=1/h;else for(let p of this.sizers)p.size=p.sizeHint/=m;this.normalized=!0}createNormalizedSizes(){let h=this.sizers.length;if(h===0)return[];let m=this.sizers.map(_=>_.size),p=m.reduce((_,y)=>_+y,0);if(p===0)for(let _=m.length-1;_>-1;_--)m[_]=1/h;else for(let _=m.length-1;_>-1;_--)m[_]/=p;return m}fit(h,m){let p=this.orientation==="horizontal",_=Math.max(0,this.children.length-1)*h,y=p?_:0,S=p?0:_,T=1/0,O=1/0;for(let A=0,b=this.children.length;A=m.length)&&(p=0),{type:"tab-area",widgets:m,currentIndex:p}}function c(f,h){let m=f.orientation,p=[],_=[];for(let y=0,S=f.children.length;y{let S=n(_,h,m),T=e(f.sizes[y]),O=h.createHandle();p.children.push(S),p.handles.push(O),p.sizers.push(T),S.parent=p}),p.syncHandles(),p.normalizeSizes(),p}})(en||(en={}));Mu=class extends me{constructor(e={}){super(),this._drag=null,this._tabsMovable=!0,this._tabsConstrained=!1,this._addButtonEnabled=!1,this._pressData=null,this._layoutModified=new Te(this),this._addRequested=new Te(this),this.addClass("lm-DockPanel"),this._document=e.document||document,this._mode=e.mode||"multiple-document",this._renderer=e.renderer||Mu.defaultRenderer,this._edges=e.edges||qi.DEFAULT_EDGES,e.tabsMovable!==void 0&&(this._tabsMovable=e.tabsMovable),e.tabsConstrained!==void 0&&(this._tabsConstrained=e.tabsConstrained),e.addButtonEnabled!==void 0&&(this._addButtonEnabled=e.addButtonEnabled),this.dataset.mode=this._mode;let i={createTabBar:()=>this._createTabBar(),createHandle:()=>this._createHandle()};this.layout=new sC({document:this._document,renderer:i,spacing:e.spacing,hiddenMode:e.hiddenMode}),this.overlay=e.overlay||new Mu.Overlay,this.node.appendChild(this.overlay.node)}dispose(){this._releaseMouse(),this.overlay.hide(0),this._drag&&this._drag.dispose(),super.dispose()}get hiddenMode(){return this.layout.hiddenMode}set hiddenMode(e){this.layout.hiddenMode=e}get layoutModified(){return this._layoutModified}get addRequested(){return this._addRequested}get renderer(){return this.layout.renderer}get spacing(){return this.layout.spacing}set spacing(e){this.layout.spacing=e}get mode(){return this._mode}set mode(e){if(this._mode===e)return;this._mode=e,this.dataset.mode=e;let i=this.layout;switch(e){case"multiple-document":for(let n of i.tabBars())n.show();break;case"single-document":i.restoreLayout(qi.createSingleDocumentConfig(this));break;default:throw"unreachable"}Ae.postMessage(this,qi.LayoutModified)}get tabsMovable(){return this._tabsMovable}set tabsMovable(e){this._tabsMovable=e;for(let i of this.tabBars())i.tabsMovable=e}get tabsConstrained(){return this._tabsConstrained}set tabsConstrained(e){this._tabsConstrained=e}get addButtonEnabled(){return this._addButtonEnabled}set addButtonEnabled(e){this._addButtonEnabled=e;for(let i of this.tabBars())i.addButtonEnabled=e}get isEmpty(){return this.layout.isEmpty}*widgets(){yield*this.layout.widgets()}*selectedWidgets(){yield*this.layout.selectedWidgets()}*tabBars(){yield*this.layout.tabBars()}*handles(){yield*this.layout.handles()}selectWidget(e){let i=ZS(this.tabBars(),n=>n.titles.indexOf(e.title)!==-1);if(!i)throw new Error("Widget is not contained in the dock panel.");i.currentTitle=e.title}activateWidget(e){this.selectWidget(e),e.activate()}saveLayout(){return this.layout.saveLayout()}restoreLayout(e){this._mode="multiple-document",this.layout.restoreLayout(e),(yu.IS_EDGE||yu.IS_IE)&&Ae.flush(),Ae.postMessage(this,qi.LayoutModified)}addWidget(e,i={}){this._mode==="single-document"?this.layout.addWidget(e):this.layout.addWidget(e,i),Ae.postMessage(this,qi.LayoutModified)}processMessage(e){e.type==="layout-modified"?this._layoutModified.emit(void 0):super.processMessage(e)}handleEvent(e){switch(e.type){case"lm-dragenter":this._evtDragEnter(e);break;case"lm-dragleave":this._evtDragLeave(e);break;case"lm-dragover":this._evtDragOver(e);break;case"lm-drop":this._evtDrop(e);break;case"pointerdown":this._evtPointerDown(e);break;case"pointermove":this._evtPointerMove(e);break;case"pointerup":this._evtPointerUp(e);break;case"keydown":this._evtKeyDown(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("lm-dragenter",this),this.node.addEventListener("lm-dragleave",this),this.node.addEventListener("lm-dragover",this),this.node.addEventListener("lm-drop",this),this.node.addEventListener("pointerdown",this)}onAfterDetach(e){this.node.removeEventListener("lm-dragenter",this),this.node.removeEventListener("lm-dragleave",this),this.node.removeEventListener("lm-dragover",this),this.node.removeEventListener("lm-drop",this),this.node.removeEventListener("pointerdown",this),this._releaseMouse()}onChildAdded(e){qi.isGeneratedTabBarProperty.get(e.child)||e.child.addClass("lm-DockPanel-widget")}onChildRemoved(e){qi.isGeneratedTabBarProperty.get(e.child)||(e.child.removeClass("lm-DockPanel-widget"),Ae.postMessage(this,qi.LayoutModified))}_evtDragEnter(e){e.mimeData.hasData("application/vnd.lumino.widget-factory")&&(e.preventDefault(),e.stopPropagation())}_evtDragLeave(e){e.preventDefault(),!(this._tabsConstrained&&e.source!==this)&&(e.stopPropagation(),this.overlay.hide(1))}_evtDragOver(e){e.preventDefault(),this._tabsConstrained&&e.source!==this||this._showOverlay(e.clientX,e.clientY)==="invalid"?e.dropAction="none":(e.stopPropagation(),e.dropAction=e.proposedAction)}_evtDrop(e){if(e.preventDefault(),this.overlay.hide(0),e.proposedAction==="none"){e.dropAction="none";return}let{clientX:i,clientY:n}=e,{zone:r,target:s}=qi.findDropTarget(this,i,n,this._edges);if(this._tabsConstrained&&e.source!==this||r==="invalid"){e.dropAction="none";return}let a=e.mimeData.getData("application/vnd.lumino.widget-factory");if(typeof a!="function"){e.dropAction="none";return}let l=a();if(!(l instanceof me)){e.dropAction="none";return}if(l.contains(this)){e.dropAction="none";return}let c=s?qi.getDropRef(s.tabBar):null;switch(r){case"root-all":this.addWidget(l);break;case"root-top":this.addWidget(l,{mode:"split-top"});break;case"root-left":this.addWidget(l,{mode:"split-left"});break;case"root-right":this.addWidget(l,{mode:"split-right"});break;case"root-bottom":this.addWidget(l,{mode:"split-bottom"});break;case"widget-all":this.addWidget(l,{mode:"tab-after",ref:c});break;case"widget-top":this.addWidget(l,{mode:"split-top",ref:c});break;case"widget-left":this.addWidget(l,{mode:"split-left",ref:c});break;case"widget-right":this.addWidget(l,{mode:"split-right",ref:c});break;case"widget-bottom":this.addWidget(l,{mode:"split-bottom",ref:c});break;case"widget-tab":this.addWidget(l,{mode:"tab-after",ref:c});break;default:throw"unreachable"}e.dropAction=e.proposedAction,e.stopPropagation(),this.activateWidget(l)}_evtKeyDown(e){e.preventDefault(),e.stopPropagation(),e.keyCode===27&&(this._releaseMouse(),Ae.postMessage(this,qi.LayoutModified))}_evtPointerDown(e){if(e.button!==0)return;let i=this.layout,n=e.target,r=ZS(i.handles(),u=>u.contains(n));if(!r)return;e.preventDefault(),e.stopPropagation(),this._document.addEventListener("keydown",this,!0),this._document.addEventListener("pointerup",this,!0),this._document.addEventListener("pointermove",this,!0),this._document.addEventListener("contextmenu",this,!0);let s=r.getBoundingClientRect(),o=e.clientX-s.left,a=e.clientY-s.top,l=window.getComputedStyle(r),c=an.overrideCursor(l.cursor,this._document);this._pressData={handle:r,deltaX:o,deltaY:a,override:c}}_evtPointerMove(e){if(!this._pressData)return;e.preventDefault(),e.stopPropagation();let i=this.node.getBoundingClientRect(),n=e.clientX-i.left-this._pressData.deltaX,r=e.clientY-i.top-this._pressData.deltaY;this.layout.moveHandle(this._pressData.handle,n,r)}_evtPointerUp(e){e.button===0&&(e.preventDefault(),e.stopPropagation(),this._releaseMouse(),Ae.postMessage(this,qi.LayoutModified))}_releaseMouse(){this._pressData&&(this._pressData.override.dispose(),this._pressData=null,this._document.removeEventListener("keydown",this,!0),this._document.removeEventListener("pointerup",this,!0),this._document.removeEventListener("pointermove",this,!0),this._document.removeEventListener("contextmenu",this,!0))}_showOverlay(e,i){let{zone:n,target:r}=qi.findDropTarget(this,e,i,this._edges);if(n==="invalid")return this.overlay.hide(100),n;let s,o,a,l,c=ei.boxSizing(this.node),u=this.node.getBoundingClientRect();switch(n){case"root-all":s=c.paddingTop,o=c.paddingLeft,a=c.paddingRight,l=c.paddingBottom;break;case"root-top":s=c.paddingTop,o=c.paddingLeft,a=c.paddingRight,l=u.height*qi.GOLDEN_RATIO;break;case"root-left":s=c.paddingTop,o=c.paddingLeft,a=u.width*qi.GOLDEN_RATIO,l=c.paddingBottom;break;case"root-right":s=c.paddingTop,o=u.width*qi.GOLDEN_RATIO,a=c.paddingRight,l=c.paddingBottom;break;case"root-bottom":s=u.height*qi.GOLDEN_RATIO,o=c.paddingLeft,a=c.paddingRight,l=c.paddingBottom;break;case"widget-all":s=r.top,o=r.left,a=r.right,l=r.bottom;break;case"widget-top":s=r.top,o=r.left,a=r.right,l=r.bottom+r.height/2;break;case"widget-left":s=r.top,o=r.left,a=r.right+r.width/2,l=r.bottom;break;case"widget-right":s=r.top,o=r.left+r.width/2,a=r.right,l=r.bottom;break;case"widget-bottom":s=r.top+r.height/2,o=r.left,a=r.right,l=r.bottom;break;case"widget-tab":{let d=r.tabBar.node.getBoundingClientRect().height;s=r.top,o=r.left,a=r.right,l=r.bottom+r.height-d;break}default:throw"unreachable"}return this.overlay.show({top:s,left:o,right:a,bottom:l}),n}_createTabBar(){let e=this._renderer.createTabBar(this._document);return qi.isGeneratedTabBarProperty.set(e,!0),this._mode==="single-document"&&e.hide(),e.tabsMovable=this._tabsMovable,e.allowDeselect=!1,e.addButtonEnabled=this._addButtonEnabled,e.removeBehavior="select-previous-tab",e.insertBehavior="select-tab-if-needed",e.tabMoved.connect(this._onTabMoved,this),e.currentChanged.connect(this._onCurrentChanged,this),e.tabCloseRequested.connect(this._onTabCloseRequested,this),e.tabDetachRequested.connect(this._onTabDetachRequested,this),e.tabActivateRequested.connect(this._onTabActivateRequested,this),e.addRequested.connect(this._onTabAddRequested,this),e}_createHandle(){return this._renderer.createHandle()}_onTabMoved(){Ae.postMessage(this,qi.LayoutModified)}_onCurrentChanged(e,i){let{previousTitle:n,currentTitle:r}=i;n&&n.owner.hide(),r&&r.owner.show(),(yu.IS_EDGE||yu.IS_IE)&&Ae.flush(),Ae.postMessage(this,qi.LayoutModified)}_onTabAddRequested(e){this._addRequested.emit(e)}_onTabActivateRequested(e,i){i.title.owner.activate()}_onTabCloseRequested(e,i){i.title.owner.close()}_onTabDetachRequested(e,i){if(this._drag)return;e.releaseMouse();let{title:n,tab:r,clientX:s,clientY:o,offset:a}=i,l=new sl.MimeData,c=()=>n.owner;l.setData("application/vnd.lumino.widget-factory",c);let u=r.cloneNode(!0);a&&(u.style.top=`-${a.y}px`,u.style.left=`-${a.x}px`),this._drag=new an({document:this._document,mimeData:l,dragImage:u,proposedAction:"move",supportedActions:"move",source:this}),r.classList.add("lm-mod-hidden");let d=()=>{this._drag=null,r.classList.remove("lm-mod-hidden")};this._drag.start(s,o).then(d)}};(function(t){class e{constructor(){this._timer=-1,this._hidden=!0,this.node=document.createElement("div"),this.node.classList.add("lm-DockPanel-overlay"),this.node.classList.add("lm-mod-hidden"),this.node.style.position="absolute",this.node.style.contain="strict"}show(r){let s=this.node.style;s.top=`${r.top}px`,s.left=`${r.left}px`,s.right=`${r.right}px`,s.bottom=`${r.bottom}px`,clearTimeout(this._timer),this._timer=-1,this._hidden&&(this._hidden=!1,this.node.classList.remove("lm-mod-hidden"))}hide(r){if(!this._hidden){if(r<=0){clearTimeout(this._timer),this._timer=-1,this._hidden=!0,this.node.classList.add("lm-mod-hidden");return}this._timer===-1&&(this._timer=window.setTimeout(()=>{this._timer=-1,this._hidden=!0,this.node.classList.add("lm-mod-hidden")},r))}}}t.Overlay=e;class i{createTabBar(r){let s=new gc({document:r});return s.addClass("lm-DockPanel-tabBar"),s}createHandle(){let r=document.createElement("div");return r.className="lm-DockPanel-handle",r}}t.Renderer=i,t.defaultRenderer=new i})(Mu||(Mu={}));(function(t){t.GOLDEN_RATIO=.618,t.DEFAULT_EDGES={top:12,right:40,bottom:40,left:40},t.LayoutModified=new pc("layout-modified"),t.isGeneratedTabBarProperty=new pt({name:"isGeneratedTabBar",create:()=>!1});function e(r){if(r.isEmpty)return{main:null};let s=Array.from(r.widgets()),o=r.selectedWidgets().next().value,a=o?s.indexOf(o):-1;return{main:{type:"tab-area",widgets:s,currentIndex:a}}}t.createSingleDocumentConfig=e;function i(r,s,o,a){if(!ei.hitTest(r.node,s,o))return{zone:"invalid",target:null};let l=r.layout;if(l.isEmpty)return{zone:"root-all",target:null};if(r.mode==="multiple-document"){let T=r.node.getBoundingClientRect(),O=s-T.left+1,A=o-T.top+1,b=T.right-s,M=T.bottom-o;switch(Math.min(A,b,M,O)){case A:if(Ap&&f>p&&d>_&&h>_)return{zone:"widget-all",target:c};u/=p,d/=_,f/=p,h/=_;let y=Math.min(u,d,f,h),S;switch(y){case u:S="widget-left";break;case d:S="widget-top";break;case f:S="widget-right";break;case h:S="widget-bottom";break;default:throw"unreachable"}return{zone:S,target:c}}t.findDropTarget=i;function n(r){return r.titles.length===0?null:r.currentTitle?r.currentTitle.owner:r.titles[r.titles.length-1].owner}t.getDropRef=n})(qi||(qi={}));rl=class extends sa{constructor(e={}){super(e),this._dirty=!1,this._rowSpacing=4,this._columnSpacing=4,this._items=[],this._rowStarts=[],this._columnStarts=[],this._rowSizers=[new na],this._columnSizers=[new na],this._box=null,e.rowCount!==void 0&&xn.reallocSizers(this._rowSizers,e.rowCount),e.columnCount!==void 0&&xn.reallocSizers(this._columnSizers,e.columnCount),e.rowSpacing!==void 0&&(this._rowSpacing=xn.clampValue(e.rowSpacing)),e.columnSpacing!==void 0&&(this._columnSpacing=xn.clampValue(e.columnSpacing))}dispose(){for(let e of this._items){let i=e.widget;e.dispose(),i.dispose()}this._box=null,this._items.length=0,this._rowStarts.length=0,this._rowSizers.length=0,this._columnStarts.length=0,this._columnSizers.length=0,super.dispose()}get rowCount(){return this._rowSizers.length}set rowCount(e){e!==this.rowCount&&(xn.reallocSizers(this._rowSizers,e),this.parent&&this.parent.fit())}get columnCount(){return this._columnSizers.length}set columnCount(e){e!==this.columnCount&&(xn.reallocSizers(this._columnSizers,e),this.parent&&this.parent.fit())}get rowSpacing(){return this._rowSpacing}set rowSpacing(e){e=xn.clampValue(e),this._rowSpacing!==e&&(this._rowSpacing=e,this.parent&&this.parent.fit())}get columnSpacing(){return this._columnSpacing}set columnSpacing(e){e=xn.clampValue(e),this._columnSpacing!==e&&(this._columnSpacing=e,this.parent&&this.parent.fit())}rowStretch(e){let i=this._rowSizers[e];return i?i.stretch:-1}setRowStretch(e,i){let n=this._rowSizers[e];n&&(i=xn.clampValue(i),n.stretch!==i&&(n.stretch=i,this.parent&&this.parent.update()))}columnStretch(e){let i=this._columnSizers[e];return i?i.stretch:-1}setColumnStretch(e,i){let n=this._columnSizers[e];n&&(i=xn.clampValue(i),n.stretch!==i&&(n.stretch=i,this.parent&&this.parent.update()))}*[Symbol.iterator](){for(let e of this._items)yield e.widget}addWidget(e){Be.findFirstIndex(this._items,n=>n.widget===e)===-1&&(this._items.push(new Cu(e)),this.parent&&this.attachWidget(e))}removeWidget(e){let i=Be.findFirstIndex(this._items,r=>r.widget===e);if(i===-1)return;let n=Be.removeAt(this._items,i);this.parent&&this.detachWidget(e),n.dispose()}init(){super.init();for(let e of this)this.attachWidget(e)}attachWidget(e){this.parent.isAttached&&Ae.sendMessage(e,me.Msg.BeforeAttach),this.parent.node.appendChild(e.node),this.parent.isAttached&&Ae.sendMessage(e,me.Msg.AfterAttach),this.parent.fit()}detachWidget(e){this.parent.isAttached&&Ae.sendMessage(e,me.Msg.BeforeDetach),this.parent.node.removeChild(e.node),this.parent.isAttached&&Ae.sendMessage(e,me.Msg.AfterDetach),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_fit(){for(let l=0,c=this.rowCount;l!l.isHidden);for(let l=0,c=e.length;l({row:0,column:0,rowSpan:1,columnSpan:1}),changed:a});function e(l){let c=Math.max(0,Math.floor(l.row||0)),u=Math.max(0,Math.floor(l.column||0)),d=Math.max(1,Math.floor(l.rowSpan||0)),f=Math.max(1,Math.floor(l.columnSpan||0));return{row:c,column:u,rowSpan:d,columnSpan:f}}t.normalizeConfig=e;function i(l){return Math.max(0,Math.floor(l))}t.clampValue=i;function n(l,c){let u=t.cellConfigProperty.get(l.widget),d=t.cellConfigProperty.get(c.widget);return u.rowSpan-d.rowSpan}t.rowSpanCmp=n;function r(l,c){let u=t.cellConfigProperty.get(l.widget),d=t.cellConfigProperty.get(c.widget);return u.columnSpan-d.columnSpan}t.columnSpanCmp=r;function s(l,c){for(c=Math.max(1,Math.floor(c));l.lengthc&&(l.length=c)}t.reallocSizers=s;function o(l,c,u,d){if(u=d)return;let h=(d-f)/(u-c+1);for(let m=c;m<=u;++m)l[m].minSize+=h}t.distributeMin=o;function a(l){l.parent&&l.parent.layout instanceof rl&&l.parent.fit()}})(xn||(xn={}));Sf=class extends me{constructor(e={}){super({node:Gb.createNode()}),this._activeIndex=-1,this._tabFocusIndex=0,this._menus=[],this._childMenu=null,this._overflowMenu=null,this._menuItemSizes=[],this._overflowIndex=-1,this.addClass("lm-MenuBar"),this.setFlag(me.Flag.DisallowLayout),this.renderer=e.renderer||Sf.defaultRenderer,this._forceItemsPosition=e.forceItemsPosition||{forceX:!0,forceY:!0},this._overflowMenuOptions=e.overflowMenuOptions||{isVisible:!0}}dispose(){this._closeChildMenu(),this._menus.length=0,super.dispose()}get childMenu(){return this._childMenu}get overflowIndex(){return this._overflowIndex}get overflowMenu(){return this._overflowMenu}get contentNode(){return this.node.getElementsByClassName("lm-MenuBar-content")[0]}get activeMenu(){return this._menus[this._activeIndex]||null}set activeMenu(e){this.activeIndex=e?this._menus.indexOf(e):-1}get activeIndex(){return this._activeIndex}set activeIndex(e){(e<0||e>=this._menus.length)&&(e=-1),e>-1&&this._menus[e].items.length===0&&(e=-1),this._activeIndex!==e&&(this._activeIndex=e,this.update())}get menus(){return this._menus}openActiveMenu(){this._activeIndex!==-1&&(this._openChildMenu(),this._childMenu&&(this._childMenu.activeIndex=-1,this._childMenu.activateNextItem()))}addMenu(e,i=!0){this.insertMenu(this._menus.length,e,i)}insertMenu(e,i,n=!0){this._closeChildMenu();let r=this._menus.indexOf(i),s=Math.max(0,Math.min(e,this._menus.length));if(r===-1){Be.insert(this._menus,s,i),i.addClass("lm-MenuBar-menu"),i.aboutToClose.connect(this._onMenuAboutToClose,this),i.menuRequested.connect(this._onMenuMenuRequested,this),i.title.changed.connect(this._onTitleChanged,this),n&&this.update();return}s===this._menus.length&&s--,r!==s&&(Be.move(this._menus,r,s),n&&this.update())}removeMenu(e,i=!0){this.removeMenuAt(this._menus.indexOf(e),i)}removeMenuAt(e,i=!0){this._closeChildMenu();let n=Be.removeAt(this._menus,e);n&&(n.aboutToClose.disconnect(this._onMenuAboutToClose,this),n.menuRequested.disconnect(this._onMenuMenuRequested,this),n.title.changed.disconnect(this._onTitleChanged,this),n.removeClass("lm-MenuBar-menu"),i&&this.update())}clearMenus(){if(this._menus.length!==0){this._closeChildMenu();for(let e of this._menus)e.aboutToClose.disconnect(this._onMenuAboutToClose,this),e.menuRequested.disconnect(this._onMenuMenuRequested,this),e.title.changed.disconnect(this._onTitleChanged,this),e.removeClass("lm-MenuBar-menu");this._menus.length=0,this.update()}}handleEvent(e){switch(e.type){case"keydown":this._evtKeyDown(e);break;case"mousedown":this._evtMouseDown(e);break;case"mousemove":this._evtMouseMove(e);break;case"focusout":this._evtFocusOut(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("keydown",this),this.node.addEventListener("mousedown",this),this.node.addEventListener("mousemove",this),this.node.addEventListener("focusout",this),this.node.addEventListener("contextmenu",this)}onAfterDetach(e){this.node.removeEventListener("keydown",this),this.node.removeEventListener("mousedown",this),this.node.removeEventListener("mousemove",this),this.node.removeEventListener("focusout",this),this.node.removeEventListener("contextmenu",this),this._closeChildMenu()}onActivateRequest(e){this.isAttached&&this._focusItemAt(0)}onResize(e){this.update(),super.onResize(e)}onUpdateRequest(e){var i;let n=this._menus,r=this.renderer,s=this._activeIndex,o=this._tabFocusIndex>=0&&this._tabFocusIndex-1?this._overflowIndex:n.length,l=0,c=!1;a=this._overflowMenu!==null?a-1:a;let u=new Array(a);for(let d=0;d{this._tabFocusIndex=d,this.activeIndex=d}}),l+=this._menuItemSizes[d],n[d].title.label===this._overflowMenuOptions.title&&(c=!0,a--);if(this._overflowMenuOptions.isVisible){if(this._overflowIndex>-1&&!c){if(this._overflowMenu===null){let d=(i=this._overflowMenuOptions.title)!==null&&i!==void 0?i:"...";this._overflowMenu=new ra({commands:new er}),this._overflowMenu.title.label=d,this._overflowMenu.title.mnemonic=0,this.addMenu(this._overflowMenu,!1)}for(let d=n.length-2;d>=a;d--){let f=this.menus[d];f.title.mnemonic=0,this._overflowMenu.insertItem(0,{type:"submenu",submenu:f}),this.removeMenu(f,!1)}u[a]=r.renderItem({title:this._overflowMenu.title,active:a===s&&n[a].items.length!==0,tabbable:a===o,disabled:n[a].items.length===0,onfocus:()=>{this._tabFocusIndex=a,this.activeIndex=a}}),a++}else if(this._overflowMenu!==null){let d=this._overflowMenu.items,f=this.node.offsetWidth,h=this._overflowMenu.items.length;for(let m=0;mthis._menuItemSizes[p]){let _=d[0].submenu;this._overflowMenu.removeItemAt(0),this.insertMenu(a,_,!1),u[a]=r.renderItem({title:_.title,active:!1,tabbable:a===o,disabled:n[a].items.length===0,onfocus:()=>{this._tabFocusIndex=a,this.activeIndex=a}}),a++}}this._overflowMenu.items.length===0&&(this.removeMenu(this._overflowMenu,!1),u.pop(),this._overflowMenu=null,this._overflowIndex=-1)}}Zt.render(u,this.contentNode),this._updateOverflowIndex()}_updateOverflowIndex(){if(!this._overflowMenuOptions.isVisible)return;let e=this.contentNode.childNodes,i=this.node.offsetWidth,n=0,r=-1,s=e.length;if(this._menuItemSizes.length==0)for(let o=0;oi&&r===-1&&(r=o)}else for(let o=0;oi){r=o;break}this._overflowIndex=r}_evtKeyDown(e){let i=e.keyCode;if(i===9){this.activeIndex=-1;return}if(e.preventDefault(),e.stopPropagation(),i===13||i===32||i===38||i===40){if(this.activeIndex=this._tabFocusIndex,this.activeIndex!==this._tabFocusIndex)return;this.openActiveMenu();return}if(i===27){this._closeChildMenu(),this._focusItemAt(this.activeIndex);return}if(i===37||i===39){let o=i===37?-1:1,a=this._tabFocusIndex+o,l=this._menus.length;for(let c=0;cei.hitTest(n,e.clientX,e.clientY));if(i===-1){this._closeChildMenu();return}if(e.button===0)if(this._childMenu)this._closeChildMenu(),this.activeIndex=i;else{e.preventDefault();let n=this._positionForMenu(i);ra.saveWindowData(),this.activeIndex=i,this._openChildMenu(n)}}_evtMouseMove(e){let i=Be.findFirstIndex(this.contentNode.children,r=>ei.hitTest(r,e.clientX,e.clientY));if(i===this._activeIndex||i===-1&&this._childMenu)return;let n=i>=0&&this._childMenu?this._positionForMenu(i):null;ra.saveWindowData(),this.activeIndex=i,n&&this._openChildMenu(n)}_positionForMenu(e){let i=this.contentNode.children[e],{left:n,bottom:r}=i.getBoundingClientRect();return{top:r,left:n}}_evtFocusOut(e){!this._childMenu&&!this.node.contains(e.relatedTarget)&&(this.activeIndex=-1)}_focusItemAt(e){let i=this.contentNode.childNodes[e];i&&i.focus()}_openChildMenu(e={}){let i=this.activeMenu;if(!i){this._closeChildMenu();return}let n=this._childMenu;if(n===i)return;this._childMenu=i,n?n.close():document.addEventListener("mousedown",this,!0),this._tabFocusIndex=this.activeIndex,Ae.sendMessage(this,me.Msg.UpdateRequest);let{left:r,top:s}=e;(typeof r>"u"||typeof s>"u")&&({left:r,top:s}=this._positionForMenu(this._activeIndex)),n||this.addClass("lm-mod-active"),i.items.length>0&&i.open(r,s,this._forceItemsPosition)}_closeChildMenu(){if(!this._childMenu)return;this.removeClass("lm-mod-active"),document.removeEventListener("mousedown",this,!0);let e=this._childMenu;this._childMenu=null,e.close(),this.activeIndex=-1}_onMenuAboutToClose(e){e===this._childMenu&&(this.removeClass("lm-mod-active"),document.removeEventListener("mousedown",this,!0),this._childMenu=null,this.activeIndex=-1)}_onMenuMenuRequested(e,i){if(e!==this._childMenu)return;let n=this._activeIndex,r=this._menus.length;switch(i){case"next":this.activeIndex=n===r-1?0:n+1;break;case"previous":this.activeIndex=n===0?r-1:n-1;break}this.openActiveMenu()}_onTitleChanged(){this.update()}};(function(t){class e{renderItem(n){let r=this.createItemClass(n),s=this.createItemDataset(n),o=this.createItemARIA(n);return Ce.li({className:r,dataset:s,...n.disabled?{}:{tabindex:n.tabbable?"0":"-1"},onfocus:n.onfocus,...o},this.renderIcon(n),this.renderLabel(n))}renderIcon(n){let r=this.createIconClass(n);return Ce.div({className:r},n.title.icon,n.title.iconLabel)}renderLabel(n){let r=this.formatLabel(n);return Ce.div({className:"lm-MenuBar-itemLabel"},r)}createItemClass(n){let r="lm-MenuBar-item";return n.title.className&&(r+=` ${n.title.className}`),n.active&&!n.disabled&&(r+=" lm-mod-active"),r}createItemDataset(n){return n.title.dataset}createItemARIA(n){return{role:"menuitem","aria-haspopup":"true","aria-disabled":n.disabled?"true":"false"}}createIconClass(n){let r="lm-MenuBar-itemIcon",s=n.title.iconClass;return s?`${r} ${s}`:r}formatLabel(n){let{label:r,mnemonic:s}=n.title;if(s<0||s>=r.length)return r;let o=r.slice(0,s),a=r.slice(s+1),l=r[s],c=Ce.span({className:"lm-MenuBar-itemMnemonic"},l);return[o,c,a]}}t.Renderer=e,t.defaultRenderer=new e})(Sf||(Sf={}));(function(t){function e(){let n=document.createElement("div"),r=document.createElement("ul");return r.className="lm-MenuBar-content",n.appendChild(r),r.setAttribute("role","menubar"),n}t.createNode=e;function i(n,r,s){let o=-1,a=-1,l=!1,c=r.toUpperCase();for(let u=0,d=n.length;u=0&&m1&&this.widgets.forEach(i=>{i.hiddenMode=this._hiddenMode}))}dispose(){for(let e of this._items)e.dispose();this._box=null,this._items.length=0,super.dispose()}attachWidget(e,i){this._hiddenMode===me.HiddenMode.Scale&&this._items.length>0?(this._items.length===1&&(this.widgets[0].hiddenMode=me.HiddenMode.Scale),i.hiddenMode=me.HiddenMode.Scale):i.hiddenMode=me.HiddenMode.Display,Be.insert(this._items,e,new Cu(i)),this.parent.isAttached&&Ae.sendMessage(i,me.Msg.BeforeAttach),this.parent.node.appendChild(i.node),this.parent.isAttached&&Ae.sendMessage(i,me.Msg.AfterAttach),this.parent.fit()}moveWidget(e,i,n){Be.move(this._items,e,i),this.parent.update()}detachWidget(e,i){let n=Be.removeAt(this._items,e);this.parent.isAttached&&Ae.sendMessage(i,me.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.isAttached&&Ae.sendMessage(i,me.Msg.AfterDetach),n.widget.node.style.zIndex="",this._hiddenMode===me.HiddenMode.Scale&&(i.hiddenMode=me.HiddenMode.Display,this._items.length===1&&(this._items[0].widget.hiddenMode=me.HiddenMode.Display)),n.dispose(),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_fit(){let e=0,i=0;for(let s=0,o=this._items.length;s{});var cC,Yb,Cf,k2=$(()=>{b1();cC=P(Qn());Rs();Yb=class{constructor(e){this.trusted=!1,this._changed=new Te(this),this._raw={};let i=Cf.getData(e.value);this._data=new kp({values:i}),this._rawData=i;let n=e.value;for(let r in n)switch(r){case"data":break;default:this._raw[r]=Cf.extract(n,r)}}get changed(){return this._changed}dispose(){this._data.dispose(),Te.clearData(this)}get data(){return this._rawData}get metadata(){return{}}setData(e){e.data&&(this._updateObservable(this._data,e.data),this._rawData=e.data),this._changed.emit(void 0)}toJSON(){let e={};for(let i in this._raw)e[i]=Cf.extract(this._raw,i);return e}_updateObservable(e,i){let n=e.keys(),r=Object.keys(i);for(let s of n)r.indexOf(s)===-1&&e.delete(s);for(let s of r){let o=e.get(s),a=i[s];o!==a&&e.set(s,a)}}};(function(t){function e(i){return Cf.getData(i)}t.getData=e})(Yb||(Yb={}));(function(t){function e(s){return r(s)}t.getData=e;function i(s){return{data:e(s.value)}}t.getBundleOptions=i;function n(s,o){let a=s[o];return a===void 0||cC.JSONExt.isPrimitive(a)?a:cC.JSONExt.deepCopy(a)}t.extract=n;function r(s){let o=Object.create(null);for(let a in s)o[a]=n(s,a);return o}})(Cf||(Cf={}))});function*Mf(){}function uC(t,e){let i=0;for(let n of t)if(e(n,i++))return n}function L2(t,e){let i=0;for(let n of t)if(e(n,i++)===!1)return!1;return!0}function N2(t,e){let i=0;for(let n of t)if(e(n,i++))return!0;return!1}function*D2(t){if(typeof t.retro=="function")yield*t.retro();else for(let e=t.length-1;e>-1;e--)yield t[e]}var He,A2,Ef,dC=$(()=>{(function(t){function e(M,C,x=0,w=-1){let E=M.length;if(E===0)return-1;x<0?x=Math.max(0,x+E):x=Math.min(x,E-1),w<0?w=Math.max(0,w+E):w=Math.min(w,E-1);let N;w0;){let X=Z>>1,K=B+X;x(M[K],C)<0?(B=K+1,Z-=X+1):Z=X}return B}t.lowerBound=a;function l(M,C,x,w=0,E=-1){let N=M.length;if(N===0)return 0;w<0?w=Math.max(0,w+N):w=Math.min(w,N-1),E<0?E=Math.max(0,E+N):E=Math.min(E,N-1);let B=w,Z=E-w+1;for(;Z>0;){let X=Z>>1,K=B+X;x(M[K],C)>0?Z=X:(B=K+1,Z-=X+1)}return B}t.upperBound=l;function c(M,C,x){if(M===C)return!0;if(M.length!==C.length)return!1;for(let w=0,E=M.length;w=N&&(x=E<0?N-1:N),w===void 0?w=E<0?-1:N:w<0?w=Math.max(w+N,E<0?-1:0):w>=N&&(w=E<0?N-1:N);let B;E<0&&w>=x||E>0&&x>=w?B=0:E<0?B=Math.floor((w-x+1)/E+1):B=Math.floor((w-x-1)/E+1);let Z=[];for(let X=0;X=w))return;let N=w-x+1;if(C>0?C=C%N:C<0&&(C=(C%N+N)%N),C===0)return;let B=x+C;f(M,x,B-1),f(M,B,w),f(M,x,w)}t.rotate=h;function m(M,C,x=0,w=-1){let E=M.length;if(E===0)return;x<0?x=Math.max(0,x+E):x=Math.min(x,E-1),w<0?w=Math.max(0,w+E):w=Math.min(w,E-1);let N;wC;--E)M[E]=M[E-1];M[C]=x}t.insert=p;function _(M,C){let x=M.length;if(C<0&&(C+=x),C<0||C>=x)return;let w=M[C];for(let E=C+1;E=x&&B<=w&&M[B]===C||w=x)&&M[B]===C?N++:N>0&&(M[B-N]=M[B]);return N>0&&(M.length=E-N),N}t.removeAllOf=T;function O(M,C,x=0,w=-1){let E,N=n(M,C,x,w);return N!==-1&&(E=_(M,N)),{index:N,value:E}}t.removeFirstWhere=O;function A(M,C,x=-1,w=0){let E,N=r(M,C,x,w);return N!==-1&&(E=_(M,N)),{index:N,value:E}}t.removeLastWhere=A;function b(M,C,x=0,w=-1){let E=M.length;if(E===0)return 0;x<0?x=Math.max(0,x+E):x=Math.min(x,E-1),w<0?w=Math.max(0,w+E):w=Math.min(w,E-1);let N=0;for(let B=0;B=x&&B<=w&&C(M[B],B)||w=x)&&C(M[B],B)?N++:N>0&&(M[B-N]=M[B]);return N>0&&(M.length=E-N),N}t.removeAllWhere=b})(He||(He={}));(function(t){function e(i,n,r){return r===0?1/0:i>n&&r>0||ia?1:0}t.cmp=s})(Ef||(Ef={}))});var O2,ti,Iu,Tu,xo,z2=$(()=>{(function(t){function e(i){let n=document.body,r=s=>{s.preventDefault(),s.stopPropagation(),s.clipboardData.setData("text",i),n.removeEventListener("copy",r,!0)};n.addEventListener("copy",r,!0),document.execCommand("copy")}t.copyText=e})(O2||(O2={}));(function(t){function e(s){let o=window.getComputedStyle(s),a=parseFloat(o.borderTopWidth)||0,l=parseFloat(o.borderLeftWidth)||0,c=parseFloat(o.borderRightWidth)||0,u=parseFloat(o.borderBottomWidth)||0,d=parseFloat(o.paddingTop)||0,f=parseFloat(o.paddingLeft)||0,h=parseFloat(o.paddingRight)||0,m=parseFloat(o.paddingBottom)||0,p=l+f+h+c,_=a+d+m+u;return{borderTop:a,borderLeft:l,borderRight:c,borderBottom:u,paddingTop:d,paddingLeft:f,paddingRight:h,paddingBottom:m,horizontalSum:p,verticalSum:_}}t.boxSizing=e;function i(s){let o=window.getComputedStyle(s),a=parseFloat(o.minWidth)||0,l=parseFloat(o.minHeight)||0,c=parseFloat(o.maxWidth)||1/0,u=parseFloat(o.maxHeight)||1/0;return c=Math.max(a,c),u=Math.max(l,u),{minWidth:a,minHeight:l,maxWidth:c,maxHeight:u}}t.sizeLimits=i;function n(s,o,a){let l=s.getBoundingClientRect();return o>=l.left&&o=l.top&&a=a.bottom)){if(l.topa.bottom&&l.height>=a.height){s.scrollTop-=a.top-l.top;return}if(l.topa.height){s.scrollTop-=a.bottom-l.bottom;return}if(l.bottom>a.bottom&&l.height{let u=Element.prototype;return u.matches||u.matchesSelector||u.mozMatchesSelector||u.msMatchesSelector||u.oMatchesSelector||u.webkitMatchesSelector||function(d){let f=this,h=f.ownerDocument?f.ownerDocument.querySelectorAll(d):[];return Array.prototype.indexOf.call(h,f)!==-1}})();function e(u){u=u.split(",",1)[0];let d=0,f=0,h=0;function m(p){let _=u.match(p);return _===null?!1:(u=u.slice(_[0].length),!0)}for(u=u.replace(c," $1 ");u.length>0;){if(m(i)){d++;continue}if(m(n)){f++;continue}if(m(r)){f++;continue}if(m(o)){h++;continue}if(m(a)){f++;continue}if(m(s)){h++;continue}if(!m(l))return 0}return d=Math.min(d,255),f=Math.min(f,255),h=Math.min(h,255),d<<16|f<<8|h}t.calculateSingle=e;let i=/^#[^\s\+>~#\.\[:]+/,n=/^\.[^\s\+>~#\.\[:]+/,r=/^\[[^\]]+\]/,s=/^[^\s\+>~#\.\[:]+/,o=/^(::[^\s\+>~#\.\[:]+|:first-line|:first-letter|:before|:after)/,a=/^:[^\s\+>~#\.\[:]+/,l=/^[\s\+>~\*]+/,c=/:not\(([^\)]+)\)/g})(xo||(xo={}))});var If,ol,P2=$(()=>{If=class{constructor(){this._first=null,this._last=null,this._size=0}get isEmpty(){return this._size===0}get size(){return this._size}get length(){return this._size}get first(){return this._first?this._first.value:void 0}get last(){return this._last?this._last.value:void 0}get firstNode(){return this._first}get lastNode(){return this._last}*[Symbol.iterator](){let e=this._first;for(;e;)yield e.value,e=e.next}*retro(){let e=this._last;for(;e;)yield e.value,e=e.prev}*nodes(){let e=this._first;for(;e;)yield e,e=e.next}*retroNodes(){let e=this._last;for(;e;)yield e,e=e.prev}assign(e){this.clear();for(let i of e)this.addLast(i)}push(e){this.addLast(e)}pop(){return this.removeLast()}shift(e){this.addFirst(e)}unshift(){return this.removeFirst()}addFirst(e){let i=new ol.LinkedListNode(this,e);return this._first?(i.next=this._first,this._first.prev=i,this._first=i):(this._first=i,this._last=i),this._size++,i}addLast(e){let i=new ol.LinkedListNode(this,e);return this._last?(i.prev=this._last,this._last.next=i,this._last=i):(this._first=i,this._last=i),this._size++,i}insertBefore(e,i){if(!i||i===this._first)return this.addFirst(e);if(!(i instanceof ol.LinkedListNode)||i.list!==this)throw new Error("Reference node is not owned by the list.");let n=new ol.LinkedListNode(this,e),r=i,s=r.prev;return n.next=r,n.prev=s,r.prev=n,s.next=n,this._size++,n}insertAfter(e,i){if(!i||i===this._last)return this.addLast(e);if(!(i instanceof ol.LinkedListNode)||i.list!==this)throw new Error("Reference node is not owned by the list.");let n=new ol.LinkedListNode(this,e),r=i,s=r.next;return n.next=s,n.prev=r,r.next=n,s.prev=n,this._size++,n}removeFirst(){let e=this._first;if(e)return e===this._last?(this._first=null,this._last=null):(this._first=e.next,this._first.prev=null),e.list=null,e.next=null,e.prev=null,this._size--,e.value}removeLast(){let e=this._last;if(e)return e===this._first?(this._first=null,this._last=null):(this._last=e.prev,this._last.next=null),e.list=null,e.next=null,e.prev=null,this._size--,e.value}removeNode(e){if(!(e instanceof ol.LinkedListNode)||e.list!==this)throw new Error("Node is not owned by the list.");let i=e;i===this._first&&i===this._last?(this._first=null,this._last=null):i===this._first?(this._first=i.next,this._first.prev=null):i===this._last?(this._last=i.prev,this._last.next=null):(i.next.prev=i.prev,i.prev.next=i.next),i.list=null,i.next=null,i.prev=null,this._size--}clear(){let e=this._first;for(;e;){let i=e.next;e.list=null,e.prev=null,e.next=null,e=i}this._first=null,this._last=null,this._size=0}};(function(t){function e(i){let n=new t;return n.assign(i),n}t.from=e})(If||(If={}));(function(t){class e{constructor(n,r){this.list=null,this.next=null,this.prev=null,this.list=n,this.value=r}}t.LinkedListNode=e})(ol||(ol={}))});var _r,_c,Oe,B2=$(()=>{dC();P2();_r=class{constructor(e){this.type=e}get isConflatable(){return!1}conflate(e){return!1}},_c=class extends _r{get isConflatable(){return!0}conflate(e){return!0}};(function(t){let e=null,i=(C=>x=>{let w=!1;return C.then(()=>!w&&x()),()=>{w=!0}})(Promise.resolve());function n(C,x){let w=f.get(C);if(!w||w.length===0){y(C,x);return}L2(D2(w),N=>N?_(N,C,x):!0)&&y(C,x)}t.sendMessage=n;function r(C,x){if(!x.isConflatable){S(C,x);return}N2(d,E=>E.handler!==C||!E.msg||E.msg.type!==x.type||!E.msg.isConflatable?!1:E.msg.conflate(x))||S(C,x)}t.postMessage=r;function s(C,x){let w=f.get(C);w&&w.indexOf(x)!==-1||(w?w.push(x):f.set(C,[x]))}t.installMessageHook=s;function o(C,x){let w=f.get(C);if(!w)return;let E=w.indexOf(x);E!==-1&&(w[E]=null,O(w))}t.removeMessageHook=o;function a(C){let x=f.get(C);x&&x.length>0&&(He.fill(x,null),O(x));for(let w of d)w.handler===C&&(w.handler=null,w.msg=null)}t.clearData=a;function l(){p||e===null||(e(),e=null,p=!0,T(),p=!1)}t.flush=l;function c(){return m}t.getExceptionHandler=c;function u(C){let x=m;return m=C,x}t.setExceptionHandler=u;let d=new If,f=new WeakMap,h=new Set,m=C=>{console.error(C)},p=!1;function _(C,x,w){let E=!0;try{typeof C=="function"?E=C(x,w):E=C.messageHook(x,w)}catch(N){m(N)}return E}function y(C,x){try{C.processMessage(x)}catch(w){m(w)}}function S(C,x){d.addLast({handler:C,msg:x}),e===null&&(e=i(T))}function T(){if(e=null,d.isEmpty)return;let C={handler:null,msg:null};for(d.addLast(C);;){let x=d.removeFirst();if(x===C)return;x.handler&&x.msg&&n(x.handler,x.msg)}}function O(C){h.size===0&&i(A),h.add(C)}function A(){h.forEach(b),h.clear()}function b(C){He.removeAllWhere(C,M)}function M(C){return C===null}})(Oe||(Oe={}))});var ul,oa,as,rg,pe,Kb,la,ku,Tf,Rf,sg,og,yo,ll,hC,Xb,Jb,fC,Au,mC,ag,pC,ls,Ru,Zb,gC,kf,al,aa,vr,H2,qV,vc,Us,_C,tn,Lu,Ui,cl,yn,Af,Qb,j2,F2,vC,W2,$2,q2=$(()=>{dC();ul=P(Qn());z2();B2();Tp();Rs();Ev();Iv();Tv();Cv();Mv();oa=class{constructor(){this.sizeHint=0,this.minSize=0,this.maxSize=1/0,this.stretch=1,this.size=0,this.done=!1}};(function(t){function e(s,o){let a=s.length;if(a===0)return o;let l=0,c=0,u=0,d=0,f=0;for(let p=0;p0&&(d+=_.stretch,f++)}if(o===u)return 0;if(o<=l){for(let p=0;p=c){for(let p=0;p0&&p>h;){let _=p,y=d;for(let S=0;S0&&p>h;){let _=p/m;for(let y=0;y0&&p>h;){let _=p,y=d;for(let S=0;S=T.maxSize?(p-=T.maxSize-T.size,d-=T.stretch,T.size=T.maxSize,T.done=!0,m--,f--):(p-=O,T.size+=O)}}for(;m>0&&p>h;){let _=p/m;for(let y=0;y=S.maxSize?(p-=S.maxSize-S.size,S.size=S.maxSize,S.done=!0,m--):(p-=_,S.size+=_))}}}return 0}t.calc=e;function i(s,o,a){s.length===0||a===0||(a>0?n(s,o,a):r(s,o,-a))}t.adjust=i;function n(s,o,a){let l=0;for(let f=0;f<=o;++f){let h=s[f];l+=h.maxSize-h.size}let c=0;for(let f=o+1,h=s.length;f=0&&u>0;--f){let h=s[f],m=h.maxSize-h.size;m>=u?(h.sizeHint=h.size+u,u=0):(h.sizeHint=h.size+m,u-=m)}let d=a;for(let f=o+1,h=s.length;f0;++f){let m=s[f],p=m.size-m.minSize;p>=d?(m.sizeHint=m.size-d,d=0):(m.sizeHint=m.size-p,d-=p)}}function r(s,o,a){let l=0;for(let f=o+1,h=s.length;f0;++f){let m=s[f],p=m.maxSize-m.size;p>=u?(m.sizeHint=m.size+u,u=0):(m.sizeHint=m.size+p,u-=p)}let d=a;for(let f=o;f>=0&&d>0;--f){let h=s[f],m=h.size-h.minSize;m>=d?(h.sizeHint=h.size-d,d=0):(h.sizeHint=h.size-m,d-=m)}}})(as||(as={}));rg=class{constructor(e){this._label="",this._caption="",this._mnemonic=-1,this._icon=void 0,this._iconClass="",this._iconLabel="",this._className="",this._closable=!1,this._changed=new Te(this),this._isDisposed=!1,this.owner=e.owner,e.label!==void 0&&(this._label=e.label),e.mnemonic!==void 0&&(this._mnemonic=e.mnemonic),e.icon!==void 0&&(this._icon=e.icon),e.iconClass!==void 0&&(this._iconClass=e.iconClass),e.iconLabel!==void 0&&(this._iconLabel=e.iconLabel),e.caption!==void 0&&(this._caption=e.caption),e.className!==void 0&&(this._className=e.className),e.closable!==void 0&&(this._closable=e.closable),this._dataset=e.dataset||{}}get changed(){return this._changed}get label(){return this._label}set label(e){this._label!==e&&(this._label=e,this._changed.emit(void 0))}get mnemonic(){return this._mnemonic}set mnemonic(e){this._mnemonic!==e&&(this._mnemonic=e,this._changed.emit(void 0))}get icon(){return this._icon}set icon(e){this._icon!==e&&(this._icon=e,this._changed.emit(void 0))}get iconClass(){return this._iconClass}set iconClass(e){this._iconClass!==e&&(this._iconClass=e,this._changed.emit(void 0))}get iconLabel(){return this._iconLabel}set iconLabel(e){this._iconLabel!==e&&(this._iconLabel=e,this._changed.emit(void 0))}get caption(){return this._caption}set caption(e){this._caption!==e&&(this._caption=e,this._changed.emit(void 0))}get className(){return this._className}set className(e){this._className!==e&&(this._className=e,this._changed.emit(void 0))}get closable(){return this._closable}set closable(e){this._closable!==e&&(this._closable=e,this._changed.emit(void 0))}get dataset(){return this._dataset}set dataset(e){this._dataset!==e&&(this._dataset=e,this._changed.emit(void 0))}get isDisposed(){return this._isDisposed}dispose(){this.isDisposed||(this._isDisposed=!0,Te.clearData(this))}},pe=class{constructor(e={}){this._flags=0,this._layout=null,this._parent=null,this._disposed=new Te(this),this._hiddenMode=pe.HiddenMode.Display,this.node=Kb.createNode(e),this.addClass("lm-Widget")}dispose(){this.isDisposed||(this.setFlag(pe.Flag.IsDisposed),this._disposed.emit(void 0),this.parent?this.parent=null:this.isAttached&&pe.detach(this),this._layout&&(this._layout.dispose(),this._layout=null),this.title.dispose(),Te.clearData(this),Oe.clearData(this),pt.clearData(this))}get disposed(){return this._disposed}get isDisposed(){return this.testFlag(pe.Flag.IsDisposed)}get isAttached(){return this.testFlag(pe.Flag.IsAttached)}get isHidden(){return this.testFlag(pe.Flag.IsHidden)}get isVisible(){return this.testFlag(pe.Flag.IsVisible)}get title(){return Kb.titleProperty.get(this)}get id(){return this.node.id}set id(e){this.node.id=e}get dataset(){return this.node.dataset}get hiddenMode(){return this._hiddenMode}set hiddenMode(e){this._hiddenMode!==e&&(this.isHidden&&this._toggleHidden(!1),e==pe.HiddenMode.Scale?this.node.style.willChange="transform":this.node.style.willChange="auto",this._hiddenMode=e,this.isHidden&&this._toggleHidden(!0))}get parent(){return this._parent}set parent(e){if(this._parent!==e){if(e&&this.contains(e))throw new Error("Invalid parent widget.");if(this._parent&&!this._parent.isDisposed){let i=new pe.ChildMessage("child-removed",this);Oe.sendMessage(this._parent,i)}if(this._parent=e,this._parent&&!this._parent.isDisposed){let i=new pe.ChildMessage("child-added",this);Oe.sendMessage(this._parent,i)}this.isDisposed||Oe.sendMessage(this,pe.Msg.ParentChanged)}}get layout(){return this._layout}set layout(e){if(this._layout!==e){if(this.testFlag(pe.Flag.DisallowLayout))throw new Error("Cannot set widget layout.");if(this._layout)throw new Error("Cannot change widget layout.");if(e.parent)throw new Error("Cannot change layout parent.");this._layout=e,e.parent=this}}*children(){this._layout&&(yield*this._layout)}contains(e){for(let i=e;i;i=i._parent)if(i===this)return!0;return!1}hasClass(e){return this.node.classList.contains(e)}addClass(e){this.node.classList.add(e)}removeClass(e){this.node.classList.remove(e)}toggleClass(e,i){return i===!0?(this.node.classList.add(e),!0):i===!1?(this.node.classList.remove(e),!1):this.node.classList.toggle(e)}update(){Oe.postMessage(this,pe.Msg.UpdateRequest)}fit(){Oe.postMessage(this,pe.Msg.FitRequest)}activate(){Oe.postMessage(this,pe.Msg.ActivateRequest)}close(){Oe.sendMessage(this,pe.Msg.CloseRequest)}show(){if(this.testFlag(pe.Flag.IsHidden)&&(this.isAttached&&(!this.parent||this.parent.isVisible)&&Oe.sendMessage(this,pe.Msg.BeforeShow),this.clearFlag(pe.Flag.IsHidden),this._toggleHidden(!1),this.isAttached&&(!this.parent||this.parent.isVisible)&&Oe.sendMessage(this,pe.Msg.AfterShow),this.parent)){let e=new pe.ChildMessage("child-shown",this);Oe.sendMessage(this.parent,e)}}hide(){if(!this.testFlag(pe.Flag.IsHidden)&&(this.isAttached&&(!this.parent||this.parent.isVisible)&&Oe.sendMessage(this,pe.Msg.BeforeHide),this.setFlag(pe.Flag.IsHidden),this._toggleHidden(!0),this.isAttached&&(!this.parent||this.parent.isVisible)&&Oe.sendMessage(this,pe.Msg.AfterHide),this.parent)){let e=new pe.ChildMessage("child-hidden",this);Oe.sendMessage(this.parent,e)}}setHidden(e){e?this.hide():this.show()}testFlag(e){return(this._flags&e)!==0}setFlag(e){this._flags|=e}clearFlag(e){this._flags&=~e}processMessage(e){switch(e.type){case"resize":this.notifyLayout(e),this.onResize(e);break;case"update-request":this.notifyLayout(e),this.onUpdateRequest(e);break;case"fit-request":this.notifyLayout(e),this.onFitRequest(e);break;case"before-show":this.notifyLayout(e),this.onBeforeShow(e);break;case"after-show":this.setFlag(pe.Flag.IsVisible),this.notifyLayout(e),this.onAfterShow(e);break;case"before-hide":this.notifyLayout(e),this.onBeforeHide(e);break;case"after-hide":this.clearFlag(pe.Flag.IsVisible),this.notifyLayout(e),this.onAfterHide(e);break;case"before-attach":this.notifyLayout(e),this.onBeforeAttach(e);break;case"after-attach":!this.isHidden&&(!this.parent||this.parent.isVisible)&&this.setFlag(pe.Flag.IsVisible),this.setFlag(pe.Flag.IsAttached),this.notifyLayout(e),this.onAfterAttach(e);break;case"before-detach":this.notifyLayout(e),this.onBeforeDetach(e);break;case"after-detach":this.clearFlag(pe.Flag.IsVisible),this.clearFlag(pe.Flag.IsAttached),this.notifyLayout(e),this.onAfterDetach(e);break;case"activate-request":this.notifyLayout(e),this.onActivateRequest(e);break;case"close-request":this.notifyLayout(e),this.onCloseRequest(e);break;case"child-added":this.notifyLayout(e),this.onChildAdded(e);break;case"child-removed":this.notifyLayout(e),this.onChildRemoved(e);break;default:this.notifyLayout(e);break}}notifyLayout(e){this._layout&&this._layout.processParentMessage(e)}onCloseRequest(e){this.parent?this.parent=null:this.isAttached&&pe.detach(this)}onResize(e){}onUpdateRequest(e){}onFitRequest(e){}onActivateRequest(e){}onBeforeShow(e){}onAfterShow(e){}onBeforeHide(e){}onAfterHide(e){}onBeforeAttach(e){}onAfterAttach(e){}onBeforeDetach(e){}onAfterDetach(e){}onChildAdded(e){}onChildRemoved(e){}_toggleHidden(e){if(e)switch(this._hiddenMode){case pe.HiddenMode.Display:this.addClass("lm-mod-hidden");break;case pe.HiddenMode.Scale:this.node.style.transform="scale(0)",this.node.setAttribute("aria-hidden","true");break;case pe.HiddenMode.ContentVisibility:this.node.style.contentVisibility="hidden",this.node.style.zIndex="-1";break}else switch(this._hiddenMode){case pe.HiddenMode.Display:this.removeClass("lm-mod-hidden");break;case pe.HiddenMode.Scale:this.node.style.transform="",this.node.removeAttribute("aria-hidden");break;case pe.HiddenMode.ContentVisibility:this.node.style.contentVisibility="",this.node.style.zIndex="";break}}};(function(t){(function(s){s[s.Display=0]="Display",s[s.Scale=1]="Scale",s[s.ContentVisibility=2]="ContentVisibility"})(t.HiddenMode||(t.HiddenMode={})),function(s){s[s.IsDisposed=1]="IsDisposed",s[s.IsAttached=2]="IsAttached",s[s.IsHidden=4]="IsHidden",s[s.IsVisible=8]="IsVisible",s[s.DisallowLayout=16]="DisallowLayout"}(t.Flag||(t.Flag={})),function(s){s.BeforeShow=new _r("before-show"),s.AfterShow=new _r("after-show"),s.BeforeHide=new _r("before-hide"),s.AfterHide=new _r("after-hide"),s.BeforeAttach=new _r("before-attach"),s.AfterAttach=new _r("after-attach"),s.BeforeDetach=new _r("before-detach"),s.AfterDetach=new _r("after-detach"),s.ParentChanged=new _r("parent-changed"),s.UpdateRequest=new _c("update-request"),s.FitRequest=new _c("fit-request"),s.ActivateRequest=new _c("activate-request"),s.CloseRequest=new _c("close-request")}(t.Msg||(t.Msg={}));class e extends _r{constructor(o,a){super(o),this.child=a}}t.ChildMessage=e;class i extends _r{constructor(o,a){super("resize"),this.width=o,this.height=a}}t.ResizeMessage=i,function(s){s.UnknownSize=new s(-1,-1)}(i=t.ResizeMessage||(t.ResizeMessage={}));function n(s,o,a=null){if(s.parent)throw new Error("Cannot attach a child widget.");if(s.isAttached||s.node.isConnected)throw new Error("Widget is already attached.");if(!o.isConnected)throw new Error("Host is not attached.");Oe.sendMessage(s,t.Msg.BeforeAttach),o.insertBefore(s.node,a),Oe.sendMessage(s,t.Msg.AfterAttach)}t.attach=n;function r(s){if(s.parent)throw new Error("Cannot detach a child widget.");if(!s.isAttached||!s.node.isConnected)throw new Error("Widget is not attached.");Oe.sendMessage(s,t.Msg.BeforeDetach),s.node.parentNode.removeChild(s.node),Oe.sendMessage(s,t.Msg.AfterDetach)}t.detach=r})(pe||(pe={}));(function(t){t.titleProperty=new pt({name:"title",create:i=>new rg({owner:i})});function e(i){return i.node||document.createElement(i.tag||"div")}t.createNode=e})(Kb||(Kb={}));la=class{constructor(e={}){this._disposed=!1,this._parent=null,this._fitPolicy=e.fitPolicy||"set-min-size"}dispose(){this._parent=null,this._disposed=!0,Te.clearData(this),pt.clearData(this)}get isDisposed(){return this._disposed}get parent(){return this._parent}set parent(e){if(this._parent!==e){if(this._parent)throw new Error("Cannot change parent widget.");if(e.layout!==this)throw new Error("Invalid parent widget.");this._parent=e,this.init()}}get fitPolicy(){return this._fitPolicy}set fitPolicy(e){if(this._fitPolicy!==e&&(this._fitPolicy=e,this._parent)){let i=this._parent.node.style;i.minWidth="",i.minHeight="",i.maxWidth="",i.maxHeight="",this._parent.fit()}}processParentMessage(e){switch(e.type){case"resize":this.onResize(e);break;case"update-request":this.onUpdateRequest(e);break;case"fit-request":this.onFitRequest(e);break;case"before-show":this.onBeforeShow(e);break;case"after-show":this.onAfterShow(e);break;case"before-hide":this.onBeforeHide(e);break;case"after-hide":this.onAfterHide(e);break;case"before-attach":this.onBeforeAttach(e);break;case"after-attach":this.onAfterAttach(e);break;case"before-detach":this.onBeforeDetach(e);break;case"after-detach":this.onAfterDetach(e);break;case"child-removed":this.onChildRemoved(e);break;case"child-shown":this.onChildShown(e);break;case"child-hidden":this.onChildHidden(e);break}}init(){for(let e of this)e.parent=this.parent}onResize(e){for(let i of this)Oe.sendMessage(i,pe.ResizeMessage.UnknownSize)}onUpdateRequest(e){for(let i of this)Oe.sendMessage(i,pe.ResizeMessage.UnknownSize)}onBeforeAttach(e){for(let i of this)Oe.sendMessage(i,e)}onAfterAttach(e){for(let i of this)Oe.sendMessage(i,e)}onBeforeDetach(e){for(let i of this)Oe.sendMessage(i,e)}onAfterDetach(e){for(let i of this)Oe.sendMessage(i,e)}onBeforeShow(e){for(let i of this)i.isHidden||Oe.sendMessage(i,e)}onAfterShow(e){for(let i of this)i.isHidden||Oe.sendMessage(i,e)}onBeforeHide(e){for(let i of this)i.isHidden||Oe.sendMessage(i,e)}onAfterHide(e){for(let i of this)i.isHidden||Oe.sendMessage(i,e)}onChildRemoved(e){this.removeWidget(e.child)}onFitRequest(e){}onChildShown(e){}onChildHidden(e){}};(function(t){function e(s){return Tf.horizontalAlignmentProperty.get(s)}t.getHorizontalAlignment=e;function i(s,o){Tf.horizontalAlignmentProperty.set(s,o)}t.setHorizontalAlignment=i;function n(s){return Tf.verticalAlignmentProperty.get(s)}t.getVerticalAlignment=n;function r(s,o){Tf.verticalAlignmentProperty.set(s,o)}t.setVerticalAlignment=r})(la||(la={}));ku=class{constructor(e){this._top=NaN,this._left=NaN,this._width=NaN,this._height=NaN,this._minWidth=0,this._minHeight=0,this._maxWidth=1/0,this._maxHeight=1/0,this._disposed=!1,this.widget=e,this.widget.node.style.position="absolute",this.widget.node.style.contain="strict"}dispose(){if(this._disposed)return;this._disposed=!0;let e=this.widget.node.style;e.position="",e.top="",e.left="",e.width="",e.height="",e.contain=""}get minWidth(){return this._minWidth}get minHeight(){return this._minHeight}get maxWidth(){return this._maxWidth}get maxHeight(){return this._maxHeight}get isDisposed(){return this._disposed}get isHidden(){return this.widget.isHidden}get isVisible(){return this.widget.isVisible}get isAttached(){return this.widget.isAttached}fit(){let e=ti.sizeLimits(this.widget.node);this._minWidth=e.minWidth,this._minHeight=e.minHeight,this._maxWidth=e.maxWidth,this._maxHeight=e.maxHeight}update(e,i,n,r){let s=Math.max(this._minWidth,Math.min(n,this._maxWidth)),o=Math.max(this._minHeight,Math.min(r,this._maxHeight));if(s"center",changed:e}),t.verticalAlignmentProperty=new pt({name:"verticalAlignment",create:()=>"top",changed:e});function e(i){i.parent&&i.parent.layout&&i.parent.update()}})(Tf||(Tf={}));Rf=class extends la{constructor(){super(...arguments),this._widgets=[]}dispose(){for(;this._widgets.length>0;)this._widgets.pop().dispose();super.dispose()}get widgets(){return this._widgets}*[Symbol.iterator](){yield*this._widgets}addWidget(e){this.insertWidget(this._widgets.length,e)}insertWidget(e,i){i.parent=this.parent;let n=this._widgets.indexOf(i),r=Math.max(0,Math.min(e,this._widgets.length));if(n===-1){He.insert(this._widgets,r,i),this.parent&&this.attachWidget(r,i);return}r===this._widgets.length&&r--,n!==r&&(He.move(this._widgets,n,r),this.parent&&this.moveWidget(n,r,i))}removeWidget(e){this.removeWidgetAt(this._widgets.indexOf(e))}removeWidgetAt(e){let i=He.removeAt(this._widgets,e);i&&this.parent&&this.detachWidget(e,i)}init(){super.init();let e=0;for(let i of this)this.attachWidget(e++,i)}attachWidget(e,i){let n=this.parent.node.children[e];this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.BeforeAttach),this.parent.node.insertBefore(i.node,n),this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.AfterAttach)}moveWidget(e,i,n){this.parent.isAttached&&Oe.sendMessage(n,pe.Msg.BeforeDetach),this.parent.node.removeChild(n.node),this.parent.isAttached&&Oe.sendMessage(n,pe.Msg.AfterDetach);let r=this.parent.node.children[i];this.parent.isAttached&&Oe.sendMessage(n,pe.Msg.BeforeAttach),this.parent.node.insertBefore(n.node,r),this.parent.isAttached&&Oe.sendMessage(n,pe.Msg.AfterAttach)}detachWidget(e,i){this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.AfterDetach)}};(function(t){function e(i){return Math.max(0,Math.floor(i))}t.clampDimension=e})(sg||(sg={}));og=sg,yo=class extends Rf{constructor(e){super(),this.widgetOffset=0,this._fixed=0,this._spacing=4,this._dirty=!1,this._hasNormedSizes=!1,this._sizers=[],this._items=[],this._handles=[],this._box=null,this._alignment="start",this._orientation="horizontal",this.renderer=e.renderer,e.orientation!==void 0&&(this._orientation=e.orientation),e.alignment!==void 0&&(this._alignment=e.alignment),e.spacing!==void 0&&(this._spacing=sg.clampDimension(e.spacing))}dispose(){for(let e of this._items)e.dispose();this._box=null,this._items.length=0,this._sizers.length=0,this._handles.length=0,super.dispose()}get orientation(){return this._orientation}set orientation(e){this._orientation!==e&&(this._orientation=e,this.parent&&(this.parent.dataset.orientation=e,this.parent.fit()))}get alignment(){return this._alignment}set alignment(e){this._alignment!==e&&(this._alignment=e,this.parent&&(this.parent.dataset.alignment=e,this.parent.update()))}get spacing(){return this._spacing}set spacing(e){e=sg.clampDimension(e),this._spacing!==e&&(this._spacing=e,this.parent&&this.parent.fit())}get handles(){return this._handles}absoluteSizes(){return this._sizers.map(e=>e.size)}relativeSizes(){return ll.normalize(this._sizers.map(e=>e.size))}setRelativeSizes(e,i=!0){let n=this._sizers.length,r=e.slice(0,n);for(;r.length0&&(s.sizeHint=s.size);as.adjust(this._sizers,e,r),this.parent&&this.parent.update()}}init(){this.parent.dataset.orientation=this.orientation,this.parent.dataset.alignment=this.alignment,super.init()}attachWidget(e,i){let n=new ku(i),r=ll.createHandle(this.renderer),s=ll.averageSize(this._sizers),o=ll.createSizer(s);He.insert(this._items,e,n),He.insert(this._sizers,e,o),He.insert(this._handles,e,r),this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.BeforeAttach),this.parent.node.appendChild(i.node),this.parent.node.appendChild(r),this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.AfterAttach),this.parent.fit()}moveWidget(e,i,n){He.move(this._items,e,i),He.move(this._sizers,e,i),He.move(this._handles,e,i),this.parent.fit()}detachWidget(e,i){let n=He.removeAt(this._items,e),r=He.removeAt(this._handles,e);He.removeAt(this._sizers,e),this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.node.removeChild(r),this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.AfterDetach),n.dispose(),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}updateItemPosition(e,i,n,r,s,o,a){let l=this._items[e];if(l.isHidden)return;let c=this._handles[e].style;i?(n+=this.widgetOffset,l.update(n,r,a,s),n+=a,c.top=`${r}px`,c.left=`${n}px`,c.width=`${this._spacing}px`,c.height=`${s}px`):(r+=this.widgetOffset,l.update(n,r,o,a),r+=a,c.top=`${r}px`,c.left=`${n}px`,c.width=`${o}px`,c.height=`${this._spacing}px`)}_fit(){let e=0,i=-1;for(let l=0,c=this._items.length;l0&&(d.sizeHint=d.size),u.isHidden){d.minSize=0,d.maxSize=0;continue}u.fit(),d.stretch=yo.getStretch(u.widget),n?(d.minSize=u.minWidth,d.maxSize=u.maxWidth,r+=u.minWidth,s=Math.max(s,u.minHeight)):(d.minSize=u.minHeight,d.maxSize=u.maxHeight,s+=u.minHeight,r=Math.max(r,u.minWidth))}let o=this._box=ti.boxSizing(this.parent.node);r+=o.horizontalSum,s+=o.verticalSum;let a=this.parent.node.style;a.minWidth=`${r}px`,a.minHeight=`${s}px`,this._dirty=!0,this.parent.parent&&Oe.sendMessage(this.parent.parent,pe.Msg.FitRequest),this._dirty&&Oe.sendMessage(this.parent,pe.Msg.UpdateRequest)}_update(e,i){this._dirty=!1;let n=0;for(let d=0,f=this._items.length;d0){let d;if(u?d=Math.max(0,o-this._fixed):d=Math.max(0,a-this._fixed),this._hasNormedSizes){for(let h of this._sizers)h.sizeHint*=d;this._hasNormedSizes=!1}let f=as.calc(this._sizers,d);if(f>0)switch(this._alignment){case"start":break;case"center":l=0,c=f/2;break;case"end":l=0,c=f;break;case"justify":l=f/n,c=0;break;default:throw"unreachable"}}for(let d=0,f=this._items.length;d0,coerce:(o,a)=>Math.max(0,Math.floor(a)),changed:s});function e(o){let a=new oa;return a.sizeHint=Math.floor(o),a}t.createSizer=e;function i(o){let a=o.createHandle();return a.style.position="absolute",a.style.contain="style",a}t.createHandle=i;function n(o){return o.reduce((a,l)=>a+l.size,0)/o.length||0}t.averageSize=n;function r(o){let a=o.length;if(a===0)return[];let l=o.reduce((c,u)=>c+Math.abs(u),0);return l===0?o.map(c=>1/a):o.map(c=>c/l)}t.normalize=r;function s(o){o.parent&&o.parent.layout instanceof yo&&o.parent.fit()}})(ll||(ll={}));hC=class extends yo{constructor(e){super({...e,orientation:e.orientation||"vertical"}),this._titles=[],this.titleSpace=e.titleSpace||22}get titleSpace(){return this.widgetOffset}set titleSpace(e){e=og.clampDimension(e),this.widgetOffset!==e&&(this.widgetOffset=e,this.parent&&this.parent.fit())}get titles(){return this._titles}dispose(){this.isDisposed||(this._titles.length=0,super.dispose())}updateTitle(e,i){let n=this._titles[e],r=n.classList.contains("lm-mod-expanded"),s=Xb.createTitle(this.renderer,i.title,r);this._titles[e]=s,this.parent.node.replaceChild(s,n)}insertWidget(e,i){i.id||(i.id=`id-${ul.UUID.uuid4()}`),super.insertWidget(e,i)}attachWidget(e,i){let n=Xb.createTitle(this.renderer,i.title);He.insert(this._titles,e,n),this.parent.node.appendChild(n),i.node.setAttribute("role","region"),i.node.setAttribute("aria-labelledby",n.id),super.attachWidget(e,i)}moveWidget(e,i,n){He.move(this._titles,e,i),super.moveWidget(e,i,n)}detachWidget(e,i){let n=He.removeAt(this._titles,e);this.parent.node.removeChild(n),super.detachWidget(e,i)}updateItemPosition(e,i,n,r,s,o,a){let l=this._titles[e].style;l.top=`${r}px`,l.left=`${n}px`,l.height=`${this.widgetOffset}px`,i?l.width=`${s}px`:l.width=`${o}px`,super.updateItemPosition(e,i,n,r,s,o,a)}};(function(t){function e(i,n,r=!0){let s=i.createSectionTitle(n);return s.style.position="absolute",s.style.contain="strict",s.setAttribute("aria-label",`${n.label} Section`),s.setAttribute("aria-expanded",r?"true":"false"),s.setAttribute("aria-controls",n.owner.id),r&&s.classList.add("lm-mod-expanded"),s}t.createTitle=e})(Xb||(Xb={}));Jb=class extends pe{constructor(e={}){super(),this.addClass("lm-Panel"),this.layout=fC.createLayout(e)}get widgets(){return this.layout.widgets}addWidget(e){this.layout.addWidget(e)}insertWidget(e,i){this.layout.insertWidget(e,i)}};(function(t){function e(i){return i.layout||new Rf}t.createLayout=e})(fC||(fC={}));Au=class extends Jb{constructor(e={}){super({layout:mC.createLayout(e)}),this._handleMoved=new Te(this),this._pressData=null,this.addClass("lm-SplitPanel")}dispose(){this._releaseMouse(),super.dispose()}get orientation(){return this.layout.orientation}set orientation(e){this.layout.orientation=e}get alignment(){return this.layout.alignment}set alignment(e){this.layout.alignment=e}get spacing(){return this.layout.spacing}set spacing(e){this.layout.spacing=e}get renderer(){return this.layout.renderer}get handleMoved(){return this._handleMoved}get handles(){return this.layout.handles}relativeSizes(){return this.layout.relativeSizes()}setRelativeSizes(e,i=!0){this.layout.setRelativeSizes(e,i)}handleEvent(e){switch(e.type){case"pointerdown":this._evtPointerDown(e);break;case"pointermove":this._evtPointerMove(e);break;case"pointerup":this._evtPointerUp(e);break;case"keydown":this._evtKeyDown(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("pointerdown",this)}onAfterDetach(e){this.node.removeEventListener("pointerdown",this),this._releaseMouse()}onChildAdded(e){e.child.addClass("lm-SplitPanel-child"),this._releaseMouse()}onChildRemoved(e){e.child.removeClass("lm-SplitPanel-child"),this._releaseMouse()}_evtKeyDown(e){this._pressData&&(e.preventDefault(),e.stopPropagation()),e.keyCode===27&&this._releaseMouse()}_evtPointerDown(e){if(e.button!==0)return;let i=this.layout,n=He.findFirstIndex(i.handles,c=>c.contains(e.target));if(n===-1)return;e.preventDefault(),e.stopPropagation(),document.addEventListener("pointerup",this,!0),document.addEventListener("pointermove",this,!0),document.addEventListener("keydown",this,!0),document.addEventListener("contextmenu",this,!0);let r,s=i.handles[n],o=s.getBoundingClientRect();i.orientation==="horizontal"?r=e.clientX-o.left:r=e.clientY-o.top;let a=window.getComputedStyle(s),l=an.overrideCursor(a.cursor);this._pressData={index:n,delta:r,override:l}}_evtPointerMove(e){e.preventDefault(),e.stopPropagation();let i,n=this.layout,r=this.node.getBoundingClientRect();n.orientation==="horizontal"?i=e.clientX-r.left-this._pressData.delta:i=e.clientY-r.top-this._pressData.delta,n.moveHandle(this._pressData.index,i)}_evtPointerUp(e){e.button===0&&(e.preventDefault(),e.stopPropagation(),this._releaseMouse())}_releaseMouse(){this._pressData&&(this._pressData.override.dispose(),this._pressData=null,this._handleMoved.emit(),document.removeEventListener("keydown",this,!0),document.removeEventListener("pointerup",this,!0),document.removeEventListener("pointermove",this,!0),document.removeEventListener("contextmenu",this,!0))}};(function(t){class e{createHandle(){let s=document.createElement("div");return s.className="lm-SplitPanel-handle",s}}t.Renderer=e,t.defaultRenderer=new e;function i(r){return yo.getStretch(r)}t.getStretch=i;function n(r,s){yo.setStretch(r,s)}t.setStretch=n})(Au||(Au={}));(function(t){function e(i){return i.layout||new yo({renderer:i.renderer||Au.defaultRenderer,orientation:i.orientation,alignment:i.alignment,spacing:i.spacing})}t.createLayout=e})(mC||(mC={}));ag=class extends Au{constructor(e={}){super({...e,layout:pC.createLayout(e)}),this._widgetSizesCache=new WeakMap,this._expansionToggled=new Te(this),this.addClass("lm-AccordionPanel")}get renderer(){return this.layout.renderer}get titleSpace(){return this.layout.titleSpace}set titleSpace(e){this.layout.titleSpace=e}get titles(){return this.layout.titles}get expansionToggled(){return this._expansionToggled}addWidget(e){super.addWidget(e),e.title.changed.connect(this._onTitleChanged,this)}collapse(e){let i=this.layout.widgets[e];i&&!i.isHidden&&this._toggleExpansion(e)}expand(e){let i=this.layout.widgets[e];i&&i.isHidden&&this._toggleExpansion(e)}insertWidget(e,i){super.insertWidget(e,i),i.title.changed.connect(this._onTitleChanged,this)}handleEvent(e){switch(super.handleEvent(e),e.type){case"click":this._evtClick(e);break;case"keydown":this._eventKeyDown(e);break}}onBeforeAttach(e){this.node.addEventListener("click",this),this.node.addEventListener("keydown",this),super.onBeforeAttach(e)}onAfterDetach(e){super.onAfterDetach(e),this.node.removeEventListener("click",this),this.node.removeEventListener("keydown",this)}_onTitleChanged(e){let i=He.findFirstIndex(this.widgets,n=>n.contains(e.owner));i>=0&&(this.layout.updateTitle(i,e.owner),this.update())}_computeWidgetSize(e){let i=this.layout,n=i.widgets[e];if(!n)return;let r=n.isHidden,s=i.absoluteSizes(),o=(r?-1:1)*this.spacing,a=s.reduce((c,u)=>c+u),l=[...s];if(r){let c=this._widgetSizesCache.get(n);if(!c)return;l[e]+=c;let u=l.map(d=>d-c>0).lastIndexOf(!0);u===-1?l.forEach((d,f)=>{f!==e&&(l[f]-=s[f]/a*(c-o))}):l[u]-=c-o}else{let c=s[e];this._widgetSizesCache.set(n,c),l[e]=0;let u=l.map(d=>d>0).lastIndexOf(!0);if(u===-1)return;l[u]=s[u]+c+o}return l.map(c=>c/(a+o))}_evtClick(e){let i=e.target;if(i){let n=He.findFirstIndex(this.titles,r=>r.contains(i));n>=0&&(e.preventDefault(),e.stopPropagation(),this._toggleExpansion(n))}}_eventKeyDown(e){if(e.defaultPrevented)return;let i=e.target,n=!1;if(i){let r=He.findFirstIndex(this.titles,s=>s.contains(i));if(r>=0){let s=e.keyCode.toString();if(e.key.match(/Space|Enter/)||s.match(/13|32/))i.click(),n=!0;else if(this.orientation==="horizontal"?e.key.match(/ArrowLeft|ArrowRight/)||s.match(/37|39/):e.key.match(/ArrowUp|ArrowDown/)||s.match(/38|40/)){let o=e.key.match(/ArrowLeft|ArrowUp/)||s.match(/37|38/)?-1:1,a=this.titles.length,l=(r+a+o)%a;this.titles[l].focus(),n=!0}else e.key==="End"||s==="35"?(this.titles[this.titles.length-1].focus(),n=!0):(e.key==="Home"||s==="36")&&(this.titles[0].focus(),n=!0)}n&&e.preventDefault()}}_toggleExpansion(e){let i=this.titles[e],n=this.layout.widgets[e],r=this._computeWidgetSize(e);r&&this.setRelativeSizes(r,!1),n.isHidden?(i.classList.add("lm-mod-expanded"),i.setAttribute("aria-expanded","true"),n.show()):(i.classList.remove("lm-mod-expanded"),i.setAttribute("aria-expanded","false"),n.hide()),this._expansionToggled.emit(e)}};(function(t){class e extends Au.Renderer{constructor(){super(),this.titleClassName="lm-AccordionPanel-title",this._titleID=0,this._titleKeys=new WeakMap,this._uuid=++e._nInstance}createCollapseIcon(n){return document.createElement("span")}createSectionTitle(n){let r=document.createElement("h3");r.setAttribute("tabindex","0"),r.id=this.createTitleKey(n),r.className=this.titleClassName;for(let a in n.dataset)r.dataset[a]=n.dataset[a];let s=r.appendChild(this.createCollapseIcon(n));s.className="lm-AccordionPanel-titleCollapser";let o=r.appendChild(document.createElement("span"));return o.className="lm-AccordionPanel-titleLabel",o.textContent=n.label,o.title=n.caption||n.label,r}createTitleKey(n){let r=this._titleKeys.get(n);return r===void 0&&(r=`title-key-${this._uuid}-${this._titleID++}`,this._titleKeys.set(n,r)),r}}e._nInstance=0,t.Renderer=e,t.defaultRenderer=new e})(ag||(ag={}));(function(t){function e(i){return i.layout||new hC({renderer:i.renderer||ag.defaultRenderer,orientation:i.orientation,alignment:i.alignment,spacing:i.spacing,titleSpace:i.titleSpace})}t.createLayout=e})(pC||(pC={}));ls=class extends Rf{constructor(e={}){super(),this._fixed=0,this._spacing=4,this._dirty=!1,this._sizers=[],this._items=[],this._box=null,this._alignment="start",this._direction="top-to-bottom",e.direction!==void 0&&(this._direction=e.direction),e.alignment!==void 0&&(this._alignment=e.alignment),e.spacing!==void 0&&(this._spacing=og.clampDimension(e.spacing))}dispose(){for(let e of this._items)e.dispose();this._box=null,this._items.length=0,this._sizers.length=0,super.dispose()}get direction(){return this._direction}set direction(e){this._direction!==e&&(this._direction=e,this.parent&&(this.parent.dataset.direction=e,this.parent.fit()))}get alignment(){return this._alignment}set alignment(e){this._alignment!==e&&(this._alignment=e,this.parent&&(this.parent.dataset.alignment=e,this.parent.update()))}get spacing(){return this._spacing}set spacing(e){e=og.clampDimension(e),this._spacing!==e&&(this._spacing=e,this.parent&&this.parent.fit())}init(){this.parent.dataset.direction=this.direction,this.parent.dataset.alignment=this.alignment,super.init()}attachWidget(e,i){He.insert(this._items,e,new ku(i)),He.insert(this._sizers,e,new oa),this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.BeforeAttach),this.parent.node.appendChild(i.node),this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.AfterAttach),this.parent.fit()}moveWidget(e,i,n){He.move(this._items,e,i),He.move(this._sizers,e,i),this.parent.update()}detachWidget(e,i){let n=He.removeAt(this._items,e);He.removeAt(this._sizers,e),this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.AfterDetach),n.dispose(),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_fit(){let e=0;for(let a=0,l=this._items.length;a0)switch(this._alignment){case"start":break;case"center":c=0,u=l/2;break;case"end":c=0,u=l;break;case"justify":c=l/n,u=0;break;default:throw"unreachable"}for(let d=0,f=this._items.length;d0,coerce:(r,s)=>Math.max(0,Math.floor(s)),changed:n}),t.sizeBasisProperty=new pt({name:"sizeBasis",create:()=>0,coerce:(r,s)=>Math.max(0,Math.floor(s)),changed:n});function e(r){return r==="left-to-right"||r==="right-to-left"}t.isHorizontal=e;function i(r){return Math.max(0,Math.floor(r))}t.clampSpacing=i;function n(r){r.parent&&r.parent.layout instanceof ls&&r.parent.fit()}})(Ru||(Ru={}));Zb=class extends Jb{constructor(e={}){super({layout:gC.createLayout(e)}),this.addClass("lm-BoxPanel")}get direction(){return this.layout.direction}set direction(e){this.layout.direction=e}get alignment(){return this.layout.alignment}set alignment(e){this.layout.alignment=e}get spacing(){return this.layout.spacing}set spacing(e){this.layout.spacing=e}onChildAdded(e){e.child.addClass("lm-BoxPanel-child")}onChildRemoved(e){e.child.removeClass("lm-BoxPanel-child")}};(function(t){function e(s){return ls.getStretch(s)}t.getStretch=e;function i(s,o){ls.setStretch(s,o)}t.setStretch=i;function n(s){return ls.getSizeBasis(s)}t.getSizeBasis=n;function r(s,o){ls.setSizeBasis(s,o)}t.setSizeBasis=r})(Zb||(Zb={}));(function(t){function e(i){return i.layout||new ls(i)}t.createLayout=e})(gC||(gC={}));kf=class extends pe{constructor(e){super({node:al.createNode()}),this._activeIndex=-1,this._items=[],this._results=null,this.addClass("lm-CommandPalette"),this.setFlag(pe.Flag.DisallowLayout),this.commands=e.commands,this.renderer=e.renderer||kf.defaultRenderer,this.commands.commandChanged.connect(this._onGenericChange,this),this.commands.keyBindingChanged.connect(this._onGenericChange,this)}dispose(){this._items.length=0,this._results=null,super.dispose()}get searchNode(){return this.node.getElementsByClassName("lm-CommandPalette-search")[0]}get inputNode(){return this.node.getElementsByClassName("lm-CommandPalette-input")[0]}get contentNode(){return this.node.getElementsByClassName("lm-CommandPalette-content")[0]}get items(){return this._items}addItem(e){let i=al.createItem(this.commands,e);return this._items.push(i),this.refresh(),i}addItems(e){let i=e.map(n=>al.createItem(this.commands,n));return i.forEach(n=>this._items.push(n)),this.refresh(),i}removeItem(e){this.removeItemAt(this._items.indexOf(e))}removeItemAt(e){He.removeAt(this._items,e)&&this.refresh()}clearItems(){this._items.length!==0&&(this._items.length=0,this.refresh())}refresh(){if(this._results=null,this.inputNode.value!==""){let e=this.node.getElementsByClassName("lm-close-icon")[0];e.style.display="inherit"}else{let e=this.node.getElementsByClassName("lm-close-icon")[0];e.style.display="none"}this.update()}handleEvent(e){switch(e.type){case"click":this._evtClick(e);break;case"keydown":this._evtKeyDown(e);break;case"input":this.refresh();break;case"focus":case"blur":this._toggleFocused();break}}onBeforeAttach(e){this.node.addEventListener("click",this),this.node.addEventListener("keydown",this),this.node.addEventListener("input",this),this.node.addEventListener("focus",this,!0),this.node.addEventListener("blur",this,!0)}onAfterDetach(e){this.node.removeEventListener("click",this),this.node.removeEventListener("keydown",this),this.node.removeEventListener("input",this),this.node.removeEventListener("focus",this,!0),this.node.removeEventListener("blur",this,!0)}onAfterShow(e){this.update(),super.onAfterShow(e)}onActivateRequest(e){if(this.isAttached){let i=this.inputNode;i.focus(),i.select()}}onUpdateRequest(e){if(this.isHidden)return;let i=this.inputNode.value,n=this.contentNode,r=this._results;if(r||(r=this._results=al.search(this._items,i),this._activeIndex=i?He.findFirstIndex(r,al.canActivate):-1),!i&&r.length===0){Zt.render(null,n);return}if(i&&r.length===0){let l=this.renderer.renderEmptyMessage({query:i});Zt.render(l,n);return}let s=this.renderer,o=this._activeIndex,a=new Array(r.length);for(let l=0,c=r.length;l=r.length)n.scrollTop=0;else{let l=n.children[o];ti.scrollIntoViewIfNeeded(n,l)}}_evtClick(e){if(e.button!==0)return;if(e.target.classList.contains("lm-close-icon")){this.inputNode.value="",this.refresh();return}let i=He.findFirstIndex(this.contentNode.children,n=>n.contains(e.target));i!==-1&&(e.preventDefault(),e.stopPropagation(),this._execute(i))}_evtKeyDown(e){if(!(e.altKey||e.ctrlKey||e.metaKey||e.shiftKey))switch(e.keyCode){case 13:e.preventDefault(),e.stopPropagation(),this._execute(this._activeIndex);break;case 38:e.preventDefault(),e.stopPropagation(),this._activatePreviousItem();break;case 40:e.preventDefault(),e.stopPropagation(),this._activateNextItem();break}}_activateNextItem(){if(!this._results||this._results.length===0)return;let e=this._activeIndex,i=this._results.length,n=eC-x),b=S.slice(0,A),M=S.slice(A);for(let C=0,x=M.length;Cp.command===h&&ul.JSONExt.deepEqual(p.args,m))||null}}})(al||(al={}));aa=class extends pe{constructor(e){super({node:vr.createNode()}),this._childIndex=-1,this._activeIndex=-1,this._openTimerID=0,this._closeTimerID=0,this._items=[],this._childMenu=null,this._parentMenu=null,this._aboutToClose=new Te(this),this._menuRequested=new Te(this),this.addClass("lm-Menu"),this.setFlag(pe.Flag.DisallowLayout),this.commands=e.commands,this.renderer=e.renderer||aa.defaultRenderer}dispose(){this.close(),this._items.length=0,super.dispose()}get aboutToClose(){return this._aboutToClose}get menuRequested(){return this._menuRequested}get parentMenu(){return this._parentMenu}get childMenu(){return this._childMenu}get rootMenu(){let e=this;for(;e._parentMenu;)e=e._parentMenu;return e}get leafMenu(){let e=this;for(;e._childMenu;)e=e._childMenu;return e}get contentNode(){return this.node.getElementsByClassName("lm-Menu-content")[0]}get activeItem(){return this._items[this._activeIndex]||null}set activeItem(e){this.activeIndex=e?this._items.indexOf(e):-1}get activeIndex(){return this._activeIndex}set activeIndex(e){(e<0||e>=this._items.length)&&(e=-1),e!==-1&&!vr.canActivate(this._items[e])&&(e=-1),this._activeIndex!==e&&(this._activeIndex=e,this._activeIndex>=0&&this.contentNode.childNodes[this._activeIndex]&&this.contentNode.childNodes[this._activeIndex].focus(),this.update())}get items(){return this._items}activateNextItem(){let e=this._items.length,i=this._activeIndex,n=i{this.activeIndex=a}})}Zt.render(o,this.contentNode)}onCloseRequest(e){this._cancelOpenTimer(),this._cancelCloseTimer(),this.activeIndex=-1;let i=this._childMenu;i&&(this._childIndex=-1,this._childMenu=null,i._parentMenu=null,i.close());let n=this._parentMenu;n&&(this._parentMenu=null,n._childIndex=-1,n._childMenu=null,n.activate()),this.isAttached&&this._aboutToClose.emit(void 0),super.onCloseRequest(e)}_evtKeyDown(e){e.preventDefault(),e.stopPropagation();let i=e.keyCode;if(i===13){this.triggerActiveItem();return}if(i===27){this.close();return}if(i===37){this._parentMenu?this.close():this._menuRequested.emit("previous");return}if(i===38){this.activatePreviousItem();return}if(i===39){let o=this.activeItem;o&&o.type==="submenu"?this.triggerActiveItem():this.rootMenu._menuRequested.emit("next");return}if(i===40){this.activateNextItem();return}let n=ks().keyForKeydownEvent(e);if(!n)return;let r=this._activeIndex+1,s=vr.findMnemonic(this._items,n,r);s.index!==-1&&!s.multiple?(this.activeIndex=s.index,this.triggerActiveItem()):s.index!==-1?this.activeIndex=s.index:s.auto!==-1&&(this.activeIndex=s.auto)}_evtMouseUp(e){e.button===0&&(e.preventDefault(),e.stopPropagation(),this.triggerActiveItem())}_evtMouseMove(e){let i=He.findFirstIndex(this.contentNode.children,r=>ti.hitTest(r,e.clientX,e.clientY));if(i===this._activeIndex)return;if(this.activeIndex=i,i=this.activeIndex,i===this._childIndex){this._cancelOpenTimer(),this._cancelCloseTimer();return}this._childIndex!==-1&&this._startCloseTimer(),this._cancelOpenTimer();let n=this.activeItem;!n||n.type!=="submenu"||!n.submenu||this._startOpenTimer()}_evtMouseEnter(e){for(let i=this._parentMenu;i;i=i._parentMenu)i._cancelOpenTimer(),i._cancelCloseTimer(),i.activeIndex=i._childIndex}_evtMouseLeave(e){if(this._cancelOpenTimer(),!this._childMenu){this.activeIndex=-1;return}let{clientX:i,clientY:n}=e;if(ti.hitTest(this._childMenu.node,i,n)){this._cancelCloseTimer();return}this.activeIndex=-1,this._startCloseTimer()}_evtMouseDown(e){this._parentMenu||(vr.hitTestMenus(this,e.clientX,e.clientY)?(e.preventDefault(),e.stopPropagation()):this.close())}_openChildMenu(e=!1){let i=this.activeItem;if(!i||i.type!=="submenu"||!i.submenu){this._closeChildMenu();return}let n=i.submenu;if(n===this._childMenu)return;aa.saveWindowData(),this._closeChildMenu(),this._childMenu=n,this._childIndex=this._activeIndex,n._parentMenu=this,Oe.sendMessage(this,pe.Msg.UpdateRequest);let r=this.contentNode.children[this._activeIndex];vr.openSubmenu(n,r),e&&(n.activeIndex=-1,n.activateNextItem()),n.activate()}_closeChildMenu(){this._childMenu&&this._childMenu.close()}_startOpenTimer(){this._openTimerID===0&&(this._openTimerID=window.setTimeout(()=>{this._openTimerID=0,this._openChildMenu()},vr.TIMER_DELAY))}_startCloseTimer(){this._closeTimerID===0&&(this._closeTimerID=window.setTimeout(()=>{this._closeTimerID=0,this._closeChildMenu()},vr.TIMER_DELAY))}_cancelOpenTimer(){this._openTimerID!==0&&(clearTimeout(this._openTimerID),this._openTimerID=0)}_cancelCloseTimer(){this._closeTimerID!==0&&(clearTimeout(this._closeTimerID),this._closeTimerID=0)}static saveWindowData(){vr.saveWindowData()}};(function(t){class e{renderItem(n){let r=this.createItemClass(n),s=this.createItemDataset(n),o=this.createItemARIA(n);return Ce.li({className:r,dataset:s,tabindex:"0",onfocus:n.onfocus,...o},this.renderIcon(n),this.renderLabel(n),this.renderShortcut(n),this.renderSubmenu(n))}renderIcon(n){let r=this.createIconClass(n);return Ce.div({className:r},n.item.icon,n.item.iconLabel)}renderLabel(n){let r=this.formatLabel(n);return Ce.div({className:"lm-Menu-itemLabel"},r)}renderShortcut(n){let r=this.formatShortcut(n);return Ce.div({className:"lm-Menu-itemShortcut"},r)}renderSubmenu(n){return Ce.div({className:"lm-Menu-itemSubmenuIcon"})}createItemClass(n){let r="lm-Menu-item";n.item.isEnabled||(r+=" lm-mod-disabled"),n.item.isToggled&&(r+=" lm-mod-toggled"),n.item.isVisible||(r+=" lm-mod-hidden"),n.active&&(r+=" lm-mod-active"),n.collapsed&&(r+=" lm-mod-collapsed");let s=n.item.className;return s&&(r+=` ${s}`),r}createItemDataset(n){let r,{type:s,command:o,dataset:a}=n.item;return s==="command"?r={...a,type:s,command:o}:r={...a,type:s},r}createIconClass(n){let r="lm-Menu-itemIcon",s=n.item.iconClass;return s?`${r} ${s}`:r}createItemARIA(n){let r={};switch(n.item.type){case"separator":r.role="presentation";break;case"submenu":r["aria-haspopup"]="true",n.item.isEnabled||(r["aria-disabled"]="true");break;default:n.item.isEnabled||(r["aria-disabled"]="true"),r.role="menuitem"}return r}formatLabel(n){let{label:r,mnemonic:s}=n.item;if(s<0||s>=r.length)return r;let o=r.slice(0,s),a=r.slice(s+1),l=r[s],c=Ce.span({className:"lm-Menu-itemMnemonic"},l);return[o,c,a]}formatShortcut(n){let r=n.item.keyBinding;return r?er.formatKeystroke(r.keys):null}}t.Renderer=e,t.defaultRenderer=new e})(aa||(aa={}));(function(t){t.TIMER_DELAY=300,t.SUBMENU_OVERLAP=3;let e=null,i=0;function n(){return i>0?(i--,e):u()}function r(){e=u(),i++}t.saveWindowData=r;function s(){let p=document.createElement("div"),_=document.createElement("ul");return _.className="lm-Menu-content",p.appendChild(_),_.setAttribute("role","menu"),p.tabIndex=0,p}t.createNode=s;function o(p){return p.type!=="separator"&&p.isEnabled&&p.isVisible}t.canActivate=o;function a(p,_){return new m(p.commands,_)}t.createItem=a;function l(p,_,y){for(let S=p;S;S=S.childMenu)if(ti.hitTest(S.node,_,y))return!0;return!1}t.hitTestMenus=l;function c(p){let _=new Array(p.length);He.fill(_,!1);let y=0,S=p.length;for(;y=0;--T){let A=p[T];if(A.isVisible){if(A.type!=="separator")break;_[T]=!0}}let O=!1;for(;++yM+x&&(_=M+x-Z),!T&&y+X>C+w&&(y>C+w?y=C+w-X:y=y-X),B.transform=`translate(${Math.max(0,_)}px, ${Math.max(0,y)}px`,B.opacity="1"}t.openRootMenu=d;function f(p,_){let y=n(),S=y.pageXOffset,T=y.pageYOffset,O=y.clientWidth,A=y.clientHeight;Oe.sendMessage(p,pe.Msg.UpdateRequest);let b=A,M=p.node,C=M.style;C.opacity="0",C.maxHeight=`${b}px`,pe.attach(p,document.body);let{width:x,height:w}=M.getBoundingClientRect(),E=ti.boxSizing(p.node),N=_.getBoundingClientRect(),B=N.right-t.SUBMENU_OVERLAP;B+x>S+O&&(B=N.left+t.SUBMENU_OVERLAP-x);let Z=N.top-E.borderTop-E.paddingTop;Z+w>T+A&&(Z=N.bottom+E.borderBottom+E.paddingBottom-w),C.transform=`translate(${Math.max(0,B)}px, ${Math.max(0,Z)}px`,C.opacity="1"}t.openSubmenu=f;function h(p,_,y){let S=-1,T=-1,O=!1,A=_.toUpperCase();for(let b=0,M=p.length;b=0&&ES.command===_&&ul.JSONExt.deepEqual(S.args,y))||null}return null}}})(vr||(vr={}));(function(t){function e(o,a){let l=n(o.selector),c=o.rank!==void 0?o.rank:1/0;return{...o,selector:l,rank:c,id:a}}t.createItem=e;function i(o,a,l,c){let u=a.target;if(!u)return null;let d=a.currentTarget;if(!d||!d.contains(u)&&(u=document.elementFromPoint(a.clientX,a.clientY),!u||!d.contains(u)))return null;let f=[],h=o.slice();for(;u!==null;){let m=[];for(let p=0,_=h.length;p<_;++p){let y=h[p];y&&Tu.matches(u,y.selector)&&(m.push(y),h[p]=null)}if(m.length!==0&&(l&&m.sort(c?s:r),f.push(...m)),u===d)break;u=u.parentElement}return l||f.sort(c?s:r),f}t.matchItems=i;function n(o){if(o.indexOf(",")!==-1)throw new Error(`Selector cannot contain commas: ${o}`);if(!Tu.isValid(o))throw new Error(`Invalid selector: ${o}`);return o}function r(o,a){let l=o.rank,c=a.rank;return l!==c?l=this._titles.length)&&(e=-1),this._currentIndex===e)return;let i=this._currentIndex,n=this._titles[i]||null,r=e,s=this._titles[r]||null;this._currentIndex=r,this._previousTitle=n,this.update(),this._currentChanged.emit({previousIndex:i,previousTitle:n,currentIndex:r,currentTitle:s})}get name(){return this._name}set name(e){this._name=e,e?this.contentNode.setAttribute("aria-label",e):this.contentNode.removeAttribute("aria-label")}get orientation(){return this._orientation}set orientation(e){this._orientation!==e&&(this._releaseMouse(),this._orientation=e,this.dataset.orientation=e,this.contentNode.setAttribute("aria-orientation",e))}get addButtonEnabled(){return this._addButtonEnabled}set addButtonEnabled(e){this._addButtonEnabled!==e&&(this._addButtonEnabled=e,e?this.addButtonNode.classList.remove("lm-mod-hidden"):this.addButtonNode.classList.add("lm-mod-hidden"))}get titles(){return this._titles}get contentNode(){return this.node.getElementsByClassName("lm-TabBar-content")[0]}get addButtonNode(){return this.node.getElementsByClassName("lm-TabBar-addButton")[0]}addTab(e){return this.insertTab(this._titles.length,e)}insertTab(e,i){this._releaseMouse();let n=Us.asTitle(i),r=this._titles.indexOf(n),s=Math.max(0,Math.min(e,this._titles.length));return r===-1?(He.insert(this._titles,s,n),n.changed.connect(this._onTitleChanged,this),this.update(),this._adjustCurrentForInsert(s,n),n):(s===this._titles.length&&s--,r===s||(He.move(this._titles,r,s),this.update(),this._adjustCurrentForMove(r,s)),n)}removeTab(e){this.removeTabAt(this._titles.indexOf(e))}removeTabAt(e){this._releaseMouse();let i=He.removeAt(this._titles,e);i&&(i.changed.disconnect(this._onTitleChanged,this),i===this._previousTitle&&(this._previousTitle=null),this.update(),this._adjustCurrentForRemove(e,i))}clearTabs(){if(this._titles.length===0)return;this._releaseMouse();for(let n of this._titles)n.changed.disconnect(this._onTitleChanged,this);let e=this.currentIndex,i=this.currentTitle;this._currentIndex=-1,this._previousTitle=null,this._titles.length=0,this.update(),e!==-1&&this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:-1,currentTitle:null})}releaseMouse(){this._releaseMouse()}handleEvent(e){switch(e.type){case"pointerdown":this._evtPointerDown(e);break;case"pointermove":this._evtPointerMove(e);break;case"pointerup":this._evtPointerUp(e);break;case"dblclick":this._evtDblClick(e);break;case"keydown":e.eventPhase===Event.CAPTURING_PHASE?this._evtKeyDownCapturing(e):this._evtKeyDown(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("pointerdown",this),this.node.addEventListener("dblclick",this),this.node.addEventListener("keydown",this)}onAfterDetach(e){this.node.removeEventListener("pointerdown",this),this.node.removeEventListener("dblclick",this),this.node.removeEventListener("keydown",this),this._releaseMouse()}onUpdateRequest(e){var i;let n=this._titles,r=this.renderer,s=this.currentTitle,o=new Array(n.length),a=(i=this._getCurrentTabindex())!==null&&i!==void 0?i:this._currentIndex>-1?this._currentIndex:0;for(let l=0,c=n.length;lti.hitTest(o,e.clientX,e.clientY));if(n===-1)return;let r=this.titles[n],s=i[n].querySelector(".lm-TabBar-tabLabel");if(s&&s.contains(e.target)){let o=r.label||"",a=s.innerHTML;s.innerHTML="";let l=document.createElement("input");l.classList.add("lm-TabBar-tabInput"),l.value=o,s.appendChild(l);let c=()=>{l.removeEventListener("blur",c),s.innerHTML=a,this.node.addEventListener("keydown",this)};l.addEventListener("dblclick",u=>u.stopPropagation()),l.addEventListener("blur",c),l.addEventListener("keydown",u=>{u.key==="Enter"?(l.value!==""&&(r.label=r.caption=l.value),c()):u.key==="Escape"&&c()}),this.node.removeEventListener("keydown",this),l.select(),l.focus(),s.children.length>0&&s.children[0].focus()}}_evtKeyDownCapturing(e){e.eventPhase===Event.CAPTURING_PHASE&&(e.preventDefault(),e.stopPropagation(),e.key==="Escape"&&this._releaseMouse())}_evtKeyDown(e){var i,n,r;if(!(e.key==="Tab"||e.eventPhase===Event.CAPTURING_PHASE)){if(e.key==="Enter"||e.key==="Spacebar"||e.key===" "){let s=document.activeElement;if(this.addButtonEnabled&&this.addButtonNode.contains(s))e.preventDefault(),e.stopPropagation(),this._addRequested.emit();else{let o=He.findFirstIndex(this.contentNode.children,a=>a.contains(s));o>=0&&(e.preventDefault(),e.stopPropagation(),this.currentIndex=o)}}else if(qV.includes(e.key)){let s=[...this.contentNode.children];if(this.addButtonEnabled&&s.push(this.addButtonNode),s.length<=1)return;e.preventDefault(),e.stopPropagation();let o=s.indexOf(document.activeElement);o===-1&&(o=this._currentIndex);let a;e.key==="ArrowRight"&&this._orientation==="horizontal"||e.key==="ArrowDown"&&this._orientation==="vertical"?a=(i=s[o+1])!==null&&i!==void 0?i:s[0]:e.key==="ArrowLeft"&&this._orientation==="horizontal"||e.key==="ArrowUp"&&this._orientation==="vertical"?a=(n=s[o-1])!==null&&n!==void 0?n:s[s.length-1]:e.key==="Home"?a=s[0]:e.key==="End"&&(a=s[s.length-1]),a&&((r=s[o])===null||r===void 0||r.setAttribute("tabindex","-1"),a?.setAttribute("tabindex","0"),a.focus())}}}_evtPointerDown(e){if(e.button!==0&&e.button!==1||this._dragData||e.target.classList.contains("lm-TabBar-tabInput"))return;let i=this.addButtonEnabled&&this.addButtonNode.contains(e.target),n=this.contentNode.children,r=He.findFirstIndex(n,o=>ti.hitTest(o,e.clientX,e.clientY));if(r===-1&&!i||(e.preventDefault(),e.stopPropagation(),this._dragData={tab:n[r],index:r,pressX:e.clientX,pressY:e.clientY,tabPos:-1,tabSize:-1,tabPressPos:-1,targetIndex:-1,tabLayout:null,contentRect:null,override:null,dragActive:!1,dragAborted:!1,detachRequested:!1},this.document.addEventListener("pointerup",this,!0),e.button===1||i))return;let s=n[r].querySelector(this.renderer.closeIconSelector);s&&s.contains(e.target)||(this.tabsMovable&&(this.document.addEventListener("pointermove",this,!0),this.document.addEventListener("keydown",this,!0),this.document.addEventListener("contextmenu",this,!0)),this.allowDeselect&&this.currentIndex===r?this.currentIndex=-1:this.currentIndex=r,this.currentIndex!==-1&&this._tabActivateRequested.emit({index:this.currentIndex,title:this.currentTitle}))}_evtPointerMove(e){let i=this._dragData;if(!i)return;e.preventDefault(),e.stopPropagation();let n=this.contentNode.children;if(!(!i.dragActive&&!Us.dragExceeded(i,e))){if(!i.dragActive){let r=i.tab.getBoundingClientRect();this._orientation==="horizontal"?(i.tabPos=i.tab.offsetLeft,i.tabSize=r.width,i.tabPressPos=i.pressX-r.left):(i.tabPos=i.tab.offsetTop,i.tabSize=r.height,i.tabPressPos=i.pressY-r.top),i.tabPressOffset={x:i.pressX-r.left,y:i.pressY-r.top},i.tabLayout=Us.snapTabLayout(n,this._orientation),i.contentRect=this.contentNode.getBoundingClientRect(),i.override=an.overrideCursor("default"),i.tab.classList.add("lm-mod-dragging"),this.addClass("lm-mod-dragging"),i.dragActive=!0}if(!i.detachRequested&&Us.detachExceeded(i,e)){i.detachRequested=!0;let r=i.index,s=e.clientX,o=e.clientY,a=n[r],l=this._titles[r];if(this._tabDetachRequested.emit({index:r,title:l,tab:a,clientX:s,clientY:o,offset:i.tabPressOffset}),i.dragAborted)return}Us.layoutTabs(n,i,e,this._orientation)}}_evtPointerUp(e){if(e.button!==0&&e.button!==1)return;let i=this._dragData;if(!i)return;if(e.preventDefault(),e.stopPropagation(),this.document.removeEventListener("pointermove",this,!0),this.document.removeEventListener("pointerup",this,!0),this.document.removeEventListener("keydown",this,!0),this.document.removeEventListener("contextmenu",this,!0),!i.dragActive){if(this._dragData=null,this.addButtonEnabled&&this.addButtonNode.contains(e.target)){this._addRequested.emit(void 0);return}let s=this.contentNode.children,o=He.findFirstIndex(s,c=>ti.hitTest(c,e.clientX,e.clientY));if(o!==i.index)return;let a=this._titles[o];if(!a.closable)return;if(e.button===1){this._tabCloseRequested.emit({index:o,title:a});return}let l=s[o].querySelector(this.renderer.closeIconSelector);if(l&&l.contains(e.target)){this._tabCloseRequested.emit({index:o,title:a});return}return}if(e.button!==0)return;Us.finalizeTabPosition(i,this._orientation),i.tab.classList.remove("lm-mod-dragging");let n=Us.parseTransitionDuration(i.tab);setTimeout(()=>{if(i.dragAborted)return;this._dragData=null,Us.resetTabPositions(this.contentNode.children,this._orientation),i.override.dispose(),this.removeClass("lm-mod-dragging");let r=i.index,s=i.targetIndex;s===-1||r===s||(He.move(this._titles,r,s),this._adjustCurrentForMove(r,s),this._tabMoved.emit({fromIndex:r,toIndex:s,title:this._titles[s]}),Oe.sendMessage(this,pe.Msg.UpdateRequest))},n)}_releaseMouse(){let e=this._dragData;e&&(this._dragData=null,this.document.removeEventListener("pointermove",this,!0),this.document.removeEventListener("pointerup",this,!0),this.document.removeEventListener("keydown",this,!0),this.document.removeEventListener("contextmenu",this,!0),e.dragAborted=!0,e.dragActive&&(Us.resetTabPositions(this.contentNode.children,this._orientation),e.override.dispose(),e.tab.classList.remove("lm-mod-dragging"),this.removeClass("lm-mod-dragging")))}_adjustCurrentForInsert(e,i){let n=this.currentTitle,r=this._currentIndex,s=this.insertBehavior;if(s==="select-tab"||s==="select-tab-if-needed"&&r===-1){this._currentIndex=e,this._previousTitle=n,this._currentChanged.emit({previousIndex:r,previousTitle:n,currentIndex:e,currentTitle:i});return}r>=e&&this._currentIndex++}_adjustCurrentForMove(e,i){this._currentIndex===e?this._currentIndex=i:this._currentIndex=i?this._currentIndex++:this._currentIndex>e&&this._currentIndex<=i&&this._currentIndex--}_adjustCurrentForRemove(e,i){let n=this._currentIndex,r=this.removeBehavior;if(n!==e){n>e&&this._currentIndex--;return}if(this._titles.length===0){this._currentIndex=-1,this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:-1,currentTitle:null});return}if(r==="select-tab-after"){this._currentIndex=Math.min(e,this._titles.length-1),this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:this._currentIndex,currentTitle:this.currentTitle});return}if(r==="select-tab-before"){this._currentIndex=Math.max(0,e-1),this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:this._currentIndex,currentTitle:this.currentTitle});return}if(r==="select-previous-tab"){this._previousTitle?(this._currentIndex=this._titles.indexOf(this._previousTitle),this._previousTitle=null):this._currentIndex=Math.min(e,this._titles.length-1),this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:this._currentIndex,currentTitle:this.currentTitle});return}this._currentIndex=-1,this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:-1,currentTitle:null})}_onTitleChanged(e){this.update()}};(function(t){class e{constructor(){this.closeIconSelector=".lm-TabBar-tabCloseIcon",this._tabID=0,this._tabKeys=new WeakMap,this._uuid=++e._nInstance}renderTab(n){let r=n.title.caption,s=this.createTabKey(n),o=s,a=this.createTabStyle(n),l=this.createTabClass(n),c=this.createTabDataset(n),u=this.createTabARIA(n);return n.title.closable?Ce.li({id:o,key:s,className:l,title:r,style:a,dataset:c,...u},this.renderIcon(n),this.renderLabel(n),this.renderCloseIcon(n)):Ce.li({id:o,key:s,className:l,title:r,style:a,dataset:c,...u},this.renderIcon(n),this.renderLabel(n))}renderIcon(n){let{title:r}=n,s=this.createIconClass(n);return Ce.div({className:s},r.icon,r.iconLabel)}renderLabel(n){return Ce.div({className:"lm-TabBar-tabLabel"},n.title.label)}renderCloseIcon(n){return Ce.div({className:"lm-TabBar-tabCloseIcon"})}createTabKey(n){let r=this._tabKeys.get(n.title);return r===void 0&&(r=`tab-key-${this._uuid}-${this._tabID++}`,this._tabKeys.set(n.title,r)),r}createTabStyle(n){return{zIndex:`${n.zIndex}`}}createTabClass(n){let r="lm-TabBar-tab";return n.title.className&&(r+=` ${n.title.className}`),n.title.closable&&(r+=" lm-mod-closable"),n.current&&(r+=" lm-mod-current"),r}createTabDataset(n){return n.title.dataset}createTabARIA(n){var r;return{role:"tab","aria-selected":n.current.toString(),tabindex:`${(r=n.tabIndex)!==null&&r!==void 0?r:"-1"}`}}createIconClass(n){let r="lm-TabBar-tabIcon",s=n.title.iconClass;return s?`${r} ${s}`:r}}e._nInstance=0,t.Renderer=e,t.defaultRenderer=new e,t.addButtonSelector=".lm-TabBar-addButton"})(vc||(vc={}));(function(t){t.DRAG_THRESHOLD=5,t.DETACH_THRESHOLD=20;function e(){let u=document.createElement("div"),d=document.createElement("ul");d.setAttribute("role","tablist"),d.className="lm-TabBar-content",u.appendChild(d);let f=document.createElement("div");return f.className="lm-TabBar-addButton lm-mod-hidden",f.setAttribute("tabindex","-1"),f.setAttribute("role","button"),u.appendChild(f),u}t.createNode=e;function i(u){return u instanceof rg?u:new rg(u)}t.asTitle=i;function n(u){let d=window.getComputedStyle(u);return 1e3*(parseFloat(d.transitionDuration)||0)}t.parseTransitionDuration=n;function r(u,d){let f=new Array(u.length);for(let h=0,m=u.length;h=t.DRAG_THRESHOLD||h>=t.DRAG_THRESHOLD}t.dragExceeded=s;function o(u,d){let f=u.contentRect;return d.clientX=f.right+t.DETACH_THRESHOLD||d.clientY=f.bottom+t.DETACH_THRESHOLD}t.detachExceeded=o;function a(u,d,f,h){let m,p,_,y;h==="horizontal"?(m=d.pressX,p=f.clientX-d.contentRect.left,_=f.clientX,y=d.contentRect.width):(m=d.pressY,p=f.clientY-d.contentRect.top,_=f.clientY,y=d.contentRect.height);let S=d.index,T=p-d.tabPressPos,O=T+d.tabSize;for(let A=0,b=u.length;A>1);if(Ad.index&&O>x)M=`${-d.tabSize-C.margin}px`,S=Math.max(S,A);else if(A===d.index){let w=_-m,E=y-(d.tabPos+d.tabSize);M=`${Math.max(-d.tabPos,Math.min(w,E))}px`}else M="";h==="horizontal"?u[A].style.left=M:u[A].style.top=M}d.targetIndex=S}t.layoutTabs=a;function l(u,d){let f;d==="horizontal"?f=u.contentRect.width:f=u.contentRect.height;let h;if(u.targetIndex===u.index)h=0;else if(u.targetIndex>u.index){let _=u.tabLayout[u.targetIndex];h=_.pos+_.size-u.tabSize-u.tabPos}else h=u.tabLayout[u.targetIndex].pos-u.tabPos;let m=f-(u.tabPos+u.tabSize),p=Math.max(-u.tabPos,Math.min(h,m));d==="horizontal"?u.tab.style.left=`${p}px`:u.tab.style.top=`${p}px`}t.finalizeTabPosition=l;function c(u,d){for(let f of u)d==="horizontal"?f.style.left="":f.style.top=""}t.resetTabPositions=c})(Us||(Us={}));_C=class extends la{constructor(e){super(),this._spacing=4,this._dirty=!1,this._root=null,this._box=null,this._items=new Map,this.renderer=e.renderer,e.spacing!==void 0&&(this._spacing=og.clampDimension(e.spacing)),this._document=e.document||document,this._hiddenMode=e.hiddenMode!==void 0?e.hiddenMode:pe.HiddenMode.Display}dispose(){let e=this[Symbol.iterator]();this._items.forEach(i=>{i.dispose()}),this._box=null,this._root=null,this._items.clear();for(let i of e)i.dispose();super.dispose()}get hiddenMode(){return this._hiddenMode}set hiddenMode(e){if(this._hiddenMode!==e){this._hiddenMode=e;for(let i of this.tabBars())if(i.titles.length>1)for(let n of i.titles)n.owner.hiddenMode=this._hiddenMode}}get spacing(){return this._spacing}set spacing(e){e=og.clampDimension(e),this._spacing!==e&&(this._spacing=e,this.parent&&this.parent.fit())}get isEmpty(){return this._root===null}[Symbol.iterator](){return this._root?this._root.iterAllWidgets():Mf()}widgets(){return this._root?this._root.iterUserWidgets():Mf()}selectedWidgets(){return this._root?this._root.iterSelectedWidgets():Mf()}tabBars(){return this._root?this._root.iterTabBars():Mf()}handles(){return this._root?this._root.iterHandles():Mf()}moveHandle(e,i,n){let r=e.classList.contains("lm-mod-hidden");if(!this._root||r)return;let s=this._root.findSplitNode(e);if(!s)return;let o;s.node.orientation==="horizontal"?o=i-e.offsetLeft:o=n-e.offsetTop,o!==0&&(s.node.holdSizes(),as.adjust(s.node.sizers,s.index,o),this.parent&&this.parent.update())}saveLayout(){return this._root?(this._root.holdAllSizes(),{main:this._root.createConfig()}):{main:null}}restoreLayout(e){let i=new Set,n;e.main?n=tn.normalizeAreaConfig(e.main,i):n=null;let r=this.widgets(),s=this.tabBars(),o=this.handles();this._root=null;for(let a of r)i.has(a)||(a.parent=null);for(let a of s)a.dispose();for(let a of o)a.parentNode&&a.parentNode.removeChild(a);for(let a of i)a.parent=this.parent;n?this._root=tn.realizeAreaConfig(n,{createTabBar:a=>this._createTabBar(),createHandle:()=>this._createHandle()},this._document):this._root=null,this.parent&&(i.forEach(a=>{this.attachWidget(a)}),this.parent.fit())}addWidget(e,i={}){let n=i.ref||null,r=i.mode||"tab-after",s=null;if(this._root&&n&&(s=this._root.findTabNode(n)),n&&!s)throw new Error("Reference widget is not in the layout.");switch(e.parent=this.parent,r){case"tab-after":this._insertTab(e,n,s,!0);break;case"tab-before":this._insertTab(e,n,s,!1);break;case"split-top":this._insertSplit(e,n,s,"vertical",!1);break;case"split-left":this._insertSplit(e,n,s,"horizontal",!1);break;case"split-right":this._insertSplit(e,n,s,"horizontal",!0);break;case"split-bottom":this._insertSplit(e,n,s,"vertical",!0);break;case"merge-top":this._insertSplit(e,n,s,"vertical",!1,!0);break;case"merge-left":this._insertSplit(e,n,s,"horizontal",!1,!0);break;case"merge-right":this._insertSplit(e,n,s,"horizontal",!0,!0);break;case"merge-bottom":this._insertSplit(e,n,s,"vertical",!0,!0);break}this.parent&&(this.attachWidget(e),this.parent.fit())}removeWidget(e){this._removeWidget(e),this.parent&&(this.detachWidget(e),this.parent.fit())}hitTestTabAreas(e,i){if(!this._root||!this.parent||!this.parent.isVisible)return null;this._box||(this._box=ti.boxSizing(this.parent.node));let n=this.parent.node.getBoundingClientRect(),r=e-n.left-this._box.borderLeft,s=i-n.top-this._box.borderTop,o=this._root.hitTestTabNodes(r,s);if(!o)return null;let{tabBar:a,top:l,left:c,width:u,height:d}=o,f=this._box.borderLeft+this._box.borderRight,h=this._box.borderTop+this._box.borderBottom,m=n.width-f-(c+u),p=n.height-h-(l+d);return{tabBar:a,x:r,y:s,top:l,left:c,right:m,bottom:p,width:u,height:d}}init(){super.init();for(let e of this)this.attachWidget(e);for(let e of this.handles())this.parent.node.appendChild(e);this.parent.fit()}attachWidget(e){this.parent.node!==e.node.parentNode&&(this._items.set(e,new ku(e)),this.parent.isAttached&&Oe.sendMessage(e,pe.Msg.BeforeAttach),this.parent.node.appendChild(e.node),this.parent.isAttached&&Oe.sendMessage(e,pe.Msg.AfterAttach))}detachWidget(e){if(this.parent.node!==e.node.parentNode)return;this.parent.isAttached&&Oe.sendMessage(e,pe.Msg.BeforeDetach),this.parent.node.removeChild(e.node),this.parent.isAttached&&Oe.sendMessage(e,pe.Msg.AfterDetach);let i=this._items.get(e);i&&(this._items.delete(e),i.dispose())}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_removeWidget(e){if(!this._root)return;let i=this._root.findTabNode(e);if(!i)return;if(tn.removeAria(e),i.tabBar.titles.length>1){if(i.tabBar.removeTab(e.title),this._hiddenMode===pe.HiddenMode.Scale&&i.tabBar.titles.length==1){let f=i.tabBar.titles[0].owner;f.hiddenMode=pe.HiddenMode.Display}return}if(i.tabBar.dispose(),this._root===i){this._root=null;return}this._root.holdAllSizes();let n=i.parent;i.parent=null;let r=He.removeFirstOf(n.children,i),s=He.removeAt(n.handles,r);if(He.removeAt(n.sizers,r),s.parentNode&&s.parentNode.removeChild(s),n.children.length>1){n.syncHandles();return}let o=n.parent;n.parent=null;let a=n.children[0],l=n.handles[0];if(n.children.length=0,n.handles.length=0,n.sizers.length=0,l.parentNode&&l.parentNode.removeChild(l),this._root===n){a.parent=null,this._root=a;return}let c=o,u=c.children.indexOf(n);if(a instanceof tn.TabLayoutNode){a.parent=c,c.children[u]=a;return}let d=He.removeAt(c.handles,u);He.removeAt(c.children,u),He.removeAt(c.sizers,u),d.parentNode&&d.parentNode.removeChild(d);for(let f=0,h=a.children.length;f=this._left+this._width||m=this._top+this._height?null:this}createConfig(){let h=this.tabBar.titles.map(p=>p.owner),m=this.tabBar.currentIndex;return{type:"tab-area",widgets:h,currentIndex:m}}holdAllSizes(){}fit(h,m){let p=0,_=0,y=1/0,S=1/0,T=m.get(this.tabBar),O=this.tabBar.currentTitle,A=O?m.get(O.owner):void 0,[b,M]=this.sizers;return T&&T.fit(),A&&A.fit(),T&&!T.isHidden?(p=Math.max(p,T.minWidth),_+=T.minHeight,b.minSize=T.minHeight,b.maxSize=T.maxHeight):(b.minSize=0,b.maxSize=0),A&&!A.isHidden?(p=Math.max(p,A.minWidth),_+=A.minHeight,M.minSize=A.minHeight,M.maxSize=1/0):(M.minSize=0,M.maxSize=1/0),{minWidth:p,minHeight:_,maxWidth:y,maxHeight:S}}update(h,m,p,_,y,S){this._top=m,this._left=h,this._width=p,this._height=_;let T=S.get(this.tabBar),O=this.tabBar.currentTitle,A=O?S.get(O.owner):void 0;if(as.calc(this.sizers,_),T&&!T.isHidden){let b=this.sizers[0].size;T.update(h,m,p,b),m+=b}if(A&&!A.isHidden){let b=this.sizers[1].size;A.update(h,m,p,b)}}}t.TabLayoutNode=r;class s{constructor(h){this.parent=null,this.normalized=!1,this.children=[],this.sizers=[],this.handles=[],this.orientation=h}*iterAllWidgets(){for(let h of this.children)yield*h.iterAllWidgets()}*iterUserWidgets(){for(let h of this.children)yield*h.iterUserWidgets()}*iterSelectedWidgets(){for(let h of this.children)yield*h.iterSelectedWidgets()}*iterTabBars(){for(let h of this.children)yield*h.iterTabBars()}*iterHandles(){yield*this.handles;for(let h of this.children)yield*h.iterHandles()}findTabNode(h){for(let m=0,p=this.children.length;m_.createConfig());return{type:"split-area",orientation:h,children:p,sizes:m}}syncHandles(){this.handles.forEach((h,m)=>{h.setAttribute("data-orientation",this.orientation),m===this.handles.length-1?h.classList.add("lm-mod-hidden"):h.classList.remove("lm-mod-hidden")})}holdSizes(){for(let h of this.sizers)h.sizeHint=h.size}holdAllSizes(){for(let h of this.children)h.holdAllSizes();this.holdSizes()}normalizeSizes(){let h=this.sizers.length;if(h===0)return;this.holdSizes();let m=this.sizers.reduce((p,_)=>p+_.sizeHint,0);if(m===0)for(let p of this.sizers)p.size=p.sizeHint=1/h;else for(let p of this.sizers)p.size=p.sizeHint/=m;this.normalized=!0}createNormalizedSizes(){let h=this.sizers.length;if(h===0)return[];let m=this.sizers.map(_=>_.size),p=m.reduce((_,y)=>_+y,0);if(p===0)for(let _=m.length-1;_>-1;_--)m[_]=1/h;else for(let _=m.length-1;_>-1;_--)m[_]/=p;return m}fit(h,m){let p=this.orientation==="horizontal",_=Math.max(0,this.children.length-1)*h,y=p?_:0,S=p?0:_,T=1/0,O=1/0;for(let A=0,b=this.children.length;A=m.length)&&(p=0),{type:"tab-area",widgets:m,currentIndex:p}}function c(f,h){let m=f.orientation,p=[],_=[];for(let y=0,S=f.children.length;y{let S=n(_,h,m),T=e(f.sizes[y]),O=h.createHandle();p.children.push(S),p.handles.push(O),p.sizers.push(T),S.parent=p}),p.syncHandles(),p.normalizeSizes(),p}})(tn||(tn={}));Lu=class extends pe{constructor(e={}){super(),this._drag=null,this._tabsMovable=!0,this._tabsConstrained=!1,this._addButtonEnabled=!1,this._pressData=null,this._layoutModified=new Te(this),this._addRequested=new Te(this),this.addClass("lm-DockPanel"),this._document=e.document||document,this._mode=e.mode||"multiple-document",this._renderer=e.renderer||Lu.defaultRenderer,this._edges=e.edges||Ui.DEFAULT_EDGES,e.tabsMovable!==void 0&&(this._tabsMovable=e.tabsMovable),e.tabsConstrained!==void 0&&(this._tabsConstrained=e.tabsConstrained),e.addButtonEnabled!==void 0&&(this._addButtonEnabled=e.addButtonEnabled),this.dataset.mode=this._mode;let i={createTabBar:()=>this._createTabBar(),createHandle:()=>this._createHandle()};this.layout=new _C({document:this._document,renderer:i,spacing:e.spacing,hiddenMode:e.hiddenMode}),this.overlay=e.overlay||new Lu.Overlay,this.node.appendChild(this.overlay.node)}dispose(){this._releaseMouse(),this.overlay.hide(0),this._drag&&this._drag.dispose(),super.dispose()}get hiddenMode(){return this.layout.hiddenMode}set hiddenMode(e){this.layout.hiddenMode=e}get layoutModified(){return this._layoutModified}get addRequested(){return this._addRequested}get renderer(){return this.layout.renderer}get spacing(){return this.layout.spacing}set spacing(e){this.layout.spacing=e}get mode(){return this._mode}set mode(e){if(this._mode===e)return;this._mode=e,this.dataset.mode=e;let i=this.layout;switch(e){case"multiple-document":for(let n of i.tabBars())n.show();break;case"single-document":i.restoreLayout(Ui.createSingleDocumentConfig(this));break;default:throw"unreachable"}Oe.postMessage(this,Ui.LayoutModified)}get tabsMovable(){return this._tabsMovable}set tabsMovable(e){this._tabsMovable=e;for(let i of this.tabBars())i.tabsMovable=e}get tabsConstrained(){return this._tabsConstrained}set tabsConstrained(e){this._tabsConstrained=e}get addButtonEnabled(){return this._addButtonEnabled}set addButtonEnabled(e){this._addButtonEnabled=e;for(let i of this.tabBars())i.addButtonEnabled=e}get isEmpty(){return this.layout.isEmpty}*widgets(){yield*this.layout.widgets()}*selectedWidgets(){yield*this.layout.selectedWidgets()}*tabBars(){yield*this.layout.tabBars()}*handles(){yield*this.layout.handles()}selectWidget(e){let i=uC(this.tabBars(),n=>n.titles.indexOf(e.title)!==-1);if(!i)throw new Error("Widget is not contained in the dock panel.");i.currentTitle=e.title}activateWidget(e){this.selectWidget(e),e.activate()}saveLayout(){return this.layout.saveLayout()}restoreLayout(e){this._mode="multiple-document",this.layout.restoreLayout(e),(Iu.IS_EDGE||Iu.IS_IE)&&Oe.flush(),Oe.postMessage(this,Ui.LayoutModified)}addWidget(e,i={}){this._mode==="single-document"?this.layout.addWidget(e):this.layout.addWidget(e,i),Oe.postMessage(this,Ui.LayoutModified)}processMessage(e){e.type==="layout-modified"?this._layoutModified.emit(void 0):super.processMessage(e)}handleEvent(e){switch(e.type){case"lm-dragenter":this._evtDragEnter(e);break;case"lm-dragleave":this._evtDragLeave(e);break;case"lm-dragover":this._evtDragOver(e);break;case"lm-drop":this._evtDrop(e);break;case"pointerdown":this._evtPointerDown(e);break;case"pointermove":this._evtPointerMove(e);break;case"pointerup":this._evtPointerUp(e);break;case"keydown":this._evtKeyDown(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("lm-dragenter",this),this.node.addEventListener("lm-dragleave",this),this.node.addEventListener("lm-dragover",this),this.node.addEventListener("lm-drop",this),this.node.addEventListener("pointerdown",this)}onAfterDetach(e){this.node.removeEventListener("lm-dragenter",this),this.node.removeEventListener("lm-dragleave",this),this.node.removeEventListener("lm-dragover",this),this.node.removeEventListener("lm-drop",this),this.node.removeEventListener("pointerdown",this),this._releaseMouse()}onChildAdded(e){Ui.isGeneratedTabBarProperty.get(e.child)||e.child.addClass("lm-DockPanel-widget")}onChildRemoved(e){Ui.isGeneratedTabBarProperty.get(e.child)||(e.child.removeClass("lm-DockPanel-widget"),Oe.postMessage(this,Ui.LayoutModified))}_evtDragEnter(e){e.mimeData.hasData("application/vnd.lumino.widget-factory")&&(e.preventDefault(),e.stopPropagation())}_evtDragLeave(e){e.preventDefault(),!(this._tabsConstrained&&e.source!==this)&&(e.stopPropagation(),this.overlay.hide(1))}_evtDragOver(e){e.preventDefault(),this._tabsConstrained&&e.source!==this||this._showOverlay(e.clientX,e.clientY)==="invalid"?e.dropAction="none":(e.stopPropagation(),e.dropAction=e.proposedAction)}_evtDrop(e){if(e.preventDefault(),this.overlay.hide(0),e.proposedAction==="none"){e.dropAction="none";return}let{clientX:i,clientY:n}=e,{zone:r,target:s}=Ui.findDropTarget(this,i,n,this._edges);if(this._tabsConstrained&&e.source!==this||r==="invalid"){e.dropAction="none";return}let a=e.mimeData.getData("application/vnd.lumino.widget-factory");if(typeof a!="function"){e.dropAction="none";return}let l=a();if(!(l instanceof pe)){e.dropAction="none";return}if(l.contains(this)){e.dropAction="none";return}let c=s?Ui.getDropRef(s.tabBar):null;switch(r){case"root-all":this.addWidget(l);break;case"root-top":this.addWidget(l,{mode:"split-top"});break;case"root-left":this.addWidget(l,{mode:"split-left"});break;case"root-right":this.addWidget(l,{mode:"split-right"});break;case"root-bottom":this.addWidget(l,{mode:"split-bottom"});break;case"widget-all":this.addWidget(l,{mode:"tab-after",ref:c});break;case"widget-top":this.addWidget(l,{mode:"split-top",ref:c});break;case"widget-left":this.addWidget(l,{mode:"split-left",ref:c});break;case"widget-right":this.addWidget(l,{mode:"split-right",ref:c});break;case"widget-bottom":this.addWidget(l,{mode:"split-bottom",ref:c});break;case"widget-tab":this.addWidget(l,{mode:"tab-after",ref:c});break;default:throw"unreachable"}e.dropAction=e.proposedAction,e.stopPropagation(),this.activateWidget(l)}_evtKeyDown(e){e.preventDefault(),e.stopPropagation(),e.keyCode===27&&(this._releaseMouse(),Oe.postMessage(this,Ui.LayoutModified))}_evtPointerDown(e){if(e.button!==0)return;let i=this.layout,n=e.target,r=uC(i.handles(),u=>u.contains(n));if(!r)return;e.preventDefault(),e.stopPropagation(),this._document.addEventListener("keydown",this,!0),this._document.addEventListener("pointerup",this,!0),this._document.addEventListener("pointermove",this,!0),this._document.addEventListener("contextmenu",this,!0);let s=r.getBoundingClientRect(),o=e.clientX-s.left,a=e.clientY-s.top,l=window.getComputedStyle(r),c=an.overrideCursor(l.cursor,this._document);this._pressData={handle:r,deltaX:o,deltaY:a,override:c}}_evtPointerMove(e){if(!this._pressData)return;e.preventDefault(),e.stopPropagation();let i=this.node.getBoundingClientRect(),n=e.clientX-i.left-this._pressData.deltaX,r=e.clientY-i.top-this._pressData.deltaY;this.layout.moveHandle(this._pressData.handle,n,r)}_evtPointerUp(e){e.button===0&&(e.preventDefault(),e.stopPropagation(),this._releaseMouse(),Oe.postMessage(this,Ui.LayoutModified))}_releaseMouse(){this._pressData&&(this._pressData.override.dispose(),this._pressData=null,this._document.removeEventListener("keydown",this,!0),this._document.removeEventListener("pointerup",this,!0),this._document.removeEventListener("pointermove",this,!0),this._document.removeEventListener("contextmenu",this,!0))}_showOverlay(e,i){let{zone:n,target:r}=Ui.findDropTarget(this,e,i,this._edges);if(n==="invalid")return this.overlay.hide(100),n;let s,o,a,l,c=ti.boxSizing(this.node),u=this.node.getBoundingClientRect();switch(n){case"root-all":s=c.paddingTop,o=c.paddingLeft,a=c.paddingRight,l=c.paddingBottom;break;case"root-top":s=c.paddingTop,o=c.paddingLeft,a=c.paddingRight,l=u.height*Ui.GOLDEN_RATIO;break;case"root-left":s=c.paddingTop,o=c.paddingLeft,a=u.width*Ui.GOLDEN_RATIO,l=c.paddingBottom;break;case"root-right":s=c.paddingTop,o=u.width*Ui.GOLDEN_RATIO,a=c.paddingRight,l=c.paddingBottom;break;case"root-bottom":s=u.height*Ui.GOLDEN_RATIO,o=c.paddingLeft,a=c.paddingRight,l=c.paddingBottom;break;case"widget-all":s=r.top,o=r.left,a=r.right,l=r.bottom;break;case"widget-top":s=r.top,o=r.left,a=r.right,l=r.bottom+r.height/2;break;case"widget-left":s=r.top,o=r.left,a=r.right+r.width/2,l=r.bottom;break;case"widget-right":s=r.top,o=r.left+r.width/2,a=r.right,l=r.bottom;break;case"widget-bottom":s=r.top+r.height/2,o=r.left,a=r.right,l=r.bottom;break;case"widget-tab":{let d=r.tabBar.node.getBoundingClientRect().height;s=r.top,o=r.left,a=r.right,l=r.bottom+r.height-d;break}default:throw"unreachable"}return this.overlay.show({top:s,left:o,right:a,bottom:l}),n}_createTabBar(){let e=this._renderer.createTabBar(this._document);return Ui.isGeneratedTabBarProperty.set(e,!0),this._mode==="single-document"&&e.hide(),e.tabsMovable=this._tabsMovable,e.allowDeselect=!1,e.addButtonEnabled=this._addButtonEnabled,e.removeBehavior="select-previous-tab",e.insertBehavior="select-tab-if-needed",e.tabMoved.connect(this._onTabMoved,this),e.currentChanged.connect(this._onCurrentChanged,this),e.tabCloseRequested.connect(this._onTabCloseRequested,this),e.tabDetachRequested.connect(this._onTabDetachRequested,this),e.tabActivateRequested.connect(this._onTabActivateRequested,this),e.addRequested.connect(this._onTabAddRequested,this),e}_createHandle(){return this._renderer.createHandle()}_onTabMoved(){Oe.postMessage(this,Ui.LayoutModified)}_onCurrentChanged(e,i){let{previousTitle:n,currentTitle:r}=i;n&&n.owner.hide(),r&&r.owner.show(),(Iu.IS_EDGE||Iu.IS_IE)&&Oe.flush(),Oe.postMessage(this,Ui.LayoutModified)}_onTabAddRequested(e){this._addRequested.emit(e)}_onTabActivateRequested(e,i){i.title.owner.activate()}_onTabCloseRequested(e,i){i.title.owner.close()}_onTabDetachRequested(e,i){if(this._drag)return;e.releaseMouse();let{title:n,tab:r,clientX:s,clientY:o,offset:a}=i,l=new ul.MimeData,c=()=>n.owner;l.setData("application/vnd.lumino.widget-factory",c);let u=r.cloneNode(!0);a&&(u.style.top=`-${a.y}px`,u.style.left=`-${a.x}px`),this._drag=new an({document:this._document,mimeData:l,dragImage:u,proposedAction:"move",supportedActions:"move",source:this}),r.classList.add("lm-mod-hidden");let d=()=>{this._drag=null,r.classList.remove("lm-mod-hidden")};this._drag.start(s,o).then(d)}};(function(t){class e{constructor(){this._timer=-1,this._hidden=!0,this.node=document.createElement("div"),this.node.classList.add("lm-DockPanel-overlay"),this.node.classList.add("lm-mod-hidden"),this.node.style.position="absolute",this.node.style.contain="strict"}show(r){let s=this.node.style;s.top=`${r.top}px`,s.left=`${r.left}px`,s.right=`${r.right}px`,s.bottom=`${r.bottom}px`,clearTimeout(this._timer),this._timer=-1,this._hidden&&(this._hidden=!1,this.node.classList.remove("lm-mod-hidden"))}hide(r){if(!this._hidden){if(r<=0){clearTimeout(this._timer),this._timer=-1,this._hidden=!0,this.node.classList.add("lm-mod-hidden");return}this._timer===-1&&(this._timer=window.setTimeout(()=>{this._timer=-1,this._hidden=!0,this.node.classList.add("lm-mod-hidden")},r))}}}t.Overlay=e;class i{createTabBar(r){let s=new vc({document:r});return s.addClass("lm-DockPanel-tabBar"),s}createHandle(){let r=document.createElement("div");return r.className="lm-DockPanel-handle",r}}t.Renderer=i,t.defaultRenderer=new i})(Lu||(Lu={}));(function(t){t.GOLDEN_RATIO=.618,t.DEFAULT_EDGES={top:12,right:40,bottom:40,left:40},t.LayoutModified=new _c("layout-modified"),t.isGeneratedTabBarProperty=new pt({name:"isGeneratedTabBar",create:()=>!1});function e(r){if(r.isEmpty)return{main:null};let s=Array.from(r.widgets()),o=r.selectedWidgets().next().value,a=o?s.indexOf(o):-1;return{main:{type:"tab-area",widgets:s,currentIndex:a}}}t.createSingleDocumentConfig=e;function i(r,s,o,a){if(!ti.hitTest(r.node,s,o))return{zone:"invalid",target:null};let l=r.layout;if(l.isEmpty)return{zone:"root-all",target:null};if(r.mode==="multiple-document"){let T=r.node.getBoundingClientRect(),O=s-T.left+1,A=o-T.top+1,b=T.right-s,M=T.bottom-o;switch(Math.min(A,b,M,O)){case A:if(Ap&&f>p&&d>_&&h>_)return{zone:"widget-all",target:c};u/=p,d/=_,f/=p,h/=_;let y=Math.min(u,d,f,h),S;switch(y){case u:S="widget-left";break;case d:S="widget-top";break;case f:S="widget-right";break;case h:S="widget-bottom";break;default:throw"unreachable"}return{zone:S,target:c}}t.findDropTarget=i;function n(r){return r.titles.length===0?null:r.currentTitle?r.currentTitle.owner:r.titles[r.titles.length-1].owner}t.getDropRef=n})(Ui||(Ui={}));cl=class extends la{constructor(e={}){super(e),this._dirty=!1,this._rowSpacing=4,this._columnSpacing=4,this._items=[],this._rowStarts=[],this._columnStarts=[],this._rowSizers=[new oa],this._columnSizers=[new oa],this._box=null,e.rowCount!==void 0&&yn.reallocSizers(this._rowSizers,e.rowCount),e.columnCount!==void 0&&yn.reallocSizers(this._columnSizers,e.columnCount),e.rowSpacing!==void 0&&(this._rowSpacing=yn.clampValue(e.rowSpacing)),e.columnSpacing!==void 0&&(this._columnSpacing=yn.clampValue(e.columnSpacing))}dispose(){for(let e of this._items){let i=e.widget;e.dispose(),i.dispose()}this._box=null,this._items.length=0,this._rowStarts.length=0,this._rowSizers.length=0,this._columnStarts.length=0,this._columnSizers.length=0,super.dispose()}get rowCount(){return this._rowSizers.length}set rowCount(e){e!==this.rowCount&&(yn.reallocSizers(this._rowSizers,e),this.parent&&this.parent.fit())}get columnCount(){return this._columnSizers.length}set columnCount(e){e!==this.columnCount&&(yn.reallocSizers(this._columnSizers,e),this.parent&&this.parent.fit())}get rowSpacing(){return this._rowSpacing}set rowSpacing(e){e=yn.clampValue(e),this._rowSpacing!==e&&(this._rowSpacing=e,this.parent&&this.parent.fit())}get columnSpacing(){return this._columnSpacing}set columnSpacing(e){e=yn.clampValue(e),this._columnSpacing!==e&&(this._columnSpacing=e,this.parent&&this.parent.fit())}rowStretch(e){let i=this._rowSizers[e];return i?i.stretch:-1}setRowStretch(e,i){let n=this._rowSizers[e];n&&(i=yn.clampValue(i),n.stretch!==i&&(n.stretch=i,this.parent&&this.parent.update()))}columnStretch(e){let i=this._columnSizers[e];return i?i.stretch:-1}setColumnStretch(e,i){let n=this._columnSizers[e];n&&(i=yn.clampValue(i),n.stretch!==i&&(n.stretch=i,this.parent&&this.parent.update()))}*[Symbol.iterator](){for(let e of this._items)yield e.widget}addWidget(e){He.findFirstIndex(this._items,n=>n.widget===e)===-1&&(this._items.push(new ku(e)),this.parent&&this.attachWidget(e))}removeWidget(e){let i=He.findFirstIndex(this._items,r=>r.widget===e);if(i===-1)return;let n=He.removeAt(this._items,i);this.parent&&this.detachWidget(e),n.dispose()}init(){super.init();for(let e of this)this.attachWidget(e)}attachWidget(e){this.parent.isAttached&&Oe.sendMessage(e,pe.Msg.BeforeAttach),this.parent.node.appendChild(e.node),this.parent.isAttached&&Oe.sendMessage(e,pe.Msg.AfterAttach),this.parent.fit()}detachWidget(e){this.parent.isAttached&&Oe.sendMessage(e,pe.Msg.BeforeDetach),this.parent.node.removeChild(e.node),this.parent.isAttached&&Oe.sendMessage(e,pe.Msg.AfterDetach),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_fit(){for(let l=0,c=this.rowCount;l!l.isHidden);for(let l=0,c=e.length;l({row:0,column:0,rowSpan:1,columnSpan:1}),changed:a});function e(l){let c=Math.max(0,Math.floor(l.row||0)),u=Math.max(0,Math.floor(l.column||0)),d=Math.max(1,Math.floor(l.rowSpan||0)),f=Math.max(1,Math.floor(l.columnSpan||0));return{row:c,column:u,rowSpan:d,columnSpan:f}}t.normalizeConfig=e;function i(l){return Math.max(0,Math.floor(l))}t.clampValue=i;function n(l,c){let u=t.cellConfigProperty.get(l.widget),d=t.cellConfigProperty.get(c.widget);return u.rowSpan-d.rowSpan}t.rowSpanCmp=n;function r(l,c){let u=t.cellConfigProperty.get(l.widget),d=t.cellConfigProperty.get(c.widget);return u.columnSpan-d.columnSpan}t.columnSpanCmp=r;function s(l,c){for(c=Math.max(1,Math.floor(c));l.lengthc&&(l.length=c)}t.reallocSizers=s;function o(l,c,u,d){if(u=d)return;let h=(d-f)/(u-c+1);for(let m=c;m<=u;++m)l[m].minSize+=h}t.distributeMin=o;function a(l){l.parent&&l.parent.layout instanceof cl&&l.parent.fit()}})(yn||(yn={}));Af=class extends pe{constructor(e={}){super({node:Qb.createNode()}),this._activeIndex=-1,this._tabFocusIndex=0,this._menus=[],this._childMenu=null,this._overflowMenu=null,this._menuItemSizes=[],this._overflowIndex=-1,this.addClass("lm-MenuBar"),this.setFlag(pe.Flag.DisallowLayout),this.renderer=e.renderer||Af.defaultRenderer,this._forceItemsPosition=e.forceItemsPosition||{forceX:!0,forceY:!0},this._overflowMenuOptions=e.overflowMenuOptions||{isVisible:!0}}dispose(){this._closeChildMenu(),this._menus.length=0,super.dispose()}get childMenu(){return this._childMenu}get overflowIndex(){return this._overflowIndex}get overflowMenu(){return this._overflowMenu}get contentNode(){return this.node.getElementsByClassName("lm-MenuBar-content")[0]}get activeMenu(){return this._menus[this._activeIndex]||null}set activeMenu(e){this.activeIndex=e?this._menus.indexOf(e):-1}get activeIndex(){return this._activeIndex}set activeIndex(e){(e<0||e>=this._menus.length)&&(e=-1),e>-1&&this._menus[e].items.length===0&&(e=-1),this._activeIndex!==e&&(this._activeIndex=e,this.update())}get menus(){return this._menus}openActiveMenu(){this._activeIndex!==-1&&(this._openChildMenu(),this._childMenu&&(this._childMenu.activeIndex=-1,this._childMenu.activateNextItem()))}addMenu(e,i=!0){this.insertMenu(this._menus.length,e,i)}insertMenu(e,i,n=!0){this._closeChildMenu();let r=this._menus.indexOf(i),s=Math.max(0,Math.min(e,this._menus.length));if(r===-1){He.insert(this._menus,s,i),i.addClass("lm-MenuBar-menu"),i.aboutToClose.connect(this._onMenuAboutToClose,this),i.menuRequested.connect(this._onMenuMenuRequested,this),i.title.changed.connect(this._onTitleChanged,this),n&&this.update();return}s===this._menus.length&&s--,r!==s&&(He.move(this._menus,r,s),n&&this.update())}removeMenu(e,i=!0){this.removeMenuAt(this._menus.indexOf(e),i)}removeMenuAt(e,i=!0){this._closeChildMenu();let n=He.removeAt(this._menus,e);n&&(n.aboutToClose.disconnect(this._onMenuAboutToClose,this),n.menuRequested.disconnect(this._onMenuMenuRequested,this),n.title.changed.disconnect(this._onTitleChanged,this),n.removeClass("lm-MenuBar-menu"),i&&this.update())}clearMenus(){if(this._menus.length!==0){this._closeChildMenu();for(let e of this._menus)e.aboutToClose.disconnect(this._onMenuAboutToClose,this),e.menuRequested.disconnect(this._onMenuMenuRequested,this),e.title.changed.disconnect(this._onTitleChanged,this),e.removeClass("lm-MenuBar-menu");this._menus.length=0,this.update()}}handleEvent(e){switch(e.type){case"keydown":this._evtKeyDown(e);break;case"mousedown":this._evtMouseDown(e);break;case"mousemove":this._evtMouseMove(e);break;case"focusout":this._evtFocusOut(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("keydown",this),this.node.addEventListener("mousedown",this),this.node.addEventListener("mousemove",this),this.node.addEventListener("focusout",this),this.node.addEventListener("contextmenu",this)}onAfterDetach(e){this.node.removeEventListener("keydown",this),this.node.removeEventListener("mousedown",this),this.node.removeEventListener("mousemove",this),this.node.removeEventListener("focusout",this),this.node.removeEventListener("contextmenu",this),this._closeChildMenu()}onActivateRequest(e){this.isAttached&&this._focusItemAt(0)}onResize(e){this.update(),super.onResize(e)}onUpdateRequest(e){var i;let n=this._menus,r=this.renderer,s=this._activeIndex,o=this._tabFocusIndex>=0&&this._tabFocusIndex-1?this._overflowIndex:n.length,l=0,c=!1;a=this._overflowMenu!==null?a-1:a;let u=new Array(a);for(let d=0;d{this._tabFocusIndex=d,this.activeIndex=d}}),l+=this._menuItemSizes[d],n[d].title.label===this._overflowMenuOptions.title&&(c=!0,a--);if(this._overflowMenuOptions.isVisible){if(this._overflowIndex>-1&&!c){if(this._overflowMenu===null){let d=(i=this._overflowMenuOptions.title)!==null&&i!==void 0?i:"...";this._overflowMenu=new aa({commands:new er}),this._overflowMenu.title.label=d,this._overflowMenu.title.mnemonic=0,this.addMenu(this._overflowMenu,!1)}for(let d=n.length-2;d>=a;d--){let f=this.menus[d];f.title.mnemonic=0,this._overflowMenu.insertItem(0,{type:"submenu",submenu:f}),this.removeMenu(f,!1)}u[a]=r.renderItem({title:this._overflowMenu.title,active:a===s&&n[a].items.length!==0,tabbable:a===o,disabled:n[a].items.length===0,onfocus:()=>{this._tabFocusIndex=a,this.activeIndex=a}}),a++}else if(this._overflowMenu!==null){let d=this._overflowMenu.items,f=this.node.offsetWidth,h=this._overflowMenu.items.length;for(let m=0;mthis._menuItemSizes[p]){let _=d[0].submenu;this._overflowMenu.removeItemAt(0),this.insertMenu(a,_,!1),u[a]=r.renderItem({title:_.title,active:!1,tabbable:a===o,disabled:n[a].items.length===0,onfocus:()=>{this._tabFocusIndex=a,this.activeIndex=a}}),a++}}this._overflowMenu.items.length===0&&(this.removeMenu(this._overflowMenu,!1),u.pop(),this._overflowMenu=null,this._overflowIndex=-1)}}Zt.render(u,this.contentNode),this._updateOverflowIndex()}_updateOverflowIndex(){if(!this._overflowMenuOptions.isVisible)return;let e=this.contentNode.childNodes,i=this.node.offsetWidth,n=0,r=-1,s=e.length;if(this._menuItemSizes.length==0)for(let o=0;oi&&r===-1&&(r=o)}else for(let o=0;oi){r=o;break}this._overflowIndex=r}_evtKeyDown(e){let i=e.keyCode;if(i===9){this.activeIndex=-1;return}if(e.preventDefault(),e.stopPropagation(),i===13||i===32||i===38||i===40){if(this.activeIndex=this._tabFocusIndex,this.activeIndex!==this._tabFocusIndex)return;this.openActiveMenu();return}if(i===27){this._closeChildMenu(),this._focusItemAt(this.activeIndex);return}if(i===37||i===39){let o=i===37?-1:1,a=this._tabFocusIndex+o,l=this._menus.length;for(let c=0;cti.hitTest(n,e.clientX,e.clientY));if(i===-1){this._closeChildMenu();return}if(e.button===0)if(this._childMenu)this._closeChildMenu(),this.activeIndex=i;else{e.preventDefault();let n=this._positionForMenu(i);aa.saveWindowData(),this.activeIndex=i,this._openChildMenu(n)}}_evtMouseMove(e){let i=He.findFirstIndex(this.contentNode.children,r=>ti.hitTest(r,e.clientX,e.clientY));if(i===this._activeIndex||i===-1&&this._childMenu)return;let n=i>=0&&this._childMenu?this._positionForMenu(i):null;aa.saveWindowData(),this.activeIndex=i,n&&this._openChildMenu(n)}_positionForMenu(e){let i=this.contentNode.children[e],{left:n,bottom:r}=i.getBoundingClientRect();return{top:r,left:n}}_evtFocusOut(e){!this._childMenu&&!this.node.contains(e.relatedTarget)&&(this.activeIndex=-1)}_focusItemAt(e){let i=this.contentNode.childNodes[e];i&&i.focus()}_openChildMenu(e={}){let i=this.activeMenu;if(!i){this._closeChildMenu();return}let n=this._childMenu;if(n===i)return;this._childMenu=i,n?n.close():document.addEventListener("mousedown",this,!0),this._tabFocusIndex=this.activeIndex,Oe.sendMessage(this,pe.Msg.UpdateRequest);let{left:r,top:s}=e;(typeof r>"u"||typeof s>"u")&&({left:r,top:s}=this._positionForMenu(this._activeIndex)),n||this.addClass("lm-mod-active"),i.items.length>0&&i.open(r,s,this._forceItemsPosition)}_closeChildMenu(){if(!this._childMenu)return;this.removeClass("lm-mod-active"),document.removeEventListener("mousedown",this,!0);let e=this._childMenu;this._childMenu=null,e.close(),this.activeIndex=-1}_onMenuAboutToClose(e){e===this._childMenu&&(this.removeClass("lm-mod-active"),document.removeEventListener("mousedown",this,!0),this._childMenu=null,this.activeIndex=-1)}_onMenuMenuRequested(e,i){if(e!==this._childMenu)return;let n=this._activeIndex,r=this._menus.length;switch(i){case"next":this.activeIndex=n===r-1?0:n+1;break;case"previous":this.activeIndex=n===0?r-1:n-1;break}this.openActiveMenu()}_onTitleChanged(){this.update()}};(function(t){class e{renderItem(n){let r=this.createItemClass(n),s=this.createItemDataset(n),o=this.createItemARIA(n);return Ce.li({className:r,dataset:s,...n.disabled?{}:{tabindex:n.tabbable?"0":"-1"},onfocus:n.onfocus,...o},this.renderIcon(n),this.renderLabel(n))}renderIcon(n){let r=this.createIconClass(n);return Ce.div({className:r},n.title.icon,n.title.iconLabel)}renderLabel(n){let r=this.formatLabel(n);return Ce.div({className:"lm-MenuBar-itemLabel"},r)}createItemClass(n){let r="lm-MenuBar-item";return n.title.className&&(r+=` ${n.title.className}`),n.active&&!n.disabled&&(r+=" lm-mod-active"),r}createItemDataset(n){return n.title.dataset}createItemARIA(n){return{role:"menuitem","aria-haspopup":"true","aria-disabled":n.disabled?"true":"false"}}createIconClass(n){let r="lm-MenuBar-itemIcon",s=n.title.iconClass;return s?`${r} ${s}`:r}formatLabel(n){let{label:r,mnemonic:s}=n.title;if(s<0||s>=r.length)return r;let o=r.slice(0,s),a=r.slice(s+1),l=r[s],c=Ce.span({className:"lm-MenuBar-itemMnemonic"},l);return[o,c,a]}}t.Renderer=e,t.defaultRenderer=new e})(Af||(Af={}));(function(t){function e(){let n=document.createElement("div"),r=document.createElement("ul");return r.className="lm-MenuBar-content",n.appendChild(r),r.setAttribute("role","menubar"),n}t.createNode=e;function i(n,r,s){let o=-1,a=-1,l=!1,c=r.toUpperCase();for(let u=0,d=n.length;u=0&&m1&&this.widgets.forEach(i=>{i.hiddenMode=this._hiddenMode}))}dispose(){for(let e of this._items)e.dispose();this._box=null,this._items.length=0,super.dispose()}attachWidget(e,i){this._hiddenMode===pe.HiddenMode.Scale&&this._items.length>0?(this._items.length===1&&(this.widgets[0].hiddenMode=pe.HiddenMode.Scale),i.hiddenMode=pe.HiddenMode.Scale):i.hiddenMode=pe.HiddenMode.Display,He.insert(this._items,e,new ku(i)),this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.BeforeAttach),this.parent.node.appendChild(i.node),this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.AfterAttach),this.parent.fit()}moveWidget(e,i,n){He.move(this._items,e,i),this.parent.update()}detachWidget(e,i){let n=He.removeAt(this._items,e);this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.AfterDetach),n.widget.node.style.zIndex="",this._hiddenMode===pe.HiddenMode.Scale&&(i.hiddenMode=pe.HiddenMode.Display,this._items.length===1&&(this._items[0].widget.hiddenMode=pe.HiddenMode.Display)),n.dispose(),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_fit(){let e=0,i=0;for(let s=0,o=this._items.length;s{var UV=1/0,VV="[object Symbol]",Y2=/[&<>"'`]/g,GV=RegExp(Y2.source),YV={"&":"&","<":"<",">":">",'"':""","'":"'","`":"`"},KV=typeof globalThis=="object"&&globalThis&&globalThis.Object===Object&&globalThis,XV=typeof self=="object"&&self&&self.Object===Object&&self,JV=KV||XV||Function("return this")();function ZV(t){return function(e){return t?.[e]}}var QV=ZV(YV),eG=Object.prototype,tG=eG.toString,U2=JV.Symbol,V2=U2?U2.prototype:void 0,G2=V2?V2.toString:void 0;function iG(t){if(typeof t=="string")return t;if(rG(t))return G2?G2.call(t):"";var e=t+"";return e=="0"&&1/t==-UV?"-0":e}function nG(t){return!!t&&typeof t=="object"}function rG(t){return typeof t=="symbol"||nG(t)&&tG.call(t)==VV}function sG(t){return t==null?"":iG(t)}function oG(t){return t=sG(t),t&&GV.test(t)?t.replace(Y2,QV):t}K2.exports=oG});function Z2(t){let e=[],i=null,n=null,r=null,s=0,o;t.includes("`")||t.includes("~~~")?(t=t.replace(/~/g,"~T").replace(/^(?`{3,}|(~T){3,})[^`\n]*\n([\s\S]*?)^\k`*$/gm,c=>c.replace(/\$/g,"~D")).replace(/(^|[^\\])(`+)([^\n]*?[^`\n])\2(?!`)/gm,c=>c.replace(/\$/g,"~D")),o=c=>c.replace(/~([TD])/g,(u,d)=>d==="T"?"~":J2)):o=c=>c;let l=t.replace(/\r\n?/g,` +${JSON.stringify(t.traceback)}`:t.evalue}var Qi,$s,JS,jr,mc,_o,$n,_f,xu,el=$(()=>{(function(t){t.launching="launching",t.ready="server-ready",t.closed="closed",t.unknown="unknown"})(Qi||(Qi={}));(function(t){t.starting="starting",t.ready="ready",t.shutdown="shutdown"})($s||($s={}));(function(t){t.starting="starting",t.ready="ready",t.shutdown="shutdown"})(JS||(JS={}));(function(t){t.attached="attached",t.detached="detached",t.executing="executing",t.idle="idle"})(jr||(jr={}));(function(t){t.attached="attached",t.detached="detached",t.executing="executing",t.idle="idle"})(mc||(mc={}));(function(t){t.server="server",t.session="session",t.kernel="kernel",t.notebook="notebook",t.cell="cell"})(_o||(_o={}));(function(t){t.warning="warning",t.executeError="execute-error",t.error="error",t.server="server-error",t.session="session-error"})($n||($n={}));(function(t){t.status="status",t.error="error"})(_f||(_f={}));xu=class{constructor(){this.listeners={}}_ensureMap(e){e in this.listeners||(this.listeners[e]=new Map)}trigger(e,i){e in this.listeners&&this.listeners[e].forEach(({unbind:n},r)=>{r(e,i),n&&this.listeners[e].delete(r)})}on(e,i){return this._ensureMap(e),this.listeners[e].set(i,{unbind:!1}),()=>this.off(e,i)}one(e,i){return this._ensureMap(e),this.listeners[e].set(i,{unbind:!0}),()=>this.off(e,i)}off(e,i){e in this.listeners&&this.listeners[e].delete(i)}}});function*bf(){}function ZS(t,e){let i=0;for(let n of t)if(e(n,i++))return n}function v2(t,e){let i=0;for(let n of t)if(e(n,i++)===!1)return!1;return!0}function b2(t,e){let i=0;for(let n of t)if(e(n,i++))return!0;return!1}function*x2(t){if(typeof t.retro=="function")yield*t.retro();else for(let e=t.length-1;e>-1;e--)yield t[e]}var Be,_2,vf,QS=$(()=>{(function(t){function e(M,C,x=0,w=-1){let E=M.length;if(E===0)return-1;x<0?x=Math.max(0,x+E):x=Math.min(x,E-1),w<0?w=Math.max(0,w+E):w=Math.min(w,E-1);let N;w0;){let X=Z>>1,K=B+X;x(M[K],C)<0?(B=K+1,Z-=X+1):Z=X}return B}t.lowerBound=a;function l(M,C,x,w=0,E=-1){let N=M.length;if(N===0)return 0;w<0?w=Math.max(0,w+N):w=Math.min(w,N-1),E<0?E=Math.max(0,E+N):E=Math.min(E,N-1);let B=w,Z=E-w+1;for(;Z>0;){let X=Z>>1,K=B+X;x(M[K],C)>0?Z=X:(B=K+1,Z-=X+1)}return B}t.upperBound=l;function c(M,C,x){if(M===C)return!0;if(M.length!==C.length)return!1;for(let w=0,E=M.length;w=N&&(x=E<0?N-1:N),w===void 0?w=E<0?-1:N:w<0?w=Math.max(w+N,E<0?-1:0):w>=N&&(w=E<0?N-1:N);let B;E<0&&w>=x||E>0&&x>=w?B=0:E<0?B=Math.floor((w-x+1)/E+1):B=Math.floor((w-x-1)/E+1);let Z=[];for(let X=0;X=w))return;let N=w-x+1;if(C>0?C=C%N:C<0&&(C=(C%N+N)%N),C===0)return;let B=x+C;f(M,x,B-1),f(M,B,w),f(M,x,w)}t.rotate=h;function m(M,C,x=0,w=-1){let E=M.length;if(E===0)return;x<0?x=Math.max(0,x+E):x=Math.min(x,E-1),w<0?w=Math.max(0,w+E):w=Math.min(w,E-1);let N;wC;--E)M[E]=M[E-1];M[C]=x}t.insert=p;function _(M,C){let x=M.length;if(C<0&&(C+=x),C<0||C>=x)return;let w=M[C];for(let E=C+1;E=x&&B<=w&&M[B]===C||w=x)&&M[B]===C?N++:N>0&&(M[B-N]=M[B]);return N>0&&(M.length=E-N),N}t.removeAllOf=T;function O(M,C,x=0,w=-1){let E,N=n(M,C,x,w);return N!==-1&&(E=_(M,N)),{index:N,value:E}}t.removeFirstWhere=O;function A(M,C,x=-1,w=0){let E,N=r(M,C,x,w);return N!==-1&&(E=_(M,N)),{index:N,value:E}}t.removeLastWhere=A;function b(M,C,x=0,w=-1){let E=M.length;if(E===0)return 0;x<0?x=Math.max(0,x+E):x=Math.min(x,E-1),w<0?w=Math.max(0,w+E):w=Math.min(w,E-1);let N=0;for(let B=0;B=x&&B<=w&&C(M[B],B)||w=x)&&C(M[B],B)?N++:N>0&&(M[B-N]=M[B]);return N>0&&(M.length=E-N),N}t.removeAllWhere=b})(Be||(Be={}));(function(t){function e(i,n,r){return r===0?1/0:i>n&&r>0||ia?1:0}t.cmp=s})(vf||(vf={}))});var y2,ti,yu,wu,vo,w2=$(()=>{(function(t){function e(i){let n=document.body,r=s=>{s.preventDefault(),s.stopPropagation(),s.clipboardData.setData("text",i),n.removeEventListener("copy",r,!0)};n.addEventListener("copy",r,!0),document.execCommand("copy")}t.copyText=e})(y2||(y2={}));(function(t){function e(s){let o=window.getComputedStyle(s),a=parseFloat(o.borderTopWidth)||0,l=parseFloat(o.borderLeftWidth)||0,c=parseFloat(o.borderRightWidth)||0,u=parseFloat(o.borderBottomWidth)||0,d=parseFloat(o.paddingTop)||0,f=parseFloat(o.paddingLeft)||0,h=parseFloat(o.paddingRight)||0,m=parseFloat(o.paddingBottom)||0,p=l+f+h+c,_=a+d+m+u;return{borderTop:a,borderLeft:l,borderRight:c,borderBottom:u,paddingTop:d,paddingLeft:f,paddingRight:h,paddingBottom:m,horizontalSum:p,verticalSum:_}}t.boxSizing=e;function i(s){let o=window.getComputedStyle(s),a=parseFloat(o.minWidth)||0,l=parseFloat(o.minHeight)||0,c=parseFloat(o.maxWidth)||1/0,u=parseFloat(o.maxHeight)||1/0;return c=Math.max(a,c),u=Math.max(l,u),{minWidth:a,minHeight:l,maxWidth:c,maxHeight:u}}t.sizeLimits=i;function n(s,o,a){let l=s.getBoundingClientRect();return o>=l.left&&o=l.top&&a=a.bottom)){if(l.topa.bottom&&l.height>=a.height){s.scrollTop-=a.top-l.top;return}if(l.topa.height){s.scrollTop-=a.bottom-l.bottom;return}if(l.bottom>a.bottom&&l.height{let u=Element.prototype;return u.matches||u.matchesSelector||u.mozMatchesSelector||u.msMatchesSelector||u.oMatchesSelector||u.webkitMatchesSelector||function(d){let f=this,h=f.ownerDocument?f.ownerDocument.querySelectorAll(d):[];return Array.prototype.indexOf.call(h,f)!==-1}})();function e(u){u=u.split(",",1)[0];let d=0,f=0,h=0;function m(p){let _=u.match(p);return _===null?!1:(u=u.slice(_[0].length),!0)}for(u=u.replace(c," $1 ");u.length>0;){if(m(i)){d++;continue}if(m(n)){f++;continue}if(m(r)){f++;continue}if(m(o)){h++;continue}if(m(a)){f++;continue}if(m(s)){h++;continue}if(!m(l))return 0}return d=Math.min(d,255),f=Math.min(f,255),h=Math.min(h,255),d<<16|f<<8|h}t.calculateSingle=e;let i=/^#[^\s\+>~#\.\[:]+/,n=/^\.[^\s\+>~#\.\[:]+/,r=/^\[[^\]]+\]/,s=/^[^\s\+>~#\.\[:]+/,o=/^(::[^\s\+>~#\.\[:]+|:first-line|:first-letter|:before|:after)/,a=/^:[^\s\+>~#\.\[:]+/,l=/^[\s\+>~\*]+/,c=/:not\(([^\)]+)\)/g})(vo||(vo={}))});var xf,tl,S2=$(()=>{xf=class{constructor(){this._first=null,this._last=null,this._size=0}get isEmpty(){return this._size===0}get size(){return this._size}get length(){return this._size}get first(){return this._first?this._first.value:void 0}get last(){return this._last?this._last.value:void 0}get firstNode(){return this._first}get lastNode(){return this._last}*[Symbol.iterator](){let e=this._first;for(;e;)yield e.value,e=e.next}*retro(){let e=this._last;for(;e;)yield e.value,e=e.prev}*nodes(){let e=this._first;for(;e;)yield e,e=e.next}*retroNodes(){let e=this._last;for(;e;)yield e,e=e.prev}assign(e){this.clear();for(let i of e)this.addLast(i)}push(e){this.addLast(e)}pop(){return this.removeLast()}shift(e){this.addFirst(e)}unshift(){return this.removeFirst()}addFirst(e){let i=new tl.LinkedListNode(this,e);return this._first?(i.next=this._first,this._first.prev=i,this._first=i):(this._first=i,this._last=i),this._size++,i}addLast(e){let i=new tl.LinkedListNode(this,e);return this._last?(i.prev=this._last,this._last.next=i,this._last=i):(this._first=i,this._last=i),this._size++,i}insertBefore(e,i){if(!i||i===this._first)return this.addFirst(e);if(!(i instanceof tl.LinkedListNode)||i.list!==this)throw new Error("Reference node is not owned by the list.");let n=new tl.LinkedListNode(this,e),r=i,s=r.prev;return n.next=r,n.prev=s,r.prev=n,s.next=n,this._size++,n}insertAfter(e,i){if(!i||i===this._last)return this.addLast(e);if(!(i instanceof tl.LinkedListNode)||i.list!==this)throw new Error("Reference node is not owned by the list.");let n=new tl.LinkedListNode(this,e),r=i,s=r.next;return n.next=s,n.prev=r,r.next=n,s.prev=n,this._size++,n}removeFirst(){let e=this._first;if(e)return e===this._last?(this._first=null,this._last=null):(this._first=e.next,this._first.prev=null),e.list=null,e.next=null,e.prev=null,this._size--,e.value}removeLast(){let e=this._last;if(e)return e===this._first?(this._first=null,this._last=null):(this._last=e.prev,this._last.next=null),e.list=null,e.next=null,e.prev=null,this._size--,e.value}removeNode(e){if(!(e instanceof tl.LinkedListNode)||e.list!==this)throw new Error("Node is not owned by the list.");let i=e;i===this._first&&i===this._last?(this._first=null,this._last=null):i===this._first?(this._first=i.next,this._first.prev=null):i===this._last?(this._last=i.prev,this._last.next=null):(i.next.prev=i.prev,i.prev.next=i.next),i.list=null,i.next=null,i.prev=null,this._size--}clear(){let e=this._first;for(;e;){let i=e.next;e.list=null,e.prev=null,e.next=null,e=i}this._first=null,this._last=null,this._size=0}};(function(t){function e(i){let n=new t;return n.assign(i),n}t.from=e})(xf||(xf={}));(function(t){class e{constructor(n,r){this.list=null,this.next=null,this.prev=null,this.list=n,this.value=r}}t.LinkedListNode=e})(tl||(tl={}))});var pr,pc,Ae,Wb=$(()=>{QS();S2();pr=class{constructor(e){this.type=e}get isConflatable(){return!1}conflate(e){return!1}},pc=class extends pr{get isConflatable(){return!0}conflate(e){return!0}};(function(t){let e=null,i=(C=>x=>{let w=!1;return C.then(()=>!w&&x()),()=>{w=!0}})(Promise.resolve());function n(C,x){let w=f.get(C);if(!w||w.length===0){y(C,x);return}v2(x2(w),N=>N?_(N,C,x):!0)&&y(C,x)}t.sendMessage=n;function r(C,x){if(!x.isConflatable){S(C,x);return}b2(d,E=>E.handler!==C||!E.msg||E.msg.type!==x.type||!E.msg.isConflatable?!1:E.msg.conflate(x))||S(C,x)}t.postMessage=r;function s(C,x){let w=f.get(C);w&&w.indexOf(x)!==-1||(w?w.push(x):f.set(C,[x]))}t.installMessageHook=s;function o(C,x){let w=f.get(C);if(!w)return;let E=w.indexOf(x);E!==-1&&(w[E]=null,O(w))}t.removeMessageHook=o;function a(C){let x=f.get(C);x&&x.length>0&&(Be.fill(x,null),O(x));for(let w of d)w.handler===C&&(w.handler=null,w.msg=null)}t.clearData=a;function l(){p||e===null||(e(),e=null,p=!0,T(),p=!1)}t.flush=l;function c(){return m}t.getExceptionHandler=c;function u(C){let x=m;return m=C,x}t.setExceptionHandler=u;let d=new xf,f=new WeakMap,h=new Set,m=C=>{console.error(C)},p=!1;function _(C,x,w){let E=!0;try{typeof C=="function"?E=C(x,w):E=C.messageHook(x,w)}catch(N){m(N)}return E}function y(C,x){try{C.processMessage(x)}catch(w){m(w)}}function S(C,x){d.addLast({handler:C,msg:x}),e===null&&(e=i(T))}function T(){if(e=null,d.isEmpty)return;let C={handler:null,msg:null};for(d.addLast(C);;){let x=d.removeFirst();if(x===C)return;x.handler&&x.msg&&n(x.handler,x.msg)}}function O(C){h.size===0&&i(A),h.add(C)}function A(){h.forEach(b),h.clear()}function b(C){Be.removeAllWhere(C,M)}function M(C){return C===null}})(Ae||(Ae={}))});var sl,na,ss,eg,me,$b,sa,Cu,yf,wf,tg,ig,bo,nl,eC,qb,Ub,tC,Eu,iC,ng,nC,os,Su,Vb,rC,Sf,il,ra,gr,C2,WV,gc,qs,sC,en,Mu,qi,rl,xn,Cf,Gb,E2,M2,oC,I2,T2,lC=$(()=>{QS();sl=P(Qn());w2();Wb();Tp();Rs();Ev();Iv();Tv();Cv();Mv();na=class{constructor(){this.sizeHint=0,this.minSize=0,this.maxSize=1/0,this.stretch=1,this.size=0,this.done=!1}};(function(t){function e(s,o){let a=s.length;if(a===0)return o;let l=0,c=0,u=0,d=0,f=0;for(let p=0;p0&&(d+=_.stretch,f++)}if(o===u)return 0;if(o<=l){for(let p=0;p=c){for(let p=0;p0&&p>h;){let _=p,y=d;for(let S=0;S0&&p>h;){let _=p/m;for(let y=0;y0&&p>h;){let _=p,y=d;for(let S=0;S=T.maxSize?(p-=T.maxSize-T.size,d-=T.stretch,T.size=T.maxSize,T.done=!0,m--,f--):(p-=O,T.size+=O)}}for(;m>0&&p>h;){let _=p/m;for(let y=0;y=S.maxSize?(p-=S.maxSize-S.size,S.size=S.maxSize,S.done=!0,m--):(p-=_,S.size+=_))}}}return 0}t.calc=e;function i(s,o,a){s.length===0||a===0||(a>0?n(s,o,a):r(s,o,-a))}t.adjust=i;function n(s,o,a){let l=0;for(let f=0;f<=o;++f){let h=s[f];l+=h.maxSize-h.size}let c=0;for(let f=o+1,h=s.length;f=0&&u>0;--f){let h=s[f],m=h.maxSize-h.size;m>=u?(h.sizeHint=h.size+u,u=0):(h.sizeHint=h.size+m,u-=m)}let d=a;for(let f=o+1,h=s.length;f0;++f){let m=s[f],p=m.size-m.minSize;p>=d?(m.sizeHint=m.size-d,d=0):(m.sizeHint=m.size-p,d-=p)}}function r(s,o,a){let l=0;for(let f=o+1,h=s.length;f0;++f){let m=s[f],p=m.maxSize-m.size;p>=u?(m.sizeHint=m.size+u,u=0):(m.sizeHint=m.size+p,u-=p)}let d=a;for(let f=o;f>=0&&d>0;--f){let h=s[f],m=h.size-h.minSize;m>=d?(h.sizeHint=h.size-d,d=0):(h.sizeHint=h.size-m,d-=m)}}})(ss||(ss={}));eg=class{constructor(e){this._label="",this._caption="",this._mnemonic=-1,this._icon=void 0,this._iconClass="",this._iconLabel="",this._className="",this._closable=!1,this._changed=new Te(this),this._isDisposed=!1,this.owner=e.owner,e.label!==void 0&&(this._label=e.label),e.mnemonic!==void 0&&(this._mnemonic=e.mnemonic),e.icon!==void 0&&(this._icon=e.icon),e.iconClass!==void 0&&(this._iconClass=e.iconClass),e.iconLabel!==void 0&&(this._iconLabel=e.iconLabel),e.caption!==void 0&&(this._caption=e.caption),e.className!==void 0&&(this._className=e.className),e.closable!==void 0&&(this._closable=e.closable),this._dataset=e.dataset||{}}get changed(){return this._changed}get label(){return this._label}set label(e){this._label!==e&&(this._label=e,this._changed.emit(void 0))}get mnemonic(){return this._mnemonic}set mnemonic(e){this._mnemonic!==e&&(this._mnemonic=e,this._changed.emit(void 0))}get icon(){return this._icon}set icon(e){this._icon!==e&&(this._icon=e,this._changed.emit(void 0))}get iconClass(){return this._iconClass}set iconClass(e){this._iconClass!==e&&(this._iconClass=e,this._changed.emit(void 0))}get iconLabel(){return this._iconLabel}set iconLabel(e){this._iconLabel!==e&&(this._iconLabel=e,this._changed.emit(void 0))}get caption(){return this._caption}set caption(e){this._caption!==e&&(this._caption=e,this._changed.emit(void 0))}get className(){return this._className}set className(e){this._className!==e&&(this._className=e,this._changed.emit(void 0))}get closable(){return this._closable}set closable(e){this._closable!==e&&(this._closable=e,this._changed.emit(void 0))}get dataset(){return this._dataset}set dataset(e){this._dataset!==e&&(this._dataset=e,this._changed.emit(void 0))}get isDisposed(){return this._isDisposed}dispose(){this.isDisposed||(this._isDisposed=!0,Te.clearData(this))}},me=class{constructor(e={}){this._flags=0,this._layout=null,this._parent=null,this._disposed=new Te(this),this._hiddenMode=me.HiddenMode.Display,this.node=$b.createNode(e),this.addClass("lm-Widget")}dispose(){this.isDisposed||(this.setFlag(me.Flag.IsDisposed),this._disposed.emit(void 0),this.parent?this.parent=null:this.isAttached&&me.detach(this),this._layout&&(this._layout.dispose(),this._layout=null),this.title.dispose(),Te.clearData(this),Ae.clearData(this),pt.clearData(this))}get disposed(){return this._disposed}get isDisposed(){return this.testFlag(me.Flag.IsDisposed)}get isAttached(){return this.testFlag(me.Flag.IsAttached)}get isHidden(){return this.testFlag(me.Flag.IsHidden)}get isVisible(){return this.testFlag(me.Flag.IsVisible)}get title(){return $b.titleProperty.get(this)}get id(){return this.node.id}set id(e){this.node.id=e}get dataset(){return this.node.dataset}get hiddenMode(){return this._hiddenMode}set hiddenMode(e){this._hiddenMode!==e&&(this.isHidden&&this._toggleHidden(!1),e==me.HiddenMode.Scale?this.node.style.willChange="transform":this.node.style.willChange="auto",this._hiddenMode=e,this.isHidden&&this._toggleHidden(!0))}get parent(){return this._parent}set parent(e){if(this._parent!==e){if(e&&this.contains(e))throw new Error("Invalid parent widget.");if(this._parent&&!this._parent.isDisposed){let i=new me.ChildMessage("child-removed",this);Ae.sendMessage(this._parent,i)}if(this._parent=e,this._parent&&!this._parent.isDisposed){let i=new me.ChildMessage("child-added",this);Ae.sendMessage(this._parent,i)}this.isDisposed||Ae.sendMessage(this,me.Msg.ParentChanged)}}get layout(){return this._layout}set layout(e){if(this._layout!==e){if(this.testFlag(me.Flag.DisallowLayout))throw new Error("Cannot set widget layout.");if(this._layout)throw new Error("Cannot change widget layout.");if(e.parent)throw new Error("Cannot change layout parent.");this._layout=e,e.parent=this}}*children(){this._layout&&(yield*this._layout)}contains(e){for(let i=e;i;i=i._parent)if(i===this)return!0;return!1}hasClass(e){return this.node.classList.contains(e)}addClass(e){this.node.classList.add(e)}removeClass(e){this.node.classList.remove(e)}toggleClass(e,i){return i===!0?(this.node.classList.add(e),!0):i===!1?(this.node.classList.remove(e),!1):this.node.classList.toggle(e)}update(){Ae.postMessage(this,me.Msg.UpdateRequest)}fit(){Ae.postMessage(this,me.Msg.FitRequest)}activate(){Ae.postMessage(this,me.Msg.ActivateRequest)}close(){Ae.sendMessage(this,me.Msg.CloseRequest)}show(){if(this.testFlag(me.Flag.IsHidden)&&(this.isAttached&&(!this.parent||this.parent.isVisible)&&Ae.sendMessage(this,me.Msg.BeforeShow),this.clearFlag(me.Flag.IsHidden),this._toggleHidden(!1),this.isAttached&&(!this.parent||this.parent.isVisible)&&Ae.sendMessage(this,me.Msg.AfterShow),this.parent)){let e=new me.ChildMessage("child-shown",this);Ae.sendMessage(this.parent,e)}}hide(){if(!this.testFlag(me.Flag.IsHidden)&&(this.isAttached&&(!this.parent||this.parent.isVisible)&&Ae.sendMessage(this,me.Msg.BeforeHide),this.setFlag(me.Flag.IsHidden),this._toggleHidden(!0),this.isAttached&&(!this.parent||this.parent.isVisible)&&Ae.sendMessage(this,me.Msg.AfterHide),this.parent)){let e=new me.ChildMessage("child-hidden",this);Ae.sendMessage(this.parent,e)}}setHidden(e){e?this.hide():this.show()}testFlag(e){return(this._flags&e)!==0}setFlag(e){this._flags|=e}clearFlag(e){this._flags&=~e}processMessage(e){switch(e.type){case"resize":this.notifyLayout(e),this.onResize(e);break;case"update-request":this.notifyLayout(e),this.onUpdateRequest(e);break;case"fit-request":this.notifyLayout(e),this.onFitRequest(e);break;case"before-show":this.notifyLayout(e),this.onBeforeShow(e);break;case"after-show":this.setFlag(me.Flag.IsVisible),this.notifyLayout(e),this.onAfterShow(e);break;case"before-hide":this.notifyLayout(e),this.onBeforeHide(e);break;case"after-hide":this.clearFlag(me.Flag.IsVisible),this.notifyLayout(e),this.onAfterHide(e);break;case"before-attach":this.notifyLayout(e),this.onBeforeAttach(e);break;case"after-attach":!this.isHidden&&(!this.parent||this.parent.isVisible)&&this.setFlag(me.Flag.IsVisible),this.setFlag(me.Flag.IsAttached),this.notifyLayout(e),this.onAfterAttach(e);break;case"before-detach":this.notifyLayout(e),this.onBeforeDetach(e);break;case"after-detach":this.clearFlag(me.Flag.IsVisible),this.clearFlag(me.Flag.IsAttached),this.notifyLayout(e),this.onAfterDetach(e);break;case"activate-request":this.notifyLayout(e),this.onActivateRequest(e);break;case"close-request":this.notifyLayout(e),this.onCloseRequest(e);break;case"child-added":this.notifyLayout(e),this.onChildAdded(e);break;case"child-removed":this.notifyLayout(e),this.onChildRemoved(e);break;default:this.notifyLayout(e);break}}notifyLayout(e){this._layout&&this._layout.processParentMessage(e)}onCloseRequest(e){this.parent?this.parent=null:this.isAttached&&me.detach(this)}onResize(e){}onUpdateRequest(e){}onFitRequest(e){}onActivateRequest(e){}onBeforeShow(e){}onAfterShow(e){}onBeforeHide(e){}onAfterHide(e){}onBeforeAttach(e){}onAfterAttach(e){}onBeforeDetach(e){}onAfterDetach(e){}onChildAdded(e){}onChildRemoved(e){}_toggleHidden(e){if(e)switch(this._hiddenMode){case me.HiddenMode.Display:this.addClass("lm-mod-hidden");break;case me.HiddenMode.Scale:this.node.style.transform="scale(0)",this.node.setAttribute("aria-hidden","true");break;case me.HiddenMode.ContentVisibility:this.node.style.contentVisibility="hidden",this.node.style.zIndex="-1";break}else switch(this._hiddenMode){case me.HiddenMode.Display:this.removeClass("lm-mod-hidden");break;case me.HiddenMode.Scale:this.node.style.transform="",this.node.removeAttribute("aria-hidden");break;case me.HiddenMode.ContentVisibility:this.node.style.contentVisibility="",this.node.style.zIndex="";break}}};(function(t){(function(s){s[s.Display=0]="Display",s[s.Scale=1]="Scale",s[s.ContentVisibility=2]="ContentVisibility"})(t.HiddenMode||(t.HiddenMode={})),function(s){s[s.IsDisposed=1]="IsDisposed",s[s.IsAttached=2]="IsAttached",s[s.IsHidden=4]="IsHidden",s[s.IsVisible=8]="IsVisible",s[s.DisallowLayout=16]="DisallowLayout"}(t.Flag||(t.Flag={})),function(s){s.BeforeShow=new pr("before-show"),s.AfterShow=new pr("after-show"),s.BeforeHide=new pr("before-hide"),s.AfterHide=new pr("after-hide"),s.BeforeAttach=new pr("before-attach"),s.AfterAttach=new pr("after-attach"),s.BeforeDetach=new pr("before-detach"),s.AfterDetach=new pr("after-detach"),s.ParentChanged=new pr("parent-changed"),s.UpdateRequest=new pc("update-request"),s.FitRequest=new pc("fit-request"),s.ActivateRequest=new pc("activate-request"),s.CloseRequest=new pc("close-request")}(t.Msg||(t.Msg={}));class e extends pr{constructor(o,a){super(o),this.child=a}}t.ChildMessage=e;class i extends pr{constructor(o,a){super("resize"),this.width=o,this.height=a}}t.ResizeMessage=i,function(s){s.UnknownSize=new s(-1,-1)}(i=t.ResizeMessage||(t.ResizeMessage={}));function n(s,o,a=null){if(s.parent)throw new Error("Cannot attach a child widget.");if(s.isAttached||s.node.isConnected)throw new Error("Widget is already attached.");if(!o.isConnected)throw new Error("Host is not attached.");Ae.sendMessage(s,t.Msg.BeforeAttach),o.insertBefore(s.node,a),Ae.sendMessage(s,t.Msg.AfterAttach)}t.attach=n;function r(s){if(s.parent)throw new Error("Cannot detach a child widget.");if(!s.isAttached||!s.node.isConnected)throw new Error("Widget is not attached.");Ae.sendMessage(s,t.Msg.BeforeDetach),s.node.parentNode.removeChild(s.node),Ae.sendMessage(s,t.Msg.AfterDetach)}t.detach=r})(me||(me={}));(function(t){t.titleProperty=new pt({name:"title",create:i=>new eg({owner:i})});function e(i){return i.node||document.createElement(i.tag||"div")}t.createNode=e})($b||($b={}));sa=class{constructor(e={}){this._disposed=!1,this._parent=null,this._fitPolicy=e.fitPolicy||"set-min-size"}dispose(){this._parent=null,this._disposed=!0,Te.clearData(this),pt.clearData(this)}get isDisposed(){return this._disposed}get parent(){return this._parent}set parent(e){if(this._parent!==e){if(this._parent)throw new Error("Cannot change parent widget.");if(e.layout!==this)throw new Error("Invalid parent widget.");this._parent=e,this.init()}}get fitPolicy(){return this._fitPolicy}set fitPolicy(e){if(this._fitPolicy!==e&&(this._fitPolicy=e,this._parent)){let i=this._parent.node.style;i.minWidth="",i.minHeight="",i.maxWidth="",i.maxHeight="",this._parent.fit()}}processParentMessage(e){switch(e.type){case"resize":this.onResize(e);break;case"update-request":this.onUpdateRequest(e);break;case"fit-request":this.onFitRequest(e);break;case"before-show":this.onBeforeShow(e);break;case"after-show":this.onAfterShow(e);break;case"before-hide":this.onBeforeHide(e);break;case"after-hide":this.onAfterHide(e);break;case"before-attach":this.onBeforeAttach(e);break;case"after-attach":this.onAfterAttach(e);break;case"before-detach":this.onBeforeDetach(e);break;case"after-detach":this.onAfterDetach(e);break;case"child-removed":this.onChildRemoved(e);break;case"child-shown":this.onChildShown(e);break;case"child-hidden":this.onChildHidden(e);break}}init(){for(let e of this)e.parent=this.parent}onResize(e){for(let i of this)Ae.sendMessage(i,me.ResizeMessage.UnknownSize)}onUpdateRequest(e){for(let i of this)Ae.sendMessage(i,me.ResizeMessage.UnknownSize)}onBeforeAttach(e){for(let i of this)Ae.sendMessage(i,e)}onAfterAttach(e){for(let i of this)Ae.sendMessage(i,e)}onBeforeDetach(e){for(let i of this)Ae.sendMessage(i,e)}onAfterDetach(e){for(let i of this)Ae.sendMessage(i,e)}onBeforeShow(e){for(let i of this)i.isHidden||Ae.sendMessage(i,e)}onAfterShow(e){for(let i of this)i.isHidden||Ae.sendMessage(i,e)}onBeforeHide(e){for(let i of this)i.isHidden||Ae.sendMessage(i,e)}onAfterHide(e){for(let i of this)i.isHidden||Ae.sendMessage(i,e)}onChildRemoved(e){this.removeWidget(e.child)}onFitRequest(e){}onChildShown(e){}onChildHidden(e){}};(function(t){function e(s){return yf.horizontalAlignmentProperty.get(s)}t.getHorizontalAlignment=e;function i(s,o){yf.horizontalAlignmentProperty.set(s,o)}t.setHorizontalAlignment=i;function n(s){return yf.verticalAlignmentProperty.get(s)}t.getVerticalAlignment=n;function r(s,o){yf.verticalAlignmentProperty.set(s,o)}t.setVerticalAlignment=r})(sa||(sa={}));Cu=class{constructor(e){this._top=NaN,this._left=NaN,this._width=NaN,this._height=NaN,this._minWidth=0,this._minHeight=0,this._maxWidth=1/0,this._maxHeight=1/0,this._disposed=!1,this.widget=e,this.widget.node.style.position="absolute",this.widget.node.style.contain="strict"}dispose(){if(this._disposed)return;this._disposed=!0;let e=this.widget.node.style;e.position="",e.top="",e.left="",e.width="",e.height="",e.contain=""}get minWidth(){return this._minWidth}get minHeight(){return this._minHeight}get maxWidth(){return this._maxWidth}get maxHeight(){return this._maxHeight}get isDisposed(){return this._disposed}get isHidden(){return this.widget.isHidden}get isVisible(){return this.widget.isVisible}get isAttached(){return this.widget.isAttached}fit(){let e=ti.sizeLimits(this.widget.node);this._minWidth=e.minWidth,this._minHeight=e.minHeight,this._maxWidth=e.maxWidth,this._maxHeight=e.maxHeight}update(e,i,n,r){let s=Math.max(this._minWidth,Math.min(n,this._maxWidth)),o=Math.max(this._minHeight,Math.min(r,this._maxHeight));if(s"center",changed:e}),t.verticalAlignmentProperty=new pt({name:"verticalAlignment",create:()=>"top",changed:e});function e(i){i.parent&&i.parent.layout&&i.parent.update()}})(yf||(yf={}));wf=class extends sa{constructor(){super(...arguments),this._widgets=[]}dispose(){for(;this._widgets.length>0;)this._widgets.pop().dispose();super.dispose()}get widgets(){return this._widgets}*[Symbol.iterator](){yield*this._widgets}addWidget(e){this.insertWidget(this._widgets.length,e)}insertWidget(e,i){i.parent=this.parent;let n=this._widgets.indexOf(i),r=Math.max(0,Math.min(e,this._widgets.length));if(n===-1){Be.insert(this._widgets,r,i),this.parent&&this.attachWidget(r,i);return}r===this._widgets.length&&r--,n!==r&&(Be.move(this._widgets,n,r),this.parent&&this.moveWidget(n,r,i))}removeWidget(e){this.removeWidgetAt(this._widgets.indexOf(e))}removeWidgetAt(e){let i=Be.removeAt(this._widgets,e);i&&this.parent&&this.detachWidget(e,i)}init(){super.init();let e=0;for(let i of this)this.attachWidget(e++,i)}attachWidget(e,i){let n=this.parent.node.children[e];this.parent.isAttached&&Ae.sendMessage(i,me.Msg.BeforeAttach),this.parent.node.insertBefore(i.node,n),this.parent.isAttached&&Ae.sendMessage(i,me.Msg.AfterAttach)}moveWidget(e,i,n){this.parent.isAttached&&Ae.sendMessage(n,me.Msg.BeforeDetach),this.parent.node.removeChild(n.node),this.parent.isAttached&&Ae.sendMessage(n,me.Msg.AfterDetach);let r=this.parent.node.children[i];this.parent.isAttached&&Ae.sendMessage(n,me.Msg.BeforeAttach),this.parent.node.insertBefore(n.node,r),this.parent.isAttached&&Ae.sendMessage(n,me.Msg.AfterAttach)}detachWidget(e,i){this.parent.isAttached&&Ae.sendMessage(i,me.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.isAttached&&Ae.sendMessage(i,me.Msg.AfterDetach)}};(function(t){function e(i){return Math.max(0,Math.floor(i))}t.clampDimension=e})(tg||(tg={}));ig=tg,bo=class extends wf{constructor(e){super(),this.widgetOffset=0,this._fixed=0,this._spacing=4,this._dirty=!1,this._hasNormedSizes=!1,this._sizers=[],this._items=[],this._handles=[],this._box=null,this._alignment="start",this._orientation="horizontal",this.renderer=e.renderer,e.orientation!==void 0&&(this._orientation=e.orientation),e.alignment!==void 0&&(this._alignment=e.alignment),e.spacing!==void 0&&(this._spacing=tg.clampDimension(e.spacing))}dispose(){for(let e of this._items)e.dispose();this._box=null,this._items.length=0,this._sizers.length=0,this._handles.length=0,super.dispose()}get orientation(){return this._orientation}set orientation(e){this._orientation!==e&&(this._orientation=e,this.parent&&(this.parent.dataset.orientation=e,this.parent.fit()))}get alignment(){return this._alignment}set alignment(e){this._alignment!==e&&(this._alignment=e,this.parent&&(this.parent.dataset.alignment=e,this.parent.update()))}get spacing(){return this._spacing}set spacing(e){e=tg.clampDimension(e),this._spacing!==e&&(this._spacing=e,this.parent&&this.parent.fit())}get handles(){return this._handles}absoluteSizes(){return this._sizers.map(e=>e.size)}relativeSizes(){return nl.normalize(this._sizers.map(e=>e.size))}setRelativeSizes(e,i=!0){let n=this._sizers.length,r=e.slice(0,n);for(;r.length0&&(s.sizeHint=s.size);ss.adjust(this._sizers,e,r),this.parent&&this.parent.update()}}init(){this.parent.dataset.orientation=this.orientation,this.parent.dataset.alignment=this.alignment,super.init()}attachWidget(e,i){let n=new Cu(i),r=nl.createHandle(this.renderer),s=nl.averageSize(this._sizers),o=nl.createSizer(s);Be.insert(this._items,e,n),Be.insert(this._sizers,e,o),Be.insert(this._handles,e,r),this.parent.isAttached&&Ae.sendMessage(i,me.Msg.BeforeAttach),this.parent.node.appendChild(i.node),this.parent.node.appendChild(r),this.parent.isAttached&&Ae.sendMessage(i,me.Msg.AfterAttach),this.parent.fit()}moveWidget(e,i,n){Be.move(this._items,e,i),Be.move(this._sizers,e,i),Be.move(this._handles,e,i),this.parent.fit()}detachWidget(e,i){let n=Be.removeAt(this._items,e),r=Be.removeAt(this._handles,e);Be.removeAt(this._sizers,e),this.parent.isAttached&&Ae.sendMessage(i,me.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.node.removeChild(r),this.parent.isAttached&&Ae.sendMessage(i,me.Msg.AfterDetach),n.dispose(),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}updateItemPosition(e,i,n,r,s,o,a){let l=this._items[e];if(l.isHidden)return;let c=this._handles[e].style;i?(n+=this.widgetOffset,l.update(n,r,a,s),n+=a,c.top=`${r}px`,c.left=`${n}px`,c.width=`${this._spacing}px`,c.height=`${s}px`):(r+=this.widgetOffset,l.update(n,r,o,a),r+=a,c.top=`${r}px`,c.left=`${n}px`,c.width=`${o}px`,c.height=`${this._spacing}px`)}_fit(){let e=0,i=-1;for(let l=0,c=this._items.length;l0&&(d.sizeHint=d.size),u.isHidden){d.minSize=0,d.maxSize=0;continue}u.fit(),d.stretch=bo.getStretch(u.widget),n?(d.minSize=u.minWidth,d.maxSize=u.maxWidth,r+=u.minWidth,s=Math.max(s,u.minHeight)):(d.minSize=u.minHeight,d.maxSize=u.maxHeight,s+=u.minHeight,r=Math.max(r,u.minWidth))}let o=this._box=ti.boxSizing(this.parent.node);r+=o.horizontalSum,s+=o.verticalSum;let a=this.parent.node.style;a.minWidth=`${r}px`,a.minHeight=`${s}px`,this._dirty=!0,this.parent.parent&&Ae.sendMessage(this.parent.parent,me.Msg.FitRequest),this._dirty&&Ae.sendMessage(this.parent,me.Msg.UpdateRequest)}_update(e,i){this._dirty=!1;let n=0;for(let d=0,f=this._items.length;d0){let d;if(u?d=Math.max(0,o-this._fixed):d=Math.max(0,a-this._fixed),this._hasNormedSizes){for(let h of this._sizers)h.sizeHint*=d;this._hasNormedSizes=!1}let f=ss.calc(this._sizers,d);if(f>0)switch(this._alignment){case"start":break;case"center":l=0,c=f/2;break;case"end":l=0,c=f;break;case"justify":l=f/n,c=0;break;default:throw"unreachable"}}for(let d=0,f=this._items.length;d0,coerce:(o,a)=>Math.max(0,Math.floor(a)),changed:s});function e(o){let a=new na;return a.sizeHint=Math.floor(o),a}t.createSizer=e;function i(o){let a=o.createHandle();return a.style.position="absolute",a.style.contain="style",a}t.createHandle=i;function n(o){return o.reduce((a,l)=>a+l.size,0)/o.length||0}t.averageSize=n;function r(o){let a=o.length;if(a===0)return[];let l=o.reduce((c,u)=>c+Math.abs(u),0);return l===0?o.map(c=>1/a):o.map(c=>c/l)}t.normalize=r;function s(o){o.parent&&o.parent.layout instanceof bo&&o.parent.fit()}})(nl||(nl={}));eC=class extends bo{constructor(e){super({...e,orientation:e.orientation||"vertical"}),this._titles=[],this.titleSpace=e.titleSpace||22}get titleSpace(){return this.widgetOffset}set titleSpace(e){e=ig.clampDimension(e),this.widgetOffset!==e&&(this.widgetOffset=e,this.parent&&this.parent.fit())}get titles(){return this._titles}dispose(){this.isDisposed||(this._titles.length=0,super.dispose())}updateTitle(e,i){let n=this._titles[e],r=n.classList.contains("lm-mod-expanded"),s=qb.createTitle(this.renderer,i.title,r);this._titles[e]=s,this.parent.node.replaceChild(s,n)}insertWidget(e,i){i.id||(i.id=`id-${sl.UUID.uuid4()}`),super.insertWidget(e,i)}attachWidget(e,i){let n=qb.createTitle(this.renderer,i.title);Be.insert(this._titles,e,n),this.parent.node.appendChild(n),i.node.setAttribute("role","region"),i.node.setAttribute("aria-labelledby",n.id),super.attachWidget(e,i)}moveWidget(e,i,n){Be.move(this._titles,e,i),super.moveWidget(e,i,n)}detachWidget(e,i){let n=Be.removeAt(this._titles,e);this.parent.node.removeChild(n),super.detachWidget(e,i)}updateItemPosition(e,i,n,r,s,o,a){let l=this._titles[e].style;l.top=`${r}px`,l.left=`${n}px`,l.height=`${this.widgetOffset}px`,i?l.width=`${s}px`:l.width=`${o}px`,super.updateItemPosition(e,i,n,r,s,o,a)}};(function(t){function e(i,n,r=!0){let s=i.createSectionTitle(n);return s.style.position="absolute",s.style.contain="strict",s.setAttribute("aria-label",`${n.label} Section`),s.setAttribute("aria-expanded",r?"true":"false"),s.setAttribute("aria-controls",n.owner.id),r&&s.classList.add("lm-mod-expanded"),s}t.createTitle=e})(qb||(qb={}));Ub=class extends me{constructor(e={}){super(),this.addClass("lm-Panel"),this.layout=tC.createLayout(e)}get widgets(){return this.layout.widgets}addWidget(e){this.layout.addWidget(e)}insertWidget(e,i){this.layout.insertWidget(e,i)}};(function(t){function e(i){return i.layout||new wf}t.createLayout=e})(tC||(tC={}));Eu=class extends Ub{constructor(e={}){super({layout:iC.createLayout(e)}),this._handleMoved=new Te(this),this._pressData=null,this.addClass("lm-SplitPanel")}dispose(){this._releaseMouse(),super.dispose()}get orientation(){return this.layout.orientation}set orientation(e){this.layout.orientation=e}get alignment(){return this.layout.alignment}set alignment(e){this.layout.alignment=e}get spacing(){return this.layout.spacing}set spacing(e){this.layout.spacing=e}get renderer(){return this.layout.renderer}get handleMoved(){return this._handleMoved}get handles(){return this.layout.handles}relativeSizes(){return this.layout.relativeSizes()}setRelativeSizes(e,i=!0){this.layout.setRelativeSizes(e,i)}handleEvent(e){switch(e.type){case"pointerdown":this._evtPointerDown(e);break;case"pointermove":this._evtPointerMove(e);break;case"pointerup":this._evtPointerUp(e);break;case"keydown":this._evtKeyDown(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("pointerdown",this)}onAfterDetach(e){this.node.removeEventListener("pointerdown",this),this._releaseMouse()}onChildAdded(e){e.child.addClass("lm-SplitPanel-child"),this._releaseMouse()}onChildRemoved(e){e.child.removeClass("lm-SplitPanel-child"),this._releaseMouse()}_evtKeyDown(e){this._pressData&&(e.preventDefault(),e.stopPropagation()),e.keyCode===27&&this._releaseMouse()}_evtPointerDown(e){if(e.button!==0)return;let i=this.layout,n=Be.findFirstIndex(i.handles,c=>c.contains(e.target));if(n===-1)return;e.preventDefault(),e.stopPropagation(),document.addEventListener("pointerup",this,!0),document.addEventListener("pointermove",this,!0),document.addEventListener("keydown",this,!0),document.addEventListener("contextmenu",this,!0);let r,s=i.handles[n],o=s.getBoundingClientRect();i.orientation==="horizontal"?r=e.clientX-o.left:r=e.clientY-o.top;let a=window.getComputedStyle(s),l=an.overrideCursor(a.cursor);this._pressData={index:n,delta:r,override:l}}_evtPointerMove(e){e.preventDefault(),e.stopPropagation();let i,n=this.layout,r=this.node.getBoundingClientRect();n.orientation==="horizontal"?i=e.clientX-r.left-this._pressData.delta:i=e.clientY-r.top-this._pressData.delta,n.moveHandle(this._pressData.index,i)}_evtPointerUp(e){e.button===0&&(e.preventDefault(),e.stopPropagation(),this._releaseMouse())}_releaseMouse(){this._pressData&&(this._pressData.override.dispose(),this._pressData=null,this._handleMoved.emit(),document.removeEventListener("keydown",this,!0),document.removeEventListener("pointerup",this,!0),document.removeEventListener("pointermove",this,!0),document.removeEventListener("contextmenu",this,!0))}};(function(t){class e{createHandle(){let s=document.createElement("div");return s.className="lm-SplitPanel-handle",s}}t.Renderer=e,t.defaultRenderer=new e;function i(r){return bo.getStretch(r)}t.getStretch=i;function n(r,s){bo.setStretch(r,s)}t.setStretch=n})(Eu||(Eu={}));(function(t){function e(i){return i.layout||new bo({renderer:i.renderer||Eu.defaultRenderer,orientation:i.orientation,alignment:i.alignment,spacing:i.spacing})}t.createLayout=e})(iC||(iC={}));ng=class extends Eu{constructor(e={}){super({...e,layout:nC.createLayout(e)}),this._widgetSizesCache=new WeakMap,this._expansionToggled=new Te(this),this.addClass("lm-AccordionPanel")}get renderer(){return this.layout.renderer}get titleSpace(){return this.layout.titleSpace}set titleSpace(e){this.layout.titleSpace=e}get titles(){return this.layout.titles}get expansionToggled(){return this._expansionToggled}addWidget(e){super.addWidget(e),e.title.changed.connect(this._onTitleChanged,this)}collapse(e){let i=this.layout.widgets[e];i&&!i.isHidden&&this._toggleExpansion(e)}expand(e){let i=this.layout.widgets[e];i&&i.isHidden&&this._toggleExpansion(e)}insertWidget(e,i){super.insertWidget(e,i),i.title.changed.connect(this._onTitleChanged,this)}handleEvent(e){switch(super.handleEvent(e),e.type){case"click":this._evtClick(e);break;case"keydown":this._eventKeyDown(e);break}}onBeforeAttach(e){this.node.addEventListener("click",this),this.node.addEventListener("keydown",this),super.onBeforeAttach(e)}onAfterDetach(e){super.onAfterDetach(e),this.node.removeEventListener("click",this),this.node.removeEventListener("keydown",this)}_onTitleChanged(e){let i=Be.findFirstIndex(this.widgets,n=>n.contains(e.owner));i>=0&&(this.layout.updateTitle(i,e.owner),this.update())}_computeWidgetSize(e){let i=this.layout,n=i.widgets[e];if(!n)return;let r=n.isHidden,s=i.absoluteSizes(),o=(r?-1:1)*this.spacing,a=s.reduce((c,u)=>c+u),l=[...s];if(r){let c=this._widgetSizesCache.get(n);if(!c)return;l[e]+=c;let u=l.map(d=>d-c>0).lastIndexOf(!0);u===-1?l.forEach((d,f)=>{f!==e&&(l[f]-=s[f]/a*(c-o))}):l[u]-=c-o}else{let c=s[e];this._widgetSizesCache.set(n,c),l[e]=0;let u=l.map(d=>d>0).lastIndexOf(!0);if(u===-1)return;l[u]=s[u]+c+o}return l.map(c=>c/(a+o))}_evtClick(e){let i=e.target;if(i){let n=Be.findFirstIndex(this.titles,r=>r.contains(i));n>=0&&(e.preventDefault(),e.stopPropagation(),this._toggleExpansion(n))}}_eventKeyDown(e){if(e.defaultPrevented)return;let i=e.target,n=!1;if(i){let r=Be.findFirstIndex(this.titles,s=>s.contains(i));if(r>=0){let s=e.keyCode.toString();if(e.key.match(/Space|Enter/)||s.match(/13|32/))i.click(),n=!0;else if(this.orientation==="horizontal"?e.key.match(/ArrowLeft|ArrowRight/)||s.match(/37|39/):e.key.match(/ArrowUp|ArrowDown/)||s.match(/38|40/)){let o=e.key.match(/ArrowLeft|ArrowUp/)||s.match(/37|38/)?-1:1,a=this.titles.length,l=(r+a+o)%a;this.titles[l].focus(),n=!0}else e.key==="End"||s==="35"?(this.titles[this.titles.length-1].focus(),n=!0):(e.key==="Home"||s==="36")&&(this.titles[0].focus(),n=!0)}n&&e.preventDefault()}}_toggleExpansion(e){let i=this.titles[e],n=this.layout.widgets[e],r=this._computeWidgetSize(e);r&&this.setRelativeSizes(r,!1),n.isHidden?(i.classList.add("lm-mod-expanded"),i.setAttribute("aria-expanded","true"),n.show()):(i.classList.remove("lm-mod-expanded"),i.setAttribute("aria-expanded","false"),n.hide()),this._expansionToggled.emit(e)}};(function(t){class e extends Eu.Renderer{constructor(){super(),this.titleClassName="lm-AccordionPanel-title",this._titleID=0,this._titleKeys=new WeakMap,this._uuid=++e._nInstance}createCollapseIcon(n){return document.createElement("span")}createSectionTitle(n){let r=document.createElement("h3");r.setAttribute("tabindex","0"),r.id=this.createTitleKey(n),r.className=this.titleClassName;for(let a in n.dataset)r.dataset[a]=n.dataset[a];let s=r.appendChild(this.createCollapseIcon(n));s.className="lm-AccordionPanel-titleCollapser";let o=r.appendChild(document.createElement("span"));return o.className="lm-AccordionPanel-titleLabel",o.textContent=n.label,o.title=n.caption||n.label,r}createTitleKey(n){let r=this._titleKeys.get(n);return r===void 0&&(r=`title-key-${this._uuid}-${this._titleID++}`,this._titleKeys.set(n,r)),r}}e._nInstance=0,t.Renderer=e,t.defaultRenderer=new e})(ng||(ng={}));(function(t){function e(i){return i.layout||new eC({renderer:i.renderer||ng.defaultRenderer,orientation:i.orientation,alignment:i.alignment,spacing:i.spacing,titleSpace:i.titleSpace})}t.createLayout=e})(nC||(nC={}));os=class extends wf{constructor(e={}){super(),this._fixed=0,this._spacing=4,this._dirty=!1,this._sizers=[],this._items=[],this._box=null,this._alignment="start",this._direction="top-to-bottom",e.direction!==void 0&&(this._direction=e.direction),e.alignment!==void 0&&(this._alignment=e.alignment),e.spacing!==void 0&&(this._spacing=ig.clampDimension(e.spacing))}dispose(){for(let e of this._items)e.dispose();this._box=null,this._items.length=0,this._sizers.length=0,super.dispose()}get direction(){return this._direction}set direction(e){this._direction!==e&&(this._direction=e,this.parent&&(this.parent.dataset.direction=e,this.parent.fit()))}get alignment(){return this._alignment}set alignment(e){this._alignment!==e&&(this._alignment=e,this.parent&&(this.parent.dataset.alignment=e,this.parent.update()))}get spacing(){return this._spacing}set spacing(e){e=ig.clampDimension(e),this._spacing!==e&&(this._spacing=e,this.parent&&this.parent.fit())}init(){this.parent.dataset.direction=this.direction,this.parent.dataset.alignment=this.alignment,super.init()}attachWidget(e,i){Be.insert(this._items,e,new Cu(i)),Be.insert(this._sizers,e,new na),this.parent.isAttached&&Ae.sendMessage(i,me.Msg.BeforeAttach),this.parent.node.appendChild(i.node),this.parent.isAttached&&Ae.sendMessage(i,me.Msg.AfterAttach),this.parent.fit()}moveWidget(e,i,n){Be.move(this._items,e,i),Be.move(this._sizers,e,i),this.parent.update()}detachWidget(e,i){let n=Be.removeAt(this._items,e);Be.removeAt(this._sizers,e),this.parent.isAttached&&Ae.sendMessage(i,me.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.isAttached&&Ae.sendMessage(i,me.Msg.AfterDetach),n.dispose(),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_fit(){let e=0;for(let a=0,l=this._items.length;a0)switch(this._alignment){case"start":break;case"center":c=0,u=l/2;break;case"end":c=0,u=l;break;case"justify":c=l/n,u=0;break;default:throw"unreachable"}for(let d=0,f=this._items.length;d0,coerce:(r,s)=>Math.max(0,Math.floor(s)),changed:n}),t.sizeBasisProperty=new pt({name:"sizeBasis",create:()=>0,coerce:(r,s)=>Math.max(0,Math.floor(s)),changed:n});function e(r){return r==="left-to-right"||r==="right-to-left"}t.isHorizontal=e;function i(r){return Math.max(0,Math.floor(r))}t.clampSpacing=i;function n(r){r.parent&&r.parent.layout instanceof os&&r.parent.fit()}})(Su||(Su={}));Vb=class extends Ub{constructor(e={}){super({layout:rC.createLayout(e)}),this.addClass("lm-BoxPanel")}get direction(){return this.layout.direction}set direction(e){this.layout.direction=e}get alignment(){return this.layout.alignment}set alignment(e){this.layout.alignment=e}get spacing(){return this.layout.spacing}set spacing(e){this.layout.spacing=e}onChildAdded(e){e.child.addClass("lm-BoxPanel-child")}onChildRemoved(e){e.child.removeClass("lm-BoxPanel-child")}};(function(t){function e(s){return os.getStretch(s)}t.getStretch=e;function i(s,o){os.setStretch(s,o)}t.setStretch=i;function n(s){return os.getSizeBasis(s)}t.getSizeBasis=n;function r(s,o){os.setSizeBasis(s,o)}t.setSizeBasis=r})(Vb||(Vb={}));(function(t){function e(i){return i.layout||new os(i)}t.createLayout=e})(rC||(rC={}));Sf=class extends me{constructor(e){super({node:il.createNode()}),this._activeIndex=-1,this._items=[],this._results=null,this.addClass("lm-CommandPalette"),this.setFlag(me.Flag.DisallowLayout),this.commands=e.commands,this.renderer=e.renderer||Sf.defaultRenderer,this.commands.commandChanged.connect(this._onGenericChange,this),this.commands.keyBindingChanged.connect(this._onGenericChange,this)}dispose(){this._items.length=0,this._results=null,super.dispose()}get searchNode(){return this.node.getElementsByClassName("lm-CommandPalette-search")[0]}get inputNode(){return this.node.getElementsByClassName("lm-CommandPalette-input")[0]}get contentNode(){return this.node.getElementsByClassName("lm-CommandPalette-content")[0]}get items(){return this._items}addItem(e){let i=il.createItem(this.commands,e);return this._items.push(i),this.refresh(),i}addItems(e){let i=e.map(n=>il.createItem(this.commands,n));return i.forEach(n=>this._items.push(n)),this.refresh(),i}removeItem(e){this.removeItemAt(this._items.indexOf(e))}removeItemAt(e){Be.removeAt(this._items,e)&&this.refresh()}clearItems(){this._items.length!==0&&(this._items.length=0,this.refresh())}refresh(){if(this._results=null,this.inputNode.value!==""){let e=this.node.getElementsByClassName("lm-close-icon")[0];e.style.display="inherit"}else{let e=this.node.getElementsByClassName("lm-close-icon")[0];e.style.display="none"}this.update()}handleEvent(e){switch(e.type){case"click":this._evtClick(e);break;case"keydown":this._evtKeyDown(e);break;case"input":this.refresh();break;case"focus":case"blur":this._toggleFocused();break}}onBeforeAttach(e){this.node.addEventListener("click",this),this.node.addEventListener("keydown",this),this.node.addEventListener("input",this),this.node.addEventListener("focus",this,!0),this.node.addEventListener("blur",this,!0)}onAfterDetach(e){this.node.removeEventListener("click",this),this.node.removeEventListener("keydown",this),this.node.removeEventListener("input",this),this.node.removeEventListener("focus",this,!0),this.node.removeEventListener("blur",this,!0)}onAfterShow(e){this.update(),super.onAfterShow(e)}onActivateRequest(e){if(this.isAttached){let i=this.inputNode;i.focus(),i.select()}}onUpdateRequest(e){if(this.isHidden)return;let i=this.inputNode.value,n=this.contentNode,r=this._results;if(r||(r=this._results=il.search(this._items,i),this._activeIndex=i?Be.findFirstIndex(r,il.canActivate):-1),!i&&r.length===0){Qt.render(null,n);return}if(i&&r.length===0){let l=this.renderer.renderEmptyMessage({query:i});Qt.render(l,n);return}let s=this.renderer,o=this._activeIndex,a=new Array(r.length);for(let l=0,c=r.length;l=r.length)n.scrollTop=0;else{let l=n.children[o];ti.scrollIntoViewIfNeeded(n,l)}}_evtClick(e){if(e.button!==0)return;if(e.target.classList.contains("lm-close-icon")){this.inputNode.value="",this.refresh();return}let i=Be.findFirstIndex(this.contentNode.children,n=>n.contains(e.target));i!==-1&&(e.preventDefault(),e.stopPropagation(),this._execute(i))}_evtKeyDown(e){if(!(e.altKey||e.ctrlKey||e.metaKey||e.shiftKey))switch(e.keyCode){case 13:e.preventDefault(),e.stopPropagation(),this._execute(this._activeIndex);break;case 38:e.preventDefault(),e.stopPropagation(),this._activatePreviousItem();break;case 40:e.preventDefault(),e.stopPropagation(),this._activateNextItem();break}}_activateNextItem(){if(!this._results||this._results.length===0)return;let e=this._activeIndex,i=this._results.length,n=eC-x),b=S.slice(0,A),M=S.slice(A);for(let C=0,x=M.length;Cp.command===h&&sl.JSONExt.deepEqual(p.args,m))||null}}})(il||(il={}));ra=class extends me{constructor(e){super({node:gr.createNode()}),this._childIndex=-1,this._activeIndex=-1,this._openTimerID=0,this._closeTimerID=0,this._items=[],this._childMenu=null,this._parentMenu=null,this._aboutToClose=new Te(this),this._menuRequested=new Te(this),this.addClass("lm-Menu"),this.setFlag(me.Flag.DisallowLayout),this.commands=e.commands,this.renderer=e.renderer||ra.defaultRenderer}dispose(){this.close(),this._items.length=0,super.dispose()}get aboutToClose(){return this._aboutToClose}get menuRequested(){return this._menuRequested}get parentMenu(){return this._parentMenu}get childMenu(){return this._childMenu}get rootMenu(){let e=this;for(;e._parentMenu;)e=e._parentMenu;return e}get leafMenu(){let e=this;for(;e._childMenu;)e=e._childMenu;return e}get contentNode(){return this.node.getElementsByClassName("lm-Menu-content")[0]}get activeItem(){return this._items[this._activeIndex]||null}set activeItem(e){this.activeIndex=e?this._items.indexOf(e):-1}get activeIndex(){return this._activeIndex}set activeIndex(e){(e<0||e>=this._items.length)&&(e=-1),e!==-1&&!gr.canActivate(this._items[e])&&(e=-1),this._activeIndex!==e&&(this._activeIndex=e,this._activeIndex>=0&&this.contentNode.childNodes[this._activeIndex]&&this.contentNode.childNodes[this._activeIndex].focus(),this.update())}get items(){return this._items}activateNextItem(){let e=this._items.length,i=this._activeIndex,n=i{this.activeIndex=a}})}Qt.render(o,this.contentNode)}onCloseRequest(e){this._cancelOpenTimer(),this._cancelCloseTimer(),this.activeIndex=-1;let i=this._childMenu;i&&(this._childIndex=-1,this._childMenu=null,i._parentMenu=null,i.close());let n=this._parentMenu;n&&(this._parentMenu=null,n._childIndex=-1,n._childMenu=null,n.activate()),this.isAttached&&this._aboutToClose.emit(void 0),super.onCloseRequest(e)}_evtKeyDown(e){e.preventDefault(),e.stopPropagation();let i=e.keyCode;if(i===13){this.triggerActiveItem();return}if(i===27){this.close();return}if(i===37){this._parentMenu?this.close():this._menuRequested.emit("previous");return}if(i===38){this.activatePreviousItem();return}if(i===39){let o=this.activeItem;o&&o.type==="submenu"?this.triggerActiveItem():this.rootMenu._menuRequested.emit("next");return}if(i===40){this.activateNextItem();return}let n=ks().keyForKeydownEvent(e);if(!n)return;let r=this._activeIndex+1,s=gr.findMnemonic(this._items,n,r);s.index!==-1&&!s.multiple?(this.activeIndex=s.index,this.triggerActiveItem()):s.index!==-1?this.activeIndex=s.index:s.auto!==-1&&(this.activeIndex=s.auto)}_evtMouseUp(e){e.button===0&&(e.preventDefault(),e.stopPropagation(),this.triggerActiveItem())}_evtMouseMove(e){let i=Be.findFirstIndex(this.contentNode.children,r=>ti.hitTest(r,e.clientX,e.clientY));if(i===this._activeIndex)return;if(this.activeIndex=i,i=this.activeIndex,i===this._childIndex){this._cancelOpenTimer(),this._cancelCloseTimer();return}this._childIndex!==-1&&this._startCloseTimer(),this._cancelOpenTimer();let n=this.activeItem;!n||n.type!=="submenu"||!n.submenu||this._startOpenTimer()}_evtMouseEnter(e){for(let i=this._parentMenu;i;i=i._parentMenu)i._cancelOpenTimer(),i._cancelCloseTimer(),i.activeIndex=i._childIndex}_evtMouseLeave(e){if(this._cancelOpenTimer(),!this._childMenu){this.activeIndex=-1;return}let{clientX:i,clientY:n}=e;if(ti.hitTest(this._childMenu.node,i,n)){this._cancelCloseTimer();return}this.activeIndex=-1,this._startCloseTimer()}_evtMouseDown(e){this._parentMenu||(gr.hitTestMenus(this,e.clientX,e.clientY)?(e.preventDefault(),e.stopPropagation()):this.close())}_openChildMenu(e=!1){let i=this.activeItem;if(!i||i.type!=="submenu"||!i.submenu){this._closeChildMenu();return}let n=i.submenu;if(n===this._childMenu)return;ra.saveWindowData(),this._closeChildMenu(),this._childMenu=n,this._childIndex=this._activeIndex,n._parentMenu=this,Ae.sendMessage(this,me.Msg.UpdateRequest);let r=this.contentNode.children[this._activeIndex];gr.openSubmenu(n,r),e&&(n.activeIndex=-1,n.activateNextItem()),n.activate()}_closeChildMenu(){this._childMenu&&this._childMenu.close()}_startOpenTimer(){this._openTimerID===0&&(this._openTimerID=window.setTimeout(()=>{this._openTimerID=0,this._openChildMenu()},gr.TIMER_DELAY))}_startCloseTimer(){this._closeTimerID===0&&(this._closeTimerID=window.setTimeout(()=>{this._closeTimerID=0,this._closeChildMenu()},gr.TIMER_DELAY))}_cancelOpenTimer(){this._openTimerID!==0&&(clearTimeout(this._openTimerID),this._openTimerID=0)}_cancelCloseTimer(){this._closeTimerID!==0&&(clearTimeout(this._closeTimerID),this._closeTimerID=0)}static saveWindowData(){gr.saveWindowData()}};(function(t){class e{renderItem(n){let r=this.createItemClass(n),s=this.createItemDataset(n),o=this.createItemARIA(n);return Ce.li({className:r,dataset:s,tabindex:"0",onfocus:n.onfocus,...o},this.renderIcon(n),this.renderLabel(n),this.renderShortcut(n),this.renderSubmenu(n))}renderIcon(n){let r=this.createIconClass(n);return Ce.div({className:r},n.item.icon,n.item.iconLabel)}renderLabel(n){let r=this.formatLabel(n);return Ce.div({className:"lm-Menu-itemLabel"},r)}renderShortcut(n){let r=this.formatShortcut(n);return Ce.div({className:"lm-Menu-itemShortcut"},r)}renderSubmenu(n){return Ce.div({className:"lm-Menu-itemSubmenuIcon"})}createItemClass(n){let r="lm-Menu-item";n.item.isEnabled||(r+=" lm-mod-disabled"),n.item.isToggled&&(r+=" lm-mod-toggled"),n.item.isVisible||(r+=" lm-mod-hidden"),n.active&&(r+=" lm-mod-active"),n.collapsed&&(r+=" lm-mod-collapsed");let s=n.item.className;return s&&(r+=` ${s}`),r}createItemDataset(n){let r,{type:s,command:o,dataset:a}=n.item;return s==="command"?r={...a,type:s,command:o}:r={...a,type:s},r}createIconClass(n){let r="lm-Menu-itemIcon",s=n.item.iconClass;return s?`${r} ${s}`:r}createItemARIA(n){let r={};switch(n.item.type){case"separator":r.role="presentation";break;case"submenu":r["aria-haspopup"]="true",n.item.isEnabled||(r["aria-disabled"]="true");break;default:n.item.isEnabled||(r["aria-disabled"]="true"),r.role="menuitem"}return r}formatLabel(n){let{label:r,mnemonic:s}=n.item;if(s<0||s>=r.length)return r;let o=r.slice(0,s),a=r.slice(s+1),l=r[s],c=Ce.span({className:"lm-Menu-itemMnemonic"},l);return[o,c,a]}formatShortcut(n){let r=n.item.keyBinding;return r?er.formatKeystroke(r.keys):null}}t.Renderer=e,t.defaultRenderer=new e})(ra||(ra={}));(function(t){t.TIMER_DELAY=300,t.SUBMENU_OVERLAP=3;let e=null,i=0;function n(){return i>0?(i--,e):u()}function r(){e=u(),i++}t.saveWindowData=r;function s(){let p=document.createElement("div"),_=document.createElement("ul");return _.className="lm-Menu-content",p.appendChild(_),_.setAttribute("role","menu"),p.tabIndex=0,p}t.createNode=s;function o(p){return p.type!=="separator"&&p.isEnabled&&p.isVisible}t.canActivate=o;function a(p,_){return new m(p.commands,_)}t.createItem=a;function l(p,_,y){for(let S=p;S;S=S.childMenu)if(ti.hitTest(S.node,_,y))return!0;return!1}t.hitTestMenus=l;function c(p){let _=new Array(p.length);Be.fill(_,!1);let y=0,S=p.length;for(;y=0;--T){let A=p[T];if(A.isVisible){if(A.type!=="separator")break;_[T]=!0}}let O=!1;for(;++yM+x&&(_=M+x-Z),!T&&y+X>C+w&&(y>C+w?y=C+w-X:y=y-X),B.transform=`translate(${Math.max(0,_)}px, ${Math.max(0,y)}px`,B.opacity="1"}t.openRootMenu=d;function f(p,_){let y=n(),S=y.pageXOffset,T=y.pageYOffset,O=y.clientWidth,A=y.clientHeight;Ae.sendMessage(p,me.Msg.UpdateRequest);let b=A,M=p.node,C=M.style;C.opacity="0",C.maxHeight=`${b}px`,me.attach(p,document.body);let{width:x,height:w}=M.getBoundingClientRect(),E=ti.boxSizing(p.node),N=_.getBoundingClientRect(),B=N.right-t.SUBMENU_OVERLAP;B+x>S+O&&(B=N.left+t.SUBMENU_OVERLAP-x);let Z=N.top-E.borderTop-E.paddingTop;Z+w>T+A&&(Z=N.bottom+E.borderBottom+E.paddingBottom-w),C.transform=`translate(${Math.max(0,B)}px, ${Math.max(0,Z)}px`,C.opacity="1"}t.openSubmenu=f;function h(p,_,y){let S=-1,T=-1,O=!1,A=_.toUpperCase();for(let b=0,M=p.length;b=0&&ES.command===_&&sl.JSONExt.deepEqual(S.args,y))||null}return null}}})(gr||(gr={}));(function(t){function e(o,a){let l=n(o.selector),c=o.rank!==void 0?o.rank:1/0;return{...o,selector:l,rank:c,id:a}}t.createItem=e;function i(o,a,l,c){let u=a.target;if(!u)return null;let d=a.currentTarget;if(!d||!d.contains(u)&&(u=document.elementFromPoint(a.clientX,a.clientY),!u||!d.contains(u)))return null;let f=[],h=o.slice();for(;u!==null;){let m=[];for(let p=0,_=h.length;p<_;++p){let y=h[p];y&&wu.matches(u,y.selector)&&(m.push(y),h[p]=null)}if(m.length!==0&&(l&&m.sort(c?s:r),f.push(...m)),u===d)break;u=u.parentElement}return l||f.sort(c?s:r),f}t.matchItems=i;function n(o){if(o.indexOf(",")!==-1)throw new Error(`Selector cannot contain commas: ${o}`);if(!wu.isValid(o))throw new Error(`Invalid selector: ${o}`);return o}function r(o,a){let l=o.rank,c=a.rank;return l!==c?l=this._titles.length)&&(e=-1),this._currentIndex===e)return;let i=this._currentIndex,n=this._titles[i]||null,r=e,s=this._titles[r]||null;this._currentIndex=r,this._previousTitle=n,this.update(),this._currentChanged.emit({previousIndex:i,previousTitle:n,currentIndex:r,currentTitle:s})}get name(){return this._name}set name(e){this._name=e,e?this.contentNode.setAttribute("aria-label",e):this.contentNode.removeAttribute("aria-label")}get orientation(){return this._orientation}set orientation(e){this._orientation!==e&&(this._releaseMouse(),this._orientation=e,this.dataset.orientation=e,this.contentNode.setAttribute("aria-orientation",e))}get addButtonEnabled(){return this._addButtonEnabled}set addButtonEnabled(e){this._addButtonEnabled!==e&&(this._addButtonEnabled=e,e?this.addButtonNode.classList.remove("lm-mod-hidden"):this.addButtonNode.classList.add("lm-mod-hidden"))}get titles(){return this._titles}get contentNode(){return this.node.getElementsByClassName("lm-TabBar-content")[0]}get addButtonNode(){return this.node.getElementsByClassName("lm-TabBar-addButton")[0]}addTab(e){return this.insertTab(this._titles.length,e)}insertTab(e,i){this._releaseMouse();let n=qs.asTitle(i),r=this._titles.indexOf(n),s=Math.max(0,Math.min(e,this._titles.length));return r===-1?(Be.insert(this._titles,s,n),n.changed.connect(this._onTitleChanged,this),this.update(),this._adjustCurrentForInsert(s,n),n):(s===this._titles.length&&s--,r===s||(Be.move(this._titles,r,s),this.update(),this._adjustCurrentForMove(r,s)),n)}removeTab(e){this.removeTabAt(this._titles.indexOf(e))}removeTabAt(e){this._releaseMouse();let i=Be.removeAt(this._titles,e);i&&(i.changed.disconnect(this._onTitleChanged,this),i===this._previousTitle&&(this._previousTitle=null),this.update(),this._adjustCurrentForRemove(e,i))}clearTabs(){if(this._titles.length===0)return;this._releaseMouse();for(let n of this._titles)n.changed.disconnect(this._onTitleChanged,this);let e=this.currentIndex,i=this.currentTitle;this._currentIndex=-1,this._previousTitle=null,this._titles.length=0,this.update(),e!==-1&&this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:-1,currentTitle:null})}releaseMouse(){this._releaseMouse()}handleEvent(e){switch(e.type){case"pointerdown":this._evtPointerDown(e);break;case"pointermove":this._evtPointerMove(e);break;case"pointerup":this._evtPointerUp(e);break;case"dblclick":this._evtDblClick(e);break;case"keydown":e.eventPhase===Event.CAPTURING_PHASE?this._evtKeyDownCapturing(e):this._evtKeyDown(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("pointerdown",this),this.node.addEventListener("dblclick",this),this.node.addEventListener("keydown",this)}onAfterDetach(e){this.node.removeEventListener("pointerdown",this),this.node.removeEventListener("dblclick",this),this.node.removeEventListener("keydown",this),this._releaseMouse()}onUpdateRequest(e){var i;let n=this._titles,r=this.renderer,s=this.currentTitle,o=new Array(n.length),a=(i=this._getCurrentTabindex())!==null&&i!==void 0?i:this._currentIndex>-1?this._currentIndex:0;for(let l=0,c=n.length;lti.hitTest(o,e.clientX,e.clientY));if(n===-1)return;let r=this.titles[n],s=i[n].querySelector(".lm-TabBar-tabLabel");if(s&&s.contains(e.target)){let o=r.label||"",a=s.innerHTML;s.innerHTML="";let l=document.createElement("input");l.classList.add("lm-TabBar-tabInput"),l.value=o,s.appendChild(l);let c=()=>{l.removeEventListener("blur",c),s.innerHTML=a,this.node.addEventListener("keydown",this)};l.addEventListener("dblclick",u=>u.stopPropagation()),l.addEventListener("blur",c),l.addEventListener("keydown",u=>{u.key==="Enter"?(l.value!==""&&(r.label=r.caption=l.value),c()):u.key==="Escape"&&c()}),this.node.removeEventListener("keydown",this),l.select(),l.focus(),s.children.length>0&&s.children[0].focus()}}_evtKeyDownCapturing(e){e.eventPhase===Event.CAPTURING_PHASE&&(e.preventDefault(),e.stopPropagation(),e.key==="Escape"&&this._releaseMouse())}_evtKeyDown(e){var i,n,r;if(!(e.key==="Tab"||e.eventPhase===Event.CAPTURING_PHASE)){if(e.key==="Enter"||e.key==="Spacebar"||e.key===" "){let s=document.activeElement;if(this.addButtonEnabled&&this.addButtonNode.contains(s))e.preventDefault(),e.stopPropagation(),this._addRequested.emit();else{let o=Be.findFirstIndex(this.contentNode.children,a=>a.contains(s));o>=0&&(e.preventDefault(),e.stopPropagation(),this.currentIndex=o)}}else if(WV.includes(e.key)){let s=[...this.contentNode.children];if(this.addButtonEnabled&&s.push(this.addButtonNode),s.length<=1)return;e.preventDefault(),e.stopPropagation();let o=s.indexOf(document.activeElement);o===-1&&(o=this._currentIndex);let a;e.key==="ArrowRight"&&this._orientation==="horizontal"||e.key==="ArrowDown"&&this._orientation==="vertical"?a=(i=s[o+1])!==null&&i!==void 0?i:s[0]:e.key==="ArrowLeft"&&this._orientation==="horizontal"||e.key==="ArrowUp"&&this._orientation==="vertical"?a=(n=s[o-1])!==null&&n!==void 0?n:s[s.length-1]:e.key==="Home"?a=s[0]:e.key==="End"&&(a=s[s.length-1]),a&&((r=s[o])===null||r===void 0||r.setAttribute("tabindex","-1"),a?.setAttribute("tabindex","0"),a.focus())}}}_evtPointerDown(e){if(e.button!==0&&e.button!==1||this._dragData||e.target.classList.contains("lm-TabBar-tabInput"))return;let i=this.addButtonEnabled&&this.addButtonNode.contains(e.target),n=this.contentNode.children,r=Be.findFirstIndex(n,o=>ti.hitTest(o,e.clientX,e.clientY));if(r===-1&&!i||(e.preventDefault(),e.stopPropagation(),this._dragData={tab:n[r],index:r,pressX:e.clientX,pressY:e.clientY,tabPos:-1,tabSize:-1,tabPressPos:-1,targetIndex:-1,tabLayout:null,contentRect:null,override:null,dragActive:!1,dragAborted:!1,detachRequested:!1},this.document.addEventListener("pointerup",this,!0),e.button===1||i))return;let s=n[r].querySelector(this.renderer.closeIconSelector);s&&s.contains(e.target)||(this.tabsMovable&&(this.document.addEventListener("pointermove",this,!0),this.document.addEventListener("keydown",this,!0),this.document.addEventListener("contextmenu",this,!0)),this.allowDeselect&&this.currentIndex===r?this.currentIndex=-1:this.currentIndex=r,this.currentIndex!==-1&&this._tabActivateRequested.emit({index:this.currentIndex,title:this.currentTitle}))}_evtPointerMove(e){let i=this._dragData;if(!i)return;e.preventDefault(),e.stopPropagation();let n=this.contentNode.children;if(!(!i.dragActive&&!qs.dragExceeded(i,e))){if(!i.dragActive){let r=i.tab.getBoundingClientRect();this._orientation==="horizontal"?(i.tabPos=i.tab.offsetLeft,i.tabSize=r.width,i.tabPressPos=i.pressX-r.left):(i.tabPos=i.tab.offsetTop,i.tabSize=r.height,i.tabPressPos=i.pressY-r.top),i.tabPressOffset={x:i.pressX-r.left,y:i.pressY-r.top},i.tabLayout=qs.snapTabLayout(n,this._orientation),i.contentRect=this.contentNode.getBoundingClientRect(),i.override=an.overrideCursor("default"),i.tab.classList.add("lm-mod-dragging"),this.addClass("lm-mod-dragging"),i.dragActive=!0}if(!i.detachRequested&&qs.detachExceeded(i,e)){i.detachRequested=!0;let r=i.index,s=e.clientX,o=e.clientY,a=n[r],l=this._titles[r];if(this._tabDetachRequested.emit({index:r,title:l,tab:a,clientX:s,clientY:o,offset:i.tabPressOffset}),i.dragAborted)return}qs.layoutTabs(n,i,e,this._orientation)}}_evtPointerUp(e){if(e.button!==0&&e.button!==1)return;let i=this._dragData;if(!i)return;if(e.preventDefault(),e.stopPropagation(),this.document.removeEventListener("pointermove",this,!0),this.document.removeEventListener("pointerup",this,!0),this.document.removeEventListener("keydown",this,!0),this.document.removeEventListener("contextmenu",this,!0),!i.dragActive){if(this._dragData=null,this.addButtonEnabled&&this.addButtonNode.contains(e.target)){this._addRequested.emit(void 0);return}let s=this.contentNode.children,o=Be.findFirstIndex(s,c=>ti.hitTest(c,e.clientX,e.clientY));if(o!==i.index)return;let a=this._titles[o];if(!a.closable)return;if(e.button===1){this._tabCloseRequested.emit({index:o,title:a});return}let l=s[o].querySelector(this.renderer.closeIconSelector);if(l&&l.contains(e.target)){this._tabCloseRequested.emit({index:o,title:a});return}return}if(e.button!==0)return;qs.finalizeTabPosition(i,this._orientation),i.tab.classList.remove("lm-mod-dragging");let n=qs.parseTransitionDuration(i.tab);setTimeout(()=>{if(i.dragAborted)return;this._dragData=null,qs.resetTabPositions(this.contentNode.children,this._orientation),i.override.dispose(),this.removeClass("lm-mod-dragging");let r=i.index,s=i.targetIndex;s===-1||r===s||(Be.move(this._titles,r,s),this._adjustCurrentForMove(r,s),this._tabMoved.emit({fromIndex:r,toIndex:s,title:this._titles[s]}),Ae.sendMessage(this,me.Msg.UpdateRequest))},n)}_releaseMouse(){let e=this._dragData;e&&(this._dragData=null,this.document.removeEventListener("pointermove",this,!0),this.document.removeEventListener("pointerup",this,!0),this.document.removeEventListener("keydown",this,!0),this.document.removeEventListener("contextmenu",this,!0),e.dragAborted=!0,e.dragActive&&(qs.resetTabPositions(this.contentNode.children,this._orientation),e.override.dispose(),e.tab.classList.remove("lm-mod-dragging"),this.removeClass("lm-mod-dragging")))}_adjustCurrentForInsert(e,i){let n=this.currentTitle,r=this._currentIndex,s=this.insertBehavior;if(s==="select-tab"||s==="select-tab-if-needed"&&r===-1){this._currentIndex=e,this._previousTitle=n,this._currentChanged.emit({previousIndex:r,previousTitle:n,currentIndex:e,currentTitle:i});return}r>=e&&this._currentIndex++}_adjustCurrentForMove(e,i){this._currentIndex===e?this._currentIndex=i:this._currentIndex=i?this._currentIndex++:this._currentIndex>e&&this._currentIndex<=i&&this._currentIndex--}_adjustCurrentForRemove(e,i){let n=this._currentIndex,r=this.removeBehavior;if(n!==e){n>e&&this._currentIndex--;return}if(this._titles.length===0){this._currentIndex=-1,this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:-1,currentTitle:null});return}if(r==="select-tab-after"){this._currentIndex=Math.min(e,this._titles.length-1),this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:this._currentIndex,currentTitle:this.currentTitle});return}if(r==="select-tab-before"){this._currentIndex=Math.max(0,e-1),this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:this._currentIndex,currentTitle:this.currentTitle});return}if(r==="select-previous-tab"){this._previousTitle?(this._currentIndex=this._titles.indexOf(this._previousTitle),this._previousTitle=null):this._currentIndex=Math.min(e,this._titles.length-1),this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:this._currentIndex,currentTitle:this.currentTitle});return}this._currentIndex=-1,this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:-1,currentTitle:null})}_onTitleChanged(e){this.update()}};(function(t){class e{constructor(){this.closeIconSelector=".lm-TabBar-tabCloseIcon",this._tabID=0,this._tabKeys=new WeakMap,this._uuid=++e._nInstance}renderTab(n){let r=n.title.caption,s=this.createTabKey(n),o=s,a=this.createTabStyle(n),l=this.createTabClass(n),c=this.createTabDataset(n),u=this.createTabARIA(n);return n.title.closable?Ce.li({id:o,key:s,className:l,title:r,style:a,dataset:c,...u},this.renderIcon(n),this.renderLabel(n),this.renderCloseIcon(n)):Ce.li({id:o,key:s,className:l,title:r,style:a,dataset:c,...u},this.renderIcon(n),this.renderLabel(n))}renderIcon(n){let{title:r}=n,s=this.createIconClass(n);return Ce.div({className:s},r.icon,r.iconLabel)}renderLabel(n){return Ce.div({className:"lm-TabBar-tabLabel"},n.title.label)}renderCloseIcon(n){return Ce.div({className:"lm-TabBar-tabCloseIcon"})}createTabKey(n){let r=this._tabKeys.get(n.title);return r===void 0&&(r=`tab-key-${this._uuid}-${this._tabID++}`,this._tabKeys.set(n.title,r)),r}createTabStyle(n){return{zIndex:`${n.zIndex}`}}createTabClass(n){let r="lm-TabBar-tab";return n.title.className&&(r+=` ${n.title.className}`),n.title.closable&&(r+=" lm-mod-closable"),n.current&&(r+=" lm-mod-current"),r}createTabDataset(n){return n.title.dataset}createTabARIA(n){var r;return{role:"tab","aria-selected":n.current.toString(),tabindex:`${(r=n.tabIndex)!==null&&r!==void 0?r:"-1"}`}}createIconClass(n){let r="lm-TabBar-tabIcon",s=n.title.iconClass;return s?`${r} ${s}`:r}}e._nInstance=0,t.Renderer=e,t.defaultRenderer=new e,t.addButtonSelector=".lm-TabBar-addButton"})(gc||(gc={}));(function(t){t.DRAG_THRESHOLD=5,t.DETACH_THRESHOLD=20;function e(){let u=document.createElement("div"),d=document.createElement("ul");d.setAttribute("role","tablist"),d.className="lm-TabBar-content",u.appendChild(d);let f=document.createElement("div");return f.className="lm-TabBar-addButton lm-mod-hidden",f.setAttribute("tabindex","-1"),f.setAttribute("role","button"),u.appendChild(f),u}t.createNode=e;function i(u){return u instanceof eg?u:new eg(u)}t.asTitle=i;function n(u){let d=window.getComputedStyle(u);return 1e3*(parseFloat(d.transitionDuration)||0)}t.parseTransitionDuration=n;function r(u,d){let f=new Array(u.length);for(let h=0,m=u.length;h=t.DRAG_THRESHOLD||h>=t.DRAG_THRESHOLD}t.dragExceeded=s;function o(u,d){let f=u.contentRect;return d.clientX=f.right+t.DETACH_THRESHOLD||d.clientY=f.bottom+t.DETACH_THRESHOLD}t.detachExceeded=o;function a(u,d,f,h){let m,p,_,y;h==="horizontal"?(m=d.pressX,p=f.clientX-d.contentRect.left,_=f.clientX,y=d.contentRect.width):(m=d.pressY,p=f.clientY-d.contentRect.top,_=f.clientY,y=d.contentRect.height);let S=d.index,T=p-d.tabPressPos,O=T+d.tabSize;for(let A=0,b=u.length;A>1);if(Ad.index&&O>x)M=`${-d.tabSize-C.margin}px`,S=Math.max(S,A);else if(A===d.index){let w=_-m,E=y-(d.tabPos+d.tabSize);M=`${Math.max(-d.tabPos,Math.min(w,E))}px`}else M="";h==="horizontal"?u[A].style.left=M:u[A].style.top=M}d.targetIndex=S}t.layoutTabs=a;function l(u,d){let f;d==="horizontal"?f=u.contentRect.width:f=u.contentRect.height;let h;if(u.targetIndex===u.index)h=0;else if(u.targetIndex>u.index){let _=u.tabLayout[u.targetIndex];h=_.pos+_.size-u.tabSize-u.tabPos}else h=u.tabLayout[u.targetIndex].pos-u.tabPos;let m=f-(u.tabPos+u.tabSize),p=Math.max(-u.tabPos,Math.min(h,m));d==="horizontal"?u.tab.style.left=`${p}px`:u.tab.style.top=`${p}px`}t.finalizeTabPosition=l;function c(u,d){for(let f of u)d==="horizontal"?f.style.left="":f.style.top=""}t.resetTabPositions=c})(qs||(qs={}));sC=class extends sa{constructor(e){super(),this._spacing=4,this._dirty=!1,this._root=null,this._box=null,this._items=new Map,this.renderer=e.renderer,e.spacing!==void 0&&(this._spacing=ig.clampDimension(e.spacing)),this._document=e.document||document,this._hiddenMode=e.hiddenMode!==void 0?e.hiddenMode:me.HiddenMode.Display}dispose(){let e=this[Symbol.iterator]();this._items.forEach(i=>{i.dispose()}),this._box=null,this._root=null,this._items.clear();for(let i of e)i.dispose();super.dispose()}get hiddenMode(){return this._hiddenMode}set hiddenMode(e){if(this._hiddenMode!==e){this._hiddenMode=e;for(let i of this.tabBars())if(i.titles.length>1)for(let n of i.titles)n.owner.hiddenMode=this._hiddenMode}}get spacing(){return this._spacing}set spacing(e){e=ig.clampDimension(e),this._spacing!==e&&(this._spacing=e,this.parent&&this.parent.fit())}get isEmpty(){return this._root===null}[Symbol.iterator](){return this._root?this._root.iterAllWidgets():bf()}widgets(){return this._root?this._root.iterUserWidgets():bf()}selectedWidgets(){return this._root?this._root.iterSelectedWidgets():bf()}tabBars(){return this._root?this._root.iterTabBars():bf()}handles(){return this._root?this._root.iterHandles():bf()}moveHandle(e,i,n){let r=e.classList.contains("lm-mod-hidden");if(!this._root||r)return;let s=this._root.findSplitNode(e);if(!s)return;let o;s.node.orientation==="horizontal"?o=i-e.offsetLeft:o=n-e.offsetTop,o!==0&&(s.node.holdSizes(),ss.adjust(s.node.sizers,s.index,o),this.parent&&this.parent.update())}saveLayout(){return this._root?(this._root.holdAllSizes(),{main:this._root.createConfig()}):{main:null}}restoreLayout(e){let i=new Set,n;e.main?n=en.normalizeAreaConfig(e.main,i):n=null;let r=this.widgets(),s=this.tabBars(),o=this.handles();this._root=null;for(let a of r)i.has(a)||(a.parent=null);for(let a of s)a.dispose();for(let a of o)a.parentNode&&a.parentNode.removeChild(a);for(let a of i)a.parent=this.parent;n?this._root=en.realizeAreaConfig(n,{createTabBar:a=>this._createTabBar(),createHandle:()=>this._createHandle()},this._document):this._root=null,this.parent&&(i.forEach(a=>{this.attachWidget(a)}),this.parent.fit())}addWidget(e,i={}){let n=i.ref||null,r=i.mode||"tab-after",s=null;if(this._root&&n&&(s=this._root.findTabNode(n)),n&&!s)throw new Error("Reference widget is not in the layout.");switch(e.parent=this.parent,r){case"tab-after":this._insertTab(e,n,s,!0);break;case"tab-before":this._insertTab(e,n,s,!1);break;case"split-top":this._insertSplit(e,n,s,"vertical",!1);break;case"split-left":this._insertSplit(e,n,s,"horizontal",!1);break;case"split-right":this._insertSplit(e,n,s,"horizontal",!0);break;case"split-bottom":this._insertSplit(e,n,s,"vertical",!0);break;case"merge-top":this._insertSplit(e,n,s,"vertical",!1,!0);break;case"merge-left":this._insertSplit(e,n,s,"horizontal",!1,!0);break;case"merge-right":this._insertSplit(e,n,s,"horizontal",!0,!0);break;case"merge-bottom":this._insertSplit(e,n,s,"vertical",!0,!0);break}this.parent&&(this.attachWidget(e),this.parent.fit())}removeWidget(e){this._removeWidget(e),this.parent&&(this.detachWidget(e),this.parent.fit())}hitTestTabAreas(e,i){if(!this._root||!this.parent||!this.parent.isVisible)return null;this._box||(this._box=ti.boxSizing(this.parent.node));let n=this.parent.node.getBoundingClientRect(),r=e-n.left-this._box.borderLeft,s=i-n.top-this._box.borderTop,o=this._root.hitTestTabNodes(r,s);if(!o)return null;let{tabBar:a,top:l,left:c,width:u,height:d}=o,f=this._box.borderLeft+this._box.borderRight,h=this._box.borderTop+this._box.borderBottom,m=n.width-f-(c+u),p=n.height-h-(l+d);return{tabBar:a,x:r,y:s,top:l,left:c,right:m,bottom:p,width:u,height:d}}init(){super.init();for(let e of this)this.attachWidget(e);for(let e of this.handles())this.parent.node.appendChild(e);this.parent.fit()}attachWidget(e){this.parent.node!==e.node.parentNode&&(this._items.set(e,new Cu(e)),this.parent.isAttached&&Ae.sendMessage(e,me.Msg.BeforeAttach),this.parent.node.appendChild(e.node),this.parent.isAttached&&Ae.sendMessage(e,me.Msg.AfterAttach))}detachWidget(e){if(this.parent.node!==e.node.parentNode)return;this.parent.isAttached&&Ae.sendMessage(e,me.Msg.BeforeDetach),this.parent.node.removeChild(e.node),this.parent.isAttached&&Ae.sendMessage(e,me.Msg.AfterDetach);let i=this._items.get(e);i&&(this._items.delete(e),i.dispose())}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_removeWidget(e){if(!this._root)return;let i=this._root.findTabNode(e);if(!i)return;if(en.removeAria(e),i.tabBar.titles.length>1){if(i.tabBar.removeTab(e.title),this._hiddenMode===me.HiddenMode.Scale&&i.tabBar.titles.length==1){let f=i.tabBar.titles[0].owner;f.hiddenMode=me.HiddenMode.Display}return}if(i.tabBar.dispose(),this._root===i){this._root=null;return}this._root.holdAllSizes();let n=i.parent;i.parent=null;let r=Be.removeFirstOf(n.children,i),s=Be.removeAt(n.handles,r);if(Be.removeAt(n.sizers,r),s.parentNode&&s.parentNode.removeChild(s),n.children.length>1){n.syncHandles();return}let o=n.parent;n.parent=null;let a=n.children[0],l=n.handles[0];if(n.children.length=0,n.handles.length=0,n.sizers.length=0,l.parentNode&&l.parentNode.removeChild(l),this._root===n){a.parent=null,this._root=a;return}let c=o,u=c.children.indexOf(n);if(a instanceof en.TabLayoutNode){a.parent=c,c.children[u]=a;return}let d=Be.removeAt(c.handles,u);Be.removeAt(c.children,u),Be.removeAt(c.sizers,u),d.parentNode&&d.parentNode.removeChild(d);for(let f=0,h=a.children.length;f=this._left+this._width||m=this._top+this._height?null:this}createConfig(){let h=this.tabBar.titles.map(p=>p.owner),m=this.tabBar.currentIndex;return{type:"tab-area",widgets:h,currentIndex:m}}holdAllSizes(){}fit(h,m){let p=0,_=0,y=1/0,S=1/0,T=m.get(this.tabBar),O=this.tabBar.currentTitle,A=O?m.get(O.owner):void 0,[b,M]=this.sizers;return T&&T.fit(),A&&A.fit(),T&&!T.isHidden?(p=Math.max(p,T.minWidth),_+=T.minHeight,b.minSize=T.minHeight,b.maxSize=T.maxHeight):(b.minSize=0,b.maxSize=0),A&&!A.isHidden?(p=Math.max(p,A.minWidth),_+=A.minHeight,M.minSize=A.minHeight,M.maxSize=1/0):(M.minSize=0,M.maxSize=1/0),{minWidth:p,minHeight:_,maxWidth:y,maxHeight:S}}update(h,m,p,_,y,S){this._top=m,this._left=h,this._width=p,this._height=_;let T=S.get(this.tabBar),O=this.tabBar.currentTitle,A=O?S.get(O.owner):void 0;if(ss.calc(this.sizers,_),T&&!T.isHidden){let b=this.sizers[0].size;T.update(h,m,p,b),m+=b}if(A&&!A.isHidden){let b=this.sizers[1].size;A.update(h,m,p,b)}}}t.TabLayoutNode=r;class s{constructor(h){this.parent=null,this.normalized=!1,this.children=[],this.sizers=[],this.handles=[],this.orientation=h}*iterAllWidgets(){for(let h of this.children)yield*h.iterAllWidgets()}*iterUserWidgets(){for(let h of this.children)yield*h.iterUserWidgets()}*iterSelectedWidgets(){for(let h of this.children)yield*h.iterSelectedWidgets()}*iterTabBars(){for(let h of this.children)yield*h.iterTabBars()}*iterHandles(){yield*this.handles;for(let h of this.children)yield*h.iterHandles()}findTabNode(h){for(let m=0,p=this.children.length;m_.createConfig());return{type:"split-area",orientation:h,children:p,sizes:m}}syncHandles(){this.handles.forEach((h,m)=>{h.setAttribute("data-orientation",this.orientation),m===this.handles.length-1?h.classList.add("lm-mod-hidden"):h.classList.remove("lm-mod-hidden")})}holdSizes(){for(let h of this.sizers)h.sizeHint=h.size}holdAllSizes(){for(let h of this.children)h.holdAllSizes();this.holdSizes()}normalizeSizes(){let h=this.sizers.length;if(h===0)return;this.holdSizes();let m=this.sizers.reduce((p,_)=>p+_.sizeHint,0);if(m===0)for(let p of this.sizers)p.size=p.sizeHint=1/h;else for(let p of this.sizers)p.size=p.sizeHint/=m;this.normalized=!0}createNormalizedSizes(){let h=this.sizers.length;if(h===0)return[];let m=this.sizers.map(_=>_.size),p=m.reduce((_,y)=>_+y,0);if(p===0)for(let _=m.length-1;_>-1;_--)m[_]=1/h;else for(let _=m.length-1;_>-1;_--)m[_]/=p;return m}fit(h,m){let p=this.orientation==="horizontal",_=Math.max(0,this.children.length-1)*h,y=p?_:0,S=p?0:_,T=1/0,O=1/0;for(let A=0,b=this.children.length;A=m.length)&&(p=0),{type:"tab-area",widgets:m,currentIndex:p}}function c(f,h){let m=f.orientation,p=[],_=[];for(let y=0,S=f.children.length;y{let S=n(_,h,m),T=e(f.sizes[y]),O=h.createHandle();p.children.push(S),p.handles.push(O),p.sizers.push(T),S.parent=p}),p.syncHandles(),p.normalizeSizes(),p}})(en||(en={}));Mu=class extends me{constructor(e={}){super(),this._drag=null,this._tabsMovable=!0,this._tabsConstrained=!1,this._addButtonEnabled=!1,this._pressData=null,this._layoutModified=new Te(this),this._addRequested=new Te(this),this.addClass("lm-DockPanel"),this._document=e.document||document,this._mode=e.mode||"multiple-document",this._renderer=e.renderer||Mu.defaultRenderer,this._edges=e.edges||qi.DEFAULT_EDGES,e.tabsMovable!==void 0&&(this._tabsMovable=e.tabsMovable),e.tabsConstrained!==void 0&&(this._tabsConstrained=e.tabsConstrained),e.addButtonEnabled!==void 0&&(this._addButtonEnabled=e.addButtonEnabled),this.dataset.mode=this._mode;let i={createTabBar:()=>this._createTabBar(),createHandle:()=>this._createHandle()};this.layout=new sC({document:this._document,renderer:i,spacing:e.spacing,hiddenMode:e.hiddenMode}),this.overlay=e.overlay||new Mu.Overlay,this.node.appendChild(this.overlay.node)}dispose(){this._releaseMouse(),this.overlay.hide(0),this._drag&&this._drag.dispose(),super.dispose()}get hiddenMode(){return this.layout.hiddenMode}set hiddenMode(e){this.layout.hiddenMode=e}get layoutModified(){return this._layoutModified}get addRequested(){return this._addRequested}get renderer(){return this.layout.renderer}get spacing(){return this.layout.spacing}set spacing(e){this.layout.spacing=e}get mode(){return this._mode}set mode(e){if(this._mode===e)return;this._mode=e,this.dataset.mode=e;let i=this.layout;switch(e){case"multiple-document":for(let n of i.tabBars())n.show();break;case"single-document":i.restoreLayout(qi.createSingleDocumentConfig(this));break;default:throw"unreachable"}Ae.postMessage(this,qi.LayoutModified)}get tabsMovable(){return this._tabsMovable}set tabsMovable(e){this._tabsMovable=e;for(let i of this.tabBars())i.tabsMovable=e}get tabsConstrained(){return this._tabsConstrained}set tabsConstrained(e){this._tabsConstrained=e}get addButtonEnabled(){return this._addButtonEnabled}set addButtonEnabled(e){this._addButtonEnabled=e;for(let i of this.tabBars())i.addButtonEnabled=e}get isEmpty(){return this.layout.isEmpty}*widgets(){yield*this.layout.widgets()}*selectedWidgets(){yield*this.layout.selectedWidgets()}*tabBars(){yield*this.layout.tabBars()}*handles(){yield*this.layout.handles()}selectWidget(e){let i=ZS(this.tabBars(),n=>n.titles.indexOf(e.title)!==-1);if(!i)throw new Error("Widget is not contained in the dock panel.");i.currentTitle=e.title}activateWidget(e){this.selectWidget(e),e.activate()}saveLayout(){return this.layout.saveLayout()}restoreLayout(e){this._mode="multiple-document",this.layout.restoreLayout(e),(yu.IS_EDGE||yu.IS_IE)&&Ae.flush(),Ae.postMessage(this,qi.LayoutModified)}addWidget(e,i={}){this._mode==="single-document"?this.layout.addWidget(e):this.layout.addWidget(e,i),Ae.postMessage(this,qi.LayoutModified)}processMessage(e){e.type==="layout-modified"?this._layoutModified.emit(void 0):super.processMessage(e)}handleEvent(e){switch(e.type){case"lm-dragenter":this._evtDragEnter(e);break;case"lm-dragleave":this._evtDragLeave(e);break;case"lm-dragover":this._evtDragOver(e);break;case"lm-drop":this._evtDrop(e);break;case"pointerdown":this._evtPointerDown(e);break;case"pointermove":this._evtPointerMove(e);break;case"pointerup":this._evtPointerUp(e);break;case"keydown":this._evtKeyDown(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("lm-dragenter",this),this.node.addEventListener("lm-dragleave",this),this.node.addEventListener("lm-dragover",this),this.node.addEventListener("lm-drop",this),this.node.addEventListener("pointerdown",this)}onAfterDetach(e){this.node.removeEventListener("lm-dragenter",this),this.node.removeEventListener("lm-dragleave",this),this.node.removeEventListener("lm-dragover",this),this.node.removeEventListener("lm-drop",this),this.node.removeEventListener("pointerdown",this),this._releaseMouse()}onChildAdded(e){qi.isGeneratedTabBarProperty.get(e.child)||e.child.addClass("lm-DockPanel-widget")}onChildRemoved(e){qi.isGeneratedTabBarProperty.get(e.child)||(e.child.removeClass("lm-DockPanel-widget"),Ae.postMessage(this,qi.LayoutModified))}_evtDragEnter(e){e.mimeData.hasData("application/vnd.lumino.widget-factory")&&(e.preventDefault(),e.stopPropagation())}_evtDragLeave(e){e.preventDefault(),!(this._tabsConstrained&&e.source!==this)&&(e.stopPropagation(),this.overlay.hide(1))}_evtDragOver(e){e.preventDefault(),this._tabsConstrained&&e.source!==this||this._showOverlay(e.clientX,e.clientY)==="invalid"?e.dropAction="none":(e.stopPropagation(),e.dropAction=e.proposedAction)}_evtDrop(e){if(e.preventDefault(),this.overlay.hide(0),e.proposedAction==="none"){e.dropAction="none";return}let{clientX:i,clientY:n}=e,{zone:r,target:s}=qi.findDropTarget(this,i,n,this._edges);if(this._tabsConstrained&&e.source!==this||r==="invalid"){e.dropAction="none";return}let a=e.mimeData.getData("application/vnd.lumino.widget-factory");if(typeof a!="function"){e.dropAction="none";return}let l=a();if(!(l instanceof me)){e.dropAction="none";return}if(l.contains(this)){e.dropAction="none";return}let c=s?qi.getDropRef(s.tabBar):null;switch(r){case"root-all":this.addWidget(l);break;case"root-top":this.addWidget(l,{mode:"split-top"});break;case"root-left":this.addWidget(l,{mode:"split-left"});break;case"root-right":this.addWidget(l,{mode:"split-right"});break;case"root-bottom":this.addWidget(l,{mode:"split-bottom"});break;case"widget-all":this.addWidget(l,{mode:"tab-after",ref:c});break;case"widget-top":this.addWidget(l,{mode:"split-top",ref:c});break;case"widget-left":this.addWidget(l,{mode:"split-left",ref:c});break;case"widget-right":this.addWidget(l,{mode:"split-right",ref:c});break;case"widget-bottom":this.addWidget(l,{mode:"split-bottom",ref:c});break;case"widget-tab":this.addWidget(l,{mode:"tab-after",ref:c});break;default:throw"unreachable"}e.dropAction=e.proposedAction,e.stopPropagation(),this.activateWidget(l)}_evtKeyDown(e){e.preventDefault(),e.stopPropagation(),e.keyCode===27&&(this._releaseMouse(),Ae.postMessage(this,qi.LayoutModified))}_evtPointerDown(e){if(e.button!==0)return;let i=this.layout,n=e.target,r=ZS(i.handles(),u=>u.contains(n));if(!r)return;e.preventDefault(),e.stopPropagation(),this._document.addEventListener("keydown",this,!0),this._document.addEventListener("pointerup",this,!0),this._document.addEventListener("pointermove",this,!0),this._document.addEventListener("contextmenu",this,!0);let s=r.getBoundingClientRect(),o=e.clientX-s.left,a=e.clientY-s.top,l=window.getComputedStyle(r),c=an.overrideCursor(l.cursor,this._document);this._pressData={handle:r,deltaX:o,deltaY:a,override:c}}_evtPointerMove(e){if(!this._pressData)return;e.preventDefault(),e.stopPropagation();let i=this.node.getBoundingClientRect(),n=e.clientX-i.left-this._pressData.deltaX,r=e.clientY-i.top-this._pressData.deltaY;this.layout.moveHandle(this._pressData.handle,n,r)}_evtPointerUp(e){e.button===0&&(e.preventDefault(),e.stopPropagation(),this._releaseMouse(),Ae.postMessage(this,qi.LayoutModified))}_releaseMouse(){this._pressData&&(this._pressData.override.dispose(),this._pressData=null,this._document.removeEventListener("keydown",this,!0),this._document.removeEventListener("pointerup",this,!0),this._document.removeEventListener("pointermove",this,!0),this._document.removeEventListener("contextmenu",this,!0))}_showOverlay(e,i){let{zone:n,target:r}=qi.findDropTarget(this,e,i,this._edges);if(n==="invalid")return this.overlay.hide(100),n;let s,o,a,l,c=ti.boxSizing(this.node),u=this.node.getBoundingClientRect();switch(n){case"root-all":s=c.paddingTop,o=c.paddingLeft,a=c.paddingRight,l=c.paddingBottom;break;case"root-top":s=c.paddingTop,o=c.paddingLeft,a=c.paddingRight,l=u.height*qi.GOLDEN_RATIO;break;case"root-left":s=c.paddingTop,o=c.paddingLeft,a=u.width*qi.GOLDEN_RATIO,l=c.paddingBottom;break;case"root-right":s=c.paddingTop,o=u.width*qi.GOLDEN_RATIO,a=c.paddingRight,l=c.paddingBottom;break;case"root-bottom":s=u.height*qi.GOLDEN_RATIO,o=c.paddingLeft,a=c.paddingRight,l=c.paddingBottom;break;case"widget-all":s=r.top,o=r.left,a=r.right,l=r.bottom;break;case"widget-top":s=r.top,o=r.left,a=r.right,l=r.bottom+r.height/2;break;case"widget-left":s=r.top,o=r.left,a=r.right+r.width/2,l=r.bottom;break;case"widget-right":s=r.top,o=r.left+r.width/2,a=r.right,l=r.bottom;break;case"widget-bottom":s=r.top+r.height/2,o=r.left,a=r.right,l=r.bottom;break;case"widget-tab":{let d=r.tabBar.node.getBoundingClientRect().height;s=r.top,o=r.left,a=r.right,l=r.bottom+r.height-d;break}default:throw"unreachable"}return this.overlay.show({top:s,left:o,right:a,bottom:l}),n}_createTabBar(){let e=this._renderer.createTabBar(this._document);return qi.isGeneratedTabBarProperty.set(e,!0),this._mode==="single-document"&&e.hide(),e.tabsMovable=this._tabsMovable,e.allowDeselect=!1,e.addButtonEnabled=this._addButtonEnabled,e.removeBehavior="select-previous-tab",e.insertBehavior="select-tab-if-needed",e.tabMoved.connect(this._onTabMoved,this),e.currentChanged.connect(this._onCurrentChanged,this),e.tabCloseRequested.connect(this._onTabCloseRequested,this),e.tabDetachRequested.connect(this._onTabDetachRequested,this),e.tabActivateRequested.connect(this._onTabActivateRequested,this),e.addRequested.connect(this._onTabAddRequested,this),e}_createHandle(){return this._renderer.createHandle()}_onTabMoved(){Ae.postMessage(this,qi.LayoutModified)}_onCurrentChanged(e,i){let{previousTitle:n,currentTitle:r}=i;n&&n.owner.hide(),r&&r.owner.show(),(yu.IS_EDGE||yu.IS_IE)&&Ae.flush(),Ae.postMessage(this,qi.LayoutModified)}_onTabAddRequested(e){this._addRequested.emit(e)}_onTabActivateRequested(e,i){i.title.owner.activate()}_onTabCloseRequested(e,i){i.title.owner.close()}_onTabDetachRequested(e,i){if(this._drag)return;e.releaseMouse();let{title:n,tab:r,clientX:s,clientY:o,offset:a}=i,l=new sl.MimeData,c=()=>n.owner;l.setData("application/vnd.lumino.widget-factory",c);let u=r.cloneNode(!0);a&&(u.style.top=`-${a.y}px`,u.style.left=`-${a.x}px`),this._drag=new an({document:this._document,mimeData:l,dragImage:u,proposedAction:"move",supportedActions:"move",source:this}),r.classList.add("lm-mod-hidden");let d=()=>{this._drag=null,r.classList.remove("lm-mod-hidden")};this._drag.start(s,o).then(d)}};(function(t){class e{constructor(){this._timer=-1,this._hidden=!0,this.node=document.createElement("div"),this.node.classList.add("lm-DockPanel-overlay"),this.node.classList.add("lm-mod-hidden"),this.node.style.position="absolute",this.node.style.contain="strict"}show(r){let s=this.node.style;s.top=`${r.top}px`,s.left=`${r.left}px`,s.right=`${r.right}px`,s.bottom=`${r.bottom}px`,clearTimeout(this._timer),this._timer=-1,this._hidden&&(this._hidden=!1,this.node.classList.remove("lm-mod-hidden"))}hide(r){if(!this._hidden){if(r<=0){clearTimeout(this._timer),this._timer=-1,this._hidden=!0,this.node.classList.add("lm-mod-hidden");return}this._timer===-1&&(this._timer=window.setTimeout(()=>{this._timer=-1,this._hidden=!0,this.node.classList.add("lm-mod-hidden")},r))}}}t.Overlay=e;class i{createTabBar(r){let s=new gc({document:r});return s.addClass("lm-DockPanel-tabBar"),s}createHandle(){let r=document.createElement("div");return r.className="lm-DockPanel-handle",r}}t.Renderer=i,t.defaultRenderer=new i})(Mu||(Mu={}));(function(t){t.GOLDEN_RATIO=.618,t.DEFAULT_EDGES={top:12,right:40,bottom:40,left:40},t.LayoutModified=new pc("layout-modified"),t.isGeneratedTabBarProperty=new pt({name:"isGeneratedTabBar",create:()=>!1});function e(r){if(r.isEmpty)return{main:null};let s=Array.from(r.widgets()),o=r.selectedWidgets().next().value,a=o?s.indexOf(o):-1;return{main:{type:"tab-area",widgets:s,currentIndex:a}}}t.createSingleDocumentConfig=e;function i(r,s,o,a){if(!ti.hitTest(r.node,s,o))return{zone:"invalid",target:null};let l=r.layout;if(l.isEmpty)return{zone:"root-all",target:null};if(r.mode==="multiple-document"){let T=r.node.getBoundingClientRect(),O=s-T.left+1,A=o-T.top+1,b=T.right-s,M=T.bottom-o;switch(Math.min(A,b,M,O)){case A:if(Ap&&f>p&&d>_&&h>_)return{zone:"widget-all",target:c};u/=p,d/=_,f/=p,h/=_;let y=Math.min(u,d,f,h),S;switch(y){case u:S="widget-left";break;case d:S="widget-top";break;case f:S="widget-right";break;case h:S="widget-bottom";break;default:throw"unreachable"}return{zone:S,target:c}}t.findDropTarget=i;function n(r){return r.titles.length===0?null:r.currentTitle?r.currentTitle.owner:r.titles[r.titles.length-1].owner}t.getDropRef=n})(qi||(qi={}));rl=class extends sa{constructor(e={}){super(e),this._dirty=!1,this._rowSpacing=4,this._columnSpacing=4,this._items=[],this._rowStarts=[],this._columnStarts=[],this._rowSizers=[new na],this._columnSizers=[new na],this._box=null,e.rowCount!==void 0&&xn.reallocSizers(this._rowSizers,e.rowCount),e.columnCount!==void 0&&xn.reallocSizers(this._columnSizers,e.columnCount),e.rowSpacing!==void 0&&(this._rowSpacing=xn.clampValue(e.rowSpacing)),e.columnSpacing!==void 0&&(this._columnSpacing=xn.clampValue(e.columnSpacing))}dispose(){for(let e of this._items){let i=e.widget;e.dispose(),i.dispose()}this._box=null,this._items.length=0,this._rowStarts.length=0,this._rowSizers.length=0,this._columnStarts.length=0,this._columnSizers.length=0,super.dispose()}get rowCount(){return this._rowSizers.length}set rowCount(e){e!==this.rowCount&&(xn.reallocSizers(this._rowSizers,e),this.parent&&this.parent.fit())}get columnCount(){return this._columnSizers.length}set columnCount(e){e!==this.columnCount&&(xn.reallocSizers(this._columnSizers,e),this.parent&&this.parent.fit())}get rowSpacing(){return this._rowSpacing}set rowSpacing(e){e=xn.clampValue(e),this._rowSpacing!==e&&(this._rowSpacing=e,this.parent&&this.parent.fit())}get columnSpacing(){return this._columnSpacing}set columnSpacing(e){e=xn.clampValue(e),this._columnSpacing!==e&&(this._columnSpacing=e,this.parent&&this.parent.fit())}rowStretch(e){let i=this._rowSizers[e];return i?i.stretch:-1}setRowStretch(e,i){let n=this._rowSizers[e];n&&(i=xn.clampValue(i),n.stretch!==i&&(n.stretch=i,this.parent&&this.parent.update()))}columnStretch(e){let i=this._columnSizers[e];return i?i.stretch:-1}setColumnStretch(e,i){let n=this._columnSizers[e];n&&(i=xn.clampValue(i),n.stretch!==i&&(n.stretch=i,this.parent&&this.parent.update()))}*[Symbol.iterator](){for(let e of this._items)yield e.widget}addWidget(e){Be.findFirstIndex(this._items,n=>n.widget===e)===-1&&(this._items.push(new Cu(e)),this.parent&&this.attachWidget(e))}removeWidget(e){let i=Be.findFirstIndex(this._items,r=>r.widget===e);if(i===-1)return;let n=Be.removeAt(this._items,i);this.parent&&this.detachWidget(e),n.dispose()}init(){super.init();for(let e of this)this.attachWidget(e)}attachWidget(e){this.parent.isAttached&&Ae.sendMessage(e,me.Msg.BeforeAttach),this.parent.node.appendChild(e.node),this.parent.isAttached&&Ae.sendMessage(e,me.Msg.AfterAttach),this.parent.fit()}detachWidget(e){this.parent.isAttached&&Ae.sendMessage(e,me.Msg.BeforeDetach),this.parent.node.removeChild(e.node),this.parent.isAttached&&Ae.sendMessage(e,me.Msg.AfterDetach),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_fit(){for(let l=0,c=this.rowCount;l!l.isHidden);for(let l=0,c=e.length;l({row:0,column:0,rowSpan:1,columnSpan:1}),changed:a});function e(l){let c=Math.max(0,Math.floor(l.row||0)),u=Math.max(0,Math.floor(l.column||0)),d=Math.max(1,Math.floor(l.rowSpan||0)),f=Math.max(1,Math.floor(l.columnSpan||0));return{row:c,column:u,rowSpan:d,columnSpan:f}}t.normalizeConfig=e;function i(l){return Math.max(0,Math.floor(l))}t.clampValue=i;function n(l,c){let u=t.cellConfigProperty.get(l.widget),d=t.cellConfigProperty.get(c.widget);return u.rowSpan-d.rowSpan}t.rowSpanCmp=n;function r(l,c){let u=t.cellConfigProperty.get(l.widget),d=t.cellConfigProperty.get(c.widget);return u.columnSpan-d.columnSpan}t.columnSpanCmp=r;function s(l,c){for(c=Math.max(1,Math.floor(c));l.lengthc&&(l.length=c)}t.reallocSizers=s;function o(l,c,u,d){if(u=d)return;let h=(d-f)/(u-c+1);for(let m=c;m<=u;++m)l[m].minSize+=h}t.distributeMin=o;function a(l){l.parent&&l.parent.layout instanceof rl&&l.parent.fit()}})(xn||(xn={}));Cf=class extends me{constructor(e={}){super({node:Gb.createNode()}),this._activeIndex=-1,this._tabFocusIndex=0,this._menus=[],this._childMenu=null,this._overflowMenu=null,this._menuItemSizes=[],this._overflowIndex=-1,this.addClass("lm-MenuBar"),this.setFlag(me.Flag.DisallowLayout),this.renderer=e.renderer||Cf.defaultRenderer,this._forceItemsPosition=e.forceItemsPosition||{forceX:!0,forceY:!0},this._overflowMenuOptions=e.overflowMenuOptions||{isVisible:!0}}dispose(){this._closeChildMenu(),this._menus.length=0,super.dispose()}get childMenu(){return this._childMenu}get overflowIndex(){return this._overflowIndex}get overflowMenu(){return this._overflowMenu}get contentNode(){return this.node.getElementsByClassName("lm-MenuBar-content")[0]}get activeMenu(){return this._menus[this._activeIndex]||null}set activeMenu(e){this.activeIndex=e?this._menus.indexOf(e):-1}get activeIndex(){return this._activeIndex}set activeIndex(e){(e<0||e>=this._menus.length)&&(e=-1),e>-1&&this._menus[e].items.length===0&&(e=-1),this._activeIndex!==e&&(this._activeIndex=e,this.update())}get menus(){return this._menus}openActiveMenu(){this._activeIndex!==-1&&(this._openChildMenu(),this._childMenu&&(this._childMenu.activeIndex=-1,this._childMenu.activateNextItem()))}addMenu(e,i=!0){this.insertMenu(this._menus.length,e,i)}insertMenu(e,i,n=!0){this._closeChildMenu();let r=this._menus.indexOf(i),s=Math.max(0,Math.min(e,this._menus.length));if(r===-1){Be.insert(this._menus,s,i),i.addClass("lm-MenuBar-menu"),i.aboutToClose.connect(this._onMenuAboutToClose,this),i.menuRequested.connect(this._onMenuMenuRequested,this),i.title.changed.connect(this._onTitleChanged,this),n&&this.update();return}s===this._menus.length&&s--,r!==s&&(Be.move(this._menus,r,s),n&&this.update())}removeMenu(e,i=!0){this.removeMenuAt(this._menus.indexOf(e),i)}removeMenuAt(e,i=!0){this._closeChildMenu();let n=Be.removeAt(this._menus,e);n&&(n.aboutToClose.disconnect(this._onMenuAboutToClose,this),n.menuRequested.disconnect(this._onMenuMenuRequested,this),n.title.changed.disconnect(this._onTitleChanged,this),n.removeClass("lm-MenuBar-menu"),i&&this.update())}clearMenus(){if(this._menus.length!==0){this._closeChildMenu();for(let e of this._menus)e.aboutToClose.disconnect(this._onMenuAboutToClose,this),e.menuRequested.disconnect(this._onMenuMenuRequested,this),e.title.changed.disconnect(this._onTitleChanged,this),e.removeClass("lm-MenuBar-menu");this._menus.length=0,this.update()}}handleEvent(e){switch(e.type){case"keydown":this._evtKeyDown(e);break;case"mousedown":this._evtMouseDown(e);break;case"mousemove":this._evtMouseMove(e);break;case"focusout":this._evtFocusOut(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("keydown",this),this.node.addEventListener("mousedown",this),this.node.addEventListener("mousemove",this),this.node.addEventListener("focusout",this),this.node.addEventListener("contextmenu",this)}onAfterDetach(e){this.node.removeEventListener("keydown",this),this.node.removeEventListener("mousedown",this),this.node.removeEventListener("mousemove",this),this.node.removeEventListener("focusout",this),this.node.removeEventListener("contextmenu",this),this._closeChildMenu()}onActivateRequest(e){this.isAttached&&this._focusItemAt(0)}onResize(e){this.update(),super.onResize(e)}onUpdateRequest(e){var i;let n=this._menus,r=this.renderer,s=this._activeIndex,o=this._tabFocusIndex>=0&&this._tabFocusIndex-1?this._overflowIndex:n.length,l=0,c=!1;a=this._overflowMenu!==null?a-1:a;let u=new Array(a);for(let d=0;d{this._tabFocusIndex=d,this.activeIndex=d}}),l+=this._menuItemSizes[d],n[d].title.label===this._overflowMenuOptions.title&&(c=!0,a--);if(this._overflowMenuOptions.isVisible){if(this._overflowIndex>-1&&!c){if(this._overflowMenu===null){let d=(i=this._overflowMenuOptions.title)!==null&&i!==void 0?i:"...";this._overflowMenu=new ra({commands:new er}),this._overflowMenu.title.label=d,this._overflowMenu.title.mnemonic=0,this.addMenu(this._overflowMenu,!1)}for(let d=n.length-2;d>=a;d--){let f=this.menus[d];f.title.mnemonic=0,this._overflowMenu.insertItem(0,{type:"submenu",submenu:f}),this.removeMenu(f,!1)}u[a]=r.renderItem({title:this._overflowMenu.title,active:a===s&&n[a].items.length!==0,tabbable:a===o,disabled:n[a].items.length===0,onfocus:()=>{this._tabFocusIndex=a,this.activeIndex=a}}),a++}else if(this._overflowMenu!==null){let d=this._overflowMenu.items,f=this.node.offsetWidth,h=this._overflowMenu.items.length;for(let m=0;mthis._menuItemSizes[p]){let _=d[0].submenu;this._overflowMenu.removeItemAt(0),this.insertMenu(a,_,!1),u[a]=r.renderItem({title:_.title,active:!1,tabbable:a===o,disabled:n[a].items.length===0,onfocus:()=>{this._tabFocusIndex=a,this.activeIndex=a}}),a++}}this._overflowMenu.items.length===0&&(this.removeMenu(this._overflowMenu,!1),u.pop(),this._overflowMenu=null,this._overflowIndex=-1)}}Qt.render(u,this.contentNode),this._updateOverflowIndex()}_updateOverflowIndex(){if(!this._overflowMenuOptions.isVisible)return;let e=this.contentNode.childNodes,i=this.node.offsetWidth,n=0,r=-1,s=e.length;if(this._menuItemSizes.length==0)for(let o=0;oi&&r===-1&&(r=o)}else for(let o=0;oi){r=o;break}this._overflowIndex=r}_evtKeyDown(e){let i=e.keyCode;if(i===9){this.activeIndex=-1;return}if(e.preventDefault(),e.stopPropagation(),i===13||i===32||i===38||i===40){if(this.activeIndex=this._tabFocusIndex,this.activeIndex!==this._tabFocusIndex)return;this.openActiveMenu();return}if(i===27){this._closeChildMenu(),this._focusItemAt(this.activeIndex);return}if(i===37||i===39){let o=i===37?-1:1,a=this._tabFocusIndex+o,l=this._menus.length;for(let c=0;cti.hitTest(n,e.clientX,e.clientY));if(i===-1){this._closeChildMenu();return}if(e.button===0)if(this._childMenu)this._closeChildMenu(),this.activeIndex=i;else{e.preventDefault();let n=this._positionForMenu(i);ra.saveWindowData(),this.activeIndex=i,this._openChildMenu(n)}}_evtMouseMove(e){let i=Be.findFirstIndex(this.contentNode.children,r=>ti.hitTest(r,e.clientX,e.clientY));if(i===this._activeIndex||i===-1&&this._childMenu)return;let n=i>=0&&this._childMenu?this._positionForMenu(i):null;ra.saveWindowData(),this.activeIndex=i,n&&this._openChildMenu(n)}_positionForMenu(e){let i=this.contentNode.children[e],{left:n,bottom:r}=i.getBoundingClientRect();return{top:r,left:n}}_evtFocusOut(e){!this._childMenu&&!this.node.contains(e.relatedTarget)&&(this.activeIndex=-1)}_focusItemAt(e){let i=this.contentNode.childNodes[e];i&&i.focus()}_openChildMenu(e={}){let i=this.activeMenu;if(!i){this._closeChildMenu();return}let n=this._childMenu;if(n===i)return;this._childMenu=i,n?n.close():document.addEventListener("mousedown",this,!0),this._tabFocusIndex=this.activeIndex,Ae.sendMessage(this,me.Msg.UpdateRequest);let{left:r,top:s}=e;(typeof r>"u"||typeof s>"u")&&({left:r,top:s}=this._positionForMenu(this._activeIndex)),n||this.addClass("lm-mod-active"),i.items.length>0&&i.open(r,s,this._forceItemsPosition)}_closeChildMenu(){if(!this._childMenu)return;this.removeClass("lm-mod-active"),document.removeEventListener("mousedown",this,!0);let e=this._childMenu;this._childMenu=null,e.close(),this.activeIndex=-1}_onMenuAboutToClose(e){e===this._childMenu&&(this.removeClass("lm-mod-active"),document.removeEventListener("mousedown",this,!0),this._childMenu=null,this.activeIndex=-1)}_onMenuMenuRequested(e,i){if(e!==this._childMenu)return;let n=this._activeIndex,r=this._menus.length;switch(i){case"next":this.activeIndex=n===r-1?0:n+1;break;case"previous":this.activeIndex=n===0?r-1:n-1;break}this.openActiveMenu()}_onTitleChanged(){this.update()}};(function(t){class e{renderItem(n){let r=this.createItemClass(n),s=this.createItemDataset(n),o=this.createItemARIA(n);return Ce.li({className:r,dataset:s,...n.disabled?{}:{tabindex:n.tabbable?"0":"-1"},onfocus:n.onfocus,...o},this.renderIcon(n),this.renderLabel(n))}renderIcon(n){let r=this.createIconClass(n);return Ce.div({className:r},n.title.icon,n.title.iconLabel)}renderLabel(n){let r=this.formatLabel(n);return Ce.div({className:"lm-MenuBar-itemLabel"},r)}createItemClass(n){let r="lm-MenuBar-item";return n.title.className&&(r+=` ${n.title.className}`),n.active&&!n.disabled&&(r+=" lm-mod-active"),r}createItemDataset(n){return n.title.dataset}createItemARIA(n){return{role:"menuitem","aria-haspopup":"true","aria-disabled":n.disabled?"true":"false"}}createIconClass(n){let r="lm-MenuBar-itemIcon",s=n.title.iconClass;return s?`${r} ${s}`:r}formatLabel(n){let{label:r,mnemonic:s}=n.title;if(s<0||s>=r.length)return r;let o=r.slice(0,s),a=r.slice(s+1),l=r[s],c=Ce.span({className:"lm-MenuBar-itemMnemonic"},l);return[o,c,a]}}t.Renderer=e,t.defaultRenderer=new e})(Cf||(Cf={}));(function(t){function e(){let n=document.createElement("div"),r=document.createElement("ul");return r.className="lm-MenuBar-content",n.appendChild(r),r.setAttribute("role","menubar"),n}t.createNode=e;function i(n,r,s){let o=-1,a=-1,l=!1,c=r.toUpperCase();for(let u=0,d=n.length;u=0&&m1&&this.widgets.forEach(i=>{i.hiddenMode=this._hiddenMode}))}dispose(){for(let e of this._items)e.dispose();this._box=null,this._items.length=0,super.dispose()}attachWidget(e,i){this._hiddenMode===me.HiddenMode.Scale&&this._items.length>0?(this._items.length===1&&(this.widgets[0].hiddenMode=me.HiddenMode.Scale),i.hiddenMode=me.HiddenMode.Scale):i.hiddenMode=me.HiddenMode.Display,Be.insert(this._items,e,new Cu(i)),this.parent.isAttached&&Ae.sendMessage(i,me.Msg.BeforeAttach),this.parent.node.appendChild(i.node),this.parent.isAttached&&Ae.sendMessage(i,me.Msg.AfterAttach),this.parent.fit()}moveWidget(e,i,n){Be.move(this._items,e,i),this.parent.update()}detachWidget(e,i){let n=Be.removeAt(this._items,e);this.parent.isAttached&&Ae.sendMessage(i,me.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.isAttached&&Ae.sendMessage(i,me.Msg.AfterDetach),n.widget.node.style.zIndex="",this._hiddenMode===me.HiddenMode.Scale&&(i.hiddenMode=me.HiddenMode.Display,this._items.length===1&&(this._items[0].widget.hiddenMode=me.HiddenMode.Display)),n.dispose(),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_fit(){let e=0,i=0;for(let s=0,o=this._items.length;s{});var cC,Yb,Ef,k2=$(()=>{b1();cC=P(Qn());Rs();Yb=class{constructor(e){this.trusted=!1,this._changed=new Te(this),this._raw={};let i=Ef.getData(e.value);this._data=new kp({values:i}),this._rawData=i;let n=e.value;for(let r in n)switch(r){case"data":break;default:this._raw[r]=Ef.extract(n,r)}}get changed(){return this._changed}dispose(){this._data.dispose(),Te.clearData(this)}get data(){return this._rawData}get metadata(){return{}}setData(e){e.data&&(this._updateObservable(this._data,e.data),this._rawData=e.data),this._changed.emit(void 0)}toJSON(){let e={};for(let i in this._raw)e[i]=Ef.extract(this._raw,i);return e}_updateObservable(e,i){let n=e.keys(),r=Object.keys(i);for(let s of n)r.indexOf(s)===-1&&e.delete(s);for(let s of r){let o=e.get(s),a=i[s];o!==a&&e.set(s,a)}}};(function(t){function e(i){return Ef.getData(i)}t.getData=e})(Yb||(Yb={}));(function(t){function e(s){return r(s)}t.getData=e;function i(s){return{data:e(s.value)}}t.getBundleOptions=i;function n(s,o){let a=s[o];return a===void 0||cC.JSONExt.isPrimitive(a)?a:cC.JSONExt.deepCopy(a)}t.extract=n;function r(s){let o=Object.create(null);for(let a in s)o[a]=n(s,a);return o}})(Ef||(Ef={}))});function*If(){}function uC(t,e){let i=0;for(let n of t)if(e(n,i++))return n}function L2(t,e){let i=0;for(let n of t)if(e(n,i++)===!1)return!1;return!0}function N2(t,e){let i=0;for(let n of t)if(e(n,i++))return!0;return!1}function*D2(t){if(typeof t.retro=="function")yield*t.retro();else for(let e=t.length-1;e>-1;e--)yield t[e]}var He,A2,Mf,dC=$(()=>{(function(t){function e(M,C,x=0,w=-1){let E=M.length;if(E===0)return-1;x<0?x=Math.max(0,x+E):x=Math.min(x,E-1),w<0?w=Math.max(0,w+E):w=Math.min(w,E-1);let N;w0;){let X=Z>>1,K=B+X;x(M[K],C)<0?(B=K+1,Z-=X+1):Z=X}return B}t.lowerBound=a;function l(M,C,x,w=0,E=-1){let N=M.length;if(N===0)return 0;w<0?w=Math.max(0,w+N):w=Math.min(w,N-1),E<0?E=Math.max(0,E+N):E=Math.min(E,N-1);let B=w,Z=E-w+1;for(;Z>0;){let X=Z>>1,K=B+X;x(M[K],C)>0?Z=X:(B=K+1,Z-=X+1)}return B}t.upperBound=l;function c(M,C,x){if(M===C)return!0;if(M.length!==C.length)return!1;for(let w=0,E=M.length;w=N&&(x=E<0?N-1:N),w===void 0?w=E<0?-1:N:w<0?w=Math.max(w+N,E<0?-1:0):w>=N&&(w=E<0?N-1:N);let B;E<0&&w>=x||E>0&&x>=w?B=0:E<0?B=Math.floor((w-x+1)/E+1):B=Math.floor((w-x-1)/E+1);let Z=[];for(let X=0;X=w))return;let N=w-x+1;if(C>0?C=C%N:C<0&&(C=(C%N+N)%N),C===0)return;let B=x+C;f(M,x,B-1),f(M,B,w),f(M,x,w)}t.rotate=h;function m(M,C,x=0,w=-1){let E=M.length;if(E===0)return;x<0?x=Math.max(0,x+E):x=Math.min(x,E-1),w<0?w=Math.max(0,w+E):w=Math.min(w,E-1);let N;wC;--E)M[E]=M[E-1];M[C]=x}t.insert=p;function _(M,C){let x=M.length;if(C<0&&(C+=x),C<0||C>=x)return;let w=M[C];for(let E=C+1;E=x&&B<=w&&M[B]===C||w=x)&&M[B]===C?N++:N>0&&(M[B-N]=M[B]);return N>0&&(M.length=E-N),N}t.removeAllOf=T;function O(M,C,x=0,w=-1){let E,N=n(M,C,x,w);return N!==-1&&(E=_(M,N)),{index:N,value:E}}t.removeFirstWhere=O;function A(M,C,x=-1,w=0){let E,N=r(M,C,x,w);return N!==-1&&(E=_(M,N)),{index:N,value:E}}t.removeLastWhere=A;function b(M,C,x=0,w=-1){let E=M.length;if(E===0)return 0;x<0?x=Math.max(0,x+E):x=Math.min(x,E-1),w<0?w=Math.max(0,w+E):w=Math.min(w,E-1);let N=0;for(let B=0;B=x&&B<=w&&C(M[B],B)||w=x)&&C(M[B],B)?N++:N>0&&(M[B-N]=M[B]);return N>0&&(M.length=E-N),N}t.removeAllWhere=b})(He||(He={}));(function(t){function e(i,n,r){return r===0?1/0:i>n&&r>0||ia?1:0}t.cmp=s})(Mf||(Mf={}))});var O2,ii,Iu,Tu,xo,z2=$(()=>{(function(t){function e(i){let n=document.body,r=s=>{s.preventDefault(),s.stopPropagation(),s.clipboardData.setData("text",i),n.removeEventListener("copy",r,!0)};n.addEventListener("copy",r,!0),document.execCommand("copy")}t.copyText=e})(O2||(O2={}));(function(t){function e(s){let o=window.getComputedStyle(s),a=parseFloat(o.borderTopWidth)||0,l=parseFloat(o.borderLeftWidth)||0,c=parseFloat(o.borderRightWidth)||0,u=parseFloat(o.borderBottomWidth)||0,d=parseFloat(o.paddingTop)||0,f=parseFloat(o.paddingLeft)||0,h=parseFloat(o.paddingRight)||0,m=parseFloat(o.paddingBottom)||0,p=l+f+h+c,_=a+d+m+u;return{borderTop:a,borderLeft:l,borderRight:c,borderBottom:u,paddingTop:d,paddingLeft:f,paddingRight:h,paddingBottom:m,horizontalSum:p,verticalSum:_}}t.boxSizing=e;function i(s){let o=window.getComputedStyle(s),a=parseFloat(o.minWidth)||0,l=parseFloat(o.minHeight)||0,c=parseFloat(o.maxWidth)||1/0,u=parseFloat(o.maxHeight)||1/0;return c=Math.max(a,c),u=Math.max(l,u),{minWidth:a,minHeight:l,maxWidth:c,maxHeight:u}}t.sizeLimits=i;function n(s,o,a){let l=s.getBoundingClientRect();return o>=l.left&&o=l.top&&a=a.bottom)){if(l.topa.bottom&&l.height>=a.height){s.scrollTop-=a.top-l.top;return}if(l.topa.height){s.scrollTop-=a.bottom-l.bottom;return}if(l.bottom>a.bottom&&l.height{let u=Element.prototype;return u.matches||u.matchesSelector||u.mozMatchesSelector||u.msMatchesSelector||u.oMatchesSelector||u.webkitMatchesSelector||function(d){let f=this,h=f.ownerDocument?f.ownerDocument.querySelectorAll(d):[];return Array.prototype.indexOf.call(h,f)!==-1}})();function e(u){u=u.split(",",1)[0];let d=0,f=0,h=0;function m(p){let _=u.match(p);return _===null?!1:(u=u.slice(_[0].length),!0)}for(u=u.replace(c," $1 ");u.length>0;){if(m(i)){d++;continue}if(m(n)){f++;continue}if(m(r)){f++;continue}if(m(o)){h++;continue}if(m(a)){f++;continue}if(m(s)){h++;continue}if(!m(l))return 0}return d=Math.min(d,255),f=Math.min(f,255),h=Math.min(h,255),d<<16|f<<8|h}t.calculateSingle=e;let i=/^#[^\s\+>~#\.\[:]+/,n=/^\.[^\s\+>~#\.\[:]+/,r=/^\[[^\]]+\]/,s=/^[^\s\+>~#\.\[:]+/,o=/^(::[^\s\+>~#\.\[:]+|:first-line|:first-letter|:before|:after)/,a=/^:[^\s\+>~#\.\[:]+/,l=/^[\s\+>~\*]+/,c=/:not\(([^\)]+)\)/g})(xo||(xo={}))});var Tf,ol,P2=$(()=>{Tf=class{constructor(){this._first=null,this._last=null,this._size=0}get isEmpty(){return this._size===0}get size(){return this._size}get length(){return this._size}get first(){return this._first?this._first.value:void 0}get last(){return this._last?this._last.value:void 0}get firstNode(){return this._first}get lastNode(){return this._last}*[Symbol.iterator](){let e=this._first;for(;e;)yield e.value,e=e.next}*retro(){let e=this._last;for(;e;)yield e.value,e=e.prev}*nodes(){let e=this._first;for(;e;)yield e,e=e.next}*retroNodes(){let e=this._last;for(;e;)yield e,e=e.prev}assign(e){this.clear();for(let i of e)this.addLast(i)}push(e){this.addLast(e)}pop(){return this.removeLast()}shift(e){this.addFirst(e)}unshift(){return this.removeFirst()}addFirst(e){let i=new ol.LinkedListNode(this,e);return this._first?(i.next=this._first,this._first.prev=i,this._first=i):(this._first=i,this._last=i),this._size++,i}addLast(e){let i=new ol.LinkedListNode(this,e);return this._last?(i.prev=this._last,this._last.next=i,this._last=i):(this._first=i,this._last=i),this._size++,i}insertBefore(e,i){if(!i||i===this._first)return this.addFirst(e);if(!(i instanceof ol.LinkedListNode)||i.list!==this)throw new Error("Reference node is not owned by the list.");let n=new ol.LinkedListNode(this,e),r=i,s=r.prev;return n.next=r,n.prev=s,r.prev=n,s.next=n,this._size++,n}insertAfter(e,i){if(!i||i===this._last)return this.addLast(e);if(!(i instanceof ol.LinkedListNode)||i.list!==this)throw new Error("Reference node is not owned by the list.");let n=new ol.LinkedListNode(this,e),r=i,s=r.next;return n.next=s,n.prev=r,r.next=n,s.prev=n,this._size++,n}removeFirst(){let e=this._first;if(e)return e===this._last?(this._first=null,this._last=null):(this._first=e.next,this._first.prev=null),e.list=null,e.next=null,e.prev=null,this._size--,e.value}removeLast(){let e=this._last;if(e)return e===this._first?(this._first=null,this._last=null):(this._last=e.prev,this._last.next=null),e.list=null,e.next=null,e.prev=null,this._size--,e.value}removeNode(e){if(!(e instanceof ol.LinkedListNode)||e.list!==this)throw new Error("Node is not owned by the list.");let i=e;i===this._first&&i===this._last?(this._first=null,this._last=null):i===this._first?(this._first=i.next,this._first.prev=null):i===this._last?(this._last=i.prev,this._last.next=null):(i.next.prev=i.prev,i.prev.next=i.next),i.list=null,i.next=null,i.prev=null,this._size--}clear(){let e=this._first;for(;e;){let i=e.next;e.list=null,e.prev=null,e.next=null,e=i}this._first=null,this._last=null,this._size=0}};(function(t){function e(i){let n=new t;return n.assign(i),n}t.from=e})(Tf||(Tf={}));(function(t){class e{constructor(n,r){this.list=null,this.next=null,this.prev=null,this.list=n,this.value=r}}t.LinkedListNode=e})(ol||(ol={}))});var _r,_c,Oe,B2=$(()=>{dC();P2();_r=class{constructor(e){this.type=e}get isConflatable(){return!1}conflate(e){return!1}},_c=class extends _r{get isConflatable(){return!0}conflate(e){return!0}};(function(t){let e=null,i=(C=>x=>{let w=!1;return C.then(()=>!w&&x()),()=>{w=!0}})(Promise.resolve());function n(C,x){let w=f.get(C);if(!w||w.length===0){y(C,x);return}L2(D2(w),N=>N?_(N,C,x):!0)&&y(C,x)}t.sendMessage=n;function r(C,x){if(!x.isConflatable){S(C,x);return}N2(d,E=>E.handler!==C||!E.msg||E.msg.type!==x.type||!E.msg.isConflatable?!1:E.msg.conflate(x))||S(C,x)}t.postMessage=r;function s(C,x){let w=f.get(C);w&&w.indexOf(x)!==-1||(w?w.push(x):f.set(C,[x]))}t.installMessageHook=s;function o(C,x){let w=f.get(C);if(!w)return;let E=w.indexOf(x);E!==-1&&(w[E]=null,O(w))}t.removeMessageHook=o;function a(C){let x=f.get(C);x&&x.length>0&&(He.fill(x,null),O(x));for(let w of d)w.handler===C&&(w.handler=null,w.msg=null)}t.clearData=a;function l(){p||e===null||(e(),e=null,p=!0,T(),p=!1)}t.flush=l;function c(){return m}t.getExceptionHandler=c;function u(C){let x=m;return m=C,x}t.setExceptionHandler=u;let d=new Tf,f=new WeakMap,h=new Set,m=C=>{console.error(C)},p=!1;function _(C,x,w){let E=!0;try{typeof C=="function"?E=C(x,w):E=C.messageHook(x,w)}catch(N){m(N)}return E}function y(C,x){try{C.processMessage(x)}catch(w){m(w)}}function S(C,x){d.addLast({handler:C,msg:x}),e===null&&(e=i(T))}function T(){if(e=null,d.isEmpty)return;let C={handler:null,msg:null};for(d.addLast(C);;){let x=d.removeFirst();if(x===C)return;x.handler&&x.msg&&n(x.handler,x.msg)}}function O(C){h.size===0&&i(A),h.add(C)}function A(){h.forEach(b),h.clear()}function b(C){He.removeAllWhere(C,M)}function M(C){return C===null}})(Oe||(Oe={}))});var ul,oa,as,rg,pe,Kb,la,ku,Rf,kf,sg,og,yo,ll,hC,Xb,Jb,fC,Au,mC,ag,pC,ls,Ru,Zb,gC,Af,al,aa,vr,H2,qV,vc,Us,_C,tn,Lu,Ui,cl,yn,Lf,Qb,j2,F2,vC,W2,$2,q2=$(()=>{dC();ul=P(Qn());z2();B2();Tp();Rs();Ev();Iv();Tv();Cv();Mv();oa=class{constructor(){this.sizeHint=0,this.minSize=0,this.maxSize=1/0,this.stretch=1,this.size=0,this.done=!1}};(function(t){function e(s,o){let a=s.length;if(a===0)return o;let l=0,c=0,u=0,d=0,f=0;for(let p=0;p0&&(d+=_.stretch,f++)}if(o===u)return 0;if(o<=l){for(let p=0;p=c){for(let p=0;p0&&p>h;){let _=p,y=d;for(let S=0;S0&&p>h;){let _=p/m;for(let y=0;y0&&p>h;){let _=p,y=d;for(let S=0;S=T.maxSize?(p-=T.maxSize-T.size,d-=T.stretch,T.size=T.maxSize,T.done=!0,m--,f--):(p-=O,T.size+=O)}}for(;m>0&&p>h;){let _=p/m;for(let y=0;y=S.maxSize?(p-=S.maxSize-S.size,S.size=S.maxSize,S.done=!0,m--):(p-=_,S.size+=_))}}}return 0}t.calc=e;function i(s,o,a){s.length===0||a===0||(a>0?n(s,o,a):r(s,o,-a))}t.adjust=i;function n(s,o,a){let l=0;for(let f=0;f<=o;++f){let h=s[f];l+=h.maxSize-h.size}let c=0;for(let f=o+1,h=s.length;f=0&&u>0;--f){let h=s[f],m=h.maxSize-h.size;m>=u?(h.sizeHint=h.size+u,u=0):(h.sizeHint=h.size+m,u-=m)}let d=a;for(let f=o+1,h=s.length;f0;++f){let m=s[f],p=m.size-m.minSize;p>=d?(m.sizeHint=m.size-d,d=0):(m.sizeHint=m.size-p,d-=p)}}function r(s,o,a){let l=0;for(let f=o+1,h=s.length;f0;++f){let m=s[f],p=m.maxSize-m.size;p>=u?(m.sizeHint=m.size+u,u=0):(m.sizeHint=m.size+p,u-=p)}let d=a;for(let f=o;f>=0&&d>0;--f){let h=s[f],m=h.size-h.minSize;m>=d?(h.sizeHint=h.size-d,d=0):(h.sizeHint=h.size-m,d-=m)}}})(as||(as={}));rg=class{constructor(e){this._label="",this._caption="",this._mnemonic=-1,this._icon=void 0,this._iconClass="",this._iconLabel="",this._className="",this._closable=!1,this._changed=new Te(this),this._isDisposed=!1,this.owner=e.owner,e.label!==void 0&&(this._label=e.label),e.mnemonic!==void 0&&(this._mnemonic=e.mnemonic),e.icon!==void 0&&(this._icon=e.icon),e.iconClass!==void 0&&(this._iconClass=e.iconClass),e.iconLabel!==void 0&&(this._iconLabel=e.iconLabel),e.caption!==void 0&&(this._caption=e.caption),e.className!==void 0&&(this._className=e.className),e.closable!==void 0&&(this._closable=e.closable),this._dataset=e.dataset||{}}get changed(){return this._changed}get label(){return this._label}set label(e){this._label!==e&&(this._label=e,this._changed.emit(void 0))}get mnemonic(){return this._mnemonic}set mnemonic(e){this._mnemonic!==e&&(this._mnemonic=e,this._changed.emit(void 0))}get icon(){return this._icon}set icon(e){this._icon!==e&&(this._icon=e,this._changed.emit(void 0))}get iconClass(){return this._iconClass}set iconClass(e){this._iconClass!==e&&(this._iconClass=e,this._changed.emit(void 0))}get iconLabel(){return this._iconLabel}set iconLabel(e){this._iconLabel!==e&&(this._iconLabel=e,this._changed.emit(void 0))}get caption(){return this._caption}set caption(e){this._caption!==e&&(this._caption=e,this._changed.emit(void 0))}get className(){return this._className}set className(e){this._className!==e&&(this._className=e,this._changed.emit(void 0))}get closable(){return this._closable}set closable(e){this._closable!==e&&(this._closable=e,this._changed.emit(void 0))}get dataset(){return this._dataset}set dataset(e){this._dataset!==e&&(this._dataset=e,this._changed.emit(void 0))}get isDisposed(){return this._isDisposed}dispose(){this.isDisposed||(this._isDisposed=!0,Te.clearData(this))}},pe=class{constructor(e={}){this._flags=0,this._layout=null,this._parent=null,this._disposed=new Te(this),this._hiddenMode=pe.HiddenMode.Display,this.node=Kb.createNode(e),this.addClass("lm-Widget")}dispose(){this.isDisposed||(this.setFlag(pe.Flag.IsDisposed),this._disposed.emit(void 0),this.parent?this.parent=null:this.isAttached&&pe.detach(this),this._layout&&(this._layout.dispose(),this._layout=null),this.title.dispose(),Te.clearData(this),Oe.clearData(this),pt.clearData(this))}get disposed(){return this._disposed}get isDisposed(){return this.testFlag(pe.Flag.IsDisposed)}get isAttached(){return this.testFlag(pe.Flag.IsAttached)}get isHidden(){return this.testFlag(pe.Flag.IsHidden)}get isVisible(){return this.testFlag(pe.Flag.IsVisible)}get title(){return Kb.titleProperty.get(this)}get id(){return this.node.id}set id(e){this.node.id=e}get dataset(){return this.node.dataset}get hiddenMode(){return this._hiddenMode}set hiddenMode(e){this._hiddenMode!==e&&(this.isHidden&&this._toggleHidden(!1),e==pe.HiddenMode.Scale?this.node.style.willChange="transform":this.node.style.willChange="auto",this._hiddenMode=e,this.isHidden&&this._toggleHidden(!0))}get parent(){return this._parent}set parent(e){if(this._parent!==e){if(e&&this.contains(e))throw new Error("Invalid parent widget.");if(this._parent&&!this._parent.isDisposed){let i=new pe.ChildMessage("child-removed",this);Oe.sendMessage(this._parent,i)}if(this._parent=e,this._parent&&!this._parent.isDisposed){let i=new pe.ChildMessage("child-added",this);Oe.sendMessage(this._parent,i)}this.isDisposed||Oe.sendMessage(this,pe.Msg.ParentChanged)}}get layout(){return this._layout}set layout(e){if(this._layout!==e){if(this.testFlag(pe.Flag.DisallowLayout))throw new Error("Cannot set widget layout.");if(this._layout)throw new Error("Cannot change widget layout.");if(e.parent)throw new Error("Cannot change layout parent.");this._layout=e,e.parent=this}}*children(){this._layout&&(yield*this._layout)}contains(e){for(let i=e;i;i=i._parent)if(i===this)return!0;return!1}hasClass(e){return this.node.classList.contains(e)}addClass(e){this.node.classList.add(e)}removeClass(e){this.node.classList.remove(e)}toggleClass(e,i){return i===!0?(this.node.classList.add(e),!0):i===!1?(this.node.classList.remove(e),!1):this.node.classList.toggle(e)}update(){Oe.postMessage(this,pe.Msg.UpdateRequest)}fit(){Oe.postMessage(this,pe.Msg.FitRequest)}activate(){Oe.postMessage(this,pe.Msg.ActivateRequest)}close(){Oe.sendMessage(this,pe.Msg.CloseRequest)}show(){if(this.testFlag(pe.Flag.IsHidden)&&(this.isAttached&&(!this.parent||this.parent.isVisible)&&Oe.sendMessage(this,pe.Msg.BeforeShow),this.clearFlag(pe.Flag.IsHidden),this._toggleHidden(!1),this.isAttached&&(!this.parent||this.parent.isVisible)&&Oe.sendMessage(this,pe.Msg.AfterShow),this.parent)){let e=new pe.ChildMessage("child-shown",this);Oe.sendMessage(this.parent,e)}}hide(){if(!this.testFlag(pe.Flag.IsHidden)&&(this.isAttached&&(!this.parent||this.parent.isVisible)&&Oe.sendMessage(this,pe.Msg.BeforeHide),this.setFlag(pe.Flag.IsHidden),this._toggleHidden(!0),this.isAttached&&(!this.parent||this.parent.isVisible)&&Oe.sendMessage(this,pe.Msg.AfterHide),this.parent)){let e=new pe.ChildMessage("child-hidden",this);Oe.sendMessage(this.parent,e)}}setHidden(e){e?this.hide():this.show()}testFlag(e){return(this._flags&e)!==0}setFlag(e){this._flags|=e}clearFlag(e){this._flags&=~e}processMessage(e){switch(e.type){case"resize":this.notifyLayout(e),this.onResize(e);break;case"update-request":this.notifyLayout(e),this.onUpdateRequest(e);break;case"fit-request":this.notifyLayout(e),this.onFitRequest(e);break;case"before-show":this.notifyLayout(e),this.onBeforeShow(e);break;case"after-show":this.setFlag(pe.Flag.IsVisible),this.notifyLayout(e),this.onAfterShow(e);break;case"before-hide":this.notifyLayout(e),this.onBeforeHide(e);break;case"after-hide":this.clearFlag(pe.Flag.IsVisible),this.notifyLayout(e),this.onAfterHide(e);break;case"before-attach":this.notifyLayout(e),this.onBeforeAttach(e);break;case"after-attach":!this.isHidden&&(!this.parent||this.parent.isVisible)&&this.setFlag(pe.Flag.IsVisible),this.setFlag(pe.Flag.IsAttached),this.notifyLayout(e),this.onAfterAttach(e);break;case"before-detach":this.notifyLayout(e),this.onBeforeDetach(e);break;case"after-detach":this.clearFlag(pe.Flag.IsVisible),this.clearFlag(pe.Flag.IsAttached),this.notifyLayout(e),this.onAfterDetach(e);break;case"activate-request":this.notifyLayout(e),this.onActivateRequest(e);break;case"close-request":this.notifyLayout(e),this.onCloseRequest(e);break;case"child-added":this.notifyLayout(e),this.onChildAdded(e);break;case"child-removed":this.notifyLayout(e),this.onChildRemoved(e);break;default:this.notifyLayout(e);break}}notifyLayout(e){this._layout&&this._layout.processParentMessage(e)}onCloseRequest(e){this.parent?this.parent=null:this.isAttached&&pe.detach(this)}onResize(e){}onUpdateRequest(e){}onFitRequest(e){}onActivateRequest(e){}onBeforeShow(e){}onAfterShow(e){}onBeforeHide(e){}onAfterHide(e){}onBeforeAttach(e){}onAfterAttach(e){}onBeforeDetach(e){}onAfterDetach(e){}onChildAdded(e){}onChildRemoved(e){}_toggleHidden(e){if(e)switch(this._hiddenMode){case pe.HiddenMode.Display:this.addClass("lm-mod-hidden");break;case pe.HiddenMode.Scale:this.node.style.transform="scale(0)",this.node.setAttribute("aria-hidden","true");break;case pe.HiddenMode.ContentVisibility:this.node.style.contentVisibility="hidden",this.node.style.zIndex="-1";break}else switch(this._hiddenMode){case pe.HiddenMode.Display:this.removeClass("lm-mod-hidden");break;case pe.HiddenMode.Scale:this.node.style.transform="",this.node.removeAttribute("aria-hidden");break;case pe.HiddenMode.ContentVisibility:this.node.style.contentVisibility="",this.node.style.zIndex="";break}}};(function(t){(function(s){s[s.Display=0]="Display",s[s.Scale=1]="Scale",s[s.ContentVisibility=2]="ContentVisibility"})(t.HiddenMode||(t.HiddenMode={})),function(s){s[s.IsDisposed=1]="IsDisposed",s[s.IsAttached=2]="IsAttached",s[s.IsHidden=4]="IsHidden",s[s.IsVisible=8]="IsVisible",s[s.DisallowLayout=16]="DisallowLayout"}(t.Flag||(t.Flag={})),function(s){s.BeforeShow=new _r("before-show"),s.AfterShow=new _r("after-show"),s.BeforeHide=new _r("before-hide"),s.AfterHide=new _r("after-hide"),s.BeforeAttach=new _r("before-attach"),s.AfterAttach=new _r("after-attach"),s.BeforeDetach=new _r("before-detach"),s.AfterDetach=new _r("after-detach"),s.ParentChanged=new _r("parent-changed"),s.UpdateRequest=new _c("update-request"),s.FitRequest=new _c("fit-request"),s.ActivateRequest=new _c("activate-request"),s.CloseRequest=new _c("close-request")}(t.Msg||(t.Msg={}));class e extends _r{constructor(o,a){super(o),this.child=a}}t.ChildMessage=e;class i extends _r{constructor(o,a){super("resize"),this.width=o,this.height=a}}t.ResizeMessage=i,function(s){s.UnknownSize=new s(-1,-1)}(i=t.ResizeMessage||(t.ResizeMessage={}));function n(s,o,a=null){if(s.parent)throw new Error("Cannot attach a child widget.");if(s.isAttached||s.node.isConnected)throw new Error("Widget is already attached.");if(!o.isConnected)throw new Error("Host is not attached.");Oe.sendMessage(s,t.Msg.BeforeAttach),o.insertBefore(s.node,a),Oe.sendMessage(s,t.Msg.AfterAttach)}t.attach=n;function r(s){if(s.parent)throw new Error("Cannot detach a child widget.");if(!s.isAttached||!s.node.isConnected)throw new Error("Widget is not attached.");Oe.sendMessage(s,t.Msg.BeforeDetach),s.node.parentNode.removeChild(s.node),Oe.sendMessage(s,t.Msg.AfterDetach)}t.detach=r})(pe||(pe={}));(function(t){t.titleProperty=new pt({name:"title",create:i=>new rg({owner:i})});function e(i){return i.node||document.createElement(i.tag||"div")}t.createNode=e})(Kb||(Kb={}));la=class{constructor(e={}){this._disposed=!1,this._parent=null,this._fitPolicy=e.fitPolicy||"set-min-size"}dispose(){this._parent=null,this._disposed=!0,Te.clearData(this),pt.clearData(this)}get isDisposed(){return this._disposed}get parent(){return this._parent}set parent(e){if(this._parent!==e){if(this._parent)throw new Error("Cannot change parent widget.");if(e.layout!==this)throw new Error("Invalid parent widget.");this._parent=e,this.init()}}get fitPolicy(){return this._fitPolicy}set fitPolicy(e){if(this._fitPolicy!==e&&(this._fitPolicy=e,this._parent)){let i=this._parent.node.style;i.minWidth="",i.minHeight="",i.maxWidth="",i.maxHeight="",this._parent.fit()}}processParentMessage(e){switch(e.type){case"resize":this.onResize(e);break;case"update-request":this.onUpdateRequest(e);break;case"fit-request":this.onFitRequest(e);break;case"before-show":this.onBeforeShow(e);break;case"after-show":this.onAfterShow(e);break;case"before-hide":this.onBeforeHide(e);break;case"after-hide":this.onAfterHide(e);break;case"before-attach":this.onBeforeAttach(e);break;case"after-attach":this.onAfterAttach(e);break;case"before-detach":this.onBeforeDetach(e);break;case"after-detach":this.onAfterDetach(e);break;case"child-removed":this.onChildRemoved(e);break;case"child-shown":this.onChildShown(e);break;case"child-hidden":this.onChildHidden(e);break}}init(){for(let e of this)e.parent=this.parent}onResize(e){for(let i of this)Oe.sendMessage(i,pe.ResizeMessage.UnknownSize)}onUpdateRequest(e){for(let i of this)Oe.sendMessage(i,pe.ResizeMessage.UnknownSize)}onBeforeAttach(e){for(let i of this)Oe.sendMessage(i,e)}onAfterAttach(e){for(let i of this)Oe.sendMessage(i,e)}onBeforeDetach(e){for(let i of this)Oe.sendMessage(i,e)}onAfterDetach(e){for(let i of this)Oe.sendMessage(i,e)}onBeforeShow(e){for(let i of this)i.isHidden||Oe.sendMessage(i,e)}onAfterShow(e){for(let i of this)i.isHidden||Oe.sendMessage(i,e)}onBeforeHide(e){for(let i of this)i.isHidden||Oe.sendMessage(i,e)}onAfterHide(e){for(let i of this)i.isHidden||Oe.sendMessage(i,e)}onChildRemoved(e){this.removeWidget(e.child)}onFitRequest(e){}onChildShown(e){}onChildHidden(e){}};(function(t){function e(s){return Rf.horizontalAlignmentProperty.get(s)}t.getHorizontalAlignment=e;function i(s,o){Rf.horizontalAlignmentProperty.set(s,o)}t.setHorizontalAlignment=i;function n(s){return Rf.verticalAlignmentProperty.get(s)}t.getVerticalAlignment=n;function r(s,o){Rf.verticalAlignmentProperty.set(s,o)}t.setVerticalAlignment=r})(la||(la={}));ku=class{constructor(e){this._top=NaN,this._left=NaN,this._width=NaN,this._height=NaN,this._minWidth=0,this._minHeight=0,this._maxWidth=1/0,this._maxHeight=1/0,this._disposed=!1,this.widget=e,this.widget.node.style.position="absolute",this.widget.node.style.contain="strict"}dispose(){if(this._disposed)return;this._disposed=!0;let e=this.widget.node.style;e.position="",e.top="",e.left="",e.width="",e.height="",e.contain=""}get minWidth(){return this._minWidth}get minHeight(){return this._minHeight}get maxWidth(){return this._maxWidth}get maxHeight(){return this._maxHeight}get isDisposed(){return this._disposed}get isHidden(){return this.widget.isHidden}get isVisible(){return this.widget.isVisible}get isAttached(){return this.widget.isAttached}fit(){let e=ii.sizeLimits(this.widget.node);this._minWidth=e.minWidth,this._minHeight=e.minHeight,this._maxWidth=e.maxWidth,this._maxHeight=e.maxHeight}update(e,i,n,r){let s=Math.max(this._minWidth,Math.min(n,this._maxWidth)),o=Math.max(this._minHeight,Math.min(r,this._maxHeight));if(s"center",changed:e}),t.verticalAlignmentProperty=new pt({name:"verticalAlignment",create:()=>"top",changed:e});function e(i){i.parent&&i.parent.layout&&i.parent.update()}})(Rf||(Rf={}));kf=class extends la{constructor(){super(...arguments),this._widgets=[]}dispose(){for(;this._widgets.length>0;)this._widgets.pop().dispose();super.dispose()}get widgets(){return this._widgets}*[Symbol.iterator](){yield*this._widgets}addWidget(e){this.insertWidget(this._widgets.length,e)}insertWidget(e,i){i.parent=this.parent;let n=this._widgets.indexOf(i),r=Math.max(0,Math.min(e,this._widgets.length));if(n===-1){He.insert(this._widgets,r,i),this.parent&&this.attachWidget(r,i);return}r===this._widgets.length&&r--,n!==r&&(He.move(this._widgets,n,r),this.parent&&this.moveWidget(n,r,i))}removeWidget(e){this.removeWidgetAt(this._widgets.indexOf(e))}removeWidgetAt(e){let i=He.removeAt(this._widgets,e);i&&this.parent&&this.detachWidget(e,i)}init(){super.init();let e=0;for(let i of this)this.attachWidget(e++,i)}attachWidget(e,i){let n=this.parent.node.children[e];this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.BeforeAttach),this.parent.node.insertBefore(i.node,n),this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.AfterAttach)}moveWidget(e,i,n){this.parent.isAttached&&Oe.sendMessage(n,pe.Msg.BeforeDetach),this.parent.node.removeChild(n.node),this.parent.isAttached&&Oe.sendMessage(n,pe.Msg.AfterDetach);let r=this.parent.node.children[i];this.parent.isAttached&&Oe.sendMessage(n,pe.Msg.BeforeAttach),this.parent.node.insertBefore(n.node,r),this.parent.isAttached&&Oe.sendMessage(n,pe.Msg.AfterAttach)}detachWidget(e,i){this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.AfterDetach)}};(function(t){function e(i){return Math.max(0,Math.floor(i))}t.clampDimension=e})(sg||(sg={}));og=sg,yo=class extends kf{constructor(e){super(),this.widgetOffset=0,this._fixed=0,this._spacing=4,this._dirty=!1,this._hasNormedSizes=!1,this._sizers=[],this._items=[],this._handles=[],this._box=null,this._alignment="start",this._orientation="horizontal",this.renderer=e.renderer,e.orientation!==void 0&&(this._orientation=e.orientation),e.alignment!==void 0&&(this._alignment=e.alignment),e.spacing!==void 0&&(this._spacing=sg.clampDimension(e.spacing))}dispose(){for(let e of this._items)e.dispose();this._box=null,this._items.length=0,this._sizers.length=0,this._handles.length=0,super.dispose()}get orientation(){return this._orientation}set orientation(e){this._orientation!==e&&(this._orientation=e,this.parent&&(this.parent.dataset.orientation=e,this.parent.fit()))}get alignment(){return this._alignment}set alignment(e){this._alignment!==e&&(this._alignment=e,this.parent&&(this.parent.dataset.alignment=e,this.parent.update()))}get spacing(){return this._spacing}set spacing(e){e=sg.clampDimension(e),this._spacing!==e&&(this._spacing=e,this.parent&&this.parent.fit())}get handles(){return this._handles}absoluteSizes(){return this._sizers.map(e=>e.size)}relativeSizes(){return ll.normalize(this._sizers.map(e=>e.size))}setRelativeSizes(e,i=!0){let n=this._sizers.length,r=e.slice(0,n);for(;r.length0&&(s.sizeHint=s.size);as.adjust(this._sizers,e,r),this.parent&&this.parent.update()}}init(){this.parent.dataset.orientation=this.orientation,this.parent.dataset.alignment=this.alignment,super.init()}attachWidget(e,i){let n=new ku(i),r=ll.createHandle(this.renderer),s=ll.averageSize(this._sizers),o=ll.createSizer(s);He.insert(this._items,e,n),He.insert(this._sizers,e,o),He.insert(this._handles,e,r),this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.BeforeAttach),this.parent.node.appendChild(i.node),this.parent.node.appendChild(r),this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.AfterAttach),this.parent.fit()}moveWidget(e,i,n){He.move(this._items,e,i),He.move(this._sizers,e,i),He.move(this._handles,e,i),this.parent.fit()}detachWidget(e,i){let n=He.removeAt(this._items,e),r=He.removeAt(this._handles,e);He.removeAt(this._sizers,e),this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.node.removeChild(r),this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.AfterDetach),n.dispose(),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}updateItemPosition(e,i,n,r,s,o,a){let l=this._items[e];if(l.isHidden)return;let c=this._handles[e].style;i?(n+=this.widgetOffset,l.update(n,r,a,s),n+=a,c.top=`${r}px`,c.left=`${n}px`,c.width=`${this._spacing}px`,c.height=`${s}px`):(r+=this.widgetOffset,l.update(n,r,o,a),r+=a,c.top=`${r}px`,c.left=`${n}px`,c.width=`${o}px`,c.height=`${this._spacing}px`)}_fit(){let e=0,i=-1;for(let l=0,c=this._items.length;l0&&(d.sizeHint=d.size),u.isHidden){d.minSize=0,d.maxSize=0;continue}u.fit(),d.stretch=yo.getStretch(u.widget),n?(d.minSize=u.minWidth,d.maxSize=u.maxWidth,r+=u.minWidth,s=Math.max(s,u.minHeight)):(d.minSize=u.minHeight,d.maxSize=u.maxHeight,s+=u.minHeight,r=Math.max(r,u.minWidth))}let o=this._box=ii.boxSizing(this.parent.node);r+=o.horizontalSum,s+=o.verticalSum;let a=this.parent.node.style;a.minWidth=`${r}px`,a.minHeight=`${s}px`,this._dirty=!0,this.parent.parent&&Oe.sendMessage(this.parent.parent,pe.Msg.FitRequest),this._dirty&&Oe.sendMessage(this.parent,pe.Msg.UpdateRequest)}_update(e,i){this._dirty=!1;let n=0;for(let d=0,f=this._items.length;d0){let d;if(u?d=Math.max(0,o-this._fixed):d=Math.max(0,a-this._fixed),this._hasNormedSizes){for(let h of this._sizers)h.sizeHint*=d;this._hasNormedSizes=!1}let f=as.calc(this._sizers,d);if(f>0)switch(this._alignment){case"start":break;case"center":l=0,c=f/2;break;case"end":l=0,c=f;break;case"justify":l=f/n,c=0;break;default:throw"unreachable"}}for(let d=0,f=this._items.length;d0,coerce:(o,a)=>Math.max(0,Math.floor(a)),changed:s});function e(o){let a=new oa;return a.sizeHint=Math.floor(o),a}t.createSizer=e;function i(o){let a=o.createHandle();return a.style.position="absolute",a.style.contain="style",a}t.createHandle=i;function n(o){return o.reduce((a,l)=>a+l.size,0)/o.length||0}t.averageSize=n;function r(o){let a=o.length;if(a===0)return[];let l=o.reduce((c,u)=>c+Math.abs(u),0);return l===0?o.map(c=>1/a):o.map(c=>c/l)}t.normalize=r;function s(o){o.parent&&o.parent.layout instanceof yo&&o.parent.fit()}})(ll||(ll={}));hC=class extends yo{constructor(e){super({...e,orientation:e.orientation||"vertical"}),this._titles=[],this.titleSpace=e.titleSpace||22}get titleSpace(){return this.widgetOffset}set titleSpace(e){e=og.clampDimension(e),this.widgetOffset!==e&&(this.widgetOffset=e,this.parent&&this.parent.fit())}get titles(){return this._titles}dispose(){this.isDisposed||(this._titles.length=0,super.dispose())}updateTitle(e,i){let n=this._titles[e],r=n.classList.contains("lm-mod-expanded"),s=Xb.createTitle(this.renderer,i.title,r);this._titles[e]=s,this.parent.node.replaceChild(s,n)}insertWidget(e,i){i.id||(i.id=`id-${ul.UUID.uuid4()}`),super.insertWidget(e,i)}attachWidget(e,i){let n=Xb.createTitle(this.renderer,i.title);He.insert(this._titles,e,n),this.parent.node.appendChild(n),i.node.setAttribute("role","region"),i.node.setAttribute("aria-labelledby",n.id),super.attachWidget(e,i)}moveWidget(e,i,n){He.move(this._titles,e,i),super.moveWidget(e,i,n)}detachWidget(e,i){let n=He.removeAt(this._titles,e);this.parent.node.removeChild(n),super.detachWidget(e,i)}updateItemPosition(e,i,n,r,s,o,a){let l=this._titles[e].style;l.top=`${r}px`,l.left=`${n}px`,l.height=`${this.widgetOffset}px`,i?l.width=`${s}px`:l.width=`${o}px`,super.updateItemPosition(e,i,n,r,s,o,a)}};(function(t){function e(i,n,r=!0){let s=i.createSectionTitle(n);return s.style.position="absolute",s.style.contain="strict",s.setAttribute("aria-label",`${n.label} Section`),s.setAttribute("aria-expanded",r?"true":"false"),s.setAttribute("aria-controls",n.owner.id),r&&s.classList.add("lm-mod-expanded"),s}t.createTitle=e})(Xb||(Xb={}));Jb=class extends pe{constructor(e={}){super(),this.addClass("lm-Panel"),this.layout=fC.createLayout(e)}get widgets(){return this.layout.widgets}addWidget(e){this.layout.addWidget(e)}insertWidget(e,i){this.layout.insertWidget(e,i)}};(function(t){function e(i){return i.layout||new kf}t.createLayout=e})(fC||(fC={}));Au=class extends Jb{constructor(e={}){super({layout:mC.createLayout(e)}),this._handleMoved=new Te(this),this._pressData=null,this.addClass("lm-SplitPanel")}dispose(){this._releaseMouse(),super.dispose()}get orientation(){return this.layout.orientation}set orientation(e){this.layout.orientation=e}get alignment(){return this.layout.alignment}set alignment(e){this.layout.alignment=e}get spacing(){return this.layout.spacing}set spacing(e){this.layout.spacing=e}get renderer(){return this.layout.renderer}get handleMoved(){return this._handleMoved}get handles(){return this.layout.handles}relativeSizes(){return this.layout.relativeSizes()}setRelativeSizes(e,i=!0){this.layout.setRelativeSizes(e,i)}handleEvent(e){switch(e.type){case"pointerdown":this._evtPointerDown(e);break;case"pointermove":this._evtPointerMove(e);break;case"pointerup":this._evtPointerUp(e);break;case"keydown":this._evtKeyDown(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("pointerdown",this)}onAfterDetach(e){this.node.removeEventListener("pointerdown",this),this._releaseMouse()}onChildAdded(e){e.child.addClass("lm-SplitPanel-child"),this._releaseMouse()}onChildRemoved(e){e.child.removeClass("lm-SplitPanel-child"),this._releaseMouse()}_evtKeyDown(e){this._pressData&&(e.preventDefault(),e.stopPropagation()),e.keyCode===27&&this._releaseMouse()}_evtPointerDown(e){if(e.button!==0)return;let i=this.layout,n=He.findFirstIndex(i.handles,c=>c.contains(e.target));if(n===-1)return;e.preventDefault(),e.stopPropagation(),document.addEventListener("pointerup",this,!0),document.addEventListener("pointermove",this,!0),document.addEventListener("keydown",this,!0),document.addEventListener("contextmenu",this,!0);let r,s=i.handles[n],o=s.getBoundingClientRect();i.orientation==="horizontal"?r=e.clientX-o.left:r=e.clientY-o.top;let a=window.getComputedStyle(s),l=an.overrideCursor(a.cursor);this._pressData={index:n,delta:r,override:l}}_evtPointerMove(e){e.preventDefault(),e.stopPropagation();let i,n=this.layout,r=this.node.getBoundingClientRect();n.orientation==="horizontal"?i=e.clientX-r.left-this._pressData.delta:i=e.clientY-r.top-this._pressData.delta,n.moveHandle(this._pressData.index,i)}_evtPointerUp(e){e.button===0&&(e.preventDefault(),e.stopPropagation(),this._releaseMouse())}_releaseMouse(){this._pressData&&(this._pressData.override.dispose(),this._pressData=null,this._handleMoved.emit(),document.removeEventListener("keydown",this,!0),document.removeEventListener("pointerup",this,!0),document.removeEventListener("pointermove",this,!0),document.removeEventListener("contextmenu",this,!0))}};(function(t){class e{createHandle(){let s=document.createElement("div");return s.className="lm-SplitPanel-handle",s}}t.Renderer=e,t.defaultRenderer=new e;function i(r){return yo.getStretch(r)}t.getStretch=i;function n(r,s){yo.setStretch(r,s)}t.setStretch=n})(Au||(Au={}));(function(t){function e(i){return i.layout||new yo({renderer:i.renderer||Au.defaultRenderer,orientation:i.orientation,alignment:i.alignment,spacing:i.spacing})}t.createLayout=e})(mC||(mC={}));ag=class extends Au{constructor(e={}){super({...e,layout:pC.createLayout(e)}),this._widgetSizesCache=new WeakMap,this._expansionToggled=new Te(this),this.addClass("lm-AccordionPanel")}get renderer(){return this.layout.renderer}get titleSpace(){return this.layout.titleSpace}set titleSpace(e){this.layout.titleSpace=e}get titles(){return this.layout.titles}get expansionToggled(){return this._expansionToggled}addWidget(e){super.addWidget(e),e.title.changed.connect(this._onTitleChanged,this)}collapse(e){let i=this.layout.widgets[e];i&&!i.isHidden&&this._toggleExpansion(e)}expand(e){let i=this.layout.widgets[e];i&&i.isHidden&&this._toggleExpansion(e)}insertWidget(e,i){super.insertWidget(e,i),i.title.changed.connect(this._onTitleChanged,this)}handleEvent(e){switch(super.handleEvent(e),e.type){case"click":this._evtClick(e);break;case"keydown":this._eventKeyDown(e);break}}onBeforeAttach(e){this.node.addEventListener("click",this),this.node.addEventListener("keydown",this),super.onBeforeAttach(e)}onAfterDetach(e){super.onAfterDetach(e),this.node.removeEventListener("click",this),this.node.removeEventListener("keydown",this)}_onTitleChanged(e){let i=He.findFirstIndex(this.widgets,n=>n.contains(e.owner));i>=0&&(this.layout.updateTitle(i,e.owner),this.update())}_computeWidgetSize(e){let i=this.layout,n=i.widgets[e];if(!n)return;let r=n.isHidden,s=i.absoluteSizes(),o=(r?-1:1)*this.spacing,a=s.reduce((c,u)=>c+u),l=[...s];if(r){let c=this._widgetSizesCache.get(n);if(!c)return;l[e]+=c;let u=l.map(d=>d-c>0).lastIndexOf(!0);u===-1?l.forEach((d,f)=>{f!==e&&(l[f]-=s[f]/a*(c-o))}):l[u]-=c-o}else{let c=s[e];this._widgetSizesCache.set(n,c),l[e]=0;let u=l.map(d=>d>0).lastIndexOf(!0);if(u===-1)return;l[u]=s[u]+c+o}return l.map(c=>c/(a+o))}_evtClick(e){let i=e.target;if(i){let n=He.findFirstIndex(this.titles,r=>r.contains(i));n>=0&&(e.preventDefault(),e.stopPropagation(),this._toggleExpansion(n))}}_eventKeyDown(e){if(e.defaultPrevented)return;let i=e.target,n=!1;if(i){let r=He.findFirstIndex(this.titles,s=>s.contains(i));if(r>=0){let s=e.keyCode.toString();if(e.key.match(/Space|Enter/)||s.match(/13|32/))i.click(),n=!0;else if(this.orientation==="horizontal"?e.key.match(/ArrowLeft|ArrowRight/)||s.match(/37|39/):e.key.match(/ArrowUp|ArrowDown/)||s.match(/38|40/)){let o=e.key.match(/ArrowLeft|ArrowUp/)||s.match(/37|38/)?-1:1,a=this.titles.length,l=(r+a+o)%a;this.titles[l].focus(),n=!0}else e.key==="End"||s==="35"?(this.titles[this.titles.length-1].focus(),n=!0):(e.key==="Home"||s==="36")&&(this.titles[0].focus(),n=!0)}n&&e.preventDefault()}}_toggleExpansion(e){let i=this.titles[e],n=this.layout.widgets[e],r=this._computeWidgetSize(e);r&&this.setRelativeSizes(r,!1),n.isHidden?(i.classList.add("lm-mod-expanded"),i.setAttribute("aria-expanded","true"),n.show()):(i.classList.remove("lm-mod-expanded"),i.setAttribute("aria-expanded","false"),n.hide()),this._expansionToggled.emit(e)}};(function(t){class e extends Au.Renderer{constructor(){super(),this.titleClassName="lm-AccordionPanel-title",this._titleID=0,this._titleKeys=new WeakMap,this._uuid=++e._nInstance}createCollapseIcon(n){return document.createElement("span")}createSectionTitle(n){let r=document.createElement("h3");r.setAttribute("tabindex","0"),r.id=this.createTitleKey(n),r.className=this.titleClassName;for(let a in n.dataset)r.dataset[a]=n.dataset[a];let s=r.appendChild(this.createCollapseIcon(n));s.className="lm-AccordionPanel-titleCollapser";let o=r.appendChild(document.createElement("span"));return o.className="lm-AccordionPanel-titleLabel",o.textContent=n.label,o.title=n.caption||n.label,r}createTitleKey(n){let r=this._titleKeys.get(n);return r===void 0&&(r=`title-key-${this._uuid}-${this._titleID++}`,this._titleKeys.set(n,r)),r}}e._nInstance=0,t.Renderer=e,t.defaultRenderer=new e})(ag||(ag={}));(function(t){function e(i){return i.layout||new hC({renderer:i.renderer||ag.defaultRenderer,orientation:i.orientation,alignment:i.alignment,spacing:i.spacing,titleSpace:i.titleSpace})}t.createLayout=e})(pC||(pC={}));ls=class extends kf{constructor(e={}){super(),this._fixed=0,this._spacing=4,this._dirty=!1,this._sizers=[],this._items=[],this._box=null,this._alignment="start",this._direction="top-to-bottom",e.direction!==void 0&&(this._direction=e.direction),e.alignment!==void 0&&(this._alignment=e.alignment),e.spacing!==void 0&&(this._spacing=og.clampDimension(e.spacing))}dispose(){for(let e of this._items)e.dispose();this._box=null,this._items.length=0,this._sizers.length=0,super.dispose()}get direction(){return this._direction}set direction(e){this._direction!==e&&(this._direction=e,this.parent&&(this.parent.dataset.direction=e,this.parent.fit()))}get alignment(){return this._alignment}set alignment(e){this._alignment!==e&&(this._alignment=e,this.parent&&(this.parent.dataset.alignment=e,this.parent.update()))}get spacing(){return this._spacing}set spacing(e){e=og.clampDimension(e),this._spacing!==e&&(this._spacing=e,this.parent&&this.parent.fit())}init(){this.parent.dataset.direction=this.direction,this.parent.dataset.alignment=this.alignment,super.init()}attachWidget(e,i){He.insert(this._items,e,new ku(i)),He.insert(this._sizers,e,new oa),this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.BeforeAttach),this.parent.node.appendChild(i.node),this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.AfterAttach),this.parent.fit()}moveWidget(e,i,n){He.move(this._items,e,i),He.move(this._sizers,e,i),this.parent.update()}detachWidget(e,i){let n=He.removeAt(this._items,e);He.removeAt(this._sizers,e),this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.AfterDetach),n.dispose(),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_fit(){let e=0;for(let a=0,l=this._items.length;a0)switch(this._alignment){case"start":break;case"center":c=0,u=l/2;break;case"end":c=0,u=l;break;case"justify":c=l/n,u=0;break;default:throw"unreachable"}for(let d=0,f=this._items.length;d0,coerce:(r,s)=>Math.max(0,Math.floor(s)),changed:n}),t.sizeBasisProperty=new pt({name:"sizeBasis",create:()=>0,coerce:(r,s)=>Math.max(0,Math.floor(s)),changed:n});function e(r){return r==="left-to-right"||r==="right-to-left"}t.isHorizontal=e;function i(r){return Math.max(0,Math.floor(r))}t.clampSpacing=i;function n(r){r.parent&&r.parent.layout instanceof ls&&r.parent.fit()}})(Ru||(Ru={}));Zb=class extends Jb{constructor(e={}){super({layout:gC.createLayout(e)}),this.addClass("lm-BoxPanel")}get direction(){return this.layout.direction}set direction(e){this.layout.direction=e}get alignment(){return this.layout.alignment}set alignment(e){this.layout.alignment=e}get spacing(){return this.layout.spacing}set spacing(e){this.layout.spacing=e}onChildAdded(e){e.child.addClass("lm-BoxPanel-child")}onChildRemoved(e){e.child.removeClass("lm-BoxPanel-child")}};(function(t){function e(s){return ls.getStretch(s)}t.getStretch=e;function i(s,o){ls.setStretch(s,o)}t.setStretch=i;function n(s){return ls.getSizeBasis(s)}t.getSizeBasis=n;function r(s,o){ls.setSizeBasis(s,o)}t.setSizeBasis=r})(Zb||(Zb={}));(function(t){function e(i){return i.layout||new ls(i)}t.createLayout=e})(gC||(gC={}));Af=class extends pe{constructor(e){super({node:al.createNode()}),this._activeIndex=-1,this._items=[],this._results=null,this.addClass("lm-CommandPalette"),this.setFlag(pe.Flag.DisallowLayout),this.commands=e.commands,this.renderer=e.renderer||Af.defaultRenderer,this.commands.commandChanged.connect(this._onGenericChange,this),this.commands.keyBindingChanged.connect(this._onGenericChange,this)}dispose(){this._items.length=0,this._results=null,super.dispose()}get searchNode(){return this.node.getElementsByClassName("lm-CommandPalette-search")[0]}get inputNode(){return this.node.getElementsByClassName("lm-CommandPalette-input")[0]}get contentNode(){return this.node.getElementsByClassName("lm-CommandPalette-content")[0]}get items(){return this._items}addItem(e){let i=al.createItem(this.commands,e);return this._items.push(i),this.refresh(),i}addItems(e){let i=e.map(n=>al.createItem(this.commands,n));return i.forEach(n=>this._items.push(n)),this.refresh(),i}removeItem(e){this.removeItemAt(this._items.indexOf(e))}removeItemAt(e){He.removeAt(this._items,e)&&this.refresh()}clearItems(){this._items.length!==0&&(this._items.length=0,this.refresh())}refresh(){if(this._results=null,this.inputNode.value!==""){let e=this.node.getElementsByClassName("lm-close-icon")[0];e.style.display="inherit"}else{let e=this.node.getElementsByClassName("lm-close-icon")[0];e.style.display="none"}this.update()}handleEvent(e){switch(e.type){case"click":this._evtClick(e);break;case"keydown":this._evtKeyDown(e);break;case"input":this.refresh();break;case"focus":case"blur":this._toggleFocused();break}}onBeforeAttach(e){this.node.addEventListener("click",this),this.node.addEventListener("keydown",this),this.node.addEventListener("input",this),this.node.addEventListener("focus",this,!0),this.node.addEventListener("blur",this,!0)}onAfterDetach(e){this.node.removeEventListener("click",this),this.node.removeEventListener("keydown",this),this.node.removeEventListener("input",this),this.node.removeEventListener("focus",this,!0),this.node.removeEventListener("blur",this,!0)}onAfterShow(e){this.update(),super.onAfterShow(e)}onActivateRequest(e){if(this.isAttached){let i=this.inputNode;i.focus(),i.select()}}onUpdateRequest(e){if(this.isHidden)return;let i=this.inputNode.value,n=this.contentNode,r=this._results;if(r||(r=this._results=al.search(this._items,i),this._activeIndex=i?He.findFirstIndex(r,al.canActivate):-1),!i&&r.length===0){Qt.render(null,n);return}if(i&&r.length===0){let l=this.renderer.renderEmptyMessage({query:i});Qt.render(l,n);return}let s=this.renderer,o=this._activeIndex,a=new Array(r.length);for(let l=0,c=r.length;l=r.length)n.scrollTop=0;else{let l=n.children[o];ii.scrollIntoViewIfNeeded(n,l)}}_evtClick(e){if(e.button!==0)return;if(e.target.classList.contains("lm-close-icon")){this.inputNode.value="",this.refresh();return}let i=He.findFirstIndex(this.contentNode.children,n=>n.contains(e.target));i!==-1&&(e.preventDefault(),e.stopPropagation(),this._execute(i))}_evtKeyDown(e){if(!(e.altKey||e.ctrlKey||e.metaKey||e.shiftKey))switch(e.keyCode){case 13:e.preventDefault(),e.stopPropagation(),this._execute(this._activeIndex);break;case 38:e.preventDefault(),e.stopPropagation(),this._activatePreviousItem();break;case 40:e.preventDefault(),e.stopPropagation(),this._activateNextItem();break}}_activateNextItem(){if(!this._results||this._results.length===0)return;let e=this._activeIndex,i=this._results.length,n=eC-x),b=S.slice(0,A),M=S.slice(A);for(let C=0,x=M.length;Cp.command===h&&ul.JSONExt.deepEqual(p.args,m))||null}}})(al||(al={}));aa=class extends pe{constructor(e){super({node:vr.createNode()}),this._childIndex=-1,this._activeIndex=-1,this._openTimerID=0,this._closeTimerID=0,this._items=[],this._childMenu=null,this._parentMenu=null,this._aboutToClose=new Te(this),this._menuRequested=new Te(this),this.addClass("lm-Menu"),this.setFlag(pe.Flag.DisallowLayout),this.commands=e.commands,this.renderer=e.renderer||aa.defaultRenderer}dispose(){this.close(),this._items.length=0,super.dispose()}get aboutToClose(){return this._aboutToClose}get menuRequested(){return this._menuRequested}get parentMenu(){return this._parentMenu}get childMenu(){return this._childMenu}get rootMenu(){let e=this;for(;e._parentMenu;)e=e._parentMenu;return e}get leafMenu(){let e=this;for(;e._childMenu;)e=e._childMenu;return e}get contentNode(){return this.node.getElementsByClassName("lm-Menu-content")[0]}get activeItem(){return this._items[this._activeIndex]||null}set activeItem(e){this.activeIndex=e?this._items.indexOf(e):-1}get activeIndex(){return this._activeIndex}set activeIndex(e){(e<0||e>=this._items.length)&&(e=-1),e!==-1&&!vr.canActivate(this._items[e])&&(e=-1),this._activeIndex!==e&&(this._activeIndex=e,this._activeIndex>=0&&this.contentNode.childNodes[this._activeIndex]&&this.contentNode.childNodes[this._activeIndex].focus(),this.update())}get items(){return this._items}activateNextItem(){let e=this._items.length,i=this._activeIndex,n=i{this.activeIndex=a}})}Qt.render(o,this.contentNode)}onCloseRequest(e){this._cancelOpenTimer(),this._cancelCloseTimer(),this.activeIndex=-1;let i=this._childMenu;i&&(this._childIndex=-1,this._childMenu=null,i._parentMenu=null,i.close());let n=this._parentMenu;n&&(this._parentMenu=null,n._childIndex=-1,n._childMenu=null,n.activate()),this.isAttached&&this._aboutToClose.emit(void 0),super.onCloseRequest(e)}_evtKeyDown(e){e.preventDefault(),e.stopPropagation();let i=e.keyCode;if(i===13){this.triggerActiveItem();return}if(i===27){this.close();return}if(i===37){this._parentMenu?this.close():this._menuRequested.emit("previous");return}if(i===38){this.activatePreviousItem();return}if(i===39){let o=this.activeItem;o&&o.type==="submenu"?this.triggerActiveItem():this.rootMenu._menuRequested.emit("next");return}if(i===40){this.activateNextItem();return}let n=ks().keyForKeydownEvent(e);if(!n)return;let r=this._activeIndex+1,s=vr.findMnemonic(this._items,n,r);s.index!==-1&&!s.multiple?(this.activeIndex=s.index,this.triggerActiveItem()):s.index!==-1?this.activeIndex=s.index:s.auto!==-1&&(this.activeIndex=s.auto)}_evtMouseUp(e){e.button===0&&(e.preventDefault(),e.stopPropagation(),this.triggerActiveItem())}_evtMouseMove(e){let i=He.findFirstIndex(this.contentNode.children,r=>ii.hitTest(r,e.clientX,e.clientY));if(i===this._activeIndex)return;if(this.activeIndex=i,i=this.activeIndex,i===this._childIndex){this._cancelOpenTimer(),this._cancelCloseTimer();return}this._childIndex!==-1&&this._startCloseTimer(),this._cancelOpenTimer();let n=this.activeItem;!n||n.type!=="submenu"||!n.submenu||this._startOpenTimer()}_evtMouseEnter(e){for(let i=this._parentMenu;i;i=i._parentMenu)i._cancelOpenTimer(),i._cancelCloseTimer(),i.activeIndex=i._childIndex}_evtMouseLeave(e){if(this._cancelOpenTimer(),!this._childMenu){this.activeIndex=-1;return}let{clientX:i,clientY:n}=e;if(ii.hitTest(this._childMenu.node,i,n)){this._cancelCloseTimer();return}this.activeIndex=-1,this._startCloseTimer()}_evtMouseDown(e){this._parentMenu||(vr.hitTestMenus(this,e.clientX,e.clientY)?(e.preventDefault(),e.stopPropagation()):this.close())}_openChildMenu(e=!1){let i=this.activeItem;if(!i||i.type!=="submenu"||!i.submenu){this._closeChildMenu();return}let n=i.submenu;if(n===this._childMenu)return;aa.saveWindowData(),this._closeChildMenu(),this._childMenu=n,this._childIndex=this._activeIndex,n._parentMenu=this,Oe.sendMessage(this,pe.Msg.UpdateRequest);let r=this.contentNode.children[this._activeIndex];vr.openSubmenu(n,r),e&&(n.activeIndex=-1,n.activateNextItem()),n.activate()}_closeChildMenu(){this._childMenu&&this._childMenu.close()}_startOpenTimer(){this._openTimerID===0&&(this._openTimerID=window.setTimeout(()=>{this._openTimerID=0,this._openChildMenu()},vr.TIMER_DELAY))}_startCloseTimer(){this._closeTimerID===0&&(this._closeTimerID=window.setTimeout(()=>{this._closeTimerID=0,this._closeChildMenu()},vr.TIMER_DELAY))}_cancelOpenTimer(){this._openTimerID!==0&&(clearTimeout(this._openTimerID),this._openTimerID=0)}_cancelCloseTimer(){this._closeTimerID!==0&&(clearTimeout(this._closeTimerID),this._closeTimerID=0)}static saveWindowData(){vr.saveWindowData()}};(function(t){class e{renderItem(n){let r=this.createItemClass(n),s=this.createItemDataset(n),o=this.createItemARIA(n);return Ce.li({className:r,dataset:s,tabindex:"0",onfocus:n.onfocus,...o},this.renderIcon(n),this.renderLabel(n),this.renderShortcut(n),this.renderSubmenu(n))}renderIcon(n){let r=this.createIconClass(n);return Ce.div({className:r},n.item.icon,n.item.iconLabel)}renderLabel(n){let r=this.formatLabel(n);return Ce.div({className:"lm-Menu-itemLabel"},r)}renderShortcut(n){let r=this.formatShortcut(n);return Ce.div({className:"lm-Menu-itemShortcut"},r)}renderSubmenu(n){return Ce.div({className:"lm-Menu-itemSubmenuIcon"})}createItemClass(n){let r="lm-Menu-item";n.item.isEnabled||(r+=" lm-mod-disabled"),n.item.isToggled&&(r+=" lm-mod-toggled"),n.item.isVisible||(r+=" lm-mod-hidden"),n.active&&(r+=" lm-mod-active"),n.collapsed&&(r+=" lm-mod-collapsed");let s=n.item.className;return s&&(r+=` ${s}`),r}createItemDataset(n){let r,{type:s,command:o,dataset:a}=n.item;return s==="command"?r={...a,type:s,command:o}:r={...a,type:s},r}createIconClass(n){let r="lm-Menu-itemIcon",s=n.item.iconClass;return s?`${r} ${s}`:r}createItemARIA(n){let r={};switch(n.item.type){case"separator":r.role="presentation";break;case"submenu":r["aria-haspopup"]="true",n.item.isEnabled||(r["aria-disabled"]="true");break;default:n.item.isEnabled||(r["aria-disabled"]="true"),r.role="menuitem"}return r}formatLabel(n){let{label:r,mnemonic:s}=n.item;if(s<0||s>=r.length)return r;let o=r.slice(0,s),a=r.slice(s+1),l=r[s],c=Ce.span({className:"lm-Menu-itemMnemonic"},l);return[o,c,a]}formatShortcut(n){let r=n.item.keyBinding;return r?er.formatKeystroke(r.keys):null}}t.Renderer=e,t.defaultRenderer=new e})(aa||(aa={}));(function(t){t.TIMER_DELAY=300,t.SUBMENU_OVERLAP=3;let e=null,i=0;function n(){return i>0?(i--,e):u()}function r(){e=u(),i++}t.saveWindowData=r;function s(){let p=document.createElement("div"),_=document.createElement("ul");return _.className="lm-Menu-content",p.appendChild(_),_.setAttribute("role","menu"),p.tabIndex=0,p}t.createNode=s;function o(p){return p.type!=="separator"&&p.isEnabled&&p.isVisible}t.canActivate=o;function a(p,_){return new m(p.commands,_)}t.createItem=a;function l(p,_,y){for(let S=p;S;S=S.childMenu)if(ii.hitTest(S.node,_,y))return!0;return!1}t.hitTestMenus=l;function c(p){let _=new Array(p.length);He.fill(_,!1);let y=0,S=p.length;for(;y=0;--T){let A=p[T];if(A.isVisible){if(A.type!=="separator")break;_[T]=!0}}let O=!1;for(;++yM+x&&(_=M+x-Z),!T&&y+X>C+w&&(y>C+w?y=C+w-X:y=y-X),B.transform=`translate(${Math.max(0,_)}px, ${Math.max(0,y)}px`,B.opacity="1"}t.openRootMenu=d;function f(p,_){let y=n(),S=y.pageXOffset,T=y.pageYOffset,O=y.clientWidth,A=y.clientHeight;Oe.sendMessage(p,pe.Msg.UpdateRequest);let b=A,M=p.node,C=M.style;C.opacity="0",C.maxHeight=`${b}px`,pe.attach(p,document.body);let{width:x,height:w}=M.getBoundingClientRect(),E=ii.boxSizing(p.node),N=_.getBoundingClientRect(),B=N.right-t.SUBMENU_OVERLAP;B+x>S+O&&(B=N.left+t.SUBMENU_OVERLAP-x);let Z=N.top-E.borderTop-E.paddingTop;Z+w>T+A&&(Z=N.bottom+E.borderBottom+E.paddingBottom-w),C.transform=`translate(${Math.max(0,B)}px, ${Math.max(0,Z)}px`,C.opacity="1"}t.openSubmenu=f;function h(p,_,y){let S=-1,T=-1,O=!1,A=_.toUpperCase();for(let b=0,M=p.length;b=0&&ES.command===_&&ul.JSONExt.deepEqual(S.args,y))||null}return null}}})(vr||(vr={}));(function(t){function e(o,a){let l=n(o.selector),c=o.rank!==void 0?o.rank:1/0;return{...o,selector:l,rank:c,id:a}}t.createItem=e;function i(o,a,l,c){let u=a.target;if(!u)return null;let d=a.currentTarget;if(!d||!d.contains(u)&&(u=document.elementFromPoint(a.clientX,a.clientY),!u||!d.contains(u)))return null;let f=[],h=o.slice();for(;u!==null;){let m=[];for(let p=0,_=h.length;p<_;++p){let y=h[p];y&&Tu.matches(u,y.selector)&&(m.push(y),h[p]=null)}if(m.length!==0&&(l&&m.sort(c?s:r),f.push(...m)),u===d)break;u=u.parentElement}return l||f.sort(c?s:r),f}t.matchItems=i;function n(o){if(o.indexOf(",")!==-1)throw new Error(`Selector cannot contain commas: ${o}`);if(!Tu.isValid(o))throw new Error(`Invalid selector: ${o}`);return o}function r(o,a){let l=o.rank,c=a.rank;return l!==c?l=this._titles.length)&&(e=-1),this._currentIndex===e)return;let i=this._currentIndex,n=this._titles[i]||null,r=e,s=this._titles[r]||null;this._currentIndex=r,this._previousTitle=n,this.update(),this._currentChanged.emit({previousIndex:i,previousTitle:n,currentIndex:r,currentTitle:s})}get name(){return this._name}set name(e){this._name=e,e?this.contentNode.setAttribute("aria-label",e):this.contentNode.removeAttribute("aria-label")}get orientation(){return this._orientation}set orientation(e){this._orientation!==e&&(this._releaseMouse(),this._orientation=e,this.dataset.orientation=e,this.contentNode.setAttribute("aria-orientation",e))}get addButtonEnabled(){return this._addButtonEnabled}set addButtonEnabled(e){this._addButtonEnabled!==e&&(this._addButtonEnabled=e,e?this.addButtonNode.classList.remove("lm-mod-hidden"):this.addButtonNode.classList.add("lm-mod-hidden"))}get titles(){return this._titles}get contentNode(){return this.node.getElementsByClassName("lm-TabBar-content")[0]}get addButtonNode(){return this.node.getElementsByClassName("lm-TabBar-addButton")[0]}addTab(e){return this.insertTab(this._titles.length,e)}insertTab(e,i){this._releaseMouse();let n=Us.asTitle(i),r=this._titles.indexOf(n),s=Math.max(0,Math.min(e,this._titles.length));return r===-1?(He.insert(this._titles,s,n),n.changed.connect(this._onTitleChanged,this),this.update(),this._adjustCurrentForInsert(s,n),n):(s===this._titles.length&&s--,r===s||(He.move(this._titles,r,s),this.update(),this._adjustCurrentForMove(r,s)),n)}removeTab(e){this.removeTabAt(this._titles.indexOf(e))}removeTabAt(e){this._releaseMouse();let i=He.removeAt(this._titles,e);i&&(i.changed.disconnect(this._onTitleChanged,this),i===this._previousTitle&&(this._previousTitle=null),this.update(),this._adjustCurrentForRemove(e,i))}clearTabs(){if(this._titles.length===0)return;this._releaseMouse();for(let n of this._titles)n.changed.disconnect(this._onTitleChanged,this);let e=this.currentIndex,i=this.currentTitle;this._currentIndex=-1,this._previousTitle=null,this._titles.length=0,this.update(),e!==-1&&this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:-1,currentTitle:null})}releaseMouse(){this._releaseMouse()}handleEvent(e){switch(e.type){case"pointerdown":this._evtPointerDown(e);break;case"pointermove":this._evtPointerMove(e);break;case"pointerup":this._evtPointerUp(e);break;case"dblclick":this._evtDblClick(e);break;case"keydown":e.eventPhase===Event.CAPTURING_PHASE?this._evtKeyDownCapturing(e):this._evtKeyDown(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("pointerdown",this),this.node.addEventListener("dblclick",this),this.node.addEventListener("keydown",this)}onAfterDetach(e){this.node.removeEventListener("pointerdown",this),this.node.removeEventListener("dblclick",this),this.node.removeEventListener("keydown",this),this._releaseMouse()}onUpdateRequest(e){var i;let n=this._titles,r=this.renderer,s=this.currentTitle,o=new Array(n.length),a=(i=this._getCurrentTabindex())!==null&&i!==void 0?i:this._currentIndex>-1?this._currentIndex:0;for(let l=0,c=n.length;lii.hitTest(o,e.clientX,e.clientY));if(n===-1)return;let r=this.titles[n],s=i[n].querySelector(".lm-TabBar-tabLabel");if(s&&s.contains(e.target)){let o=r.label||"",a=s.innerHTML;s.innerHTML="";let l=document.createElement("input");l.classList.add("lm-TabBar-tabInput"),l.value=o,s.appendChild(l);let c=()=>{l.removeEventListener("blur",c),s.innerHTML=a,this.node.addEventListener("keydown",this)};l.addEventListener("dblclick",u=>u.stopPropagation()),l.addEventListener("blur",c),l.addEventListener("keydown",u=>{u.key==="Enter"?(l.value!==""&&(r.label=r.caption=l.value),c()):u.key==="Escape"&&c()}),this.node.removeEventListener("keydown",this),l.select(),l.focus(),s.children.length>0&&s.children[0].focus()}}_evtKeyDownCapturing(e){e.eventPhase===Event.CAPTURING_PHASE&&(e.preventDefault(),e.stopPropagation(),e.key==="Escape"&&this._releaseMouse())}_evtKeyDown(e){var i,n,r;if(!(e.key==="Tab"||e.eventPhase===Event.CAPTURING_PHASE)){if(e.key==="Enter"||e.key==="Spacebar"||e.key===" "){let s=document.activeElement;if(this.addButtonEnabled&&this.addButtonNode.contains(s))e.preventDefault(),e.stopPropagation(),this._addRequested.emit();else{let o=He.findFirstIndex(this.contentNode.children,a=>a.contains(s));o>=0&&(e.preventDefault(),e.stopPropagation(),this.currentIndex=o)}}else if(qV.includes(e.key)){let s=[...this.contentNode.children];if(this.addButtonEnabled&&s.push(this.addButtonNode),s.length<=1)return;e.preventDefault(),e.stopPropagation();let o=s.indexOf(document.activeElement);o===-1&&(o=this._currentIndex);let a;e.key==="ArrowRight"&&this._orientation==="horizontal"||e.key==="ArrowDown"&&this._orientation==="vertical"?a=(i=s[o+1])!==null&&i!==void 0?i:s[0]:e.key==="ArrowLeft"&&this._orientation==="horizontal"||e.key==="ArrowUp"&&this._orientation==="vertical"?a=(n=s[o-1])!==null&&n!==void 0?n:s[s.length-1]:e.key==="Home"?a=s[0]:e.key==="End"&&(a=s[s.length-1]),a&&((r=s[o])===null||r===void 0||r.setAttribute("tabindex","-1"),a?.setAttribute("tabindex","0"),a.focus())}}}_evtPointerDown(e){if(e.button!==0&&e.button!==1||this._dragData||e.target.classList.contains("lm-TabBar-tabInput"))return;let i=this.addButtonEnabled&&this.addButtonNode.contains(e.target),n=this.contentNode.children,r=He.findFirstIndex(n,o=>ii.hitTest(o,e.clientX,e.clientY));if(r===-1&&!i||(e.preventDefault(),e.stopPropagation(),this._dragData={tab:n[r],index:r,pressX:e.clientX,pressY:e.clientY,tabPos:-1,tabSize:-1,tabPressPos:-1,targetIndex:-1,tabLayout:null,contentRect:null,override:null,dragActive:!1,dragAborted:!1,detachRequested:!1},this.document.addEventListener("pointerup",this,!0),e.button===1||i))return;let s=n[r].querySelector(this.renderer.closeIconSelector);s&&s.contains(e.target)||(this.tabsMovable&&(this.document.addEventListener("pointermove",this,!0),this.document.addEventListener("keydown",this,!0),this.document.addEventListener("contextmenu",this,!0)),this.allowDeselect&&this.currentIndex===r?this.currentIndex=-1:this.currentIndex=r,this.currentIndex!==-1&&this._tabActivateRequested.emit({index:this.currentIndex,title:this.currentTitle}))}_evtPointerMove(e){let i=this._dragData;if(!i)return;e.preventDefault(),e.stopPropagation();let n=this.contentNode.children;if(!(!i.dragActive&&!Us.dragExceeded(i,e))){if(!i.dragActive){let r=i.tab.getBoundingClientRect();this._orientation==="horizontal"?(i.tabPos=i.tab.offsetLeft,i.tabSize=r.width,i.tabPressPos=i.pressX-r.left):(i.tabPos=i.tab.offsetTop,i.tabSize=r.height,i.tabPressPos=i.pressY-r.top),i.tabPressOffset={x:i.pressX-r.left,y:i.pressY-r.top},i.tabLayout=Us.snapTabLayout(n,this._orientation),i.contentRect=this.contentNode.getBoundingClientRect(),i.override=an.overrideCursor("default"),i.tab.classList.add("lm-mod-dragging"),this.addClass("lm-mod-dragging"),i.dragActive=!0}if(!i.detachRequested&&Us.detachExceeded(i,e)){i.detachRequested=!0;let r=i.index,s=e.clientX,o=e.clientY,a=n[r],l=this._titles[r];if(this._tabDetachRequested.emit({index:r,title:l,tab:a,clientX:s,clientY:o,offset:i.tabPressOffset}),i.dragAborted)return}Us.layoutTabs(n,i,e,this._orientation)}}_evtPointerUp(e){if(e.button!==0&&e.button!==1)return;let i=this._dragData;if(!i)return;if(e.preventDefault(),e.stopPropagation(),this.document.removeEventListener("pointermove",this,!0),this.document.removeEventListener("pointerup",this,!0),this.document.removeEventListener("keydown",this,!0),this.document.removeEventListener("contextmenu",this,!0),!i.dragActive){if(this._dragData=null,this.addButtonEnabled&&this.addButtonNode.contains(e.target)){this._addRequested.emit(void 0);return}let s=this.contentNode.children,o=He.findFirstIndex(s,c=>ii.hitTest(c,e.clientX,e.clientY));if(o!==i.index)return;let a=this._titles[o];if(!a.closable)return;if(e.button===1){this._tabCloseRequested.emit({index:o,title:a});return}let l=s[o].querySelector(this.renderer.closeIconSelector);if(l&&l.contains(e.target)){this._tabCloseRequested.emit({index:o,title:a});return}return}if(e.button!==0)return;Us.finalizeTabPosition(i,this._orientation),i.tab.classList.remove("lm-mod-dragging");let n=Us.parseTransitionDuration(i.tab);setTimeout(()=>{if(i.dragAborted)return;this._dragData=null,Us.resetTabPositions(this.contentNode.children,this._orientation),i.override.dispose(),this.removeClass("lm-mod-dragging");let r=i.index,s=i.targetIndex;s===-1||r===s||(He.move(this._titles,r,s),this._adjustCurrentForMove(r,s),this._tabMoved.emit({fromIndex:r,toIndex:s,title:this._titles[s]}),Oe.sendMessage(this,pe.Msg.UpdateRequest))},n)}_releaseMouse(){let e=this._dragData;e&&(this._dragData=null,this.document.removeEventListener("pointermove",this,!0),this.document.removeEventListener("pointerup",this,!0),this.document.removeEventListener("keydown",this,!0),this.document.removeEventListener("contextmenu",this,!0),e.dragAborted=!0,e.dragActive&&(Us.resetTabPositions(this.contentNode.children,this._orientation),e.override.dispose(),e.tab.classList.remove("lm-mod-dragging"),this.removeClass("lm-mod-dragging")))}_adjustCurrentForInsert(e,i){let n=this.currentTitle,r=this._currentIndex,s=this.insertBehavior;if(s==="select-tab"||s==="select-tab-if-needed"&&r===-1){this._currentIndex=e,this._previousTitle=n,this._currentChanged.emit({previousIndex:r,previousTitle:n,currentIndex:e,currentTitle:i});return}r>=e&&this._currentIndex++}_adjustCurrentForMove(e,i){this._currentIndex===e?this._currentIndex=i:this._currentIndex=i?this._currentIndex++:this._currentIndex>e&&this._currentIndex<=i&&this._currentIndex--}_adjustCurrentForRemove(e,i){let n=this._currentIndex,r=this.removeBehavior;if(n!==e){n>e&&this._currentIndex--;return}if(this._titles.length===0){this._currentIndex=-1,this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:-1,currentTitle:null});return}if(r==="select-tab-after"){this._currentIndex=Math.min(e,this._titles.length-1),this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:this._currentIndex,currentTitle:this.currentTitle});return}if(r==="select-tab-before"){this._currentIndex=Math.max(0,e-1),this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:this._currentIndex,currentTitle:this.currentTitle});return}if(r==="select-previous-tab"){this._previousTitle?(this._currentIndex=this._titles.indexOf(this._previousTitle),this._previousTitle=null):this._currentIndex=Math.min(e,this._titles.length-1),this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:this._currentIndex,currentTitle:this.currentTitle});return}this._currentIndex=-1,this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:-1,currentTitle:null})}_onTitleChanged(e){this.update()}};(function(t){class e{constructor(){this.closeIconSelector=".lm-TabBar-tabCloseIcon",this._tabID=0,this._tabKeys=new WeakMap,this._uuid=++e._nInstance}renderTab(n){let r=n.title.caption,s=this.createTabKey(n),o=s,a=this.createTabStyle(n),l=this.createTabClass(n),c=this.createTabDataset(n),u=this.createTabARIA(n);return n.title.closable?Ce.li({id:o,key:s,className:l,title:r,style:a,dataset:c,...u},this.renderIcon(n),this.renderLabel(n),this.renderCloseIcon(n)):Ce.li({id:o,key:s,className:l,title:r,style:a,dataset:c,...u},this.renderIcon(n),this.renderLabel(n))}renderIcon(n){let{title:r}=n,s=this.createIconClass(n);return Ce.div({className:s},r.icon,r.iconLabel)}renderLabel(n){return Ce.div({className:"lm-TabBar-tabLabel"},n.title.label)}renderCloseIcon(n){return Ce.div({className:"lm-TabBar-tabCloseIcon"})}createTabKey(n){let r=this._tabKeys.get(n.title);return r===void 0&&(r=`tab-key-${this._uuid}-${this._tabID++}`,this._tabKeys.set(n.title,r)),r}createTabStyle(n){return{zIndex:`${n.zIndex}`}}createTabClass(n){let r="lm-TabBar-tab";return n.title.className&&(r+=` ${n.title.className}`),n.title.closable&&(r+=" lm-mod-closable"),n.current&&(r+=" lm-mod-current"),r}createTabDataset(n){return n.title.dataset}createTabARIA(n){var r;return{role:"tab","aria-selected":n.current.toString(),tabindex:`${(r=n.tabIndex)!==null&&r!==void 0?r:"-1"}`}}createIconClass(n){let r="lm-TabBar-tabIcon",s=n.title.iconClass;return s?`${r} ${s}`:r}}e._nInstance=0,t.Renderer=e,t.defaultRenderer=new e,t.addButtonSelector=".lm-TabBar-addButton"})(vc||(vc={}));(function(t){t.DRAG_THRESHOLD=5,t.DETACH_THRESHOLD=20;function e(){let u=document.createElement("div"),d=document.createElement("ul");d.setAttribute("role","tablist"),d.className="lm-TabBar-content",u.appendChild(d);let f=document.createElement("div");return f.className="lm-TabBar-addButton lm-mod-hidden",f.setAttribute("tabindex","-1"),f.setAttribute("role","button"),u.appendChild(f),u}t.createNode=e;function i(u){return u instanceof rg?u:new rg(u)}t.asTitle=i;function n(u){let d=window.getComputedStyle(u);return 1e3*(parseFloat(d.transitionDuration)||0)}t.parseTransitionDuration=n;function r(u,d){let f=new Array(u.length);for(let h=0,m=u.length;h=t.DRAG_THRESHOLD||h>=t.DRAG_THRESHOLD}t.dragExceeded=s;function o(u,d){let f=u.contentRect;return d.clientX=f.right+t.DETACH_THRESHOLD||d.clientY=f.bottom+t.DETACH_THRESHOLD}t.detachExceeded=o;function a(u,d,f,h){let m,p,_,y;h==="horizontal"?(m=d.pressX,p=f.clientX-d.contentRect.left,_=f.clientX,y=d.contentRect.width):(m=d.pressY,p=f.clientY-d.contentRect.top,_=f.clientY,y=d.contentRect.height);let S=d.index,T=p-d.tabPressPos,O=T+d.tabSize;for(let A=0,b=u.length;A>1);if(Ad.index&&O>x)M=`${-d.tabSize-C.margin}px`,S=Math.max(S,A);else if(A===d.index){let w=_-m,E=y-(d.tabPos+d.tabSize);M=`${Math.max(-d.tabPos,Math.min(w,E))}px`}else M="";h==="horizontal"?u[A].style.left=M:u[A].style.top=M}d.targetIndex=S}t.layoutTabs=a;function l(u,d){let f;d==="horizontal"?f=u.contentRect.width:f=u.contentRect.height;let h;if(u.targetIndex===u.index)h=0;else if(u.targetIndex>u.index){let _=u.tabLayout[u.targetIndex];h=_.pos+_.size-u.tabSize-u.tabPos}else h=u.tabLayout[u.targetIndex].pos-u.tabPos;let m=f-(u.tabPos+u.tabSize),p=Math.max(-u.tabPos,Math.min(h,m));d==="horizontal"?u.tab.style.left=`${p}px`:u.tab.style.top=`${p}px`}t.finalizeTabPosition=l;function c(u,d){for(let f of u)d==="horizontal"?f.style.left="":f.style.top=""}t.resetTabPositions=c})(Us||(Us={}));_C=class extends la{constructor(e){super(),this._spacing=4,this._dirty=!1,this._root=null,this._box=null,this._items=new Map,this.renderer=e.renderer,e.spacing!==void 0&&(this._spacing=og.clampDimension(e.spacing)),this._document=e.document||document,this._hiddenMode=e.hiddenMode!==void 0?e.hiddenMode:pe.HiddenMode.Display}dispose(){let e=this[Symbol.iterator]();this._items.forEach(i=>{i.dispose()}),this._box=null,this._root=null,this._items.clear();for(let i of e)i.dispose();super.dispose()}get hiddenMode(){return this._hiddenMode}set hiddenMode(e){if(this._hiddenMode!==e){this._hiddenMode=e;for(let i of this.tabBars())if(i.titles.length>1)for(let n of i.titles)n.owner.hiddenMode=this._hiddenMode}}get spacing(){return this._spacing}set spacing(e){e=og.clampDimension(e),this._spacing!==e&&(this._spacing=e,this.parent&&this.parent.fit())}get isEmpty(){return this._root===null}[Symbol.iterator](){return this._root?this._root.iterAllWidgets():If()}widgets(){return this._root?this._root.iterUserWidgets():If()}selectedWidgets(){return this._root?this._root.iterSelectedWidgets():If()}tabBars(){return this._root?this._root.iterTabBars():If()}handles(){return this._root?this._root.iterHandles():If()}moveHandle(e,i,n){let r=e.classList.contains("lm-mod-hidden");if(!this._root||r)return;let s=this._root.findSplitNode(e);if(!s)return;let o;s.node.orientation==="horizontal"?o=i-e.offsetLeft:o=n-e.offsetTop,o!==0&&(s.node.holdSizes(),as.adjust(s.node.sizers,s.index,o),this.parent&&this.parent.update())}saveLayout(){return this._root?(this._root.holdAllSizes(),{main:this._root.createConfig()}):{main:null}}restoreLayout(e){let i=new Set,n;e.main?n=tn.normalizeAreaConfig(e.main,i):n=null;let r=this.widgets(),s=this.tabBars(),o=this.handles();this._root=null;for(let a of r)i.has(a)||(a.parent=null);for(let a of s)a.dispose();for(let a of o)a.parentNode&&a.parentNode.removeChild(a);for(let a of i)a.parent=this.parent;n?this._root=tn.realizeAreaConfig(n,{createTabBar:a=>this._createTabBar(),createHandle:()=>this._createHandle()},this._document):this._root=null,this.parent&&(i.forEach(a=>{this.attachWidget(a)}),this.parent.fit())}addWidget(e,i={}){let n=i.ref||null,r=i.mode||"tab-after",s=null;if(this._root&&n&&(s=this._root.findTabNode(n)),n&&!s)throw new Error("Reference widget is not in the layout.");switch(e.parent=this.parent,r){case"tab-after":this._insertTab(e,n,s,!0);break;case"tab-before":this._insertTab(e,n,s,!1);break;case"split-top":this._insertSplit(e,n,s,"vertical",!1);break;case"split-left":this._insertSplit(e,n,s,"horizontal",!1);break;case"split-right":this._insertSplit(e,n,s,"horizontal",!0);break;case"split-bottom":this._insertSplit(e,n,s,"vertical",!0);break;case"merge-top":this._insertSplit(e,n,s,"vertical",!1,!0);break;case"merge-left":this._insertSplit(e,n,s,"horizontal",!1,!0);break;case"merge-right":this._insertSplit(e,n,s,"horizontal",!0,!0);break;case"merge-bottom":this._insertSplit(e,n,s,"vertical",!0,!0);break}this.parent&&(this.attachWidget(e),this.parent.fit())}removeWidget(e){this._removeWidget(e),this.parent&&(this.detachWidget(e),this.parent.fit())}hitTestTabAreas(e,i){if(!this._root||!this.parent||!this.parent.isVisible)return null;this._box||(this._box=ii.boxSizing(this.parent.node));let n=this.parent.node.getBoundingClientRect(),r=e-n.left-this._box.borderLeft,s=i-n.top-this._box.borderTop,o=this._root.hitTestTabNodes(r,s);if(!o)return null;let{tabBar:a,top:l,left:c,width:u,height:d}=o,f=this._box.borderLeft+this._box.borderRight,h=this._box.borderTop+this._box.borderBottom,m=n.width-f-(c+u),p=n.height-h-(l+d);return{tabBar:a,x:r,y:s,top:l,left:c,right:m,bottom:p,width:u,height:d}}init(){super.init();for(let e of this)this.attachWidget(e);for(let e of this.handles())this.parent.node.appendChild(e);this.parent.fit()}attachWidget(e){this.parent.node!==e.node.parentNode&&(this._items.set(e,new ku(e)),this.parent.isAttached&&Oe.sendMessage(e,pe.Msg.BeforeAttach),this.parent.node.appendChild(e.node),this.parent.isAttached&&Oe.sendMessage(e,pe.Msg.AfterAttach))}detachWidget(e){if(this.parent.node!==e.node.parentNode)return;this.parent.isAttached&&Oe.sendMessage(e,pe.Msg.BeforeDetach),this.parent.node.removeChild(e.node),this.parent.isAttached&&Oe.sendMessage(e,pe.Msg.AfterDetach);let i=this._items.get(e);i&&(this._items.delete(e),i.dispose())}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_removeWidget(e){if(!this._root)return;let i=this._root.findTabNode(e);if(!i)return;if(tn.removeAria(e),i.tabBar.titles.length>1){if(i.tabBar.removeTab(e.title),this._hiddenMode===pe.HiddenMode.Scale&&i.tabBar.titles.length==1){let f=i.tabBar.titles[0].owner;f.hiddenMode=pe.HiddenMode.Display}return}if(i.tabBar.dispose(),this._root===i){this._root=null;return}this._root.holdAllSizes();let n=i.parent;i.parent=null;let r=He.removeFirstOf(n.children,i),s=He.removeAt(n.handles,r);if(He.removeAt(n.sizers,r),s.parentNode&&s.parentNode.removeChild(s),n.children.length>1){n.syncHandles();return}let o=n.parent;n.parent=null;let a=n.children[0],l=n.handles[0];if(n.children.length=0,n.handles.length=0,n.sizers.length=0,l.parentNode&&l.parentNode.removeChild(l),this._root===n){a.parent=null,this._root=a;return}let c=o,u=c.children.indexOf(n);if(a instanceof tn.TabLayoutNode){a.parent=c,c.children[u]=a;return}let d=He.removeAt(c.handles,u);He.removeAt(c.children,u),He.removeAt(c.sizers,u),d.parentNode&&d.parentNode.removeChild(d);for(let f=0,h=a.children.length;f=this._left+this._width||m=this._top+this._height?null:this}createConfig(){let h=this.tabBar.titles.map(p=>p.owner),m=this.tabBar.currentIndex;return{type:"tab-area",widgets:h,currentIndex:m}}holdAllSizes(){}fit(h,m){let p=0,_=0,y=1/0,S=1/0,T=m.get(this.tabBar),O=this.tabBar.currentTitle,A=O?m.get(O.owner):void 0,[b,M]=this.sizers;return T&&T.fit(),A&&A.fit(),T&&!T.isHidden?(p=Math.max(p,T.minWidth),_+=T.minHeight,b.minSize=T.minHeight,b.maxSize=T.maxHeight):(b.minSize=0,b.maxSize=0),A&&!A.isHidden?(p=Math.max(p,A.minWidth),_+=A.minHeight,M.minSize=A.minHeight,M.maxSize=1/0):(M.minSize=0,M.maxSize=1/0),{minWidth:p,minHeight:_,maxWidth:y,maxHeight:S}}update(h,m,p,_,y,S){this._top=m,this._left=h,this._width=p,this._height=_;let T=S.get(this.tabBar),O=this.tabBar.currentTitle,A=O?S.get(O.owner):void 0;if(as.calc(this.sizers,_),T&&!T.isHidden){let b=this.sizers[0].size;T.update(h,m,p,b),m+=b}if(A&&!A.isHidden){let b=this.sizers[1].size;A.update(h,m,p,b)}}}t.TabLayoutNode=r;class s{constructor(h){this.parent=null,this.normalized=!1,this.children=[],this.sizers=[],this.handles=[],this.orientation=h}*iterAllWidgets(){for(let h of this.children)yield*h.iterAllWidgets()}*iterUserWidgets(){for(let h of this.children)yield*h.iterUserWidgets()}*iterSelectedWidgets(){for(let h of this.children)yield*h.iterSelectedWidgets()}*iterTabBars(){for(let h of this.children)yield*h.iterTabBars()}*iterHandles(){yield*this.handles;for(let h of this.children)yield*h.iterHandles()}findTabNode(h){for(let m=0,p=this.children.length;m_.createConfig());return{type:"split-area",orientation:h,children:p,sizes:m}}syncHandles(){this.handles.forEach((h,m)=>{h.setAttribute("data-orientation",this.orientation),m===this.handles.length-1?h.classList.add("lm-mod-hidden"):h.classList.remove("lm-mod-hidden")})}holdSizes(){for(let h of this.sizers)h.sizeHint=h.size}holdAllSizes(){for(let h of this.children)h.holdAllSizes();this.holdSizes()}normalizeSizes(){let h=this.sizers.length;if(h===0)return;this.holdSizes();let m=this.sizers.reduce((p,_)=>p+_.sizeHint,0);if(m===0)for(let p of this.sizers)p.size=p.sizeHint=1/h;else for(let p of this.sizers)p.size=p.sizeHint/=m;this.normalized=!0}createNormalizedSizes(){let h=this.sizers.length;if(h===0)return[];let m=this.sizers.map(_=>_.size),p=m.reduce((_,y)=>_+y,0);if(p===0)for(let _=m.length-1;_>-1;_--)m[_]=1/h;else for(let _=m.length-1;_>-1;_--)m[_]/=p;return m}fit(h,m){let p=this.orientation==="horizontal",_=Math.max(0,this.children.length-1)*h,y=p?_:0,S=p?0:_,T=1/0,O=1/0;for(let A=0,b=this.children.length;A=m.length)&&(p=0),{type:"tab-area",widgets:m,currentIndex:p}}function c(f,h){let m=f.orientation,p=[],_=[];for(let y=0,S=f.children.length;y{let S=n(_,h,m),T=e(f.sizes[y]),O=h.createHandle();p.children.push(S),p.handles.push(O),p.sizers.push(T),S.parent=p}),p.syncHandles(),p.normalizeSizes(),p}})(tn||(tn={}));Lu=class extends pe{constructor(e={}){super(),this._drag=null,this._tabsMovable=!0,this._tabsConstrained=!1,this._addButtonEnabled=!1,this._pressData=null,this._layoutModified=new Te(this),this._addRequested=new Te(this),this.addClass("lm-DockPanel"),this._document=e.document||document,this._mode=e.mode||"multiple-document",this._renderer=e.renderer||Lu.defaultRenderer,this._edges=e.edges||Ui.DEFAULT_EDGES,e.tabsMovable!==void 0&&(this._tabsMovable=e.tabsMovable),e.tabsConstrained!==void 0&&(this._tabsConstrained=e.tabsConstrained),e.addButtonEnabled!==void 0&&(this._addButtonEnabled=e.addButtonEnabled),this.dataset.mode=this._mode;let i={createTabBar:()=>this._createTabBar(),createHandle:()=>this._createHandle()};this.layout=new _C({document:this._document,renderer:i,spacing:e.spacing,hiddenMode:e.hiddenMode}),this.overlay=e.overlay||new Lu.Overlay,this.node.appendChild(this.overlay.node)}dispose(){this._releaseMouse(),this.overlay.hide(0),this._drag&&this._drag.dispose(),super.dispose()}get hiddenMode(){return this.layout.hiddenMode}set hiddenMode(e){this.layout.hiddenMode=e}get layoutModified(){return this._layoutModified}get addRequested(){return this._addRequested}get renderer(){return this.layout.renderer}get spacing(){return this.layout.spacing}set spacing(e){this.layout.spacing=e}get mode(){return this._mode}set mode(e){if(this._mode===e)return;this._mode=e,this.dataset.mode=e;let i=this.layout;switch(e){case"multiple-document":for(let n of i.tabBars())n.show();break;case"single-document":i.restoreLayout(Ui.createSingleDocumentConfig(this));break;default:throw"unreachable"}Oe.postMessage(this,Ui.LayoutModified)}get tabsMovable(){return this._tabsMovable}set tabsMovable(e){this._tabsMovable=e;for(let i of this.tabBars())i.tabsMovable=e}get tabsConstrained(){return this._tabsConstrained}set tabsConstrained(e){this._tabsConstrained=e}get addButtonEnabled(){return this._addButtonEnabled}set addButtonEnabled(e){this._addButtonEnabled=e;for(let i of this.tabBars())i.addButtonEnabled=e}get isEmpty(){return this.layout.isEmpty}*widgets(){yield*this.layout.widgets()}*selectedWidgets(){yield*this.layout.selectedWidgets()}*tabBars(){yield*this.layout.tabBars()}*handles(){yield*this.layout.handles()}selectWidget(e){let i=uC(this.tabBars(),n=>n.titles.indexOf(e.title)!==-1);if(!i)throw new Error("Widget is not contained in the dock panel.");i.currentTitle=e.title}activateWidget(e){this.selectWidget(e),e.activate()}saveLayout(){return this.layout.saveLayout()}restoreLayout(e){this._mode="multiple-document",this.layout.restoreLayout(e),(Iu.IS_EDGE||Iu.IS_IE)&&Oe.flush(),Oe.postMessage(this,Ui.LayoutModified)}addWidget(e,i={}){this._mode==="single-document"?this.layout.addWidget(e):this.layout.addWidget(e,i),Oe.postMessage(this,Ui.LayoutModified)}processMessage(e){e.type==="layout-modified"?this._layoutModified.emit(void 0):super.processMessage(e)}handleEvent(e){switch(e.type){case"lm-dragenter":this._evtDragEnter(e);break;case"lm-dragleave":this._evtDragLeave(e);break;case"lm-dragover":this._evtDragOver(e);break;case"lm-drop":this._evtDrop(e);break;case"pointerdown":this._evtPointerDown(e);break;case"pointermove":this._evtPointerMove(e);break;case"pointerup":this._evtPointerUp(e);break;case"keydown":this._evtKeyDown(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("lm-dragenter",this),this.node.addEventListener("lm-dragleave",this),this.node.addEventListener("lm-dragover",this),this.node.addEventListener("lm-drop",this),this.node.addEventListener("pointerdown",this)}onAfterDetach(e){this.node.removeEventListener("lm-dragenter",this),this.node.removeEventListener("lm-dragleave",this),this.node.removeEventListener("lm-dragover",this),this.node.removeEventListener("lm-drop",this),this.node.removeEventListener("pointerdown",this),this._releaseMouse()}onChildAdded(e){Ui.isGeneratedTabBarProperty.get(e.child)||e.child.addClass("lm-DockPanel-widget")}onChildRemoved(e){Ui.isGeneratedTabBarProperty.get(e.child)||(e.child.removeClass("lm-DockPanel-widget"),Oe.postMessage(this,Ui.LayoutModified))}_evtDragEnter(e){e.mimeData.hasData("application/vnd.lumino.widget-factory")&&(e.preventDefault(),e.stopPropagation())}_evtDragLeave(e){e.preventDefault(),!(this._tabsConstrained&&e.source!==this)&&(e.stopPropagation(),this.overlay.hide(1))}_evtDragOver(e){e.preventDefault(),this._tabsConstrained&&e.source!==this||this._showOverlay(e.clientX,e.clientY)==="invalid"?e.dropAction="none":(e.stopPropagation(),e.dropAction=e.proposedAction)}_evtDrop(e){if(e.preventDefault(),this.overlay.hide(0),e.proposedAction==="none"){e.dropAction="none";return}let{clientX:i,clientY:n}=e,{zone:r,target:s}=Ui.findDropTarget(this,i,n,this._edges);if(this._tabsConstrained&&e.source!==this||r==="invalid"){e.dropAction="none";return}let a=e.mimeData.getData("application/vnd.lumino.widget-factory");if(typeof a!="function"){e.dropAction="none";return}let l=a();if(!(l instanceof pe)){e.dropAction="none";return}if(l.contains(this)){e.dropAction="none";return}let c=s?Ui.getDropRef(s.tabBar):null;switch(r){case"root-all":this.addWidget(l);break;case"root-top":this.addWidget(l,{mode:"split-top"});break;case"root-left":this.addWidget(l,{mode:"split-left"});break;case"root-right":this.addWidget(l,{mode:"split-right"});break;case"root-bottom":this.addWidget(l,{mode:"split-bottom"});break;case"widget-all":this.addWidget(l,{mode:"tab-after",ref:c});break;case"widget-top":this.addWidget(l,{mode:"split-top",ref:c});break;case"widget-left":this.addWidget(l,{mode:"split-left",ref:c});break;case"widget-right":this.addWidget(l,{mode:"split-right",ref:c});break;case"widget-bottom":this.addWidget(l,{mode:"split-bottom",ref:c});break;case"widget-tab":this.addWidget(l,{mode:"tab-after",ref:c});break;default:throw"unreachable"}e.dropAction=e.proposedAction,e.stopPropagation(),this.activateWidget(l)}_evtKeyDown(e){e.preventDefault(),e.stopPropagation(),e.keyCode===27&&(this._releaseMouse(),Oe.postMessage(this,Ui.LayoutModified))}_evtPointerDown(e){if(e.button!==0)return;let i=this.layout,n=e.target,r=uC(i.handles(),u=>u.contains(n));if(!r)return;e.preventDefault(),e.stopPropagation(),this._document.addEventListener("keydown",this,!0),this._document.addEventListener("pointerup",this,!0),this._document.addEventListener("pointermove",this,!0),this._document.addEventListener("contextmenu",this,!0);let s=r.getBoundingClientRect(),o=e.clientX-s.left,a=e.clientY-s.top,l=window.getComputedStyle(r),c=an.overrideCursor(l.cursor,this._document);this._pressData={handle:r,deltaX:o,deltaY:a,override:c}}_evtPointerMove(e){if(!this._pressData)return;e.preventDefault(),e.stopPropagation();let i=this.node.getBoundingClientRect(),n=e.clientX-i.left-this._pressData.deltaX,r=e.clientY-i.top-this._pressData.deltaY;this.layout.moveHandle(this._pressData.handle,n,r)}_evtPointerUp(e){e.button===0&&(e.preventDefault(),e.stopPropagation(),this._releaseMouse(),Oe.postMessage(this,Ui.LayoutModified))}_releaseMouse(){this._pressData&&(this._pressData.override.dispose(),this._pressData=null,this._document.removeEventListener("keydown",this,!0),this._document.removeEventListener("pointerup",this,!0),this._document.removeEventListener("pointermove",this,!0),this._document.removeEventListener("contextmenu",this,!0))}_showOverlay(e,i){let{zone:n,target:r}=Ui.findDropTarget(this,e,i,this._edges);if(n==="invalid")return this.overlay.hide(100),n;let s,o,a,l,c=ii.boxSizing(this.node),u=this.node.getBoundingClientRect();switch(n){case"root-all":s=c.paddingTop,o=c.paddingLeft,a=c.paddingRight,l=c.paddingBottom;break;case"root-top":s=c.paddingTop,o=c.paddingLeft,a=c.paddingRight,l=u.height*Ui.GOLDEN_RATIO;break;case"root-left":s=c.paddingTop,o=c.paddingLeft,a=u.width*Ui.GOLDEN_RATIO,l=c.paddingBottom;break;case"root-right":s=c.paddingTop,o=u.width*Ui.GOLDEN_RATIO,a=c.paddingRight,l=c.paddingBottom;break;case"root-bottom":s=u.height*Ui.GOLDEN_RATIO,o=c.paddingLeft,a=c.paddingRight,l=c.paddingBottom;break;case"widget-all":s=r.top,o=r.left,a=r.right,l=r.bottom;break;case"widget-top":s=r.top,o=r.left,a=r.right,l=r.bottom+r.height/2;break;case"widget-left":s=r.top,o=r.left,a=r.right+r.width/2,l=r.bottom;break;case"widget-right":s=r.top,o=r.left+r.width/2,a=r.right,l=r.bottom;break;case"widget-bottom":s=r.top+r.height/2,o=r.left,a=r.right,l=r.bottom;break;case"widget-tab":{let d=r.tabBar.node.getBoundingClientRect().height;s=r.top,o=r.left,a=r.right,l=r.bottom+r.height-d;break}default:throw"unreachable"}return this.overlay.show({top:s,left:o,right:a,bottom:l}),n}_createTabBar(){let e=this._renderer.createTabBar(this._document);return Ui.isGeneratedTabBarProperty.set(e,!0),this._mode==="single-document"&&e.hide(),e.tabsMovable=this._tabsMovable,e.allowDeselect=!1,e.addButtonEnabled=this._addButtonEnabled,e.removeBehavior="select-previous-tab",e.insertBehavior="select-tab-if-needed",e.tabMoved.connect(this._onTabMoved,this),e.currentChanged.connect(this._onCurrentChanged,this),e.tabCloseRequested.connect(this._onTabCloseRequested,this),e.tabDetachRequested.connect(this._onTabDetachRequested,this),e.tabActivateRequested.connect(this._onTabActivateRequested,this),e.addRequested.connect(this._onTabAddRequested,this),e}_createHandle(){return this._renderer.createHandle()}_onTabMoved(){Oe.postMessage(this,Ui.LayoutModified)}_onCurrentChanged(e,i){let{previousTitle:n,currentTitle:r}=i;n&&n.owner.hide(),r&&r.owner.show(),(Iu.IS_EDGE||Iu.IS_IE)&&Oe.flush(),Oe.postMessage(this,Ui.LayoutModified)}_onTabAddRequested(e){this._addRequested.emit(e)}_onTabActivateRequested(e,i){i.title.owner.activate()}_onTabCloseRequested(e,i){i.title.owner.close()}_onTabDetachRequested(e,i){if(this._drag)return;e.releaseMouse();let{title:n,tab:r,clientX:s,clientY:o,offset:a}=i,l=new ul.MimeData,c=()=>n.owner;l.setData("application/vnd.lumino.widget-factory",c);let u=r.cloneNode(!0);a&&(u.style.top=`-${a.y}px`,u.style.left=`-${a.x}px`),this._drag=new an({document:this._document,mimeData:l,dragImage:u,proposedAction:"move",supportedActions:"move",source:this}),r.classList.add("lm-mod-hidden");let d=()=>{this._drag=null,r.classList.remove("lm-mod-hidden")};this._drag.start(s,o).then(d)}};(function(t){class e{constructor(){this._timer=-1,this._hidden=!0,this.node=document.createElement("div"),this.node.classList.add("lm-DockPanel-overlay"),this.node.classList.add("lm-mod-hidden"),this.node.style.position="absolute",this.node.style.contain="strict"}show(r){let s=this.node.style;s.top=`${r.top}px`,s.left=`${r.left}px`,s.right=`${r.right}px`,s.bottom=`${r.bottom}px`,clearTimeout(this._timer),this._timer=-1,this._hidden&&(this._hidden=!1,this.node.classList.remove("lm-mod-hidden"))}hide(r){if(!this._hidden){if(r<=0){clearTimeout(this._timer),this._timer=-1,this._hidden=!0,this.node.classList.add("lm-mod-hidden");return}this._timer===-1&&(this._timer=window.setTimeout(()=>{this._timer=-1,this._hidden=!0,this.node.classList.add("lm-mod-hidden")},r))}}}t.Overlay=e;class i{createTabBar(r){let s=new vc({document:r});return s.addClass("lm-DockPanel-tabBar"),s}createHandle(){let r=document.createElement("div");return r.className="lm-DockPanel-handle",r}}t.Renderer=i,t.defaultRenderer=new i})(Lu||(Lu={}));(function(t){t.GOLDEN_RATIO=.618,t.DEFAULT_EDGES={top:12,right:40,bottom:40,left:40},t.LayoutModified=new _c("layout-modified"),t.isGeneratedTabBarProperty=new pt({name:"isGeneratedTabBar",create:()=>!1});function e(r){if(r.isEmpty)return{main:null};let s=Array.from(r.widgets()),o=r.selectedWidgets().next().value,a=o?s.indexOf(o):-1;return{main:{type:"tab-area",widgets:s,currentIndex:a}}}t.createSingleDocumentConfig=e;function i(r,s,o,a){if(!ii.hitTest(r.node,s,o))return{zone:"invalid",target:null};let l=r.layout;if(l.isEmpty)return{zone:"root-all",target:null};if(r.mode==="multiple-document"){let T=r.node.getBoundingClientRect(),O=s-T.left+1,A=o-T.top+1,b=T.right-s,M=T.bottom-o;switch(Math.min(A,b,M,O)){case A:if(Ap&&f>p&&d>_&&h>_)return{zone:"widget-all",target:c};u/=p,d/=_,f/=p,h/=_;let y=Math.min(u,d,f,h),S;switch(y){case u:S="widget-left";break;case d:S="widget-top";break;case f:S="widget-right";break;case h:S="widget-bottom";break;default:throw"unreachable"}return{zone:S,target:c}}t.findDropTarget=i;function n(r){return r.titles.length===0?null:r.currentTitle?r.currentTitle.owner:r.titles[r.titles.length-1].owner}t.getDropRef=n})(Ui||(Ui={}));cl=class extends la{constructor(e={}){super(e),this._dirty=!1,this._rowSpacing=4,this._columnSpacing=4,this._items=[],this._rowStarts=[],this._columnStarts=[],this._rowSizers=[new oa],this._columnSizers=[new oa],this._box=null,e.rowCount!==void 0&&yn.reallocSizers(this._rowSizers,e.rowCount),e.columnCount!==void 0&&yn.reallocSizers(this._columnSizers,e.columnCount),e.rowSpacing!==void 0&&(this._rowSpacing=yn.clampValue(e.rowSpacing)),e.columnSpacing!==void 0&&(this._columnSpacing=yn.clampValue(e.columnSpacing))}dispose(){for(let e of this._items){let i=e.widget;e.dispose(),i.dispose()}this._box=null,this._items.length=0,this._rowStarts.length=0,this._rowSizers.length=0,this._columnStarts.length=0,this._columnSizers.length=0,super.dispose()}get rowCount(){return this._rowSizers.length}set rowCount(e){e!==this.rowCount&&(yn.reallocSizers(this._rowSizers,e),this.parent&&this.parent.fit())}get columnCount(){return this._columnSizers.length}set columnCount(e){e!==this.columnCount&&(yn.reallocSizers(this._columnSizers,e),this.parent&&this.parent.fit())}get rowSpacing(){return this._rowSpacing}set rowSpacing(e){e=yn.clampValue(e),this._rowSpacing!==e&&(this._rowSpacing=e,this.parent&&this.parent.fit())}get columnSpacing(){return this._columnSpacing}set columnSpacing(e){e=yn.clampValue(e),this._columnSpacing!==e&&(this._columnSpacing=e,this.parent&&this.parent.fit())}rowStretch(e){let i=this._rowSizers[e];return i?i.stretch:-1}setRowStretch(e,i){let n=this._rowSizers[e];n&&(i=yn.clampValue(i),n.stretch!==i&&(n.stretch=i,this.parent&&this.parent.update()))}columnStretch(e){let i=this._columnSizers[e];return i?i.stretch:-1}setColumnStretch(e,i){let n=this._columnSizers[e];n&&(i=yn.clampValue(i),n.stretch!==i&&(n.stretch=i,this.parent&&this.parent.update()))}*[Symbol.iterator](){for(let e of this._items)yield e.widget}addWidget(e){He.findFirstIndex(this._items,n=>n.widget===e)===-1&&(this._items.push(new ku(e)),this.parent&&this.attachWidget(e))}removeWidget(e){let i=He.findFirstIndex(this._items,r=>r.widget===e);if(i===-1)return;let n=He.removeAt(this._items,i);this.parent&&this.detachWidget(e),n.dispose()}init(){super.init();for(let e of this)this.attachWidget(e)}attachWidget(e){this.parent.isAttached&&Oe.sendMessage(e,pe.Msg.BeforeAttach),this.parent.node.appendChild(e.node),this.parent.isAttached&&Oe.sendMessage(e,pe.Msg.AfterAttach),this.parent.fit()}detachWidget(e){this.parent.isAttached&&Oe.sendMessage(e,pe.Msg.BeforeDetach),this.parent.node.removeChild(e.node),this.parent.isAttached&&Oe.sendMessage(e,pe.Msg.AfterDetach),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_fit(){for(let l=0,c=this.rowCount;l!l.isHidden);for(let l=0,c=e.length;l({row:0,column:0,rowSpan:1,columnSpan:1}),changed:a});function e(l){let c=Math.max(0,Math.floor(l.row||0)),u=Math.max(0,Math.floor(l.column||0)),d=Math.max(1,Math.floor(l.rowSpan||0)),f=Math.max(1,Math.floor(l.columnSpan||0));return{row:c,column:u,rowSpan:d,columnSpan:f}}t.normalizeConfig=e;function i(l){return Math.max(0,Math.floor(l))}t.clampValue=i;function n(l,c){let u=t.cellConfigProperty.get(l.widget),d=t.cellConfigProperty.get(c.widget);return u.rowSpan-d.rowSpan}t.rowSpanCmp=n;function r(l,c){let u=t.cellConfigProperty.get(l.widget),d=t.cellConfigProperty.get(c.widget);return u.columnSpan-d.columnSpan}t.columnSpanCmp=r;function s(l,c){for(c=Math.max(1,Math.floor(c));l.lengthc&&(l.length=c)}t.reallocSizers=s;function o(l,c,u,d){if(u=d)return;let h=(d-f)/(u-c+1);for(let m=c;m<=u;++m)l[m].minSize+=h}t.distributeMin=o;function a(l){l.parent&&l.parent.layout instanceof cl&&l.parent.fit()}})(yn||(yn={}));Lf=class extends pe{constructor(e={}){super({node:Qb.createNode()}),this._activeIndex=-1,this._tabFocusIndex=0,this._menus=[],this._childMenu=null,this._overflowMenu=null,this._menuItemSizes=[],this._overflowIndex=-1,this.addClass("lm-MenuBar"),this.setFlag(pe.Flag.DisallowLayout),this.renderer=e.renderer||Lf.defaultRenderer,this._forceItemsPosition=e.forceItemsPosition||{forceX:!0,forceY:!0},this._overflowMenuOptions=e.overflowMenuOptions||{isVisible:!0}}dispose(){this._closeChildMenu(),this._menus.length=0,super.dispose()}get childMenu(){return this._childMenu}get overflowIndex(){return this._overflowIndex}get overflowMenu(){return this._overflowMenu}get contentNode(){return this.node.getElementsByClassName("lm-MenuBar-content")[0]}get activeMenu(){return this._menus[this._activeIndex]||null}set activeMenu(e){this.activeIndex=e?this._menus.indexOf(e):-1}get activeIndex(){return this._activeIndex}set activeIndex(e){(e<0||e>=this._menus.length)&&(e=-1),e>-1&&this._menus[e].items.length===0&&(e=-1),this._activeIndex!==e&&(this._activeIndex=e,this.update())}get menus(){return this._menus}openActiveMenu(){this._activeIndex!==-1&&(this._openChildMenu(),this._childMenu&&(this._childMenu.activeIndex=-1,this._childMenu.activateNextItem()))}addMenu(e,i=!0){this.insertMenu(this._menus.length,e,i)}insertMenu(e,i,n=!0){this._closeChildMenu();let r=this._menus.indexOf(i),s=Math.max(0,Math.min(e,this._menus.length));if(r===-1){He.insert(this._menus,s,i),i.addClass("lm-MenuBar-menu"),i.aboutToClose.connect(this._onMenuAboutToClose,this),i.menuRequested.connect(this._onMenuMenuRequested,this),i.title.changed.connect(this._onTitleChanged,this),n&&this.update();return}s===this._menus.length&&s--,r!==s&&(He.move(this._menus,r,s),n&&this.update())}removeMenu(e,i=!0){this.removeMenuAt(this._menus.indexOf(e),i)}removeMenuAt(e,i=!0){this._closeChildMenu();let n=He.removeAt(this._menus,e);n&&(n.aboutToClose.disconnect(this._onMenuAboutToClose,this),n.menuRequested.disconnect(this._onMenuMenuRequested,this),n.title.changed.disconnect(this._onTitleChanged,this),n.removeClass("lm-MenuBar-menu"),i&&this.update())}clearMenus(){if(this._menus.length!==0){this._closeChildMenu();for(let e of this._menus)e.aboutToClose.disconnect(this._onMenuAboutToClose,this),e.menuRequested.disconnect(this._onMenuMenuRequested,this),e.title.changed.disconnect(this._onTitleChanged,this),e.removeClass("lm-MenuBar-menu");this._menus.length=0,this.update()}}handleEvent(e){switch(e.type){case"keydown":this._evtKeyDown(e);break;case"mousedown":this._evtMouseDown(e);break;case"mousemove":this._evtMouseMove(e);break;case"focusout":this._evtFocusOut(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("keydown",this),this.node.addEventListener("mousedown",this),this.node.addEventListener("mousemove",this),this.node.addEventListener("focusout",this),this.node.addEventListener("contextmenu",this)}onAfterDetach(e){this.node.removeEventListener("keydown",this),this.node.removeEventListener("mousedown",this),this.node.removeEventListener("mousemove",this),this.node.removeEventListener("focusout",this),this.node.removeEventListener("contextmenu",this),this._closeChildMenu()}onActivateRequest(e){this.isAttached&&this._focusItemAt(0)}onResize(e){this.update(),super.onResize(e)}onUpdateRequest(e){var i;let n=this._menus,r=this.renderer,s=this._activeIndex,o=this._tabFocusIndex>=0&&this._tabFocusIndex-1?this._overflowIndex:n.length,l=0,c=!1;a=this._overflowMenu!==null?a-1:a;let u=new Array(a);for(let d=0;d{this._tabFocusIndex=d,this.activeIndex=d}}),l+=this._menuItemSizes[d],n[d].title.label===this._overflowMenuOptions.title&&(c=!0,a--);if(this._overflowMenuOptions.isVisible){if(this._overflowIndex>-1&&!c){if(this._overflowMenu===null){let d=(i=this._overflowMenuOptions.title)!==null&&i!==void 0?i:"...";this._overflowMenu=new aa({commands:new er}),this._overflowMenu.title.label=d,this._overflowMenu.title.mnemonic=0,this.addMenu(this._overflowMenu,!1)}for(let d=n.length-2;d>=a;d--){let f=this.menus[d];f.title.mnemonic=0,this._overflowMenu.insertItem(0,{type:"submenu",submenu:f}),this.removeMenu(f,!1)}u[a]=r.renderItem({title:this._overflowMenu.title,active:a===s&&n[a].items.length!==0,tabbable:a===o,disabled:n[a].items.length===0,onfocus:()=>{this._tabFocusIndex=a,this.activeIndex=a}}),a++}else if(this._overflowMenu!==null){let d=this._overflowMenu.items,f=this.node.offsetWidth,h=this._overflowMenu.items.length;for(let m=0;mthis._menuItemSizes[p]){let _=d[0].submenu;this._overflowMenu.removeItemAt(0),this.insertMenu(a,_,!1),u[a]=r.renderItem({title:_.title,active:!1,tabbable:a===o,disabled:n[a].items.length===0,onfocus:()=>{this._tabFocusIndex=a,this.activeIndex=a}}),a++}}this._overflowMenu.items.length===0&&(this.removeMenu(this._overflowMenu,!1),u.pop(),this._overflowMenu=null,this._overflowIndex=-1)}}Qt.render(u,this.contentNode),this._updateOverflowIndex()}_updateOverflowIndex(){if(!this._overflowMenuOptions.isVisible)return;let e=this.contentNode.childNodes,i=this.node.offsetWidth,n=0,r=-1,s=e.length;if(this._menuItemSizes.length==0)for(let o=0;oi&&r===-1&&(r=o)}else for(let o=0;oi){r=o;break}this._overflowIndex=r}_evtKeyDown(e){let i=e.keyCode;if(i===9){this.activeIndex=-1;return}if(e.preventDefault(),e.stopPropagation(),i===13||i===32||i===38||i===40){if(this.activeIndex=this._tabFocusIndex,this.activeIndex!==this._tabFocusIndex)return;this.openActiveMenu();return}if(i===27){this._closeChildMenu(),this._focusItemAt(this.activeIndex);return}if(i===37||i===39){let o=i===37?-1:1,a=this._tabFocusIndex+o,l=this._menus.length;for(let c=0;cii.hitTest(n,e.clientX,e.clientY));if(i===-1){this._closeChildMenu();return}if(e.button===0)if(this._childMenu)this._closeChildMenu(),this.activeIndex=i;else{e.preventDefault();let n=this._positionForMenu(i);aa.saveWindowData(),this.activeIndex=i,this._openChildMenu(n)}}_evtMouseMove(e){let i=He.findFirstIndex(this.contentNode.children,r=>ii.hitTest(r,e.clientX,e.clientY));if(i===this._activeIndex||i===-1&&this._childMenu)return;let n=i>=0&&this._childMenu?this._positionForMenu(i):null;aa.saveWindowData(),this.activeIndex=i,n&&this._openChildMenu(n)}_positionForMenu(e){let i=this.contentNode.children[e],{left:n,bottom:r}=i.getBoundingClientRect();return{top:r,left:n}}_evtFocusOut(e){!this._childMenu&&!this.node.contains(e.relatedTarget)&&(this.activeIndex=-1)}_focusItemAt(e){let i=this.contentNode.childNodes[e];i&&i.focus()}_openChildMenu(e={}){let i=this.activeMenu;if(!i){this._closeChildMenu();return}let n=this._childMenu;if(n===i)return;this._childMenu=i,n?n.close():document.addEventListener("mousedown",this,!0),this._tabFocusIndex=this.activeIndex,Oe.sendMessage(this,pe.Msg.UpdateRequest);let{left:r,top:s}=e;(typeof r>"u"||typeof s>"u")&&({left:r,top:s}=this._positionForMenu(this._activeIndex)),n||this.addClass("lm-mod-active"),i.items.length>0&&i.open(r,s,this._forceItemsPosition)}_closeChildMenu(){if(!this._childMenu)return;this.removeClass("lm-mod-active"),document.removeEventListener("mousedown",this,!0);let e=this._childMenu;this._childMenu=null,e.close(),this.activeIndex=-1}_onMenuAboutToClose(e){e===this._childMenu&&(this.removeClass("lm-mod-active"),document.removeEventListener("mousedown",this,!0),this._childMenu=null,this.activeIndex=-1)}_onMenuMenuRequested(e,i){if(e!==this._childMenu)return;let n=this._activeIndex,r=this._menus.length;switch(i){case"next":this.activeIndex=n===r-1?0:n+1;break;case"previous":this.activeIndex=n===0?r-1:n-1;break}this.openActiveMenu()}_onTitleChanged(){this.update()}};(function(t){class e{renderItem(n){let r=this.createItemClass(n),s=this.createItemDataset(n),o=this.createItemARIA(n);return Ce.li({className:r,dataset:s,...n.disabled?{}:{tabindex:n.tabbable?"0":"-1"},onfocus:n.onfocus,...o},this.renderIcon(n),this.renderLabel(n))}renderIcon(n){let r=this.createIconClass(n);return Ce.div({className:r},n.title.icon,n.title.iconLabel)}renderLabel(n){let r=this.formatLabel(n);return Ce.div({className:"lm-MenuBar-itemLabel"},r)}createItemClass(n){let r="lm-MenuBar-item";return n.title.className&&(r+=` ${n.title.className}`),n.active&&!n.disabled&&(r+=" lm-mod-active"),r}createItemDataset(n){return n.title.dataset}createItemARIA(n){return{role:"menuitem","aria-haspopup":"true","aria-disabled":n.disabled?"true":"false"}}createIconClass(n){let r="lm-MenuBar-itemIcon",s=n.title.iconClass;return s?`${r} ${s}`:r}formatLabel(n){let{label:r,mnemonic:s}=n.title;if(s<0||s>=r.length)return r;let o=r.slice(0,s),a=r.slice(s+1),l=r[s],c=Ce.span({className:"lm-MenuBar-itemMnemonic"},l);return[o,c,a]}}t.Renderer=e,t.defaultRenderer=new e})(Lf||(Lf={}));(function(t){function e(){let n=document.createElement("div"),r=document.createElement("ul");return r.className="lm-MenuBar-content",n.appendChild(r),r.setAttribute("role","menubar"),n}t.createNode=e;function i(n,r,s){let o=-1,a=-1,l=!1,c=r.toUpperCase();for(let u=0,d=n.length;u=0&&m1&&this.widgets.forEach(i=>{i.hiddenMode=this._hiddenMode}))}dispose(){for(let e of this._items)e.dispose();this._box=null,this._items.length=0,super.dispose()}attachWidget(e,i){this._hiddenMode===pe.HiddenMode.Scale&&this._items.length>0?(this._items.length===1&&(this.widgets[0].hiddenMode=pe.HiddenMode.Scale),i.hiddenMode=pe.HiddenMode.Scale):i.hiddenMode=pe.HiddenMode.Display,He.insert(this._items,e,new ku(i)),this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.BeforeAttach),this.parent.node.appendChild(i.node),this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.AfterAttach),this.parent.fit()}moveWidget(e,i,n){He.move(this._items,e,i),this.parent.update()}detachWidget(e,i){let n=He.removeAt(this._items,e);this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.isAttached&&Oe.sendMessage(i,pe.Msg.AfterDetach),n.widget.node.style.zIndex="",this._hiddenMode===pe.HiddenMode.Scale&&(i.hiddenMode=pe.HiddenMode.Display,this._items.length===1&&(this._items[0].widget.hiddenMode=pe.HiddenMode.Display)),n.dispose(),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_fit(){let e=0,i=0;for(let s=0,o=this._items.length;s{var UV=1/0,VV="[object Symbol]",Y2=/[&<>"'`]/g,GV=RegExp(Y2.source),YV={"&":"&","<":"<",">":">",'"':""","'":"'","`":"`"},KV=typeof globalThis=="object"&&globalThis&&globalThis.Object===Object&&globalThis,XV=typeof self=="object"&&self&&self.Object===Object&&self,JV=KV||XV||Function("return this")();function ZV(t){return function(e){return t?.[e]}}var QV=ZV(YV),eG=Object.prototype,tG=eG.toString,U2=JV.Symbol,V2=U2?U2.prototype:void 0,G2=V2?V2.toString:void 0;function iG(t){if(typeof t=="string")return t;if(rG(t))return G2?G2.call(t):"";var e=t+"";return e=="0"&&1/t==-UV?"-0":e}function nG(t){return!!t&&typeof t=="object"}function rG(t){return typeof t=="symbol"||nG(t)&&tG.call(t)==VV}function sG(t){return t==null?"":iG(t)}function oG(t){return t=sG(t),t&&GV.test(t)?t.replace(Y2,QV):t}K2.exports=oG});function Z2(t){let e=[],i=null,n=null,r=null,s=0,o;t.includes("`")||t.includes("~~~")?(t=t.replace(/~/g,"~T").replace(/^(?`{3,}|(~T){3,})[^`\n]*\n([\s\S]*?)^\k`*$/gm,c=>c.replace(/\$/g,"~D")).replace(/(^|[^\\])(`+)([^\n]*?[^`\n])\2(?!`)/gm,c=>c.replace(/\$/g,"~D")),o=c=>c.replace(/~([TD])/g,(u,d)=>d==="T"?"~":J2)):o=c=>c;let l=t.replace(/\r\n?/g,` `).split(aG);for(let c=1,u=l.length;c{let s=e[r];return s.substr(0,3)==="\\\\("&&s.substr(s.length-3)==="\\\\)"?s="\\("+s.substring(3,s.length-3)+"\\)":s.substr(0,3)==="\\\\["&&s.substr(s.length-3)==="\\\\]"&&(s="\\["+s.substring(3,s.length-3)+"\\]"),s};return t.replace(/@@(\d+)@@/g,i)}function bC(t,e,i,n,r){let s=r.slice(t,e+1).join("").replace(/&/g,"&").replace(//g,">");for(navigator&&navigator.appName==="Microsoft Internet Explorer"&&(s=s.replace(/(%[^\n]*)\n/g,`$1
-`));e>t;)r[e]="",e--;return r[t]="@@"+n.length+"@@",i&&(s=i(s)),n.push(s),r}var J2,aG,xC=$(()=>{J2="$",aG=/(\$\$?|\\(?:begin|end)\{[a-z]*\*?\}|\\[{}$]|[{}]|(?:\n\s*)+|@@\d+@@|\\\\(?:\(|\)|\[|\]))/i});function SC(t){let{host:e,source:i,trusted:n,sanitizer:r,resolver:s,linkHandler:o,shouldTypeset:a,latexTypesetter:l,translator:c}=t;c=c||fo;let u=c?.load("jupyterlab"),d=i;if(!i)return e.textContent="",Promise.resolve(void 0);if(n||(d=`${i}`,i=r.sanitize(i)),e.innerHTML=i,e.getElementsByTagName("script").length>0)if(n)ca.evalInnerHTMLScriptTags(e);else{let h=document.createElement("div"),m=document.createElement("pre");m.textContent=u.__("This HTML output contains inline scripts. Are you sure that you want to run arbitrary Javascript within your JupyterLab session?");let p=document.createElement("button");p.textContent=u.__("Run"),p.onclick=_=>{e.innerHTML=d,ca.evalInnerHTMLScriptTags(e),e.firstChild&&e.removeChild(e.firstChild)},h.appendChild(m),h.appendChild(p),e.insertBefore(h,e.firstChild)}ca.handleDefaults(e,s);let f;return s?f=ca.handleUrls(e,s,o):f=Promise.resolve(void 0),f.then(()=>{a&&l&&l.typeset(e)})}function nz(t){let{host:e,mimeType:i,source:n,width:r,height:s,needsBackground:o,unconfined:a}=t;e.textContent="";let l=document.createElement("img");return l.src=`data:${i};base64,${n}`,typeof s=="number"&&(l.height=s),typeof r=="number"&&(l.width=r),o==="light"?l.classList.add("jp-needs-light-background"):o==="dark"&&l.classList.add("jp-needs-dark-background"),a===!0&&l.classList.add("jp-mod-unconfined"),e.appendChild(l),Promise.resolve(void 0)}function rz(t){let{host:e,source:i,shouldTypeset:n,latexTypesetter:r}=t;return e.textContent=i,n&&r&&r.typeset(e),Promise.resolve(void 0)}async function lg(t){let{host:e,source:i,markdownParser:n,...r}=t;if(!i){e.textContent="";return}let s="";if(n){let o=Z2(i);s=await n.render(o.text),s=Q2(s,o.math)}else s=`
${i}
`;await SC({host:e,source:s,...r}),ca.headerAnchors(e)}function sz(t){let{host:e,source:i,trusted:n,unconfined:r}=t;if(!i)return e.textContent="",Promise.resolve(void 0);if(!n)return e.textContent="Cannot display an untrusted SVG. Maybe you need to run the cell?",Promise.resolve(void 0);let s="]+xmlns=[^>]+svg";i.search(s)<0&&(i=i.replace("{if(o>=i.length){n.push(document.createTextNode(s));return}let a=i[o],l,c=0,u=a.regex;for(u.lastIndex=0;(l=u.exec(s))!=null;){let f=s.substring(c,l.index);f&&r(f,o+1);let{path:h,...m}=l.groups,p=a.processPath?a.processPath(h):h,_=a.processLabel?a.processLabel(l[0]):l[0];n.push(a.createAnchor(p,_,m)),c=l.index+_.length}let d=s.substring(c);d&&r(d,o+1)};return r(t,0),n}function ez(t,e){var i,n;let r=t.cloneNode();r.textContent=(i=t.textContent)===null||i===void 0?void 0:i.slice(0,e);let s=t.cloneNode();return s.textContent=(n=t.textContent)===null||n===void 0?void 0:n.slice(e),{pre:r,post:s}}function*tz(t){var e;let i=0,n;for(let r of t)n=i+(((e=r.textContent)===null||e===void 0?void 0:e.length)||0),yield{node:r,start:i,end:n,isText:r.nodeType===Node.TEXT_NODE},i=n}function*lG(t,e){var i,n;let r=tz(t),s=tz(e),o=r.next(),a=s.next();for(;!o.done&&!a.done;){let l=o.value,c=a.value;if(l.isText&&l.start<=c.start&&l.end>=c.end)yield[null,c.node],a=s.next();else if(c.isText&&c.start<=l.start&&c.end>=l.end)yield[l.node,null],o=r.next();else if(l.end===c.end&&l.start===c.start)yield[l.node,c.node],o=r.next(),a=s.next();else if(l.end>c.end){let{pre:u,post:d}=ez(l.node,c.end-l.start);c.startl.end){let{pre:u,post:d}=ez(c.node,l.end-c.start);l.start{Lf=P(yA());Rp();iz=P(X2());xC();(function(t){function e(i){var n;return((n=i.textContent)!==null&&n!==void 0?n:"").replace(/ /g,"-")}t.createHeaderId=e})(lg||(lg={}));(function(t){let e="\\u0000-\\u0020\\u007f-\\u009f";t.webLinkRegex=new RegExp("(?(?:[a-zA-Z][a-zA-Z0-9+.-]{2,}:\\/\\/|data:|www\\.)[^\\s"+e+'"]{2,}[^\\s'+e+`"'(){}\\[\\],:;.!?])`,"ug");let i=/(?:[a-zA-Z]:(?:(?:\\|\/)[\w\.-]*)+)/,n=/(?:(?:\~|\.)(?:(?:\\|\/)[\w\.-]*)+)/,r=new RegExp(`(${i.source}|${n.source})`),s=/((?:\~|\.)?(?:\/[\w\.-]*)+)/,o=/(?:(?:\:|", line )(?[\d]+))?(?:\:(?[\d]+))?/,a=navigator.userAgent.indexOf("Windows")>=0;t.pathLinkRegex=new RegExp(`(?${a?r.source:s.source})${o.source}`,"g")})(e0||(e0={}));yC=class{constructor(){this.regex=e0.webLinkRegex}createAnchor(e,i){let n=document.createElement("a");return n.href=e.startsWith("www.")?"https://"+e:e,n.rel="noopener",n.target="_blank",n.appendChild(document.createTextNode(i)),n}processPath(e){let i=e.slice(-1),r=[">","<"].indexOf(i)!==-1?e.length-1:e.length;return e=e.slice(0,r),e}processLabel(e){return this.processPath(e)}},wC=class{constructor(){this.regex=e0.pathLinkRegex}createAnchor(e,i,n){let r=document.createElement("a");r.dataset.path=e;let s=parseInt(n.line,10),o=isNaN(s)?"":`line=${s-1}`;return r.dataset.locator=o,r.appendChild(document.createTextNode(i)),r}};(function(t){function e(h){let m=Array.from(h.getElementsByTagName("script"));for(let p of m){if(!p.parentNode)continue;let _=document.createElement("script"),y=p.attributes;for(let S=0,T=y.length;S{})}t.handleUrls=n;async function r(h,m,p){let _=h.getElementsByTagName("a");for(let y=0;y<_.length;y++)await l(_[y],m,p)}t.handlePaths=r;function s(h){let m=["h1","h2","h3","h4","h5","h6"];for(let p of m){let _=h.getElementsByTagName(p);for(let y=0;y<_.length;y++){let S=_[y];S.id=lg.createHeaderId(S);let T=document.createElement("a");T.target="_self",T.textContent="\xB6",T.href="#"+S.id,T.classList.add("jp-InternalAnchorLink"),S.appendChild(T)}}}t.headerAnchors=s;async function o(h,m,p){let _=h.getAttribute(m)||"",y=p.isLocal?p.isLocal(_):Lf.URLExt.isLocal(_);if(!(!_||!y))try{let S=await p.resolveUrl(_),T=await p.getDownloadUrl(S);Lf.URLExt.parse(T).protocol!=="data:"&&(T+=(/\?/.test(T)?"&":"?")+new Date().getTime()),h.setAttribute(m,T)}catch(S){throw h.setAttribute(m,""),S}}function a(h,m,p){let _=h.getAttribute("href")||"",y=m.isLocal?m.isLocal(_):Lf.URLExt.isLocal(_);if(!_||!y)return Promise.resolve(void 0);let S=h.hash;if(S){if(S===_)return h.target="_self",Promise.resolve(void 0);_=_.replace(S,"")}return m.resolveUrl(_).then(T=>{let O=decodeURIComponent(T);return p&&p.handleLink(h,O,S),m.getDownloadUrl(T)}).then(T=>{h.href=T+S}).catch(T=>{h.href=""})}async function l(h,m,p){let _=h.dataset.path||"",y=h.dataset.locator?"#"+h.dataset.locator:"";delete h.dataset.path,delete h.dataset.locator;let S=!0,T=m.isLocal?m.isLocal(_,S):Lf.URLExt.isLocal(_,S);if(!_||!T||!m.resolvePath||!p||!p.handlePath)return h.replaceWith(...h.childNodes),Promise.resolve(void 0);try{let O=await m.resolvePath(_);if(!O)return console.log("Path resolution bailing: does not exist"),Promise.resolve(void 0);p.handlePath(h,O.path,O.scope,y),h.href=O.path+y}catch(O){console.warn("Path anchor error:",O),h.href="#linking-failed-see-console"}}let c=["ansi-black","ansi-red","ansi-green","ansi-yellow","ansi-blue","ansi-magenta","ansi-cyan","ansi-white","ansi-black-intense","ansi-red-intense","ansi-green-intense","ansi-yellow-intense","ansi-blue-intense","ansi-magenta-intense","ansi-cyan-intense","ansi-white-intense"];function u(h,m,p,_,y,S,T){if(h){let O=[],A=[];_&&typeof m=="number"&&0<=m&&m<8&&(m+=8),S&&([m,p]=[p,m]),typeof m=="number"?O.push(c[m]+"-fg"):m.length?A.push(`color: rgb(${m})`):S&&O.push("ansi-default-inverse-fg"),typeof p=="number"?O.push(c[p]+"-bg"):p.length?A.push(`background-color: rgb(${p})`):S&&O.push("ansi-default-inverse-bg"),_&&O.push("ansi-bold"),y&&O.push("ansi-underline"),O.length||A.length?(T.push(""),T.push(h),T.push("")):T.push(h)}}function d(h){let m,p,_,y=h.shift();if(y===2&&h.length>=3){if(m=h.shift(),p=h.shift(),_=h.shift(),[m,p,_].some(S=>S<0||255=1){let S=h.shift();if(S<0)throw new RangeError("Color index must be >= 0");if(S<16)return S;if(S<232)m=Math.floor((S-16)/36),m=m>0?55+m*40:0,p=Math.floor((S-16)%36/6),p=p>0?55+p*40:0,_=(S-16)%6,_=_>0?55+_*40:0;else if(S<256)m=p=_=(S-232)*10+8;else throw new RangeError("Color index must be < 256")}else throw new RangeError("Invalid extended color specification");return[m,p,_]}function f(h){let m=/\x1b\[(.*?)([@-~])/g,p=[],_=[],y=!1,S=!1,T=!1,O,A=[],b=[],M=0;for(h=(0,iz.default)(h),h+="\x1B[m";O=m.exec(h);){if(O[2]==="m"){let x=O[1].split(";");for(let w=0;w{Rp();q2();EC();dl=class extends pe{constructor(e){var i,n;super(),this.mimeType=e.mimeType,this.sanitizer=e.sanitizer,this.resolver=e.resolver,this.linkHandler=e.linkHandler,this.translator=(i=e.translator)!==null&&i!==void 0?i:fo,this.latexTypesetter=e.latexTypesetter,this.markdownParser=(n=e.markdownParser)!==null&&n!==void 0?n:null,this.node.dataset.mimeType=this.mimeType}async renderModel(e,i){if(!i)for(;this.node.firstChild;)this.node.removeChild(this.node.firstChild);this.toggleClass("jp-mod-trusted",e.trusted),await this.render(e);let{fragment:n}=e.metadata;n&&this.setFragment(n)}setFragment(e){}},t0=class extends dl{constructor(e){super(e),this.addClass("jp-RenderedHTMLCommon")}setFragment(e){let i;try{i=this.node.querySelector(e.startsWith("#")?`#${CSS.escape(e.slice(1))}`:e)}catch(n){console.warn("Unable to set URI fragment identifier.",n)}i&&i.scrollIntoView()}},i0=class extends t0{constructor(e){super(e),this._rendered=Promise.resolve(),this.addClass("jp-RenderedHTML")}render(e){return this._rendered=SC({host:this.node,source:String(e.data[this.mimeType]),trusted:e.trusted,resolver:this.resolver,sanitizer:this.sanitizer,linkHandler:this.linkHandler,shouldTypeset:this.isAttached,latexTypesetter:this.latexTypesetter,translator:this.translator})}onAfterAttach(e){this._rendered.then(()=>{this.latexTypesetter&&this.latexTypesetter.typeset(this.node)}).catch(console.warn)}},n0=class extends dl{constructor(e){super(e),this._rendered=Promise.resolve(),this.addClass("jp-RenderedLatex")}render(e){return this._rendered=rz({host:this.node,source:String(e.data[this.mimeType]),shouldTypeset:this.isAttached,latexTypesetter:this.latexTypesetter})}onAfterAttach(e){this._rendered.then(()=>{this.latexTypesetter&&this.latexTypesetter.typeset(this.node)}).catch(console.warn)}},r0=class extends dl{constructor(e){super(e),this.addClass("jp-RenderedImage")}render(e){let i=e.metadata[this.mimeType];return nz({host:this.node,mimeType:this.mimeType,source:String(e.data[this.mimeType]),width:i&&i.width,height:i&&i.height,needsBackground:e.metadata.needs_background,unconfined:i&&i.unconfined})}},s0=class extends t0{constructor(e){super(e),this._rendered=Promise.resolve(),this.addClass("jp-RenderedMarkdown")}render(e){return this._rendered=lg({host:this.node,source:String(e.data[this.mimeType]),trusted:e.trusted,resolver:this.resolver,sanitizer:this.sanitizer,linkHandler:this.linkHandler,shouldTypeset:this.isAttached,latexTypesetter:this.latexTypesetter,markdownParser:this.markdownParser,translator:this.translator})}async renderModel(e){await super.renderModel(e,!0)}onAfterAttach(e){this._rendered.then(()=>{this.latexTypesetter&&this.latexTypesetter.typeset(this.node)}).catch(console.warn)}},o0=class extends dl{constructor(e){super(e),this._rendered=Promise.resolve(),this.addClass("jp-RenderedSVG")}render(e){let i=e.metadata[this.mimeType];return this._rendered=sz({host:this.node,source:String(e.data[this.mimeType]),trusted:e.trusted,unconfined:i&&i.unconfined,translator:this.translator})}onAfterAttach(e){this._rendered.then(()=>{this.latexTypesetter&&this.latexTypesetter.typeset(this.node)}).catch(console.warn)}},a0=class extends dl{constructor(e){super(e),this.addClass("jp-RenderedText")}render(e){return CC({host:this.node,sanitizer:this.sanitizer,source:String(e.data[this.mimeType]),translator:this.translator})}},l0=class extends dl{constructor(e){super(e),this.addClass("jp-RenderedText")}render(e){return az({host:this.node,sanitizer:this.sanitizer,source:String(e.data[this.mimeType]),linkHandler:this.linkHandler,resolver:this.resolver,translator:this.translator})}},cg=class extends dl{constructor(e){super(e),this.addClass("jp-RenderedJavaScript")}render(e){let i=this.translator.load("jupyterlab");return CC({host:this.node,sanitizer:this.sanitizer,source:i.__("JavaScript output is disabled in JupyterLab"),translator:this.translator})}}});var cz,uz,dz,hz,fz,mz,pz,gz=$(()=>{MC();cz={safe:!0,mimeTypes:["text/html"],defaultRank:50,createRenderer:t=>new i0(t)},uz={safe:!0,mimeTypes:["image/bmp","image/png","image/jpeg","image/gif","image/webp"],defaultRank:90,createRenderer:t=>new r0(t)},dz={safe:!0,mimeTypes:["text/latex"],defaultRank:70,createRenderer:t=>new n0(t)},hz={safe:!0,mimeTypes:["text/markdown"],defaultRank:60,createRenderer:t=>new s0(t)},fz={safe:!1,mimeTypes:["image/svg+xml"],defaultRank:80,createRenderer:t=>new o0(t)},mz={safe:!0,mimeTypes:["application/vnd.jupyter.stderr"],defaultRank:110,createRenderer:t=>new l0(t)},pz={safe:!0,mimeTypes:["text/plain","application/vnd.jupyter.stdout"],defaultRank:120,createRenderer:t=>new a0(t)}});var c0,IC,TC=$(()=>{c0=class{constructor(e={}){this.trusted=!!e.trusted,this._data=e.data||{},this._metadata=e.metadata||{},this._callback=e.callback||IC.noOp}get data(){return this._data}get metadata(){return this._metadata}setData(e){this._data=e.data||this._data,this._metadata=e.metadata||this._metadata,this._callback(e)}};(function(t){function e(){}t.noOp=e})(IC||(IC={}))});function u0(t){return t.output_type==="execute_result"}function RC(t){return t.output_type==="display_data"}function _z(t){return t.output_type==="update_display_data"}function Nu(t){return t.output_type==="stream"}function vz(t){return t.output_type==="error"}var dG,kC=$(()=>{dG=P(Qn())});var xz,Nf,Du,yz=$(()=>{kC();b1();xz=P(Qn());Rs();Nf=class{constructor(e){this._changed=new Te(this),this._raw={};let{data:i,metadata:n,trusted:r}=Du.getBundleOptions(e);this._data=new kp({values:i}),this._rawData=i,this._metadata=new kp({values:n}),this._rawMetadata=n,this.trusted=r;let s=e.value;for(let o in s)switch(o){case"data":case"metadata":break;default:this._raw[o]=Du.extract(s,o)}this.type=s.output_type,u0(s)?this.executionCount=s.execution_count:this.executionCount=null}get changed(){return this._changed}dispose(){this._data.dispose(),this._metadata.dispose(),Te.clearData(this)}get data(){return this._rawData}get metadata(){return this._rawMetadata}setData(e){e.data&&(this._updateObservable(this._data,e.data),this._rawData=e.data),e.metadata&&(this._updateObservable(this._metadata,e.metadata),this._rawMetadata=e.metadata),this._changed.emit()}toJSON(){let e={};for(let i in this._raw)e[i]=Du.extract(this._raw,i);switch(this.type){case"display_data":case"execute_result":case"update_display_data":e.data=this.data,e.metadata=this.metadata;break;default:break}return delete e.transient,e}_updateObservable(e,i){let n=e.keys(),r=Object.keys(i);for(let s of n)r.indexOf(s)===-1&&e.delete(s);for(let s of r){let o=e.get(s),a=i[s];o!==a&&e.set(s,a)}}};(function(t){function e(n){return Du.getData(n)}t.getData=e;function i(n){return Du.getMetadata(n)}t.getMetadata=i})(Nf||(Nf={}));(function(t){function e(o){let a={};if(u0(o)||RC(o)||_z(o))a=o.data;else if(Nu(o))o.name==="stderr"?a["application/vnd.jupyter.stderr"]=o.text:a["application/vnd.jupyter.stdout"]=o.text;else if(vz(o)){a["application/vnd.jupyter.error"]=o;let l=o.traceback.join(` -`);a["application/vnd.jupyter.stderr"]=l||`${o.ename}: ${o.evalue}`}return s(a)}t.getData=e;function i(o){let a=Object.create(null);if(u0(o)||RC(o))for(let l in o.metadata)a[l]=r(o.metadata,l);return a}t.getMetadata=i;function n(o){let a=e(o.value),l=i(o.value),c=!!o.trusted;return{data:a,metadata:l,trusted:c}}t.getBundleOptions=n;function r(o,a){let l=o[a];return l===void 0||xz.JSONExt.isPrimitive(l)?l:JSON.parse(JSON.stringify(l))}t.extract=r;function s(o){let a=Object.create(null);for(let l in o)a[l]=r(o,l);return a}})(Du||(Du={}))});var Ou,bc,AC,wz=$(()=>{y1();Ou=P(yA());Rp();TC();bc=class{constructor(e={}){var i,n,r,s,o,a;if(this._id=0,this._ranks={},this._types=null,this._factories={},this.translator=(i=e.translator)!==null&&i!==void 0?i:fo,this.resolver=(n=e.resolver)!==null&&n!==void 0?n:null,this.linkHandler=(r=e.linkHandler)!==null&&r!==void 0?r:null,this.latexTypesetter=(s=e.latexTypesetter)!==null&&s!==void 0?s:null,this.markdownParser=(o=e.markdownParser)!==null&&o!==void 0?o:null,this.sanitizer=(a=e.sanitizer)!==null&&a!==void 0?a:new CA,e.initialFactories)for(let l of e.initialFactories)this.addFactory(l)}get mimeTypes(){return this._types||(this._types=AC.sortedTypes(this._ranks))}preferredMimeType(e,i="ensure"){if(i==="ensure"||i==="prefer"){for(let n of this.mimeTypes)if(n in e&&this._factories[n].safe)return n}if(i!=="ensure"){for(let n of this.mimeTypes)if(n in e)return n}}createRenderer(e){if(!(e in this._factories))throw new Error(`No factory for mime type: '${e}'`);return this._factories[e].createRenderer({mimeType:e,resolver:this.resolver,sanitizer:this.sanitizer,linkHandler:this.linkHandler,latexTypesetter:this.latexTypesetter,markdownParser:this.markdownParser,translator:this.translator})}createModel(e={}){return new c0(e)}clone(e={}){var i,n,r,s,o,a,l,c,u,d;let f=new bc({resolver:(n=(i=e.resolver)!==null&&i!==void 0?i:this.resolver)!==null&&n!==void 0?n:void 0,sanitizer:(s=(r=e.sanitizer)!==null&&r!==void 0?r:this.sanitizer)!==null&&s!==void 0?s:void 0,linkHandler:(a=(o=e.linkHandler)!==null&&o!==void 0?o:this.linkHandler)!==null&&a!==void 0?a:void 0,latexTypesetter:(c=(l=e.latexTypesetter)!==null&&l!==void 0?l:this.latexTypesetter)!==null&&c!==void 0?c:void 0,markdownParser:(d=(u=e.markdownParser)!==null&&u!==void 0?u:this.markdownParser)!==null&&d!==void 0?d:void 0,translator:this.translator});return f._factories={...this._factories},f._ranks={...this._ranks},f._id=this._id,f}getFactory(e){return this._factories[e]}addFactory(e,i){i===void 0&&(i=e.defaultRank,i===void 0&&(i=100));for(let n of e.mimeTypes)this._factories[n]=e,this._ranks[n]={rank:i,id:this._id++};this._types=null}removeMimeType(e){delete this._factories[e],delete this._ranks[e],this._types=null}getRank(e){let i=this._ranks[e];return i&&i.rank}setRank(e,i){if(!this._ranks[e])return;let n=this._id++;this._ranks[e]={rank:i,id:n},this._types=null}};(function(t){class e{constructor(n){this._path=n.path,this._contents=n.contents}get path(){return this._path}set path(n){this._path=n}async resolveUrl(n){if(this.isLocal(n)){let r=encodeURI(Ou.PathExt.dirname(this.path));n=Ou.PathExt.resolve(r,n)}return n}async getDownloadUrl(n){return this.isLocal(n)?this._contents.getDownloadUrl(decodeURIComponent(n)):n}isLocal(n,r=!1){return this.isMalformed(n)?!1:Ou.URLExt.isLocal(n,r)||!!this._contents.driveName(decodeURI(n))}async resolvePath(n){let r=Ou.PageConfig.getOption("rootUri").replace("file://","");if(n.startsWith("~/")&&r.startsWith("/home/")&&(n=r.split("/").slice(0,3).join("/")+n.substring(1)),n.startsWith(r)||n.startsWith("./"))try{let s=n.replace(r,"");return{path:(await this._contents.get(s,{content:!1})).path,scope:"server"}}catch{return console.warn(`Could not resolve location of ${n} on server`),null}return{path:n,scope:"kernel"}}isMalformed(n){try{return decodeURI(n),!1}catch(r){if(r instanceof URIError)return!0;throw r}}}t.UrlResolver=e})(bc||(bc={}));(function(t){function e(i){return Object.keys(i).sort((n,r)=>{let s=i[n],o=i[r];return s.rank!==o.rank?s.rank-o.rank:s.id-o.id})}t.sortedTypes=e})(AC||(AC={}))});var d0,Dfe,Ofe,zfe,Sz=$(()=>{d0=P(Qn()),Dfe=new d0.Token("@jupyterlab/rendermime:IRenderMimeRegistry",'A service for the rendermime registry for the application. Use this to create renderers for various mime-types in your extension. Many times it will be easier to create a "mime renderer extension" rather than using this service directly.'),Ofe=new d0.Token("@jupyterlab/rendermime:ILatexTypesetter","A service for the LaTeX typesetter for the application. Use this if you want to typeset math in your extension."),zfe=new d0.Token("@jupyterlab/rendermime:IMarkdownParser","A service for rendering markdown syntax as HTML content.")});var h0=$(()=>{R2();k2();gz();xC();TC();yz();wz();EC();Sz();MC()});function*Of(){}function LC(t,e){let i=0;for(let n of t)if(e(n,i++))return n}function Ez(t,e){let i=0;for(let n of t)if(e(n,i++)===!1)return!1;return!0}function Mz(t,e){let i=0;for(let n of t)if(e(n,i++))return!0;return!1}function*Iz(t,e){let i=0;for(let n of t)yield e(n,i++)}function*Tz(t){if(typeof t.retro=="function")yield*t.retro();else for(let e=t.length-1;e>-1;e--)yield t[e]}var je,Cz,Df,f0=$(()=>{(function(t){function e(M,C,x=0,w=-1){let E=M.length;if(E===0)return-1;x<0?x=Math.max(0,x+E):x=Math.min(x,E-1),w<0?w=Math.max(0,w+E):w=Math.min(w,E-1);let N;w0;){let X=Z>>1,K=B+X;x(M[K],C)<0?(B=K+1,Z-=X+1):Z=X}return B}t.lowerBound=a;function l(M,C,x,w=0,E=-1){let N=M.length;if(N===0)return 0;w<0?w=Math.max(0,w+N):w=Math.min(w,N-1),E<0?E=Math.max(0,E+N):E=Math.min(E,N-1);let B=w,Z=E-w+1;for(;Z>0;){let X=Z>>1,K=B+X;x(M[K],C)>0?Z=X:(B=K+1,Z-=X+1)}return B}t.upperBound=l;function c(M,C,x){if(M===C)return!0;if(M.length!==C.length)return!1;for(let w=0,E=M.length;w=N&&(x=E<0?N-1:N),w===void 0?w=E<0?-1:N:w<0?w=Math.max(w+N,E<0?-1:0):w>=N&&(w=E<0?N-1:N);let B;E<0&&w>=x||E>0&&x>=w?B=0:E<0?B=Math.floor((w-x+1)/E+1):B=Math.floor((w-x-1)/E+1);let Z=[];for(let X=0;X=w))return;let N=w-x+1;if(C>0?C=C%N:C<0&&(C=(C%N+N)%N),C===0)return;let B=x+C;f(M,x,B-1),f(M,B,w),f(M,x,w)}t.rotate=h;function m(M,C,x=0,w=-1){let E=M.length;if(E===0)return;x<0?x=Math.max(0,x+E):x=Math.min(x,E-1),w<0?w=Math.max(0,w+E):w=Math.min(w,E-1);let N;wC;--E)M[E]=M[E-1];M[C]=x}t.insert=p;function _(M,C){let x=M.length;if(C<0&&(C+=x),C<0||C>=x)return;let w=M[C];for(let E=C+1;E=x&&B<=w&&M[B]===C||w=x)&&M[B]===C?N++:N>0&&(M[B-N]=M[B]);return N>0&&(M.length=E-N),N}t.removeAllOf=T;function O(M,C,x=0,w=-1){let E,N=n(M,C,x,w);return N!==-1&&(E=_(M,N)),{index:N,value:E}}t.removeFirstWhere=O;function A(M,C,x=-1,w=0){let E,N=r(M,C,x,w);return N!==-1&&(E=_(M,N)),{index:N,value:E}}t.removeLastWhere=A;function b(M,C,x=0,w=-1){let E=M.length;if(E===0)return 0;x<0?x=Math.max(0,x+E):x=Math.min(x,E-1),w<0?w=Math.max(0,w+E):w=Math.min(w,E-1);let N=0;for(let B=0;B=x&&B<=w&&C(M[B],B)||w=x)&&C(M[B],B)?N++:N>0&&(M[B-N]=M[B]);return N>0&&(M.length=E-N),N}t.removeAllWhere=b})(je||(je={}));(function(t){function e(i,n,r){return r===0?1/0:i>n&&r>0||ia?1:0}t.cmp=s})(Df||(Df={}))});var NC,ua,zf,Rz=$(()=>{kC();b1();h0();f0();NC=P(Qn());Rs();ua=class{constructor(e={}){if(this.clearNext=!1,this._lastStream="",this._trusted=!1,this._isDisposed=!1,this._stateChanged=new Te(this),this._changed=new Te(this),this._trusted=!!e.trusted,this.contentFactory=e.contentFactory||ua.defaultContentFactory,this.list=new wA,e.values)for(let i of e.values){let n=this._add(i)-1;this.list.get(n).changed.connect(this._onGenericChange,this)}this.list.changed.connect(this._onListChanged,this)}get stateChanged(){return this._stateChanged}get changed(){return this._changed}get length(){return this.list?this.list.length:0}get trusted(){return this._trusted}set trusted(e){if(e===this._trusted)return;let i=this._trusted=e;for(let n=0;ne.toJSON()))}_add(e){let i=this._trusted;if(e=NC.JSONExt.deepCopy(e),zf.normalize(e),Nu(e)&&this._lastStream&&e.name===this._lastName&&this.shouldCombine({value:e,lastModel:this.list.get(this.length-1)})){this._lastStream+=e.text,this._lastStream=zf.removeOverwrittenChars(this._lastStream),e.text=this._lastStream;let r=this._createItem({value:e,trusted:i}),s=this.length-1,o=this.list.get(s);return this.list.set(s,r),o.dispose(),this.length}Nu(e)&&(e.text=zf.removeOverwrittenChars(e.text));let n=this._createItem({value:e,trusted:i});return Nu(e)?(this._lastStream=e.text,this._lastName=e.name):this._lastStream="",this.list.push(n)}shouldCombine(e){return!0}_createItem(e){return this.contentFactory.createOutputModel(e)}_onListChanged(e,i){switch(i.type){case"add":i.newValues.forEach(n=>{n.changed.connect(this._onGenericChange,this)});break;case"remove":i.oldValues.forEach(n=>{n.changed.disconnect(this._onGenericChange,this)});break;case"set":i.newValues.forEach(n=>{n.changed.connect(this._onGenericChange,this)}),i.oldValues.forEach(n=>{n.changed.disconnect(this._onGenericChange,this)});break}this._changed.emit(i)}_onGenericChange(e){let i,n=null;for(i=0;it;)r[e]="",e--;return r[t]="@@"+n.length+"@@",i&&(s=i(s)),n.push(s),r}var J2,aG,xC=$(()=>{J2="$",aG=/(\$\$?|\\(?:begin|end)\{[a-z]*\*?\}|\\[{}$]|[{}]|(?:\n\s*)+|@@\d+@@|\\\\(?:\(|\)|\[|\]))/i});function SC(t){let{host:e,source:i,trusted:n,sanitizer:r,resolver:s,linkHandler:o,shouldTypeset:a,latexTypesetter:l,translator:c}=t;c=c||fo;let u=c?.load("jupyterlab"),d=i;if(!i)return e.textContent="",Promise.resolve(void 0);if(n||(d=`${i}`,i=r.sanitize(i)),e.innerHTML=i,e.getElementsByTagName("script").length>0)if(n)ca.evalInnerHTMLScriptTags(e);else{let h=document.createElement("div"),m=document.createElement("pre");m.textContent=u.__("This HTML output contains inline scripts. Are you sure that you want to run arbitrary Javascript within your JupyterLab session?");let p=document.createElement("button");p.textContent=u.__("Run"),p.onclick=_=>{e.innerHTML=d,ca.evalInnerHTMLScriptTags(e),e.firstChild&&e.removeChild(e.firstChild)},h.appendChild(m),h.appendChild(p),e.insertBefore(h,e.firstChild)}ca.handleDefaults(e,s);let f;return s?f=ca.handleUrls(e,s,o):f=Promise.resolve(void 0),f.then(()=>{a&&l&&l.typeset(e)})}function nz(t){let{host:e,mimeType:i,source:n,width:r,height:s,needsBackground:o,unconfined:a}=t;e.textContent="";let l=document.createElement("img");return l.src=`data:${i};base64,${n}`,typeof s=="number"&&(l.height=s),typeof r=="number"&&(l.width=r),o==="light"?l.classList.add("jp-needs-light-background"):o==="dark"&&l.classList.add("jp-needs-dark-background"),a===!0&&l.classList.add("jp-mod-unconfined"),e.appendChild(l),Promise.resolve(void 0)}function rz(t){let{host:e,source:i,shouldTypeset:n,latexTypesetter:r}=t;return e.textContent=i,n&&r&&r.typeset(e),Promise.resolve(void 0)}async function lg(t){let{host:e,source:i,markdownParser:n,...r}=t;if(!i){e.textContent="";return}let s="";if(n){let o=Z2(i);s=await n.render(o.text),s=Q2(s,o.math)}else s=`
${i}
`;await SC({host:e,source:s,...r}),ca.headerAnchors(e)}function sz(t){let{host:e,source:i,trusted:n,unconfined:r}=t;if(!i)return e.textContent="",Promise.resolve(void 0);if(!n)return e.textContent="Cannot display an untrusted SVG. Maybe you need to run the cell?",Promise.resolve(void 0);let s="]+xmlns=[^>]+svg";i.search(s)<0&&(i=i.replace("{if(o>=i.length){n.push(document.createTextNode(s));return}let a=i[o],l,c=0,u=a.regex;for(u.lastIndex=0;(l=u.exec(s))!=null;){let f=s.substring(c,l.index);f&&r(f,o+1);let{path:h,...m}=l.groups,p=a.processPath?a.processPath(h):h,_=a.processLabel?a.processLabel(l[0]):l[0];n.push(a.createAnchor(p,_,m)),c=l.index+_.length}let d=s.substring(c);d&&r(d,o+1)};return r(t,0),n}function ez(t,e){var i,n;let r=t.cloneNode();r.textContent=(i=t.textContent)===null||i===void 0?void 0:i.slice(0,e);let s=t.cloneNode();return s.textContent=(n=t.textContent)===null||n===void 0?void 0:n.slice(e),{pre:r,post:s}}function*tz(t){var e;let i=0,n;for(let r of t)n=i+(((e=r.textContent)===null||e===void 0?void 0:e.length)||0),yield{node:r,start:i,end:n,isText:r.nodeType===Node.TEXT_NODE},i=n}function*lG(t,e){var i,n;let r=tz(t),s=tz(e),o=r.next(),a=s.next();for(;!o.done&&!a.done;){let l=o.value,c=a.value;if(l.isText&&l.start<=c.start&&l.end>=c.end)yield[null,c.node],a=s.next();else if(c.isText&&c.start<=l.start&&c.end>=l.end)yield[l.node,null],o=r.next();else if(l.end===c.end&&l.start===c.start)yield[l.node,c.node],o=r.next(),a=s.next();else if(l.end>c.end){let{pre:u,post:d}=ez(l.node,c.end-l.start);c.startl.end){let{pre:u,post:d}=ez(c.node,l.end-c.start);l.start{Nf=P(yA());Rp();iz=P(X2());xC();(function(t){function e(i){var n;return((n=i.textContent)!==null&&n!==void 0?n:"").replace(/ /g,"-")}t.createHeaderId=e})(lg||(lg={}));(function(t){let e="\\u0000-\\u0020\\u007f-\\u009f";t.webLinkRegex=new RegExp("(?(?:[a-zA-Z][a-zA-Z0-9+.-]{2,}:\\/\\/|data:|www\\.)[^\\s"+e+'"]{2,}[^\\s'+e+`"'(){}\\[\\],:;.!?])`,"ug");let i=/(?:[a-zA-Z]:(?:(?:\\|\/)[\w\.-]*)+)/,n=/(?:(?:\~|\.)(?:(?:\\|\/)[\w\.-]*)+)/,r=new RegExp(`(${i.source}|${n.source})`),s=/((?:\~|\.)?(?:\/[\w\.-]*)+)/,o=/(?:(?:\:|", line )(?[\d]+))?(?:\:(?[\d]+))?/,a=navigator.userAgent.indexOf("Windows")>=0;t.pathLinkRegex=new RegExp(`(?${a?r.source:s.source})${o.source}`,"g")})(e0||(e0={}));yC=class{constructor(){this.regex=e0.webLinkRegex}createAnchor(e,i){let n=document.createElement("a");return n.href=e.startsWith("www.")?"https://"+e:e,n.rel="noopener",n.target="_blank",n.appendChild(document.createTextNode(i)),n}processPath(e){let i=e.slice(-1),r=[">","<"].indexOf(i)!==-1?e.length-1:e.length;return e=e.slice(0,r),e}processLabel(e){return this.processPath(e)}},wC=class{constructor(){this.regex=e0.pathLinkRegex}createAnchor(e,i,n){let r=document.createElement("a");r.dataset.path=e;let s=parseInt(n.line,10),o=isNaN(s)?"":`line=${s-1}`;return r.dataset.locator=o,r.appendChild(document.createTextNode(i)),r}};(function(t){function e(h){let m=Array.from(h.getElementsByTagName("script"));for(let p of m){if(!p.parentNode)continue;let _=document.createElement("script"),y=p.attributes;for(let S=0,T=y.length;S{})}t.handleUrls=n;async function r(h,m,p){let _=h.getElementsByTagName("a");for(let y=0;y<_.length;y++)await l(_[y],m,p)}t.handlePaths=r;function s(h){let m=["h1","h2","h3","h4","h5","h6"];for(let p of m){let _=h.getElementsByTagName(p);for(let y=0;y<_.length;y++){let S=_[y];S.id=lg.createHeaderId(S);let T=document.createElement("a");T.target="_self",T.textContent="\xB6",T.href="#"+S.id,T.classList.add("jp-InternalAnchorLink"),S.appendChild(T)}}}t.headerAnchors=s;async function o(h,m,p){let _=h.getAttribute(m)||"",y=p.isLocal?p.isLocal(_):Nf.URLExt.isLocal(_);if(!(!_||!y))try{let S=await p.resolveUrl(_),T=await p.getDownloadUrl(S);Nf.URLExt.parse(T).protocol!=="data:"&&(T+=(/\?/.test(T)?"&":"?")+new Date().getTime()),h.setAttribute(m,T)}catch(S){throw h.setAttribute(m,""),S}}function a(h,m,p){let _=h.getAttribute("href")||"",y=m.isLocal?m.isLocal(_):Nf.URLExt.isLocal(_);if(!_||!y)return Promise.resolve(void 0);let S=h.hash;if(S){if(S===_)return h.target="_self",Promise.resolve(void 0);_=_.replace(S,"")}return m.resolveUrl(_).then(T=>{let O=decodeURIComponent(T);return p&&p.handleLink(h,O,S),m.getDownloadUrl(T)}).then(T=>{h.href=T+S}).catch(T=>{h.href=""})}async function l(h,m,p){let _=h.dataset.path||"",y=h.dataset.locator?"#"+h.dataset.locator:"";delete h.dataset.path,delete h.dataset.locator;let S=!0,T=m.isLocal?m.isLocal(_,S):Nf.URLExt.isLocal(_,S);if(!_||!T||!m.resolvePath||!p||!p.handlePath)return h.replaceWith(...h.childNodes),Promise.resolve(void 0);try{let O=await m.resolvePath(_);if(!O)return console.log("Path resolution bailing: does not exist"),Promise.resolve(void 0);p.handlePath(h,O.path,O.scope,y),h.href=O.path+y}catch(O){console.warn("Path anchor error:",O),h.href="#linking-failed-see-console"}}let c=["ansi-black","ansi-red","ansi-green","ansi-yellow","ansi-blue","ansi-magenta","ansi-cyan","ansi-white","ansi-black-intense","ansi-red-intense","ansi-green-intense","ansi-yellow-intense","ansi-blue-intense","ansi-magenta-intense","ansi-cyan-intense","ansi-white-intense"];function u(h,m,p,_,y,S,T){if(h){let O=[],A=[];_&&typeof m=="number"&&0<=m&&m<8&&(m+=8),S&&([m,p]=[p,m]),typeof m=="number"?O.push(c[m]+"-fg"):m.length?A.push(`color: rgb(${m})`):S&&O.push("ansi-default-inverse-fg"),typeof p=="number"?O.push(c[p]+"-bg"):p.length?A.push(`background-color: rgb(${p})`):S&&O.push("ansi-default-inverse-bg"),_&&O.push("ansi-bold"),y&&O.push("ansi-underline"),O.length||A.length?(T.push(""),T.push(h),T.push("")):T.push(h)}}function d(h){let m,p,_,y=h.shift();if(y===2&&h.length>=3){if(m=h.shift(),p=h.shift(),_=h.shift(),[m,p,_].some(S=>S<0||255=1){let S=h.shift();if(S<0)throw new RangeError("Color index must be >= 0");if(S<16)return S;if(S<232)m=Math.floor((S-16)/36),m=m>0?55+m*40:0,p=Math.floor((S-16)%36/6),p=p>0?55+p*40:0,_=(S-16)%6,_=_>0?55+_*40:0;else if(S<256)m=p=_=(S-232)*10+8;else throw new RangeError("Color index must be < 256")}else throw new RangeError("Invalid extended color specification");return[m,p,_]}function f(h){let m=/\x1b\[(.*?)([@-~])/g,p=[],_=[],y=!1,S=!1,T=!1,O,A=[],b=[],M=0;for(h=(0,iz.default)(h),h+="\x1B[m";O=m.exec(h);){if(O[2]==="m"){let x=O[1].split(";");for(let w=0;w{Rp();q2();EC();dl=class extends pe{constructor(e){var i,n;super(),this.mimeType=e.mimeType,this.sanitizer=e.sanitizer,this.resolver=e.resolver,this.linkHandler=e.linkHandler,this.translator=(i=e.translator)!==null&&i!==void 0?i:fo,this.latexTypesetter=e.latexTypesetter,this.markdownParser=(n=e.markdownParser)!==null&&n!==void 0?n:null,this.node.dataset.mimeType=this.mimeType}async renderModel(e,i){if(!i)for(;this.node.firstChild;)this.node.removeChild(this.node.firstChild);this.toggleClass("jp-mod-trusted",e.trusted),await this.render(e);let{fragment:n}=e.metadata;n&&this.setFragment(n)}setFragment(e){}},t0=class extends dl{constructor(e){super(e),this.addClass("jp-RenderedHTMLCommon")}setFragment(e){let i;try{i=this.node.querySelector(e.startsWith("#")?`#${CSS.escape(e.slice(1))}`:e)}catch(n){console.warn("Unable to set URI fragment identifier.",n)}i&&i.scrollIntoView()}},i0=class extends t0{constructor(e){super(e),this._rendered=Promise.resolve(),this.addClass("jp-RenderedHTML")}render(e){return this._rendered=SC({host:this.node,source:String(e.data[this.mimeType]),trusted:e.trusted,resolver:this.resolver,sanitizer:this.sanitizer,linkHandler:this.linkHandler,shouldTypeset:this.isAttached,latexTypesetter:this.latexTypesetter,translator:this.translator})}onAfterAttach(e){this._rendered.then(()=>{this.latexTypesetter&&this.latexTypesetter.typeset(this.node)}).catch(console.warn)}},n0=class extends dl{constructor(e){super(e),this._rendered=Promise.resolve(),this.addClass("jp-RenderedLatex")}render(e){return this._rendered=rz({host:this.node,source:String(e.data[this.mimeType]),shouldTypeset:this.isAttached,latexTypesetter:this.latexTypesetter})}onAfterAttach(e){this._rendered.then(()=>{this.latexTypesetter&&this.latexTypesetter.typeset(this.node)}).catch(console.warn)}},r0=class extends dl{constructor(e){super(e),this.addClass("jp-RenderedImage")}render(e){let i=e.metadata[this.mimeType];return nz({host:this.node,mimeType:this.mimeType,source:String(e.data[this.mimeType]),width:i&&i.width,height:i&&i.height,needsBackground:e.metadata.needs_background,unconfined:i&&i.unconfined})}},s0=class extends t0{constructor(e){super(e),this._rendered=Promise.resolve(),this.addClass("jp-RenderedMarkdown")}render(e){return this._rendered=lg({host:this.node,source:String(e.data[this.mimeType]),trusted:e.trusted,resolver:this.resolver,sanitizer:this.sanitizer,linkHandler:this.linkHandler,shouldTypeset:this.isAttached,latexTypesetter:this.latexTypesetter,markdownParser:this.markdownParser,translator:this.translator})}async renderModel(e){await super.renderModel(e,!0)}onAfterAttach(e){this._rendered.then(()=>{this.latexTypesetter&&this.latexTypesetter.typeset(this.node)}).catch(console.warn)}},o0=class extends dl{constructor(e){super(e),this._rendered=Promise.resolve(),this.addClass("jp-RenderedSVG")}render(e){let i=e.metadata[this.mimeType];return this._rendered=sz({host:this.node,source:String(e.data[this.mimeType]),trusted:e.trusted,unconfined:i&&i.unconfined,translator:this.translator})}onAfterAttach(e){this._rendered.then(()=>{this.latexTypesetter&&this.latexTypesetter.typeset(this.node)}).catch(console.warn)}},a0=class extends dl{constructor(e){super(e),this.addClass("jp-RenderedText")}render(e){return CC({host:this.node,sanitizer:this.sanitizer,source:String(e.data[this.mimeType]),translator:this.translator})}},l0=class extends dl{constructor(e){super(e),this.addClass("jp-RenderedText")}render(e){return az({host:this.node,sanitizer:this.sanitizer,source:String(e.data[this.mimeType]),linkHandler:this.linkHandler,resolver:this.resolver,translator:this.translator})}},cg=class extends dl{constructor(e){super(e),this.addClass("jp-RenderedJavaScript")}render(e){let i=this.translator.load("jupyterlab");return CC({host:this.node,sanitizer:this.sanitizer,source:i.__("JavaScript output is disabled in JupyterLab"),translator:this.translator})}}});var cz,uz,dz,hz,fz,mz,pz,gz=$(()=>{MC();cz={safe:!0,mimeTypes:["text/html"],defaultRank:50,createRenderer:t=>new i0(t)},uz={safe:!0,mimeTypes:["image/bmp","image/png","image/jpeg","image/gif","image/webp"],defaultRank:90,createRenderer:t=>new r0(t)},dz={safe:!0,mimeTypes:["text/latex"],defaultRank:70,createRenderer:t=>new n0(t)},hz={safe:!0,mimeTypes:["text/markdown"],defaultRank:60,createRenderer:t=>new s0(t)},fz={safe:!1,mimeTypes:["image/svg+xml"],defaultRank:80,createRenderer:t=>new o0(t)},mz={safe:!0,mimeTypes:["application/vnd.jupyter.stderr"],defaultRank:110,createRenderer:t=>new l0(t)},pz={safe:!0,mimeTypes:["text/plain","application/vnd.jupyter.stdout"],defaultRank:120,createRenderer:t=>new a0(t)}});var c0,IC,TC=$(()=>{c0=class{constructor(e={}){this.trusted=!!e.trusted,this._data=e.data||{},this._metadata=e.metadata||{},this._callback=e.callback||IC.noOp}get data(){return this._data}get metadata(){return this._metadata}setData(e){this._data=e.data||this._data,this._metadata=e.metadata||this._metadata,this._callback(e)}};(function(t){function e(){}t.noOp=e})(IC||(IC={}))});function u0(t){return t.output_type==="execute_result"}function RC(t){return t.output_type==="display_data"}function _z(t){return t.output_type==="update_display_data"}function Nu(t){return t.output_type==="stream"}function vz(t){return t.output_type==="error"}var dG,kC=$(()=>{dG=P(Qn())});var xz,Df,Du,yz=$(()=>{kC();b1();xz=P(Qn());Rs();Df=class{constructor(e){this._changed=new Te(this),this._raw={};let{data:i,metadata:n,trusted:r}=Du.getBundleOptions(e);this._data=new kp({values:i}),this._rawData=i,this._metadata=new kp({values:n}),this._rawMetadata=n,this.trusted=r;let s=e.value;for(let o in s)switch(o){case"data":case"metadata":break;default:this._raw[o]=Du.extract(s,o)}this.type=s.output_type,u0(s)?this.executionCount=s.execution_count:this.executionCount=null}get changed(){return this._changed}dispose(){this._data.dispose(),this._metadata.dispose(),Te.clearData(this)}get data(){return this._rawData}get metadata(){return this._rawMetadata}setData(e){e.data&&(this._updateObservable(this._data,e.data),this._rawData=e.data),e.metadata&&(this._updateObservable(this._metadata,e.metadata),this._rawMetadata=e.metadata),this._changed.emit()}toJSON(){let e={};for(let i in this._raw)e[i]=Du.extract(this._raw,i);switch(this.type){case"display_data":case"execute_result":case"update_display_data":e.data=this.data,e.metadata=this.metadata;break;default:break}return delete e.transient,e}_updateObservable(e,i){let n=e.keys(),r=Object.keys(i);for(let s of n)r.indexOf(s)===-1&&e.delete(s);for(let s of r){let o=e.get(s),a=i[s];o!==a&&e.set(s,a)}}};(function(t){function e(n){return Du.getData(n)}t.getData=e;function i(n){return Du.getMetadata(n)}t.getMetadata=i})(Df||(Df={}));(function(t){function e(o){let a={};if(u0(o)||RC(o)||_z(o))a=o.data;else if(Nu(o))o.name==="stderr"?a["application/vnd.jupyter.stderr"]=o.text:a["application/vnd.jupyter.stdout"]=o.text;else if(vz(o)){a["application/vnd.jupyter.error"]=o;let l=o.traceback.join(` +`);a["application/vnd.jupyter.stderr"]=l||`${o.ename}: ${o.evalue}`}return s(a)}t.getData=e;function i(o){let a=Object.create(null);if(u0(o)||RC(o))for(let l in o.metadata)a[l]=r(o.metadata,l);return a}t.getMetadata=i;function n(o){let a=e(o.value),l=i(o.value),c=!!o.trusted;return{data:a,metadata:l,trusted:c}}t.getBundleOptions=n;function r(o,a){let l=o[a];return l===void 0||xz.JSONExt.isPrimitive(l)?l:JSON.parse(JSON.stringify(l))}t.extract=r;function s(o){let a=Object.create(null);for(let l in o)a[l]=r(o,l);return a}})(Du||(Du={}))});var Ou,bc,AC,wz=$(()=>{y1();Ou=P(yA());Rp();TC();bc=class{constructor(e={}){var i,n,r,s,o,a;if(this._id=0,this._ranks={},this._types=null,this._factories={},this.translator=(i=e.translator)!==null&&i!==void 0?i:fo,this.resolver=(n=e.resolver)!==null&&n!==void 0?n:null,this.linkHandler=(r=e.linkHandler)!==null&&r!==void 0?r:null,this.latexTypesetter=(s=e.latexTypesetter)!==null&&s!==void 0?s:null,this.markdownParser=(o=e.markdownParser)!==null&&o!==void 0?o:null,this.sanitizer=(a=e.sanitizer)!==null&&a!==void 0?a:new CA,e.initialFactories)for(let l of e.initialFactories)this.addFactory(l)}get mimeTypes(){return this._types||(this._types=AC.sortedTypes(this._ranks))}preferredMimeType(e,i="ensure"){if(i==="ensure"||i==="prefer"){for(let n of this.mimeTypes)if(n in e&&this._factories[n].safe)return n}if(i!=="ensure"){for(let n of this.mimeTypes)if(n in e)return n}}createRenderer(e){if(!(e in this._factories))throw new Error(`No factory for mime type: '${e}'`);return this._factories[e].createRenderer({mimeType:e,resolver:this.resolver,sanitizer:this.sanitizer,linkHandler:this.linkHandler,latexTypesetter:this.latexTypesetter,markdownParser:this.markdownParser,translator:this.translator})}createModel(e={}){return new c0(e)}clone(e={}){var i,n,r,s,o,a,l,c,u,d;let f=new bc({resolver:(n=(i=e.resolver)!==null&&i!==void 0?i:this.resolver)!==null&&n!==void 0?n:void 0,sanitizer:(s=(r=e.sanitizer)!==null&&r!==void 0?r:this.sanitizer)!==null&&s!==void 0?s:void 0,linkHandler:(a=(o=e.linkHandler)!==null&&o!==void 0?o:this.linkHandler)!==null&&a!==void 0?a:void 0,latexTypesetter:(c=(l=e.latexTypesetter)!==null&&l!==void 0?l:this.latexTypesetter)!==null&&c!==void 0?c:void 0,markdownParser:(d=(u=e.markdownParser)!==null&&u!==void 0?u:this.markdownParser)!==null&&d!==void 0?d:void 0,translator:this.translator});return f._factories={...this._factories},f._ranks={...this._ranks},f._id=this._id,f}getFactory(e){return this._factories[e]}addFactory(e,i){i===void 0&&(i=e.defaultRank,i===void 0&&(i=100));for(let n of e.mimeTypes)this._factories[n]=e,this._ranks[n]={rank:i,id:this._id++};this._types=null}removeMimeType(e){delete this._factories[e],delete this._ranks[e],this._types=null}getRank(e){let i=this._ranks[e];return i&&i.rank}setRank(e,i){if(!this._ranks[e])return;let n=this._id++;this._ranks[e]={rank:i,id:n},this._types=null}};(function(t){class e{constructor(n){this._path=n.path,this._contents=n.contents}get path(){return this._path}set path(n){this._path=n}async resolveUrl(n){if(this.isLocal(n)){let r=encodeURI(Ou.PathExt.dirname(this.path));n=Ou.PathExt.resolve(r,n)}return n}async getDownloadUrl(n){return this.isLocal(n)?this._contents.getDownloadUrl(decodeURIComponent(n)):n}isLocal(n,r=!1){return this.isMalformed(n)?!1:Ou.URLExt.isLocal(n,r)||!!this._contents.driveName(decodeURI(n))}async resolvePath(n){let r=Ou.PageConfig.getOption("rootUri").replace("file://","");if(n.startsWith("~/")&&r.startsWith("/home/")&&(n=r.split("/").slice(0,3).join("/")+n.substring(1)),n.startsWith(r)||n.startsWith("./"))try{let s=n.replace(r,"");return{path:(await this._contents.get(s,{content:!1})).path,scope:"server"}}catch{return console.warn(`Could not resolve location of ${n} on server`),null}return{path:n,scope:"kernel"}}isMalformed(n){try{return decodeURI(n),!1}catch(r){if(r instanceof URIError)return!0;throw r}}}t.UrlResolver=e})(bc||(bc={}));(function(t){function e(i){return Object.keys(i).sort((n,r)=>{let s=i[n],o=i[r];return s.rank!==o.rank?s.rank-o.rank:s.id-o.id})}t.sortedTypes=e})(AC||(AC={}))});var d0,Dfe,Ofe,zfe,Sz=$(()=>{d0=P(Qn()),Dfe=new d0.Token("@jupyterlab/rendermime:IRenderMimeRegistry",'A service for the rendermime registry for the application. Use this to create renderers for various mime-types in your extension. Many times it will be easier to create a "mime renderer extension" rather than using this service directly.'),Ofe=new d0.Token("@jupyterlab/rendermime:ILatexTypesetter","A service for the LaTeX typesetter for the application. Use this if you want to typeset math in your extension."),zfe=new d0.Token("@jupyterlab/rendermime:IMarkdownParser","A service for rendering markdown syntax as HTML content.")});var h0=$(()=>{R2();k2();gz();xC();TC();yz();wz();EC();Sz();MC()});function*zf(){}function LC(t,e){let i=0;for(let n of t)if(e(n,i++))return n}function Ez(t,e){let i=0;for(let n of t)if(e(n,i++)===!1)return!1;return!0}function Mz(t,e){let i=0;for(let n of t)if(e(n,i++))return!0;return!1}function*Iz(t,e){let i=0;for(let n of t)yield e(n,i++)}function*Tz(t){if(typeof t.retro=="function")yield*t.retro();else for(let e=t.length-1;e>-1;e--)yield t[e]}var je,Cz,Of,f0=$(()=>{(function(t){function e(M,C,x=0,w=-1){let E=M.length;if(E===0)return-1;x<0?x=Math.max(0,x+E):x=Math.min(x,E-1),w<0?w=Math.max(0,w+E):w=Math.min(w,E-1);let N;w0;){let X=Z>>1,K=B+X;x(M[K],C)<0?(B=K+1,Z-=X+1):Z=X}return B}t.lowerBound=a;function l(M,C,x,w=0,E=-1){let N=M.length;if(N===0)return 0;w<0?w=Math.max(0,w+N):w=Math.min(w,N-1),E<0?E=Math.max(0,E+N):E=Math.min(E,N-1);let B=w,Z=E-w+1;for(;Z>0;){let X=Z>>1,K=B+X;x(M[K],C)>0?Z=X:(B=K+1,Z-=X+1)}return B}t.upperBound=l;function c(M,C,x){if(M===C)return!0;if(M.length!==C.length)return!1;for(let w=0,E=M.length;w=N&&(x=E<0?N-1:N),w===void 0?w=E<0?-1:N:w<0?w=Math.max(w+N,E<0?-1:0):w>=N&&(w=E<0?N-1:N);let B;E<0&&w>=x||E>0&&x>=w?B=0:E<0?B=Math.floor((w-x+1)/E+1):B=Math.floor((w-x-1)/E+1);let Z=[];for(let X=0;X=w))return;let N=w-x+1;if(C>0?C=C%N:C<0&&(C=(C%N+N)%N),C===0)return;let B=x+C;f(M,x,B-1),f(M,B,w),f(M,x,w)}t.rotate=h;function m(M,C,x=0,w=-1){let E=M.length;if(E===0)return;x<0?x=Math.max(0,x+E):x=Math.min(x,E-1),w<0?w=Math.max(0,w+E):w=Math.min(w,E-1);let N;wC;--E)M[E]=M[E-1];M[C]=x}t.insert=p;function _(M,C){let x=M.length;if(C<0&&(C+=x),C<0||C>=x)return;let w=M[C];for(let E=C+1;E=x&&B<=w&&M[B]===C||w=x)&&M[B]===C?N++:N>0&&(M[B-N]=M[B]);return N>0&&(M.length=E-N),N}t.removeAllOf=T;function O(M,C,x=0,w=-1){let E,N=n(M,C,x,w);return N!==-1&&(E=_(M,N)),{index:N,value:E}}t.removeFirstWhere=O;function A(M,C,x=-1,w=0){let E,N=r(M,C,x,w);return N!==-1&&(E=_(M,N)),{index:N,value:E}}t.removeLastWhere=A;function b(M,C,x=0,w=-1){let E=M.length;if(E===0)return 0;x<0?x=Math.max(0,x+E):x=Math.min(x,E-1),w<0?w=Math.max(0,w+E):w=Math.min(w,E-1);let N=0;for(let B=0;B=x&&B<=w&&C(M[B],B)||w=x)&&C(M[B],B)?N++:N>0&&(M[B-N]=M[B]);return N>0&&(M.length=E-N),N}t.removeAllWhere=b})(je||(je={}));(function(t){function e(i,n,r){return r===0?1/0:i>n&&r>0||ia?1:0}t.cmp=s})(Of||(Of={}))});var NC,ua,Pf,Rz=$(()=>{kC();b1();h0();f0();NC=P(Qn());Rs();ua=class{constructor(e={}){if(this.clearNext=!1,this._lastStream="",this._trusted=!1,this._isDisposed=!1,this._stateChanged=new Te(this),this._changed=new Te(this),this._trusted=!!e.trusted,this.contentFactory=e.contentFactory||ua.defaultContentFactory,this.list=new wA,e.values)for(let i of e.values){let n=this._add(i)-1;this.list.get(n).changed.connect(this._onGenericChange,this)}this.list.changed.connect(this._onListChanged,this)}get stateChanged(){return this._stateChanged}get changed(){return this._changed}get length(){return this.list?this.list.length:0}get trusted(){return this._trusted}set trusted(e){if(e===this._trusted)return;let i=this._trusted=e;for(let n=0;ne.toJSON()))}_add(e){let i=this._trusted;if(e=NC.JSONExt.deepCopy(e),Pf.normalize(e),Nu(e)&&this._lastStream&&e.name===this._lastName&&this.shouldCombine({value:e,lastModel:this.list.get(this.length-1)})){this._lastStream+=e.text,this._lastStream=Pf.removeOverwrittenChars(this._lastStream),e.text=this._lastStream;let r=this._createItem({value:e,trusted:i}),s=this.length-1,o=this.list.get(s);return this.list.set(s,r),o.dispose(),this.length}Nu(e)&&(e.text=Pf.removeOverwrittenChars(e.text));let n=this._createItem({value:e,trusted:i});return Nu(e)?(this._lastStream=e.text,this._lastName=e.name):this._lastStream="",this.list.push(n)}shouldCombine(e){return!0}_createItem(e){return this.contentFactory.createOutputModel(e)}_onListChanged(e,i){switch(i.type){case"add":i.newValues.forEach(n=>{n.changed.connect(this._onGenericChange,this)});break;case"remove":i.oldValues.forEach(n=>{n.changed.disconnect(this._onGenericChange,this)});break;case"set":i.newValues.forEach(n=>{n.changed.connect(this._onGenericChange,this)}),i.oldValues.forEach(n=>{n.changed.disconnect(this._onGenericChange,this)});break}this._changed.emit(i)}_onGenericChange(e){let i,n=null;for(i=0;i-1;){let o=s.match(/^(.*)\r+/m)[1],a=s.match(/\r+(.*)$/m)[1];a=a+o.slice(a.length,o.length),s=s.replace(/\r+.*$/m,"\r").replace(/^.*\r/m,a)}return s}function r(s){return n(i(s))}t.removeOverwrittenChars=r})(zf||(zf={}))});var kz,ii,zu,Pu,wo,Az=$(()=>{(function(t){function e(i){let n=document.body,r=s=>{s.preventDefault(),s.stopPropagation(),s.clipboardData.setData("text",i),n.removeEventListener("copy",r,!0)};n.addEventListener("copy",r,!0),document.execCommand("copy")}t.copyText=e})(kz||(kz={}));(function(t){function e(s){let o=window.getComputedStyle(s),a=parseFloat(o.borderTopWidth)||0,l=parseFloat(o.borderLeftWidth)||0,c=parseFloat(o.borderRightWidth)||0,u=parseFloat(o.borderBottomWidth)||0,d=parseFloat(o.paddingTop)||0,f=parseFloat(o.paddingLeft)||0,h=parseFloat(o.paddingRight)||0,m=parseFloat(o.paddingBottom)||0,p=l+f+h+c,_=a+d+m+u;return{borderTop:a,borderLeft:l,borderRight:c,borderBottom:u,paddingTop:d,paddingLeft:f,paddingRight:h,paddingBottom:m,horizontalSum:p,verticalSum:_}}t.boxSizing=e;function i(s){let o=window.getComputedStyle(s),a=parseFloat(o.minWidth)||0,l=parseFloat(o.minHeight)||0,c=parseFloat(o.maxWidth)||1/0,u=parseFloat(o.maxHeight)||1/0;return c=Math.max(a,c),u=Math.max(l,u),{minWidth:a,minHeight:l,maxWidth:c,maxHeight:u}}t.sizeLimits=i;function n(s,o,a){let l=s.getBoundingClientRect();return o>=l.left&&o=l.top&&a=a.bottom)){if(l.topa.bottom&&l.height>=a.height){s.scrollTop-=a.top-l.top;return}if(l.topa.height){s.scrollTop-=a.bottom-l.bottom;return}if(l.bottom>a.bottom&&l.height{let u=Element.prototype;return u.matches||u.matchesSelector||u.mozMatchesSelector||u.msMatchesSelector||u.oMatchesSelector||u.webkitMatchesSelector||function(d){let f=this,h=f.ownerDocument?f.ownerDocument.querySelectorAll(d):[];return Array.prototype.indexOf.call(h,f)!==-1}})();function e(u){u=u.split(",",1)[0];let d=0,f=0,h=0;function m(p){let _=u.match(p);return _===null?!1:(u=u.slice(_[0].length),!0)}for(u=u.replace(c," $1 ");u.length>0;){if(m(i)){d++;continue}if(m(n)){f++;continue}if(m(r)){f++;continue}if(m(o)){h++;continue}if(m(a)){f++;continue}if(m(s)){h++;continue}if(!m(l))return 0}return d=Math.min(d,255),f=Math.min(f,255),h=Math.min(h,255),d<<16|f<<8|h}t.calculateSingle=e;let i=/^#[^\s\+>~#\.\[:]+/,n=/^\.[^\s\+>~#\.\[:]+/,r=/^\[[^\]]+\]/,s=/^[^\s\+>~#\.\[:]+/,o=/^(::[^\s\+>~#\.\[:]+|:first-line|:first-letter|:before|:after)/,a=/^:[^\s\+>~#\.\[:]+/,l=/^[\s\+>~\*]+/,c=/:not\(([^\)]+)\)/g})(wo||(wo={}))});var Pf,hl,Lz=$(()=>{Pf=class{constructor(){this._first=null,this._last=null,this._size=0}get isEmpty(){return this._size===0}get size(){return this._size}get length(){return this._size}get first(){return this._first?this._first.value:void 0}get last(){return this._last?this._last.value:void 0}get firstNode(){return this._first}get lastNode(){return this._last}*[Symbol.iterator](){let e=this._first;for(;e;)yield e.value,e=e.next}*retro(){let e=this._last;for(;e;)yield e.value,e=e.prev}*nodes(){let e=this._first;for(;e;)yield e,e=e.next}*retroNodes(){let e=this._last;for(;e;)yield e,e=e.prev}assign(e){this.clear();for(let i of e)this.addLast(i)}push(e){this.addLast(e)}pop(){return this.removeLast()}shift(e){this.addFirst(e)}unshift(){return this.removeFirst()}addFirst(e){let i=new hl.LinkedListNode(this,e);return this._first?(i.next=this._first,this._first.prev=i,this._first=i):(this._first=i,this._last=i),this._size++,i}addLast(e){let i=new hl.LinkedListNode(this,e);return this._last?(i.prev=this._last,this._last.next=i,this._last=i):(this._first=i,this._last=i),this._size++,i}insertBefore(e,i){if(!i||i===this._first)return this.addFirst(e);if(!(i instanceof hl.LinkedListNode)||i.list!==this)throw new Error("Reference node is not owned by the list.");let n=new hl.LinkedListNode(this,e),r=i,s=r.prev;return n.next=r,n.prev=s,r.prev=n,s.next=n,this._size++,n}insertAfter(e,i){if(!i||i===this._last)return this.addLast(e);if(!(i instanceof hl.LinkedListNode)||i.list!==this)throw new Error("Reference node is not owned by the list.");let n=new hl.LinkedListNode(this,e),r=i,s=r.next;return n.next=s,n.prev=r,r.next=n,s.prev=n,this._size++,n}removeFirst(){let e=this._first;if(e)return e===this._last?(this._first=null,this._last=null):(this._first=e.next,this._first.prev=null),e.list=null,e.next=null,e.prev=null,this._size--,e.value}removeLast(){let e=this._last;if(e)return e===this._first?(this._first=null,this._last=null):(this._last=e.prev,this._last.next=null),e.list=null,e.next=null,e.prev=null,this._size--,e.value}removeNode(e){if(!(e instanceof hl.LinkedListNode)||e.list!==this)throw new Error("Node is not owned by the list.");let i=e;i===this._first&&i===this._last?(this._first=null,this._last=null):i===this._first?(this._first=i.next,this._first.prev=null):i===this._last?(this._last=i.prev,this._last.next=null):(i.next.prev=i.prev,i.prev.next=i.next),i.list=null,i.next=null,i.prev=null,this._size--}clear(){let e=this._first;for(;e;){let i=e.next;e.list=null,e.prev=null,e.next=null,e=i}this._first=null,this._last=null,this._size=0}};(function(t){function e(i){let n=new t;return n.assign(i),n}t.from=e})(Pf||(Pf={}));(function(t){class e{constructor(n,r){this.list=null,this.next=null,this.prev=null,this.list=n,this.value=r}}t.LinkedListNode=e})(hl||(hl={}))});var br,xc,ze,Nz=$(()=>{f0();Lz();br=class{constructor(e){this.type=e}get isConflatable(){return!1}conflate(e){return!1}},xc=class extends br{get isConflatable(){return!0}conflate(e){return!0}};(function(t){let e=null,i=(C=>x=>{let w=!1;return C.then(()=>!w&&x()),()=>{w=!0}})(Promise.resolve());function n(C,x){let w=f.get(C);if(!w||w.length===0){y(C,x);return}Ez(Tz(w),N=>N?_(N,C,x):!0)&&y(C,x)}t.sendMessage=n;function r(C,x){if(!x.isConflatable){S(C,x);return}Mz(d,E=>E.handler!==C||!E.msg||E.msg.type!==x.type||!E.msg.isConflatable?!1:E.msg.conflate(x))||S(C,x)}t.postMessage=r;function s(C,x){let w=f.get(C);w&&w.indexOf(x)!==-1||(w?w.push(x):f.set(C,[x]))}t.installMessageHook=s;function o(C,x){let w=f.get(C);if(!w)return;let E=w.indexOf(x);E!==-1&&(w[E]=null,O(w))}t.removeMessageHook=o;function a(C){let x=f.get(C);x&&x.length>0&&(je.fill(x,null),O(x));for(let w of d)w.handler===C&&(w.handler=null,w.msg=null)}t.clearData=a;function l(){p||e===null||(e(),e=null,p=!0,T(),p=!1)}t.flush=l;function c(){return m}t.getExceptionHandler=c;function u(C){let x=m;return m=C,x}t.setExceptionHandler=u;let d=new Pf,f=new WeakMap,h=new Set,m=C=>{console.error(C)},p=!1;function _(C,x,w){let E=!0;try{typeof C=="function"?E=C(x,w):E=C.messageHook(x,w)}catch(N){m(N)}return E}function y(C,x){try{C.processMessage(x)}catch(w){m(w)}}function S(C,x){d.addLast({handler:C,msg:x}),e===null&&(e=i(T))}function T(){if(e=null,d.isEmpty)return;let C={handler:null,msg:null};for(d.addLast(C);;){let x=d.removeFirst();if(x===C)return;x.handler&&x.msg&&n(x.handler,x.msg)}}function O(C){h.size===0&&i(A),h.add(C)}function A(){h.forEach(b),h.clear()}function b(C){je.removeAllWhere(C,M)}function M(C){return C===null}})(ze||(ze={}))});var gl,da,cs,ug,fe,m0,fa,Hu,Bf,yc,dg,hg,So,ml,DC,p0,ju,OC,Fu,zC,fg,PC,us,Bu,g0,BC,Hf,fl,ha,xr,Dz,hG,wc,Vs,HC,nn,Wu,Vi,pl,wn,jf,_0,Oz,zz,jC,Pz,Bz,Hz=$(()=>{f0();gl=P(Qn());Az();Nz();Tp();Rs();Ev();Iv();Tv();Cv();Mv();da=class{constructor(){this.sizeHint=0,this.minSize=0,this.maxSize=1/0,this.stretch=1,this.size=0,this.done=!1}};(function(t){function e(s,o){let a=s.length;if(a===0)return o;let l=0,c=0,u=0,d=0,f=0;for(let p=0;p0&&(d+=_.stretch,f++)}if(o===u)return 0;if(o<=l){for(let p=0;p=c){for(let p=0;p0&&p>h;){let _=p,y=d;for(let S=0;S0&&p>h;){let _=p/m;for(let y=0;y0&&p>h;){let _=p,y=d;for(let S=0;S=T.maxSize?(p-=T.maxSize-T.size,d-=T.stretch,T.size=T.maxSize,T.done=!0,m--,f--):(p-=O,T.size+=O)}}for(;m>0&&p>h;){let _=p/m;for(let y=0;y=S.maxSize?(p-=S.maxSize-S.size,S.size=S.maxSize,S.done=!0,m--):(p-=_,S.size+=_))}}}return 0}t.calc=e;function i(s,o,a){s.length===0||a===0||(a>0?n(s,o,a):r(s,o,-a))}t.adjust=i;function n(s,o,a){let l=0;for(let f=0;f<=o;++f){let h=s[f];l+=h.maxSize-h.size}let c=0;for(let f=o+1,h=s.length;f=0&&u>0;--f){let h=s[f],m=h.maxSize-h.size;m>=u?(h.sizeHint=h.size+u,u=0):(h.sizeHint=h.size+m,u-=m)}let d=a;for(let f=o+1,h=s.length;f0;++f){let m=s[f],p=m.size-m.minSize;p>=d?(m.sizeHint=m.size-d,d=0):(m.sizeHint=m.size-p,d-=p)}}function r(s,o,a){let l=0;for(let f=o+1,h=s.length;f0;++f){let m=s[f],p=m.maxSize-m.size;p>=u?(m.sizeHint=m.size+u,u=0):(m.sizeHint=m.size+p,u-=p)}let d=a;for(let f=o;f>=0&&d>0;--f){let h=s[f],m=h.size-h.minSize;m>=d?(h.sizeHint=h.size-d,d=0):(h.sizeHint=h.size-m,d-=m)}}})(cs||(cs={}));ug=class{constructor(e){this._label="",this._caption="",this._mnemonic=-1,this._icon=void 0,this._iconClass="",this._iconLabel="",this._className="",this._closable=!1,this._changed=new Te(this),this._isDisposed=!1,this.owner=e.owner,e.label!==void 0&&(this._label=e.label),e.mnemonic!==void 0&&(this._mnemonic=e.mnemonic),e.icon!==void 0&&(this._icon=e.icon),e.iconClass!==void 0&&(this._iconClass=e.iconClass),e.iconLabel!==void 0&&(this._iconLabel=e.iconLabel),e.caption!==void 0&&(this._caption=e.caption),e.className!==void 0&&(this._className=e.className),e.closable!==void 0&&(this._closable=e.closable),this._dataset=e.dataset||{}}get changed(){return this._changed}get label(){return this._label}set label(e){this._label!==e&&(this._label=e,this._changed.emit(void 0))}get mnemonic(){return this._mnemonic}set mnemonic(e){this._mnemonic!==e&&(this._mnemonic=e,this._changed.emit(void 0))}get icon(){return this._icon}set icon(e){this._icon!==e&&(this._icon=e,this._changed.emit(void 0))}get iconClass(){return this._iconClass}set iconClass(e){this._iconClass!==e&&(this._iconClass=e,this._changed.emit(void 0))}get iconLabel(){return this._iconLabel}set iconLabel(e){this._iconLabel!==e&&(this._iconLabel=e,this._changed.emit(void 0))}get caption(){return this._caption}set caption(e){this._caption!==e&&(this._caption=e,this._changed.emit(void 0))}get className(){return this._className}set className(e){this._className!==e&&(this._className=e,this._changed.emit(void 0))}get closable(){return this._closable}set closable(e){this._closable!==e&&(this._closable=e,this._changed.emit(void 0))}get dataset(){return this._dataset}set dataset(e){this._dataset!==e&&(this._dataset=e,this._changed.emit(void 0))}get isDisposed(){return this._isDisposed}dispose(){this.isDisposed||(this._isDisposed=!0,Te.clearData(this))}},fe=class{constructor(e={}){this._flags=0,this._layout=null,this._parent=null,this._disposed=new Te(this),this._hiddenMode=fe.HiddenMode.Display,this.node=m0.createNode(e),this.addClass("lm-Widget")}dispose(){this.isDisposed||(this.setFlag(fe.Flag.IsDisposed),this._disposed.emit(void 0),this.parent?this.parent=null:this.isAttached&&fe.detach(this),this._layout&&(this._layout.dispose(),this._layout=null),this.title.dispose(),Te.clearData(this),ze.clearData(this),pt.clearData(this))}get disposed(){return this._disposed}get isDisposed(){return this.testFlag(fe.Flag.IsDisposed)}get isAttached(){return this.testFlag(fe.Flag.IsAttached)}get isHidden(){return this.testFlag(fe.Flag.IsHidden)}get isVisible(){return this.testFlag(fe.Flag.IsVisible)}get title(){return m0.titleProperty.get(this)}get id(){return this.node.id}set id(e){this.node.id=e}get dataset(){return this.node.dataset}get hiddenMode(){return this._hiddenMode}set hiddenMode(e){this._hiddenMode!==e&&(this.isHidden&&this._toggleHidden(!1),e==fe.HiddenMode.Scale?this.node.style.willChange="transform":this.node.style.willChange="auto",this._hiddenMode=e,this.isHidden&&this._toggleHidden(!0))}get parent(){return this._parent}set parent(e){if(this._parent!==e){if(e&&this.contains(e))throw new Error("Invalid parent widget.");if(this._parent&&!this._parent.isDisposed){let i=new fe.ChildMessage("child-removed",this);ze.sendMessage(this._parent,i)}if(this._parent=e,this._parent&&!this._parent.isDisposed){let i=new fe.ChildMessage("child-added",this);ze.sendMessage(this._parent,i)}this.isDisposed||ze.sendMessage(this,fe.Msg.ParentChanged)}}get layout(){return this._layout}set layout(e){if(this._layout!==e){if(this.testFlag(fe.Flag.DisallowLayout))throw new Error("Cannot set widget layout.");if(this._layout)throw new Error("Cannot change widget layout.");if(e.parent)throw new Error("Cannot change layout parent.");this._layout=e,e.parent=this}}*children(){this._layout&&(yield*this._layout)}contains(e){for(let i=e;i;i=i._parent)if(i===this)return!0;return!1}hasClass(e){return this.node.classList.contains(e)}addClass(e){this.node.classList.add(e)}removeClass(e){this.node.classList.remove(e)}toggleClass(e,i){return i===!0?(this.node.classList.add(e),!0):i===!1?(this.node.classList.remove(e),!1):this.node.classList.toggle(e)}update(){ze.postMessage(this,fe.Msg.UpdateRequest)}fit(){ze.postMessage(this,fe.Msg.FitRequest)}activate(){ze.postMessage(this,fe.Msg.ActivateRequest)}close(){ze.sendMessage(this,fe.Msg.CloseRequest)}show(){if(this.testFlag(fe.Flag.IsHidden)&&(this.isAttached&&(!this.parent||this.parent.isVisible)&&ze.sendMessage(this,fe.Msg.BeforeShow),this.clearFlag(fe.Flag.IsHidden),this._toggleHidden(!1),this.isAttached&&(!this.parent||this.parent.isVisible)&&ze.sendMessage(this,fe.Msg.AfterShow),this.parent)){let e=new fe.ChildMessage("child-shown",this);ze.sendMessage(this.parent,e)}}hide(){if(!this.testFlag(fe.Flag.IsHidden)&&(this.isAttached&&(!this.parent||this.parent.isVisible)&&ze.sendMessage(this,fe.Msg.BeforeHide),this.setFlag(fe.Flag.IsHidden),this._toggleHidden(!0),this.isAttached&&(!this.parent||this.parent.isVisible)&&ze.sendMessage(this,fe.Msg.AfterHide),this.parent)){let e=new fe.ChildMessage("child-hidden",this);ze.sendMessage(this.parent,e)}}setHidden(e){e?this.hide():this.show()}testFlag(e){return(this._flags&e)!==0}setFlag(e){this._flags|=e}clearFlag(e){this._flags&=~e}processMessage(e){switch(e.type){case"resize":this.notifyLayout(e),this.onResize(e);break;case"update-request":this.notifyLayout(e),this.onUpdateRequest(e);break;case"fit-request":this.notifyLayout(e),this.onFitRequest(e);break;case"before-show":this.notifyLayout(e),this.onBeforeShow(e);break;case"after-show":this.setFlag(fe.Flag.IsVisible),this.notifyLayout(e),this.onAfterShow(e);break;case"before-hide":this.notifyLayout(e),this.onBeforeHide(e);break;case"after-hide":this.clearFlag(fe.Flag.IsVisible),this.notifyLayout(e),this.onAfterHide(e);break;case"before-attach":this.notifyLayout(e),this.onBeforeAttach(e);break;case"after-attach":!this.isHidden&&(!this.parent||this.parent.isVisible)&&this.setFlag(fe.Flag.IsVisible),this.setFlag(fe.Flag.IsAttached),this.notifyLayout(e),this.onAfterAttach(e);break;case"before-detach":this.notifyLayout(e),this.onBeforeDetach(e);break;case"after-detach":this.clearFlag(fe.Flag.IsVisible),this.clearFlag(fe.Flag.IsAttached),this.notifyLayout(e),this.onAfterDetach(e);break;case"activate-request":this.notifyLayout(e),this.onActivateRequest(e);break;case"close-request":this.notifyLayout(e),this.onCloseRequest(e);break;case"child-added":this.notifyLayout(e),this.onChildAdded(e);break;case"child-removed":this.notifyLayout(e),this.onChildRemoved(e);break;default:this.notifyLayout(e);break}}notifyLayout(e){this._layout&&this._layout.processParentMessage(e)}onCloseRequest(e){this.parent?this.parent=null:this.isAttached&&fe.detach(this)}onResize(e){}onUpdateRequest(e){}onFitRequest(e){}onActivateRequest(e){}onBeforeShow(e){}onAfterShow(e){}onBeforeHide(e){}onAfterHide(e){}onBeforeAttach(e){}onAfterAttach(e){}onBeforeDetach(e){}onAfterDetach(e){}onChildAdded(e){}onChildRemoved(e){}_toggleHidden(e){if(e)switch(this._hiddenMode){case fe.HiddenMode.Display:this.addClass("lm-mod-hidden");break;case fe.HiddenMode.Scale:this.node.style.transform="scale(0)",this.node.setAttribute("aria-hidden","true");break;case fe.HiddenMode.ContentVisibility:this.node.style.contentVisibility="hidden",this.node.style.zIndex="-1";break}else switch(this._hiddenMode){case fe.HiddenMode.Display:this.removeClass("lm-mod-hidden");break;case fe.HiddenMode.Scale:this.node.style.transform="",this.node.removeAttribute("aria-hidden");break;case fe.HiddenMode.ContentVisibility:this.node.style.contentVisibility="",this.node.style.zIndex="";break}}};(function(t){(function(s){s[s.Display=0]="Display",s[s.Scale=1]="Scale",s[s.ContentVisibility=2]="ContentVisibility"})(t.HiddenMode||(t.HiddenMode={})),function(s){s[s.IsDisposed=1]="IsDisposed",s[s.IsAttached=2]="IsAttached",s[s.IsHidden=4]="IsHidden",s[s.IsVisible=8]="IsVisible",s[s.DisallowLayout=16]="DisallowLayout"}(t.Flag||(t.Flag={})),function(s){s.BeforeShow=new br("before-show"),s.AfterShow=new br("after-show"),s.BeforeHide=new br("before-hide"),s.AfterHide=new br("after-hide"),s.BeforeAttach=new br("before-attach"),s.AfterAttach=new br("after-attach"),s.BeforeDetach=new br("before-detach"),s.AfterDetach=new br("after-detach"),s.ParentChanged=new br("parent-changed"),s.UpdateRequest=new xc("update-request"),s.FitRequest=new xc("fit-request"),s.ActivateRequest=new xc("activate-request"),s.CloseRequest=new xc("close-request")}(t.Msg||(t.Msg={}));class e extends br{constructor(o,a){super(o),this.child=a}}t.ChildMessage=e;class i extends br{constructor(o,a){super("resize"),this.width=o,this.height=a}}t.ResizeMessage=i,function(s){s.UnknownSize=new s(-1,-1)}(i=t.ResizeMessage||(t.ResizeMessage={}));function n(s,o,a=null){if(s.parent)throw new Error("Cannot attach a child widget.");if(s.isAttached||s.node.isConnected)throw new Error("Widget is already attached.");if(!o.isConnected)throw new Error("Host is not attached.");ze.sendMessage(s,t.Msg.BeforeAttach),o.insertBefore(s.node,a),ze.sendMessage(s,t.Msg.AfterAttach)}t.attach=n;function r(s){if(s.parent)throw new Error("Cannot detach a child widget.");if(!s.isAttached||!s.node.isConnected)throw new Error("Widget is not attached.");ze.sendMessage(s,t.Msg.BeforeDetach),s.node.parentNode.removeChild(s.node),ze.sendMessage(s,t.Msg.AfterDetach)}t.detach=r})(fe||(fe={}));(function(t){t.titleProperty=new pt({name:"title",create:i=>new ug({owner:i})});function e(i){return i.node||document.createElement(i.tag||"div")}t.createNode=e})(m0||(m0={}));fa=class{constructor(e={}){this._disposed=!1,this._parent=null,this._fitPolicy=e.fitPolicy||"set-min-size"}dispose(){this._parent=null,this._disposed=!0,Te.clearData(this),pt.clearData(this)}get isDisposed(){return this._disposed}get parent(){return this._parent}set parent(e){if(this._parent!==e){if(this._parent)throw new Error("Cannot change parent widget.");if(e.layout!==this)throw new Error("Invalid parent widget.");this._parent=e,this.init()}}get fitPolicy(){return this._fitPolicy}set fitPolicy(e){if(this._fitPolicy!==e&&(this._fitPolicy=e,this._parent)){let i=this._parent.node.style;i.minWidth="",i.minHeight="",i.maxWidth="",i.maxHeight="",this._parent.fit()}}processParentMessage(e){switch(e.type){case"resize":this.onResize(e);break;case"update-request":this.onUpdateRequest(e);break;case"fit-request":this.onFitRequest(e);break;case"before-show":this.onBeforeShow(e);break;case"after-show":this.onAfterShow(e);break;case"before-hide":this.onBeforeHide(e);break;case"after-hide":this.onAfterHide(e);break;case"before-attach":this.onBeforeAttach(e);break;case"after-attach":this.onAfterAttach(e);break;case"before-detach":this.onBeforeDetach(e);break;case"after-detach":this.onAfterDetach(e);break;case"child-removed":this.onChildRemoved(e);break;case"child-shown":this.onChildShown(e);break;case"child-hidden":this.onChildHidden(e);break}}init(){for(let e of this)e.parent=this.parent}onResize(e){for(let i of this)ze.sendMessage(i,fe.ResizeMessage.UnknownSize)}onUpdateRequest(e){for(let i of this)ze.sendMessage(i,fe.ResizeMessage.UnknownSize)}onBeforeAttach(e){for(let i of this)ze.sendMessage(i,e)}onAfterAttach(e){for(let i of this)ze.sendMessage(i,e)}onBeforeDetach(e){for(let i of this)ze.sendMessage(i,e)}onAfterDetach(e){for(let i of this)ze.sendMessage(i,e)}onBeforeShow(e){for(let i of this)i.isHidden||ze.sendMessage(i,e)}onAfterShow(e){for(let i of this)i.isHidden||ze.sendMessage(i,e)}onBeforeHide(e){for(let i of this)i.isHidden||ze.sendMessage(i,e)}onAfterHide(e){for(let i of this)i.isHidden||ze.sendMessage(i,e)}onChildRemoved(e){this.removeWidget(e.child)}onFitRequest(e){}onChildShown(e){}onChildHidden(e){}};(function(t){function e(s){return Bf.horizontalAlignmentProperty.get(s)}t.getHorizontalAlignment=e;function i(s,o){Bf.horizontalAlignmentProperty.set(s,o)}t.setHorizontalAlignment=i;function n(s){return Bf.verticalAlignmentProperty.get(s)}t.getVerticalAlignment=n;function r(s,o){Bf.verticalAlignmentProperty.set(s,o)}t.setVerticalAlignment=r})(fa||(fa={}));Hu=class{constructor(e){this._top=NaN,this._left=NaN,this._width=NaN,this._height=NaN,this._minWidth=0,this._minHeight=0,this._maxWidth=1/0,this._maxHeight=1/0,this._disposed=!1,this.widget=e,this.widget.node.style.position="absolute",this.widget.node.style.contain="strict"}dispose(){if(this._disposed)return;this._disposed=!0;let e=this.widget.node.style;e.position="",e.top="",e.left="",e.width="",e.height="",e.contain=""}get minWidth(){return this._minWidth}get minHeight(){return this._minHeight}get maxWidth(){return this._maxWidth}get maxHeight(){return this._maxHeight}get isDisposed(){return this._disposed}get isHidden(){return this.widget.isHidden}get isVisible(){return this.widget.isVisible}get isAttached(){return this.widget.isAttached}fit(){let e=ii.sizeLimits(this.widget.node);this._minWidth=e.minWidth,this._minHeight=e.minHeight,this._maxWidth=e.maxWidth,this._maxHeight=e.maxHeight}update(e,i,n,r){let s=Math.max(this._minWidth,Math.min(n,this._maxWidth)),o=Math.max(this._minHeight,Math.min(r,this._maxHeight));if(s"center",changed:e}),t.verticalAlignmentProperty=new pt({name:"verticalAlignment",create:()=>"top",changed:e});function e(i){i.parent&&i.parent.layout&&i.parent.update()}})(Bf||(Bf={}));yc=class extends fa{constructor(){super(...arguments),this._widgets=[]}dispose(){for(;this._widgets.length>0;)this._widgets.pop().dispose();super.dispose()}get widgets(){return this._widgets}*[Symbol.iterator](){yield*this._widgets}addWidget(e){this.insertWidget(this._widgets.length,e)}insertWidget(e,i){i.parent=this.parent;let n=this._widgets.indexOf(i),r=Math.max(0,Math.min(e,this._widgets.length));if(n===-1){je.insert(this._widgets,r,i),this.parent&&this.attachWidget(r,i);return}r===this._widgets.length&&r--,n!==r&&(je.move(this._widgets,n,r),this.parent&&this.moveWidget(n,r,i))}removeWidget(e){this.removeWidgetAt(this._widgets.indexOf(e))}removeWidgetAt(e){let i=je.removeAt(this._widgets,e);i&&this.parent&&this.detachWidget(e,i)}init(){super.init();let e=0;for(let i of this)this.attachWidget(e++,i)}attachWidget(e,i){let n=this.parent.node.children[e];this.parent.isAttached&&ze.sendMessage(i,fe.Msg.BeforeAttach),this.parent.node.insertBefore(i.node,n),this.parent.isAttached&&ze.sendMessage(i,fe.Msg.AfterAttach)}moveWidget(e,i,n){this.parent.isAttached&&ze.sendMessage(n,fe.Msg.BeforeDetach),this.parent.node.removeChild(n.node),this.parent.isAttached&&ze.sendMessage(n,fe.Msg.AfterDetach);let r=this.parent.node.children[i];this.parent.isAttached&&ze.sendMessage(n,fe.Msg.BeforeAttach),this.parent.node.insertBefore(n.node,r),this.parent.isAttached&&ze.sendMessage(n,fe.Msg.AfterAttach)}detachWidget(e,i){this.parent.isAttached&&ze.sendMessage(i,fe.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.isAttached&&ze.sendMessage(i,fe.Msg.AfterDetach)}};(function(t){function e(i){return Math.max(0,Math.floor(i))}t.clampDimension=e})(dg||(dg={}));hg=dg,So=class extends yc{constructor(e){super(),this.widgetOffset=0,this._fixed=0,this._spacing=4,this._dirty=!1,this._hasNormedSizes=!1,this._sizers=[],this._items=[],this._handles=[],this._box=null,this._alignment="start",this._orientation="horizontal",this.renderer=e.renderer,e.orientation!==void 0&&(this._orientation=e.orientation),e.alignment!==void 0&&(this._alignment=e.alignment),e.spacing!==void 0&&(this._spacing=dg.clampDimension(e.spacing))}dispose(){for(let e of this._items)e.dispose();this._box=null,this._items.length=0,this._sizers.length=0,this._handles.length=0,super.dispose()}get orientation(){return this._orientation}set orientation(e){this._orientation!==e&&(this._orientation=e,this.parent&&(this.parent.dataset.orientation=e,this.parent.fit()))}get alignment(){return this._alignment}set alignment(e){this._alignment!==e&&(this._alignment=e,this.parent&&(this.parent.dataset.alignment=e,this.parent.update()))}get spacing(){return this._spacing}set spacing(e){e=dg.clampDimension(e),this._spacing!==e&&(this._spacing=e,this.parent&&this.parent.fit())}get handles(){return this._handles}absoluteSizes(){return this._sizers.map(e=>e.size)}relativeSizes(){return ml.normalize(this._sizers.map(e=>e.size))}setRelativeSizes(e,i=!0){let n=this._sizers.length,r=e.slice(0,n);for(;r.length0&&(s.sizeHint=s.size);cs.adjust(this._sizers,e,r),this.parent&&this.parent.update()}}init(){this.parent.dataset.orientation=this.orientation,this.parent.dataset.alignment=this.alignment,super.init()}attachWidget(e,i){let n=new Hu(i),r=ml.createHandle(this.renderer),s=ml.averageSize(this._sizers),o=ml.createSizer(s);je.insert(this._items,e,n),je.insert(this._sizers,e,o),je.insert(this._handles,e,r),this.parent.isAttached&&ze.sendMessage(i,fe.Msg.BeforeAttach),this.parent.node.appendChild(i.node),this.parent.node.appendChild(r),this.parent.isAttached&&ze.sendMessage(i,fe.Msg.AfterAttach),this.parent.fit()}moveWidget(e,i,n){je.move(this._items,e,i),je.move(this._sizers,e,i),je.move(this._handles,e,i),this.parent.fit()}detachWidget(e,i){let n=je.removeAt(this._items,e),r=je.removeAt(this._handles,e);je.removeAt(this._sizers,e),this.parent.isAttached&&ze.sendMessage(i,fe.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.node.removeChild(r),this.parent.isAttached&&ze.sendMessage(i,fe.Msg.AfterDetach),n.dispose(),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}updateItemPosition(e,i,n,r,s,o,a){let l=this._items[e];if(l.isHidden)return;let c=this._handles[e].style;i?(n+=this.widgetOffset,l.update(n,r,a,s),n+=a,c.top=`${r}px`,c.left=`${n}px`,c.width=`${this._spacing}px`,c.height=`${s}px`):(r+=this.widgetOffset,l.update(n,r,o,a),r+=a,c.top=`${r}px`,c.left=`${n}px`,c.width=`${o}px`,c.height=`${this._spacing}px`)}_fit(){let e=0,i=-1;for(let l=0,c=this._items.length;l0&&(d.sizeHint=d.size),u.isHidden){d.minSize=0,d.maxSize=0;continue}u.fit(),d.stretch=So.getStretch(u.widget),n?(d.minSize=u.minWidth,d.maxSize=u.maxWidth,r+=u.minWidth,s=Math.max(s,u.minHeight)):(d.minSize=u.minHeight,d.maxSize=u.maxHeight,s+=u.minHeight,r=Math.max(r,u.minWidth))}let o=this._box=ii.boxSizing(this.parent.node);r+=o.horizontalSum,s+=o.verticalSum;let a=this.parent.node.style;a.minWidth=`${r}px`,a.minHeight=`${s}px`,this._dirty=!0,this.parent.parent&&ze.sendMessage(this.parent.parent,fe.Msg.FitRequest),this._dirty&&ze.sendMessage(this.parent,fe.Msg.UpdateRequest)}_update(e,i){this._dirty=!1;let n=0;for(let d=0,f=this._items.length;d0){let d;if(u?d=Math.max(0,o-this._fixed):d=Math.max(0,a-this._fixed),this._hasNormedSizes){for(let h of this._sizers)h.sizeHint*=d;this._hasNormedSizes=!1}let f=cs.calc(this._sizers,d);if(f>0)switch(this._alignment){case"start":break;case"center":l=0,c=f/2;break;case"end":l=0,c=f;break;case"justify":l=f/n,c=0;break;default:throw"unreachable"}}for(let d=0,f=this._items.length;d0,coerce:(o,a)=>Math.max(0,Math.floor(a)),changed:s});function e(o){let a=new da;return a.sizeHint=Math.floor(o),a}t.createSizer=e;function i(o){let a=o.createHandle();return a.style.position="absolute",a.style.contain="style",a}t.createHandle=i;function n(o){return o.reduce((a,l)=>a+l.size,0)/o.length||0}t.averageSize=n;function r(o){let a=o.length;if(a===0)return[];let l=o.reduce((c,u)=>c+Math.abs(u),0);return l===0?o.map(c=>1/a):o.map(c=>c/l)}t.normalize=r;function s(o){o.parent&&o.parent.layout instanceof So&&o.parent.fit()}})(ml||(ml={}));DC=class extends So{constructor(e){super({...e,orientation:e.orientation||"vertical"}),this._titles=[],this.titleSpace=e.titleSpace||22}get titleSpace(){return this.widgetOffset}set titleSpace(e){e=hg.clampDimension(e),this.widgetOffset!==e&&(this.widgetOffset=e,this.parent&&this.parent.fit())}get titles(){return this._titles}dispose(){this.isDisposed||(this._titles.length=0,super.dispose())}updateTitle(e,i){let n=this._titles[e],r=n.classList.contains("lm-mod-expanded"),s=p0.createTitle(this.renderer,i.title,r);this._titles[e]=s,this.parent.node.replaceChild(s,n)}insertWidget(e,i){i.id||(i.id=`id-${gl.UUID.uuid4()}`),super.insertWidget(e,i)}attachWidget(e,i){let n=p0.createTitle(this.renderer,i.title);je.insert(this._titles,e,n),this.parent.node.appendChild(n),i.node.setAttribute("role","region"),i.node.setAttribute("aria-labelledby",n.id),super.attachWidget(e,i)}moveWidget(e,i,n){je.move(this._titles,e,i),super.moveWidget(e,i,n)}detachWidget(e,i){let n=je.removeAt(this._titles,e);this.parent.node.removeChild(n),super.detachWidget(e,i)}updateItemPosition(e,i,n,r,s,o,a){let l=this._titles[e].style;l.top=`${r}px`,l.left=`${n}px`,l.height=`${this.widgetOffset}px`,i?l.width=`${s}px`:l.width=`${o}px`,super.updateItemPosition(e,i,n,r,s,o,a)}};(function(t){function e(i,n,r=!0){let s=i.createSectionTitle(n);return s.style.position="absolute",s.style.contain="strict",s.setAttribute("aria-label",`${n.label} Section`),s.setAttribute("aria-expanded",r?"true":"false"),s.setAttribute("aria-controls",n.owner.id),r&&s.classList.add("lm-mod-expanded"),s}t.createTitle=e})(p0||(p0={}));ju=class extends fe{constructor(e={}){super(),this.addClass("lm-Panel"),this.layout=OC.createLayout(e)}get widgets(){return this.layout.widgets}addWidget(e){this.layout.addWidget(e)}insertWidget(e,i){this.layout.insertWidget(e,i)}};(function(t){function e(i){return i.layout||new yc}t.createLayout=e})(OC||(OC={}));Fu=class extends ju{constructor(e={}){super({layout:zC.createLayout(e)}),this._handleMoved=new Te(this),this._pressData=null,this.addClass("lm-SplitPanel")}dispose(){this._releaseMouse(),super.dispose()}get orientation(){return this.layout.orientation}set orientation(e){this.layout.orientation=e}get alignment(){return this.layout.alignment}set alignment(e){this.layout.alignment=e}get spacing(){return this.layout.spacing}set spacing(e){this.layout.spacing=e}get renderer(){return this.layout.renderer}get handleMoved(){return this._handleMoved}get handles(){return this.layout.handles}relativeSizes(){return this.layout.relativeSizes()}setRelativeSizes(e,i=!0){this.layout.setRelativeSizes(e,i)}handleEvent(e){switch(e.type){case"pointerdown":this._evtPointerDown(e);break;case"pointermove":this._evtPointerMove(e);break;case"pointerup":this._evtPointerUp(e);break;case"keydown":this._evtKeyDown(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("pointerdown",this)}onAfterDetach(e){this.node.removeEventListener("pointerdown",this),this._releaseMouse()}onChildAdded(e){e.child.addClass("lm-SplitPanel-child"),this._releaseMouse()}onChildRemoved(e){e.child.removeClass("lm-SplitPanel-child"),this._releaseMouse()}_evtKeyDown(e){this._pressData&&(e.preventDefault(),e.stopPropagation()),e.keyCode===27&&this._releaseMouse()}_evtPointerDown(e){if(e.button!==0)return;let i=this.layout,n=je.findFirstIndex(i.handles,c=>c.contains(e.target));if(n===-1)return;e.preventDefault(),e.stopPropagation(),document.addEventListener("pointerup",this,!0),document.addEventListener("pointermove",this,!0),document.addEventListener("keydown",this,!0),document.addEventListener("contextmenu",this,!0);let r,s=i.handles[n],o=s.getBoundingClientRect();i.orientation==="horizontal"?r=e.clientX-o.left:r=e.clientY-o.top;let a=window.getComputedStyle(s),l=an.overrideCursor(a.cursor);this._pressData={index:n,delta:r,override:l}}_evtPointerMove(e){e.preventDefault(),e.stopPropagation();let i,n=this.layout,r=this.node.getBoundingClientRect();n.orientation==="horizontal"?i=e.clientX-r.left-this._pressData.delta:i=e.clientY-r.top-this._pressData.delta,n.moveHandle(this._pressData.index,i)}_evtPointerUp(e){e.button===0&&(e.preventDefault(),e.stopPropagation(),this._releaseMouse())}_releaseMouse(){this._pressData&&(this._pressData.override.dispose(),this._pressData=null,this._handleMoved.emit(),document.removeEventListener("keydown",this,!0),document.removeEventListener("pointerup",this,!0),document.removeEventListener("pointermove",this,!0),document.removeEventListener("contextmenu",this,!0))}};(function(t){class e{createHandle(){let s=document.createElement("div");return s.className="lm-SplitPanel-handle",s}}t.Renderer=e,t.defaultRenderer=new e;function i(r){return So.getStretch(r)}t.getStretch=i;function n(r,s){So.setStretch(r,s)}t.setStretch=n})(Fu||(Fu={}));(function(t){function e(i){return i.layout||new So({renderer:i.renderer||Fu.defaultRenderer,orientation:i.orientation,alignment:i.alignment,spacing:i.spacing})}t.createLayout=e})(zC||(zC={}));fg=class extends Fu{constructor(e={}){super({...e,layout:PC.createLayout(e)}),this._widgetSizesCache=new WeakMap,this._expansionToggled=new Te(this),this.addClass("lm-AccordionPanel")}get renderer(){return this.layout.renderer}get titleSpace(){return this.layout.titleSpace}set titleSpace(e){this.layout.titleSpace=e}get titles(){return this.layout.titles}get expansionToggled(){return this._expansionToggled}addWidget(e){super.addWidget(e),e.title.changed.connect(this._onTitleChanged,this)}collapse(e){let i=this.layout.widgets[e];i&&!i.isHidden&&this._toggleExpansion(e)}expand(e){let i=this.layout.widgets[e];i&&i.isHidden&&this._toggleExpansion(e)}insertWidget(e,i){super.insertWidget(e,i),i.title.changed.connect(this._onTitleChanged,this)}handleEvent(e){switch(super.handleEvent(e),e.type){case"click":this._evtClick(e);break;case"keydown":this._eventKeyDown(e);break}}onBeforeAttach(e){this.node.addEventListener("click",this),this.node.addEventListener("keydown",this),super.onBeforeAttach(e)}onAfterDetach(e){super.onAfterDetach(e),this.node.removeEventListener("click",this),this.node.removeEventListener("keydown",this)}_onTitleChanged(e){let i=je.findFirstIndex(this.widgets,n=>n.contains(e.owner));i>=0&&(this.layout.updateTitle(i,e.owner),this.update())}_computeWidgetSize(e){let i=this.layout,n=i.widgets[e];if(!n)return;let r=n.isHidden,s=i.absoluteSizes(),o=(r?-1:1)*this.spacing,a=s.reduce((c,u)=>c+u),l=[...s];if(r){let c=this._widgetSizesCache.get(n);if(!c)return;l[e]+=c;let u=l.map(d=>d-c>0).lastIndexOf(!0);u===-1?l.forEach((d,f)=>{f!==e&&(l[f]-=s[f]/a*(c-o))}):l[u]-=c-o}else{let c=s[e];this._widgetSizesCache.set(n,c),l[e]=0;let u=l.map(d=>d>0).lastIndexOf(!0);if(u===-1)return;l[u]=s[u]+c+o}return l.map(c=>c/(a+o))}_evtClick(e){let i=e.target;if(i){let n=je.findFirstIndex(this.titles,r=>r.contains(i));n>=0&&(e.preventDefault(),e.stopPropagation(),this._toggleExpansion(n))}}_eventKeyDown(e){if(e.defaultPrevented)return;let i=e.target,n=!1;if(i){let r=je.findFirstIndex(this.titles,s=>s.contains(i));if(r>=0){let s=e.keyCode.toString();if(e.key.match(/Space|Enter/)||s.match(/13|32/))i.click(),n=!0;else if(this.orientation==="horizontal"?e.key.match(/ArrowLeft|ArrowRight/)||s.match(/37|39/):e.key.match(/ArrowUp|ArrowDown/)||s.match(/38|40/)){let o=e.key.match(/ArrowLeft|ArrowUp/)||s.match(/37|38/)?-1:1,a=this.titles.length,l=(r+a+o)%a;this.titles[l].focus(),n=!0}else e.key==="End"||s==="35"?(this.titles[this.titles.length-1].focus(),n=!0):(e.key==="Home"||s==="36")&&(this.titles[0].focus(),n=!0)}n&&e.preventDefault()}}_toggleExpansion(e){let i=this.titles[e],n=this.layout.widgets[e],r=this._computeWidgetSize(e);r&&this.setRelativeSizes(r,!1),n.isHidden?(i.classList.add("lm-mod-expanded"),i.setAttribute("aria-expanded","true"),n.show()):(i.classList.remove("lm-mod-expanded"),i.setAttribute("aria-expanded","false"),n.hide()),this._expansionToggled.emit(e)}};(function(t){class e extends Fu.Renderer{constructor(){super(),this.titleClassName="lm-AccordionPanel-title",this._titleID=0,this._titleKeys=new WeakMap,this._uuid=++e._nInstance}createCollapseIcon(n){return document.createElement("span")}createSectionTitle(n){let r=document.createElement("h3");r.setAttribute("tabindex","0"),r.id=this.createTitleKey(n),r.className=this.titleClassName;for(let a in n.dataset)r.dataset[a]=n.dataset[a];let s=r.appendChild(this.createCollapseIcon(n));s.className="lm-AccordionPanel-titleCollapser";let o=r.appendChild(document.createElement("span"));return o.className="lm-AccordionPanel-titleLabel",o.textContent=n.label,o.title=n.caption||n.label,r}createTitleKey(n){let r=this._titleKeys.get(n);return r===void 0&&(r=`title-key-${this._uuid}-${this._titleID++}`,this._titleKeys.set(n,r)),r}}e._nInstance=0,t.Renderer=e,t.defaultRenderer=new e})(fg||(fg={}));(function(t){function e(i){return i.layout||new DC({renderer:i.renderer||fg.defaultRenderer,orientation:i.orientation,alignment:i.alignment,spacing:i.spacing,titleSpace:i.titleSpace})}t.createLayout=e})(PC||(PC={}));us=class extends yc{constructor(e={}){super(),this._fixed=0,this._spacing=4,this._dirty=!1,this._sizers=[],this._items=[],this._box=null,this._alignment="start",this._direction="top-to-bottom",e.direction!==void 0&&(this._direction=e.direction),e.alignment!==void 0&&(this._alignment=e.alignment),e.spacing!==void 0&&(this._spacing=hg.clampDimension(e.spacing))}dispose(){for(let e of this._items)e.dispose();this._box=null,this._items.length=0,this._sizers.length=0,super.dispose()}get direction(){return this._direction}set direction(e){this._direction!==e&&(this._direction=e,this.parent&&(this.parent.dataset.direction=e,this.parent.fit()))}get alignment(){return this._alignment}set alignment(e){this._alignment!==e&&(this._alignment=e,this.parent&&(this.parent.dataset.alignment=e,this.parent.update()))}get spacing(){return this._spacing}set spacing(e){e=hg.clampDimension(e),this._spacing!==e&&(this._spacing=e,this.parent&&this.parent.fit())}init(){this.parent.dataset.direction=this.direction,this.parent.dataset.alignment=this.alignment,super.init()}attachWidget(e,i){je.insert(this._items,e,new Hu(i)),je.insert(this._sizers,e,new da),this.parent.isAttached&&ze.sendMessage(i,fe.Msg.BeforeAttach),this.parent.node.appendChild(i.node),this.parent.isAttached&&ze.sendMessage(i,fe.Msg.AfterAttach),this.parent.fit()}moveWidget(e,i,n){je.move(this._items,e,i),je.move(this._sizers,e,i),this.parent.update()}detachWidget(e,i){let n=je.removeAt(this._items,e);je.removeAt(this._sizers,e),this.parent.isAttached&&ze.sendMessage(i,fe.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.isAttached&&ze.sendMessage(i,fe.Msg.AfterDetach),n.dispose(),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_fit(){let e=0;for(let a=0,l=this._items.length;a0)switch(this._alignment){case"start":break;case"center":c=0,u=l/2;break;case"end":c=0,u=l;break;case"justify":c=l/n,u=0;break;default:throw"unreachable"}for(let d=0,f=this._items.length;d0,coerce:(r,s)=>Math.max(0,Math.floor(s)),changed:n}),t.sizeBasisProperty=new pt({name:"sizeBasis",create:()=>0,coerce:(r,s)=>Math.max(0,Math.floor(s)),changed:n});function e(r){return r==="left-to-right"||r==="right-to-left"}t.isHorizontal=e;function i(r){return Math.max(0,Math.floor(r))}t.clampSpacing=i;function n(r){r.parent&&r.parent.layout instanceof us&&r.parent.fit()}})(Bu||(Bu={}));g0=class extends ju{constructor(e={}){super({layout:BC.createLayout(e)}),this.addClass("lm-BoxPanel")}get direction(){return this.layout.direction}set direction(e){this.layout.direction=e}get alignment(){return this.layout.alignment}set alignment(e){this.layout.alignment=e}get spacing(){return this.layout.spacing}set spacing(e){this.layout.spacing=e}onChildAdded(e){e.child.addClass("lm-BoxPanel-child")}onChildRemoved(e){e.child.removeClass("lm-BoxPanel-child")}};(function(t){function e(s){return us.getStretch(s)}t.getStretch=e;function i(s,o){us.setStretch(s,o)}t.setStretch=i;function n(s){return us.getSizeBasis(s)}t.getSizeBasis=n;function r(s,o){us.setSizeBasis(s,o)}t.setSizeBasis=r})(g0||(g0={}));(function(t){function e(i){return i.layout||new us(i)}t.createLayout=e})(BC||(BC={}));Hf=class extends fe{constructor(e){super({node:fl.createNode()}),this._activeIndex=-1,this._items=[],this._results=null,this.addClass("lm-CommandPalette"),this.setFlag(fe.Flag.DisallowLayout),this.commands=e.commands,this.renderer=e.renderer||Hf.defaultRenderer,this.commands.commandChanged.connect(this._onGenericChange,this),this.commands.keyBindingChanged.connect(this._onGenericChange,this)}dispose(){this._items.length=0,this._results=null,super.dispose()}get searchNode(){return this.node.getElementsByClassName("lm-CommandPalette-search")[0]}get inputNode(){return this.node.getElementsByClassName("lm-CommandPalette-input")[0]}get contentNode(){return this.node.getElementsByClassName("lm-CommandPalette-content")[0]}get items(){return this._items}addItem(e){let i=fl.createItem(this.commands,e);return this._items.push(i),this.refresh(),i}addItems(e){let i=e.map(n=>fl.createItem(this.commands,n));return i.forEach(n=>this._items.push(n)),this.refresh(),i}removeItem(e){this.removeItemAt(this._items.indexOf(e))}removeItemAt(e){je.removeAt(this._items,e)&&this.refresh()}clearItems(){this._items.length!==0&&(this._items.length=0,this.refresh())}refresh(){if(this._results=null,this.inputNode.value!==""){let e=this.node.getElementsByClassName("lm-close-icon")[0];e.style.display="inherit"}else{let e=this.node.getElementsByClassName("lm-close-icon")[0];e.style.display="none"}this.update()}handleEvent(e){switch(e.type){case"click":this._evtClick(e);break;case"keydown":this._evtKeyDown(e);break;case"input":this.refresh();break;case"focus":case"blur":this._toggleFocused();break}}onBeforeAttach(e){this.node.addEventListener("click",this),this.node.addEventListener("keydown",this),this.node.addEventListener("input",this),this.node.addEventListener("focus",this,!0),this.node.addEventListener("blur",this,!0)}onAfterDetach(e){this.node.removeEventListener("click",this),this.node.removeEventListener("keydown",this),this.node.removeEventListener("input",this),this.node.removeEventListener("focus",this,!0),this.node.removeEventListener("blur",this,!0)}onAfterShow(e){this.update(),super.onAfterShow(e)}onActivateRequest(e){if(this.isAttached){let i=this.inputNode;i.focus(),i.select()}}onUpdateRequest(e){if(this.isHidden)return;let i=this.inputNode.value,n=this.contentNode,r=this._results;if(r||(r=this._results=fl.search(this._items,i),this._activeIndex=i?je.findFirstIndex(r,fl.canActivate):-1),!i&&r.length===0){Zt.render(null,n);return}if(i&&r.length===0){let l=this.renderer.renderEmptyMessage({query:i});Zt.render(l,n);return}let s=this.renderer,o=this._activeIndex,a=new Array(r.length);for(let l=0,c=r.length;l=r.length)n.scrollTop=0;else{let l=n.children[o];ii.scrollIntoViewIfNeeded(n,l)}}_evtClick(e){if(e.button!==0)return;if(e.target.classList.contains("lm-close-icon")){this.inputNode.value="",this.refresh();return}let i=je.findFirstIndex(this.contentNode.children,n=>n.contains(e.target));i!==-1&&(e.preventDefault(),e.stopPropagation(),this._execute(i))}_evtKeyDown(e){if(!(e.altKey||e.ctrlKey||e.metaKey||e.shiftKey))switch(e.keyCode){case 13:e.preventDefault(),e.stopPropagation(),this._execute(this._activeIndex);break;case 38:e.preventDefault(),e.stopPropagation(),this._activatePreviousItem();break;case 40:e.preventDefault(),e.stopPropagation(),this._activateNextItem();break}}_activateNextItem(){if(!this._results||this._results.length===0)return;let e=this._activeIndex,i=this._results.length,n=eC-x),b=S.slice(0,A),M=S.slice(A);for(let C=0,x=M.length;Cp.command===h&&gl.JSONExt.deepEqual(p.args,m))||null}}})(fl||(fl={}));ha=class extends fe{constructor(e){super({node:xr.createNode()}),this._childIndex=-1,this._activeIndex=-1,this._openTimerID=0,this._closeTimerID=0,this._items=[],this._childMenu=null,this._parentMenu=null,this._aboutToClose=new Te(this),this._menuRequested=new Te(this),this.addClass("lm-Menu"),this.setFlag(fe.Flag.DisallowLayout),this.commands=e.commands,this.renderer=e.renderer||ha.defaultRenderer}dispose(){this.close(),this._items.length=0,super.dispose()}get aboutToClose(){return this._aboutToClose}get menuRequested(){return this._menuRequested}get parentMenu(){return this._parentMenu}get childMenu(){return this._childMenu}get rootMenu(){let e=this;for(;e._parentMenu;)e=e._parentMenu;return e}get leafMenu(){let e=this;for(;e._childMenu;)e=e._childMenu;return e}get contentNode(){return this.node.getElementsByClassName("lm-Menu-content")[0]}get activeItem(){return this._items[this._activeIndex]||null}set activeItem(e){this.activeIndex=e?this._items.indexOf(e):-1}get activeIndex(){return this._activeIndex}set activeIndex(e){(e<0||e>=this._items.length)&&(e=-1),e!==-1&&!xr.canActivate(this._items[e])&&(e=-1),this._activeIndex!==e&&(this._activeIndex=e,this._activeIndex>=0&&this.contentNode.childNodes[this._activeIndex]&&this.contentNode.childNodes[this._activeIndex].focus(),this.update())}get items(){return this._items}activateNextItem(){let e=this._items.length,i=this._activeIndex,n=i{this.activeIndex=a}})}Zt.render(o,this.contentNode)}onCloseRequest(e){this._cancelOpenTimer(),this._cancelCloseTimer(),this.activeIndex=-1;let i=this._childMenu;i&&(this._childIndex=-1,this._childMenu=null,i._parentMenu=null,i.close());let n=this._parentMenu;n&&(this._parentMenu=null,n._childIndex=-1,n._childMenu=null,n.activate()),this.isAttached&&this._aboutToClose.emit(void 0),super.onCloseRequest(e)}_evtKeyDown(e){e.preventDefault(),e.stopPropagation();let i=e.keyCode;if(i===13){this.triggerActiveItem();return}if(i===27){this.close();return}if(i===37){this._parentMenu?this.close():this._menuRequested.emit("previous");return}if(i===38){this.activatePreviousItem();return}if(i===39){let o=this.activeItem;o&&o.type==="submenu"?this.triggerActiveItem():this.rootMenu._menuRequested.emit("next");return}if(i===40){this.activateNextItem();return}let n=ks().keyForKeydownEvent(e);if(!n)return;let r=this._activeIndex+1,s=xr.findMnemonic(this._items,n,r);s.index!==-1&&!s.multiple?(this.activeIndex=s.index,this.triggerActiveItem()):s.index!==-1?this.activeIndex=s.index:s.auto!==-1&&(this.activeIndex=s.auto)}_evtMouseUp(e){e.button===0&&(e.preventDefault(),e.stopPropagation(),this.triggerActiveItem())}_evtMouseMove(e){let i=je.findFirstIndex(this.contentNode.children,r=>ii.hitTest(r,e.clientX,e.clientY));if(i===this._activeIndex)return;if(this.activeIndex=i,i=this.activeIndex,i===this._childIndex){this._cancelOpenTimer(),this._cancelCloseTimer();return}this._childIndex!==-1&&this._startCloseTimer(),this._cancelOpenTimer();let n=this.activeItem;!n||n.type!=="submenu"||!n.submenu||this._startOpenTimer()}_evtMouseEnter(e){for(let i=this._parentMenu;i;i=i._parentMenu)i._cancelOpenTimer(),i._cancelCloseTimer(),i.activeIndex=i._childIndex}_evtMouseLeave(e){if(this._cancelOpenTimer(),!this._childMenu){this.activeIndex=-1;return}let{clientX:i,clientY:n}=e;if(ii.hitTest(this._childMenu.node,i,n)){this._cancelCloseTimer();return}this.activeIndex=-1,this._startCloseTimer()}_evtMouseDown(e){this._parentMenu||(xr.hitTestMenus(this,e.clientX,e.clientY)?(e.preventDefault(),e.stopPropagation()):this.close())}_openChildMenu(e=!1){let i=this.activeItem;if(!i||i.type!=="submenu"||!i.submenu){this._closeChildMenu();return}let n=i.submenu;if(n===this._childMenu)return;ha.saveWindowData(),this._closeChildMenu(),this._childMenu=n,this._childIndex=this._activeIndex,n._parentMenu=this,ze.sendMessage(this,fe.Msg.UpdateRequest);let r=this.contentNode.children[this._activeIndex];xr.openSubmenu(n,r),e&&(n.activeIndex=-1,n.activateNextItem()),n.activate()}_closeChildMenu(){this._childMenu&&this._childMenu.close()}_startOpenTimer(){this._openTimerID===0&&(this._openTimerID=window.setTimeout(()=>{this._openTimerID=0,this._openChildMenu()},xr.TIMER_DELAY))}_startCloseTimer(){this._closeTimerID===0&&(this._closeTimerID=window.setTimeout(()=>{this._closeTimerID=0,this._closeChildMenu()},xr.TIMER_DELAY))}_cancelOpenTimer(){this._openTimerID!==0&&(clearTimeout(this._openTimerID),this._openTimerID=0)}_cancelCloseTimer(){this._closeTimerID!==0&&(clearTimeout(this._closeTimerID),this._closeTimerID=0)}static saveWindowData(){xr.saveWindowData()}};(function(t){class e{renderItem(n){let r=this.createItemClass(n),s=this.createItemDataset(n),o=this.createItemARIA(n);return Ce.li({className:r,dataset:s,tabindex:"0",onfocus:n.onfocus,...o},this.renderIcon(n),this.renderLabel(n),this.renderShortcut(n),this.renderSubmenu(n))}renderIcon(n){let r=this.createIconClass(n);return Ce.div({className:r},n.item.icon,n.item.iconLabel)}renderLabel(n){let r=this.formatLabel(n);return Ce.div({className:"lm-Menu-itemLabel"},r)}renderShortcut(n){let r=this.formatShortcut(n);return Ce.div({className:"lm-Menu-itemShortcut"},r)}renderSubmenu(n){return Ce.div({className:"lm-Menu-itemSubmenuIcon"})}createItemClass(n){let r="lm-Menu-item";n.item.isEnabled||(r+=" lm-mod-disabled"),n.item.isToggled&&(r+=" lm-mod-toggled"),n.item.isVisible||(r+=" lm-mod-hidden"),n.active&&(r+=" lm-mod-active"),n.collapsed&&(r+=" lm-mod-collapsed");let s=n.item.className;return s&&(r+=` ${s}`),r}createItemDataset(n){let r,{type:s,command:o,dataset:a}=n.item;return s==="command"?r={...a,type:s,command:o}:r={...a,type:s},r}createIconClass(n){let r="lm-Menu-itemIcon",s=n.item.iconClass;return s?`${r} ${s}`:r}createItemARIA(n){let r={};switch(n.item.type){case"separator":r.role="presentation";break;case"submenu":r["aria-haspopup"]="true",n.item.isEnabled||(r["aria-disabled"]="true");break;default:n.item.isEnabled||(r["aria-disabled"]="true"),r.role="menuitem"}return r}formatLabel(n){let{label:r,mnemonic:s}=n.item;if(s<0||s>=r.length)return r;let o=r.slice(0,s),a=r.slice(s+1),l=r[s],c=Ce.span({className:"lm-Menu-itemMnemonic"},l);return[o,c,a]}formatShortcut(n){let r=n.item.keyBinding;return r?er.formatKeystroke(r.keys):null}}t.Renderer=e,t.defaultRenderer=new e})(ha||(ha={}));(function(t){t.TIMER_DELAY=300,t.SUBMENU_OVERLAP=3;let e=null,i=0;function n(){return i>0?(i--,e):u()}function r(){e=u(),i++}t.saveWindowData=r;function s(){let p=document.createElement("div"),_=document.createElement("ul");return _.className="lm-Menu-content",p.appendChild(_),_.setAttribute("role","menu"),p.tabIndex=0,p}t.createNode=s;function o(p){return p.type!=="separator"&&p.isEnabled&&p.isVisible}t.canActivate=o;function a(p,_){return new m(p.commands,_)}t.createItem=a;function l(p,_,y){for(let S=p;S;S=S.childMenu)if(ii.hitTest(S.node,_,y))return!0;return!1}t.hitTestMenus=l;function c(p){let _=new Array(p.length);je.fill(_,!1);let y=0,S=p.length;for(;y=0;--T){let A=p[T];if(A.isVisible){if(A.type!=="separator")break;_[T]=!0}}let O=!1;for(;++yM+x&&(_=M+x-Z),!T&&y+X>C+w&&(y>C+w?y=C+w-X:y=y-X),B.transform=`translate(${Math.max(0,_)}px, ${Math.max(0,y)}px`,B.opacity="1"}t.openRootMenu=d;function f(p,_){let y=n(),S=y.pageXOffset,T=y.pageYOffset,O=y.clientWidth,A=y.clientHeight;ze.sendMessage(p,fe.Msg.UpdateRequest);let b=A,M=p.node,C=M.style;C.opacity="0",C.maxHeight=`${b}px`,fe.attach(p,document.body);let{width:x,height:w}=M.getBoundingClientRect(),E=ii.boxSizing(p.node),N=_.getBoundingClientRect(),B=N.right-t.SUBMENU_OVERLAP;B+x>S+O&&(B=N.left+t.SUBMENU_OVERLAP-x);let Z=N.top-E.borderTop-E.paddingTop;Z+w>T+A&&(Z=N.bottom+E.borderBottom+E.paddingBottom-w),C.transform=`translate(${Math.max(0,B)}px, ${Math.max(0,Z)}px`,C.opacity="1"}t.openSubmenu=f;function h(p,_,y){let S=-1,T=-1,O=!1,A=_.toUpperCase();for(let b=0,M=p.length;b=0&&ES.command===_&&gl.JSONExt.deepEqual(S.args,y))||null}return null}}})(xr||(xr={}));(function(t){function e(o,a){let l=n(o.selector),c=o.rank!==void 0?o.rank:1/0;return{...o,selector:l,rank:c,id:a}}t.createItem=e;function i(o,a,l,c){let u=a.target;if(!u)return null;let d=a.currentTarget;if(!d||!d.contains(u)&&(u=document.elementFromPoint(a.clientX,a.clientY),!u||!d.contains(u)))return null;let f=[],h=o.slice();for(;u!==null;){let m=[];for(let p=0,_=h.length;p<_;++p){let y=h[p];y&&Pu.matches(u,y.selector)&&(m.push(y),h[p]=null)}if(m.length!==0&&(l&&m.sort(c?s:r),f.push(...m)),u===d)break;u=u.parentElement}return l||f.sort(c?s:r),f}t.matchItems=i;function n(o){if(o.indexOf(",")!==-1)throw new Error(`Selector cannot contain commas: ${o}`);if(!Pu.isValid(o))throw new Error(`Invalid selector: ${o}`);return o}function r(o,a){let l=o.rank,c=a.rank;return l!==c?l=this._titles.length)&&(e=-1),this._currentIndex===e)return;let i=this._currentIndex,n=this._titles[i]||null,r=e,s=this._titles[r]||null;this._currentIndex=r,this._previousTitle=n,this.update(),this._currentChanged.emit({previousIndex:i,previousTitle:n,currentIndex:r,currentTitle:s})}get name(){return this._name}set name(e){this._name=e,e?this.contentNode.setAttribute("aria-label",e):this.contentNode.removeAttribute("aria-label")}get orientation(){return this._orientation}set orientation(e){this._orientation!==e&&(this._releaseMouse(),this._orientation=e,this.dataset.orientation=e,this.contentNode.setAttribute("aria-orientation",e))}get addButtonEnabled(){return this._addButtonEnabled}set addButtonEnabled(e){this._addButtonEnabled!==e&&(this._addButtonEnabled=e,e?this.addButtonNode.classList.remove("lm-mod-hidden"):this.addButtonNode.classList.add("lm-mod-hidden"))}get titles(){return this._titles}get contentNode(){return this.node.getElementsByClassName("lm-TabBar-content")[0]}get addButtonNode(){return this.node.getElementsByClassName("lm-TabBar-addButton")[0]}addTab(e){return this.insertTab(this._titles.length,e)}insertTab(e,i){this._releaseMouse();let n=Vs.asTitle(i),r=this._titles.indexOf(n),s=Math.max(0,Math.min(e,this._titles.length));return r===-1?(je.insert(this._titles,s,n),n.changed.connect(this._onTitleChanged,this),this.update(),this._adjustCurrentForInsert(s,n),n):(s===this._titles.length&&s--,r===s||(je.move(this._titles,r,s),this.update(),this._adjustCurrentForMove(r,s)),n)}removeTab(e){this.removeTabAt(this._titles.indexOf(e))}removeTabAt(e){this._releaseMouse();let i=je.removeAt(this._titles,e);i&&(i.changed.disconnect(this._onTitleChanged,this),i===this._previousTitle&&(this._previousTitle=null),this.update(),this._adjustCurrentForRemove(e,i))}clearTabs(){if(this._titles.length===0)return;this._releaseMouse();for(let n of this._titles)n.changed.disconnect(this._onTitleChanged,this);let e=this.currentIndex,i=this.currentTitle;this._currentIndex=-1,this._previousTitle=null,this._titles.length=0,this.update(),e!==-1&&this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:-1,currentTitle:null})}releaseMouse(){this._releaseMouse()}handleEvent(e){switch(e.type){case"pointerdown":this._evtPointerDown(e);break;case"pointermove":this._evtPointerMove(e);break;case"pointerup":this._evtPointerUp(e);break;case"dblclick":this._evtDblClick(e);break;case"keydown":e.eventPhase===Event.CAPTURING_PHASE?this._evtKeyDownCapturing(e):this._evtKeyDown(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("pointerdown",this),this.node.addEventListener("dblclick",this),this.node.addEventListener("keydown",this)}onAfterDetach(e){this.node.removeEventListener("pointerdown",this),this.node.removeEventListener("dblclick",this),this.node.removeEventListener("keydown",this),this._releaseMouse()}onUpdateRequest(e){var i;let n=this._titles,r=this.renderer,s=this.currentTitle,o=new Array(n.length),a=(i=this._getCurrentTabindex())!==null&&i!==void 0?i:this._currentIndex>-1?this._currentIndex:0;for(let l=0,c=n.length;lii.hitTest(o,e.clientX,e.clientY));if(n===-1)return;let r=this.titles[n],s=i[n].querySelector(".lm-TabBar-tabLabel");if(s&&s.contains(e.target)){let o=r.label||"",a=s.innerHTML;s.innerHTML="";let l=document.createElement("input");l.classList.add("lm-TabBar-tabInput"),l.value=o,s.appendChild(l);let c=()=>{l.removeEventListener("blur",c),s.innerHTML=a,this.node.addEventListener("keydown",this)};l.addEventListener("dblclick",u=>u.stopPropagation()),l.addEventListener("blur",c),l.addEventListener("keydown",u=>{u.key==="Enter"?(l.value!==""&&(r.label=r.caption=l.value),c()):u.key==="Escape"&&c()}),this.node.removeEventListener("keydown",this),l.select(),l.focus(),s.children.length>0&&s.children[0].focus()}}_evtKeyDownCapturing(e){e.eventPhase===Event.CAPTURING_PHASE&&(e.preventDefault(),e.stopPropagation(),e.key==="Escape"&&this._releaseMouse())}_evtKeyDown(e){var i,n,r;if(!(e.key==="Tab"||e.eventPhase===Event.CAPTURING_PHASE)){if(e.key==="Enter"||e.key==="Spacebar"||e.key===" "){let s=document.activeElement;if(this.addButtonEnabled&&this.addButtonNode.contains(s))e.preventDefault(),e.stopPropagation(),this._addRequested.emit();else{let o=je.findFirstIndex(this.contentNode.children,a=>a.contains(s));o>=0&&(e.preventDefault(),e.stopPropagation(),this.currentIndex=o)}}else if(hG.includes(e.key)){let s=[...this.contentNode.children];if(this.addButtonEnabled&&s.push(this.addButtonNode),s.length<=1)return;e.preventDefault(),e.stopPropagation();let o=s.indexOf(document.activeElement);o===-1&&(o=this._currentIndex);let a;e.key==="ArrowRight"&&this._orientation==="horizontal"||e.key==="ArrowDown"&&this._orientation==="vertical"?a=(i=s[o+1])!==null&&i!==void 0?i:s[0]:e.key==="ArrowLeft"&&this._orientation==="horizontal"||e.key==="ArrowUp"&&this._orientation==="vertical"?a=(n=s[o-1])!==null&&n!==void 0?n:s[s.length-1]:e.key==="Home"?a=s[0]:e.key==="End"&&(a=s[s.length-1]),a&&((r=s[o])===null||r===void 0||r.setAttribute("tabindex","-1"),a?.setAttribute("tabindex","0"),a.focus())}}}_evtPointerDown(e){if(e.button!==0&&e.button!==1||this._dragData||e.target.classList.contains("lm-TabBar-tabInput"))return;let i=this.addButtonEnabled&&this.addButtonNode.contains(e.target),n=this.contentNode.children,r=je.findFirstIndex(n,o=>ii.hitTest(o,e.clientX,e.clientY));if(r===-1&&!i||(e.preventDefault(),e.stopPropagation(),this._dragData={tab:n[r],index:r,pressX:e.clientX,pressY:e.clientY,tabPos:-1,tabSize:-1,tabPressPos:-1,targetIndex:-1,tabLayout:null,contentRect:null,override:null,dragActive:!1,dragAborted:!1,detachRequested:!1},this.document.addEventListener("pointerup",this,!0),e.button===1||i))return;let s=n[r].querySelector(this.renderer.closeIconSelector);s&&s.contains(e.target)||(this.tabsMovable&&(this.document.addEventListener("pointermove",this,!0),this.document.addEventListener("keydown",this,!0),this.document.addEventListener("contextmenu",this,!0)),this.allowDeselect&&this.currentIndex===r?this.currentIndex=-1:this.currentIndex=r,this.currentIndex!==-1&&this._tabActivateRequested.emit({index:this.currentIndex,title:this.currentTitle}))}_evtPointerMove(e){let i=this._dragData;if(!i)return;e.preventDefault(),e.stopPropagation();let n=this.contentNode.children;if(!(!i.dragActive&&!Vs.dragExceeded(i,e))){if(!i.dragActive){let r=i.tab.getBoundingClientRect();this._orientation==="horizontal"?(i.tabPos=i.tab.offsetLeft,i.tabSize=r.width,i.tabPressPos=i.pressX-r.left):(i.tabPos=i.tab.offsetTop,i.tabSize=r.height,i.tabPressPos=i.pressY-r.top),i.tabPressOffset={x:i.pressX-r.left,y:i.pressY-r.top},i.tabLayout=Vs.snapTabLayout(n,this._orientation),i.contentRect=this.contentNode.getBoundingClientRect(),i.override=an.overrideCursor("default"),i.tab.classList.add("lm-mod-dragging"),this.addClass("lm-mod-dragging"),i.dragActive=!0}if(!i.detachRequested&&Vs.detachExceeded(i,e)){i.detachRequested=!0;let r=i.index,s=e.clientX,o=e.clientY,a=n[r],l=this._titles[r];if(this._tabDetachRequested.emit({index:r,title:l,tab:a,clientX:s,clientY:o,offset:i.tabPressOffset}),i.dragAborted)return}Vs.layoutTabs(n,i,e,this._orientation)}}_evtPointerUp(e){if(e.button!==0&&e.button!==1)return;let i=this._dragData;if(!i)return;if(e.preventDefault(),e.stopPropagation(),this.document.removeEventListener("pointermove",this,!0),this.document.removeEventListener("pointerup",this,!0),this.document.removeEventListener("keydown",this,!0),this.document.removeEventListener("contextmenu",this,!0),!i.dragActive){if(this._dragData=null,this.addButtonEnabled&&this.addButtonNode.contains(e.target)){this._addRequested.emit(void 0);return}let s=this.contentNode.children,o=je.findFirstIndex(s,c=>ii.hitTest(c,e.clientX,e.clientY));if(o!==i.index)return;let a=this._titles[o];if(!a.closable)return;if(e.button===1){this._tabCloseRequested.emit({index:o,title:a});return}let l=s[o].querySelector(this.renderer.closeIconSelector);if(l&&l.contains(e.target)){this._tabCloseRequested.emit({index:o,title:a});return}return}if(e.button!==0)return;Vs.finalizeTabPosition(i,this._orientation),i.tab.classList.remove("lm-mod-dragging");let n=Vs.parseTransitionDuration(i.tab);setTimeout(()=>{if(i.dragAborted)return;this._dragData=null,Vs.resetTabPositions(this.contentNode.children,this._orientation),i.override.dispose(),this.removeClass("lm-mod-dragging");let r=i.index,s=i.targetIndex;s===-1||r===s||(je.move(this._titles,r,s),this._adjustCurrentForMove(r,s),this._tabMoved.emit({fromIndex:r,toIndex:s,title:this._titles[s]}),ze.sendMessage(this,fe.Msg.UpdateRequest))},n)}_releaseMouse(){let e=this._dragData;e&&(this._dragData=null,this.document.removeEventListener("pointermove",this,!0),this.document.removeEventListener("pointerup",this,!0),this.document.removeEventListener("keydown",this,!0),this.document.removeEventListener("contextmenu",this,!0),e.dragAborted=!0,e.dragActive&&(Vs.resetTabPositions(this.contentNode.children,this._orientation),e.override.dispose(),e.tab.classList.remove("lm-mod-dragging"),this.removeClass("lm-mod-dragging")))}_adjustCurrentForInsert(e,i){let n=this.currentTitle,r=this._currentIndex,s=this.insertBehavior;if(s==="select-tab"||s==="select-tab-if-needed"&&r===-1){this._currentIndex=e,this._previousTitle=n,this._currentChanged.emit({previousIndex:r,previousTitle:n,currentIndex:e,currentTitle:i});return}r>=e&&this._currentIndex++}_adjustCurrentForMove(e,i){this._currentIndex===e?this._currentIndex=i:this._currentIndex=i?this._currentIndex++:this._currentIndex>e&&this._currentIndex<=i&&this._currentIndex--}_adjustCurrentForRemove(e,i){let n=this._currentIndex,r=this.removeBehavior;if(n!==e){n>e&&this._currentIndex--;return}if(this._titles.length===0){this._currentIndex=-1,this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:-1,currentTitle:null});return}if(r==="select-tab-after"){this._currentIndex=Math.min(e,this._titles.length-1),this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:this._currentIndex,currentTitle:this.currentTitle});return}if(r==="select-tab-before"){this._currentIndex=Math.max(0,e-1),this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:this._currentIndex,currentTitle:this.currentTitle});return}if(r==="select-previous-tab"){this._previousTitle?(this._currentIndex=this._titles.indexOf(this._previousTitle),this._previousTitle=null):this._currentIndex=Math.min(e,this._titles.length-1),this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:this._currentIndex,currentTitle:this.currentTitle});return}this._currentIndex=-1,this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:-1,currentTitle:null})}_onTitleChanged(e){this.update()}};(function(t){class e{constructor(){this.closeIconSelector=".lm-TabBar-tabCloseIcon",this._tabID=0,this._tabKeys=new WeakMap,this._uuid=++e._nInstance}renderTab(n){let r=n.title.caption,s=this.createTabKey(n),o=s,a=this.createTabStyle(n),l=this.createTabClass(n),c=this.createTabDataset(n),u=this.createTabARIA(n);return n.title.closable?Ce.li({id:o,key:s,className:l,title:r,style:a,dataset:c,...u},this.renderIcon(n),this.renderLabel(n),this.renderCloseIcon(n)):Ce.li({id:o,key:s,className:l,title:r,style:a,dataset:c,...u},this.renderIcon(n),this.renderLabel(n))}renderIcon(n){let{title:r}=n,s=this.createIconClass(n);return Ce.div({className:s},r.icon,r.iconLabel)}renderLabel(n){return Ce.div({className:"lm-TabBar-tabLabel"},n.title.label)}renderCloseIcon(n){return Ce.div({className:"lm-TabBar-tabCloseIcon"})}createTabKey(n){let r=this._tabKeys.get(n.title);return r===void 0&&(r=`tab-key-${this._uuid}-${this._tabID++}`,this._tabKeys.set(n.title,r)),r}createTabStyle(n){return{zIndex:`${n.zIndex}`}}createTabClass(n){let r="lm-TabBar-tab";return n.title.className&&(r+=` ${n.title.className}`),n.title.closable&&(r+=" lm-mod-closable"),n.current&&(r+=" lm-mod-current"),r}createTabDataset(n){return n.title.dataset}createTabARIA(n){var r;return{role:"tab","aria-selected":n.current.toString(),tabindex:`${(r=n.tabIndex)!==null&&r!==void 0?r:"-1"}`}}createIconClass(n){let r="lm-TabBar-tabIcon",s=n.title.iconClass;return s?`${r} ${s}`:r}}e._nInstance=0,t.Renderer=e,t.defaultRenderer=new e,t.addButtonSelector=".lm-TabBar-addButton"})(wc||(wc={}));(function(t){t.DRAG_THRESHOLD=5,t.DETACH_THRESHOLD=20;function e(){let u=document.createElement("div"),d=document.createElement("ul");d.setAttribute("role","tablist"),d.className="lm-TabBar-content",u.appendChild(d);let f=document.createElement("div");return f.className="lm-TabBar-addButton lm-mod-hidden",f.setAttribute("tabindex","-1"),f.setAttribute("role","button"),u.appendChild(f),u}t.createNode=e;function i(u){return u instanceof ug?u:new ug(u)}t.asTitle=i;function n(u){let d=window.getComputedStyle(u);return 1e3*(parseFloat(d.transitionDuration)||0)}t.parseTransitionDuration=n;function r(u,d){let f=new Array(u.length);for(let h=0,m=u.length;h=t.DRAG_THRESHOLD||h>=t.DRAG_THRESHOLD}t.dragExceeded=s;function o(u,d){let f=u.contentRect;return d.clientX=f.right+t.DETACH_THRESHOLD||d.clientY=f.bottom+t.DETACH_THRESHOLD}t.detachExceeded=o;function a(u,d,f,h){let m,p,_,y;h==="horizontal"?(m=d.pressX,p=f.clientX-d.contentRect.left,_=f.clientX,y=d.contentRect.width):(m=d.pressY,p=f.clientY-d.contentRect.top,_=f.clientY,y=d.contentRect.height);let S=d.index,T=p-d.tabPressPos,O=T+d.tabSize;for(let A=0,b=u.length;A>1);if(Ad.index&&O>x)M=`${-d.tabSize-C.margin}px`,S=Math.max(S,A);else if(A===d.index){let w=_-m,E=y-(d.tabPos+d.tabSize);M=`${Math.max(-d.tabPos,Math.min(w,E))}px`}else M="";h==="horizontal"?u[A].style.left=M:u[A].style.top=M}d.targetIndex=S}t.layoutTabs=a;function l(u,d){let f;d==="horizontal"?f=u.contentRect.width:f=u.contentRect.height;let h;if(u.targetIndex===u.index)h=0;else if(u.targetIndex>u.index){let _=u.tabLayout[u.targetIndex];h=_.pos+_.size-u.tabSize-u.tabPos}else h=u.tabLayout[u.targetIndex].pos-u.tabPos;let m=f-(u.tabPos+u.tabSize),p=Math.max(-u.tabPos,Math.min(h,m));d==="horizontal"?u.tab.style.left=`${p}px`:u.tab.style.top=`${p}px`}t.finalizeTabPosition=l;function c(u,d){for(let f of u)d==="horizontal"?f.style.left="":f.style.top=""}t.resetTabPositions=c})(Vs||(Vs={}));HC=class extends fa{constructor(e){super(),this._spacing=4,this._dirty=!1,this._root=null,this._box=null,this._items=new Map,this.renderer=e.renderer,e.spacing!==void 0&&(this._spacing=hg.clampDimension(e.spacing)),this._document=e.document||document,this._hiddenMode=e.hiddenMode!==void 0?e.hiddenMode:fe.HiddenMode.Display}dispose(){let e=this[Symbol.iterator]();this._items.forEach(i=>{i.dispose()}),this._box=null,this._root=null,this._items.clear();for(let i of e)i.dispose();super.dispose()}get hiddenMode(){return this._hiddenMode}set hiddenMode(e){if(this._hiddenMode!==e){this._hiddenMode=e;for(let i of this.tabBars())if(i.titles.length>1)for(let n of i.titles)n.owner.hiddenMode=this._hiddenMode}}get spacing(){return this._spacing}set spacing(e){e=hg.clampDimension(e),this._spacing!==e&&(this._spacing=e,this.parent&&this.parent.fit())}get isEmpty(){return this._root===null}[Symbol.iterator](){return this._root?this._root.iterAllWidgets():Of()}widgets(){return this._root?this._root.iterUserWidgets():Of()}selectedWidgets(){return this._root?this._root.iterSelectedWidgets():Of()}tabBars(){return this._root?this._root.iterTabBars():Of()}handles(){return this._root?this._root.iterHandles():Of()}moveHandle(e,i,n){let r=e.classList.contains("lm-mod-hidden");if(!this._root||r)return;let s=this._root.findSplitNode(e);if(!s)return;let o;s.node.orientation==="horizontal"?o=i-e.offsetLeft:o=n-e.offsetTop,o!==0&&(s.node.holdSizes(),cs.adjust(s.node.sizers,s.index,o),this.parent&&this.parent.update())}saveLayout(){return this._root?(this._root.holdAllSizes(),{main:this._root.createConfig()}):{main:null}}restoreLayout(e){let i=new Set,n;e.main?n=nn.normalizeAreaConfig(e.main,i):n=null;let r=this.widgets(),s=this.tabBars(),o=this.handles();this._root=null;for(let a of r)i.has(a)||(a.parent=null);for(let a of s)a.dispose();for(let a of o)a.parentNode&&a.parentNode.removeChild(a);for(let a of i)a.parent=this.parent;n?this._root=nn.realizeAreaConfig(n,{createTabBar:a=>this._createTabBar(),createHandle:()=>this._createHandle()},this._document):this._root=null,this.parent&&(i.forEach(a=>{this.attachWidget(a)}),this.parent.fit())}addWidget(e,i={}){let n=i.ref||null,r=i.mode||"tab-after",s=null;if(this._root&&n&&(s=this._root.findTabNode(n)),n&&!s)throw new Error("Reference widget is not in the layout.");switch(e.parent=this.parent,r){case"tab-after":this._insertTab(e,n,s,!0);break;case"tab-before":this._insertTab(e,n,s,!1);break;case"split-top":this._insertSplit(e,n,s,"vertical",!1);break;case"split-left":this._insertSplit(e,n,s,"horizontal",!1);break;case"split-right":this._insertSplit(e,n,s,"horizontal",!0);break;case"split-bottom":this._insertSplit(e,n,s,"vertical",!0);break;case"merge-top":this._insertSplit(e,n,s,"vertical",!1,!0);break;case"merge-left":this._insertSplit(e,n,s,"horizontal",!1,!0);break;case"merge-right":this._insertSplit(e,n,s,"horizontal",!0,!0);break;case"merge-bottom":this._insertSplit(e,n,s,"vertical",!0,!0);break}this.parent&&(this.attachWidget(e),this.parent.fit())}removeWidget(e){this._removeWidget(e),this.parent&&(this.detachWidget(e),this.parent.fit())}hitTestTabAreas(e,i){if(!this._root||!this.parent||!this.parent.isVisible)return null;this._box||(this._box=ii.boxSizing(this.parent.node));let n=this.parent.node.getBoundingClientRect(),r=e-n.left-this._box.borderLeft,s=i-n.top-this._box.borderTop,o=this._root.hitTestTabNodes(r,s);if(!o)return null;let{tabBar:a,top:l,left:c,width:u,height:d}=o,f=this._box.borderLeft+this._box.borderRight,h=this._box.borderTop+this._box.borderBottom,m=n.width-f-(c+u),p=n.height-h-(l+d);return{tabBar:a,x:r,y:s,top:l,left:c,right:m,bottom:p,width:u,height:d}}init(){super.init();for(let e of this)this.attachWidget(e);for(let e of this.handles())this.parent.node.appendChild(e);this.parent.fit()}attachWidget(e){this.parent.node!==e.node.parentNode&&(this._items.set(e,new Hu(e)),this.parent.isAttached&&ze.sendMessage(e,fe.Msg.BeforeAttach),this.parent.node.appendChild(e.node),this.parent.isAttached&&ze.sendMessage(e,fe.Msg.AfterAttach))}detachWidget(e){if(this.parent.node!==e.node.parentNode)return;this.parent.isAttached&&ze.sendMessage(e,fe.Msg.BeforeDetach),this.parent.node.removeChild(e.node),this.parent.isAttached&&ze.sendMessage(e,fe.Msg.AfterDetach);let i=this._items.get(e);i&&(this._items.delete(e),i.dispose())}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_removeWidget(e){if(!this._root)return;let i=this._root.findTabNode(e);if(!i)return;if(nn.removeAria(e),i.tabBar.titles.length>1){if(i.tabBar.removeTab(e.title),this._hiddenMode===fe.HiddenMode.Scale&&i.tabBar.titles.length==1){let f=i.tabBar.titles[0].owner;f.hiddenMode=fe.HiddenMode.Display}return}if(i.tabBar.dispose(),this._root===i){this._root=null;return}this._root.holdAllSizes();let n=i.parent;i.parent=null;let r=je.removeFirstOf(n.children,i),s=je.removeAt(n.handles,r);if(je.removeAt(n.sizers,r),s.parentNode&&s.parentNode.removeChild(s),n.children.length>1){n.syncHandles();return}let o=n.parent;n.parent=null;let a=n.children[0],l=n.handles[0];if(n.children.length=0,n.handles.length=0,n.sizers.length=0,l.parentNode&&l.parentNode.removeChild(l),this._root===n){a.parent=null,this._root=a;return}let c=o,u=c.children.indexOf(n);if(a instanceof nn.TabLayoutNode){a.parent=c,c.children[u]=a;return}let d=je.removeAt(c.handles,u);je.removeAt(c.children,u),je.removeAt(c.sizers,u),d.parentNode&&d.parentNode.removeChild(d);for(let f=0,h=a.children.length;f=this._left+this._width||m=this._top+this._height?null:this}createConfig(){let h=this.tabBar.titles.map(p=>p.owner),m=this.tabBar.currentIndex;return{type:"tab-area",widgets:h,currentIndex:m}}holdAllSizes(){}fit(h,m){let p=0,_=0,y=1/0,S=1/0,T=m.get(this.tabBar),O=this.tabBar.currentTitle,A=O?m.get(O.owner):void 0,[b,M]=this.sizers;return T&&T.fit(),A&&A.fit(),T&&!T.isHidden?(p=Math.max(p,T.minWidth),_+=T.minHeight,b.minSize=T.minHeight,b.maxSize=T.maxHeight):(b.minSize=0,b.maxSize=0),A&&!A.isHidden?(p=Math.max(p,A.minWidth),_+=A.minHeight,M.minSize=A.minHeight,M.maxSize=1/0):(M.minSize=0,M.maxSize=1/0),{minWidth:p,minHeight:_,maxWidth:y,maxHeight:S}}update(h,m,p,_,y,S){this._top=m,this._left=h,this._width=p,this._height=_;let T=S.get(this.tabBar),O=this.tabBar.currentTitle,A=O?S.get(O.owner):void 0;if(cs.calc(this.sizers,_),T&&!T.isHidden){let b=this.sizers[0].size;T.update(h,m,p,b),m+=b}if(A&&!A.isHidden){let b=this.sizers[1].size;A.update(h,m,p,b)}}}t.TabLayoutNode=r;class s{constructor(h){this.parent=null,this.normalized=!1,this.children=[],this.sizers=[],this.handles=[],this.orientation=h}*iterAllWidgets(){for(let h of this.children)yield*h.iterAllWidgets()}*iterUserWidgets(){for(let h of this.children)yield*h.iterUserWidgets()}*iterSelectedWidgets(){for(let h of this.children)yield*h.iterSelectedWidgets()}*iterTabBars(){for(let h of this.children)yield*h.iterTabBars()}*iterHandles(){yield*this.handles;for(let h of this.children)yield*h.iterHandles()}findTabNode(h){for(let m=0,p=this.children.length;m_.createConfig());return{type:"split-area",orientation:h,children:p,sizes:m}}syncHandles(){this.handles.forEach((h,m)=>{h.setAttribute("data-orientation",this.orientation),m===this.handles.length-1?h.classList.add("lm-mod-hidden"):h.classList.remove("lm-mod-hidden")})}holdSizes(){for(let h of this.sizers)h.sizeHint=h.size}holdAllSizes(){for(let h of this.children)h.holdAllSizes();this.holdSizes()}normalizeSizes(){let h=this.sizers.length;if(h===0)return;this.holdSizes();let m=this.sizers.reduce((p,_)=>p+_.sizeHint,0);if(m===0)for(let p of this.sizers)p.size=p.sizeHint=1/h;else for(let p of this.sizers)p.size=p.sizeHint/=m;this.normalized=!0}createNormalizedSizes(){let h=this.sizers.length;if(h===0)return[];let m=this.sizers.map(_=>_.size),p=m.reduce((_,y)=>_+y,0);if(p===0)for(let _=m.length-1;_>-1;_--)m[_]=1/h;else for(let _=m.length-1;_>-1;_--)m[_]/=p;return m}fit(h,m){let p=this.orientation==="horizontal",_=Math.max(0,this.children.length-1)*h,y=p?_:0,S=p?0:_,T=1/0,O=1/0;for(let A=0,b=this.children.length;A=m.length)&&(p=0),{type:"tab-area",widgets:m,currentIndex:p}}function c(f,h){let m=f.orientation,p=[],_=[];for(let y=0,S=f.children.length;y{let S=n(_,h,m),T=e(f.sizes[y]),O=h.createHandle();p.children.push(S),p.handles.push(O),p.sizers.push(T),S.parent=p}),p.syncHandles(),p.normalizeSizes(),p}})(nn||(nn={}));Wu=class extends fe{constructor(e={}){super(),this._drag=null,this._tabsMovable=!0,this._tabsConstrained=!1,this._addButtonEnabled=!1,this._pressData=null,this._layoutModified=new Te(this),this._addRequested=new Te(this),this.addClass("lm-DockPanel"),this._document=e.document||document,this._mode=e.mode||"multiple-document",this._renderer=e.renderer||Wu.defaultRenderer,this._edges=e.edges||Vi.DEFAULT_EDGES,e.tabsMovable!==void 0&&(this._tabsMovable=e.tabsMovable),e.tabsConstrained!==void 0&&(this._tabsConstrained=e.tabsConstrained),e.addButtonEnabled!==void 0&&(this._addButtonEnabled=e.addButtonEnabled),this.dataset.mode=this._mode;let i={createTabBar:()=>this._createTabBar(),createHandle:()=>this._createHandle()};this.layout=new HC({document:this._document,renderer:i,spacing:e.spacing,hiddenMode:e.hiddenMode}),this.overlay=e.overlay||new Wu.Overlay,this.node.appendChild(this.overlay.node)}dispose(){this._releaseMouse(),this.overlay.hide(0),this._drag&&this._drag.dispose(),super.dispose()}get hiddenMode(){return this.layout.hiddenMode}set hiddenMode(e){this.layout.hiddenMode=e}get layoutModified(){return this._layoutModified}get addRequested(){return this._addRequested}get renderer(){return this.layout.renderer}get spacing(){return this.layout.spacing}set spacing(e){this.layout.spacing=e}get mode(){return this._mode}set mode(e){if(this._mode===e)return;this._mode=e,this.dataset.mode=e;let i=this.layout;switch(e){case"multiple-document":for(let n of i.tabBars())n.show();break;case"single-document":i.restoreLayout(Vi.createSingleDocumentConfig(this));break;default:throw"unreachable"}ze.postMessage(this,Vi.LayoutModified)}get tabsMovable(){return this._tabsMovable}set tabsMovable(e){this._tabsMovable=e;for(let i of this.tabBars())i.tabsMovable=e}get tabsConstrained(){return this._tabsConstrained}set tabsConstrained(e){this._tabsConstrained=e}get addButtonEnabled(){return this._addButtonEnabled}set addButtonEnabled(e){this._addButtonEnabled=e;for(let i of this.tabBars())i.addButtonEnabled=e}get isEmpty(){return this.layout.isEmpty}*widgets(){yield*this.layout.widgets()}*selectedWidgets(){yield*this.layout.selectedWidgets()}*tabBars(){yield*this.layout.tabBars()}*handles(){yield*this.layout.handles()}selectWidget(e){let i=LC(this.tabBars(),n=>n.titles.indexOf(e.title)!==-1);if(!i)throw new Error("Widget is not contained in the dock panel.");i.currentTitle=e.title}activateWidget(e){this.selectWidget(e),e.activate()}saveLayout(){return this.layout.saveLayout()}restoreLayout(e){this._mode="multiple-document",this.layout.restoreLayout(e),(zu.IS_EDGE||zu.IS_IE)&&ze.flush(),ze.postMessage(this,Vi.LayoutModified)}addWidget(e,i={}){this._mode==="single-document"?this.layout.addWidget(e):this.layout.addWidget(e,i),ze.postMessage(this,Vi.LayoutModified)}processMessage(e){e.type==="layout-modified"?this._layoutModified.emit(void 0):super.processMessage(e)}handleEvent(e){switch(e.type){case"lm-dragenter":this._evtDragEnter(e);break;case"lm-dragleave":this._evtDragLeave(e);break;case"lm-dragover":this._evtDragOver(e);break;case"lm-drop":this._evtDrop(e);break;case"pointerdown":this._evtPointerDown(e);break;case"pointermove":this._evtPointerMove(e);break;case"pointerup":this._evtPointerUp(e);break;case"keydown":this._evtKeyDown(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("lm-dragenter",this),this.node.addEventListener("lm-dragleave",this),this.node.addEventListener("lm-dragover",this),this.node.addEventListener("lm-drop",this),this.node.addEventListener("pointerdown",this)}onAfterDetach(e){this.node.removeEventListener("lm-dragenter",this),this.node.removeEventListener("lm-dragleave",this),this.node.removeEventListener("lm-dragover",this),this.node.removeEventListener("lm-drop",this),this.node.removeEventListener("pointerdown",this),this._releaseMouse()}onChildAdded(e){Vi.isGeneratedTabBarProperty.get(e.child)||e.child.addClass("lm-DockPanel-widget")}onChildRemoved(e){Vi.isGeneratedTabBarProperty.get(e.child)||(e.child.removeClass("lm-DockPanel-widget"),ze.postMessage(this,Vi.LayoutModified))}_evtDragEnter(e){e.mimeData.hasData("application/vnd.lumino.widget-factory")&&(e.preventDefault(),e.stopPropagation())}_evtDragLeave(e){e.preventDefault(),!(this._tabsConstrained&&e.source!==this)&&(e.stopPropagation(),this.overlay.hide(1))}_evtDragOver(e){e.preventDefault(),this._tabsConstrained&&e.source!==this||this._showOverlay(e.clientX,e.clientY)==="invalid"?e.dropAction="none":(e.stopPropagation(),e.dropAction=e.proposedAction)}_evtDrop(e){if(e.preventDefault(),this.overlay.hide(0),e.proposedAction==="none"){e.dropAction="none";return}let{clientX:i,clientY:n}=e,{zone:r,target:s}=Vi.findDropTarget(this,i,n,this._edges);if(this._tabsConstrained&&e.source!==this||r==="invalid"){e.dropAction="none";return}let a=e.mimeData.getData("application/vnd.lumino.widget-factory");if(typeof a!="function"){e.dropAction="none";return}let l=a();if(!(l instanceof fe)){e.dropAction="none";return}if(l.contains(this)){e.dropAction="none";return}let c=s?Vi.getDropRef(s.tabBar):null;switch(r){case"root-all":this.addWidget(l);break;case"root-top":this.addWidget(l,{mode:"split-top"});break;case"root-left":this.addWidget(l,{mode:"split-left"});break;case"root-right":this.addWidget(l,{mode:"split-right"});break;case"root-bottom":this.addWidget(l,{mode:"split-bottom"});break;case"widget-all":this.addWidget(l,{mode:"tab-after",ref:c});break;case"widget-top":this.addWidget(l,{mode:"split-top",ref:c});break;case"widget-left":this.addWidget(l,{mode:"split-left",ref:c});break;case"widget-right":this.addWidget(l,{mode:"split-right",ref:c});break;case"widget-bottom":this.addWidget(l,{mode:"split-bottom",ref:c});break;case"widget-tab":this.addWidget(l,{mode:"tab-after",ref:c});break;default:throw"unreachable"}e.dropAction=e.proposedAction,e.stopPropagation(),this.activateWidget(l)}_evtKeyDown(e){e.preventDefault(),e.stopPropagation(),e.keyCode===27&&(this._releaseMouse(),ze.postMessage(this,Vi.LayoutModified))}_evtPointerDown(e){if(e.button!==0)return;let i=this.layout,n=e.target,r=LC(i.handles(),u=>u.contains(n));if(!r)return;e.preventDefault(),e.stopPropagation(),this._document.addEventListener("keydown",this,!0),this._document.addEventListener("pointerup",this,!0),this._document.addEventListener("pointermove",this,!0),this._document.addEventListener("contextmenu",this,!0);let s=r.getBoundingClientRect(),o=e.clientX-s.left,a=e.clientY-s.top,l=window.getComputedStyle(r),c=an.overrideCursor(l.cursor,this._document);this._pressData={handle:r,deltaX:o,deltaY:a,override:c}}_evtPointerMove(e){if(!this._pressData)return;e.preventDefault(),e.stopPropagation();let i=this.node.getBoundingClientRect(),n=e.clientX-i.left-this._pressData.deltaX,r=e.clientY-i.top-this._pressData.deltaY;this.layout.moveHandle(this._pressData.handle,n,r)}_evtPointerUp(e){e.button===0&&(e.preventDefault(),e.stopPropagation(),this._releaseMouse(),ze.postMessage(this,Vi.LayoutModified))}_releaseMouse(){this._pressData&&(this._pressData.override.dispose(),this._pressData=null,this._document.removeEventListener("keydown",this,!0),this._document.removeEventListener("pointerup",this,!0),this._document.removeEventListener("pointermove",this,!0),this._document.removeEventListener("contextmenu",this,!0))}_showOverlay(e,i){let{zone:n,target:r}=Vi.findDropTarget(this,e,i,this._edges);if(n==="invalid")return this.overlay.hide(100),n;let s,o,a,l,c=ii.boxSizing(this.node),u=this.node.getBoundingClientRect();switch(n){case"root-all":s=c.paddingTop,o=c.paddingLeft,a=c.paddingRight,l=c.paddingBottom;break;case"root-top":s=c.paddingTop,o=c.paddingLeft,a=c.paddingRight,l=u.height*Vi.GOLDEN_RATIO;break;case"root-left":s=c.paddingTop,o=c.paddingLeft,a=u.width*Vi.GOLDEN_RATIO,l=c.paddingBottom;break;case"root-right":s=c.paddingTop,o=u.width*Vi.GOLDEN_RATIO,a=c.paddingRight,l=c.paddingBottom;break;case"root-bottom":s=u.height*Vi.GOLDEN_RATIO,o=c.paddingLeft,a=c.paddingRight,l=c.paddingBottom;break;case"widget-all":s=r.top,o=r.left,a=r.right,l=r.bottom;break;case"widget-top":s=r.top,o=r.left,a=r.right,l=r.bottom+r.height/2;break;case"widget-left":s=r.top,o=r.left,a=r.right+r.width/2,l=r.bottom;break;case"widget-right":s=r.top,o=r.left+r.width/2,a=r.right,l=r.bottom;break;case"widget-bottom":s=r.top+r.height/2,o=r.left,a=r.right,l=r.bottom;break;case"widget-tab":{let d=r.tabBar.node.getBoundingClientRect().height;s=r.top,o=r.left,a=r.right,l=r.bottom+r.height-d;break}default:throw"unreachable"}return this.overlay.show({top:s,left:o,right:a,bottom:l}),n}_createTabBar(){let e=this._renderer.createTabBar(this._document);return Vi.isGeneratedTabBarProperty.set(e,!0),this._mode==="single-document"&&e.hide(),e.tabsMovable=this._tabsMovable,e.allowDeselect=!1,e.addButtonEnabled=this._addButtonEnabled,e.removeBehavior="select-previous-tab",e.insertBehavior="select-tab-if-needed",e.tabMoved.connect(this._onTabMoved,this),e.currentChanged.connect(this._onCurrentChanged,this),e.tabCloseRequested.connect(this._onTabCloseRequested,this),e.tabDetachRequested.connect(this._onTabDetachRequested,this),e.tabActivateRequested.connect(this._onTabActivateRequested,this),e.addRequested.connect(this._onTabAddRequested,this),e}_createHandle(){return this._renderer.createHandle()}_onTabMoved(){ze.postMessage(this,Vi.LayoutModified)}_onCurrentChanged(e,i){let{previousTitle:n,currentTitle:r}=i;n&&n.owner.hide(),r&&r.owner.show(),(zu.IS_EDGE||zu.IS_IE)&&ze.flush(),ze.postMessage(this,Vi.LayoutModified)}_onTabAddRequested(e){this._addRequested.emit(e)}_onTabActivateRequested(e,i){i.title.owner.activate()}_onTabCloseRequested(e,i){i.title.owner.close()}_onTabDetachRequested(e,i){if(this._drag)return;e.releaseMouse();let{title:n,tab:r,clientX:s,clientY:o,offset:a}=i,l=new gl.MimeData,c=()=>n.owner;l.setData("application/vnd.lumino.widget-factory",c);let u=r.cloneNode(!0);a&&(u.style.top=`-${a.y}px`,u.style.left=`-${a.x}px`),this._drag=new an({document:this._document,mimeData:l,dragImage:u,proposedAction:"move",supportedActions:"move",source:this}),r.classList.add("lm-mod-hidden");let d=()=>{this._drag=null,r.classList.remove("lm-mod-hidden")};this._drag.start(s,o).then(d)}};(function(t){class e{constructor(){this._timer=-1,this._hidden=!0,this.node=document.createElement("div"),this.node.classList.add("lm-DockPanel-overlay"),this.node.classList.add("lm-mod-hidden"),this.node.style.position="absolute",this.node.style.contain="strict"}show(r){let s=this.node.style;s.top=`${r.top}px`,s.left=`${r.left}px`,s.right=`${r.right}px`,s.bottom=`${r.bottom}px`,clearTimeout(this._timer),this._timer=-1,this._hidden&&(this._hidden=!1,this.node.classList.remove("lm-mod-hidden"))}hide(r){if(!this._hidden){if(r<=0){clearTimeout(this._timer),this._timer=-1,this._hidden=!0,this.node.classList.add("lm-mod-hidden");return}this._timer===-1&&(this._timer=window.setTimeout(()=>{this._timer=-1,this._hidden=!0,this.node.classList.add("lm-mod-hidden")},r))}}}t.Overlay=e;class i{createTabBar(r){let s=new wc({document:r});return s.addClass("lm-DockPanel-tabBar"),s}createHandle(){let r=document.createElement("div");return r.className="lm-DockPanel-handle",r}}t.Renderer=i,t.defaultRenderer=new i})(Wu||(Wu={}));(function(t){t.GOLDEN_RATIO=.618,t.DEFAULT_EDGES={top:12,right:40,bottom:40,left:40},t.LayoutModified=new xc("layout-modified"),t.isGeneratedTabBarProperty=new pt({name:"isGeneratedTabBar",create:()=>!1});function e(r){if(r.isEmpty)return{main:null};let s=Array.from(r.widgets()),o=r.selectedWidgets().next().value,a=o?s.indexOf(o):-1;return{main:{type:"tab-area",widgets:s,currentIndex:a}}}t.createSingleDocumentConfig=e;function i(r,s,o,a){if(!ii.hitTest(r.node,s,o))return{zone:"invalid",target:null};let l=r.layout;if(l.isEmpty)return{zone:"root-all",target:null};if(r.mode==="multiple-document"){let T=r.node.getBoundingClientRect(),O=s-T.left+1,A=o-T.top+1,b=T.right-s,M=T.bottom-o;switch(Math.min(A,b,M,O)){case A:if(Ap&&f>p&&d>_&&h>_)return{zone:"widget-all",target:c};u/=p,d/=_,f/=p,h/=_;let y=Math.min(u,d,f,h),S;switch(y){case u:S="widget-left";break;case d:S="widget-top";break;case f:S="widget-right";break;case h:S="widget-bottom";break;default:throw"unreachable"}return{zone:S,target:c}}t.findDropTarget=i;function n(r){return r.titles.length===0?null:r.currentTitle?r.currentTitle.owner:r.titles[r.titles.length-1].owner}t.getDropRef=n})(Vi||(Vi={}));pl=class extends fa{constructor(e={}){super(e),this._dirty=!1,this._rowSpacing=4,this._columnSpacing=4,this._items=[],this._rowStarts=[],this._columnStarts=[],this._rowSizers=[new da],this._columnSizers=[new da],this._box=null,e.rowCount!==void 0&&wn.reallocSizers(this._rowSizers,e.rowCount),e.columnCount!==void 0&&wn.reallocSizers(this._columnSizers,e.columnCount),e.rowSpacing!==void 0&&(this._rowSpacing=wn.clampValue(e.rowSpacing)),e.columnSpacing!==void 0&&(this._columnSpacing=wn.clampValue(e.columnSpacing))}dispose(){for(let e of this._items){let i=e.widget;e.dispose(),i.dispose()}this._box=null,this._items.length=0,this._rowStarts.length=0,this._rowSizers.length=0,this._columnStarts.length=0,this._columnSizers.length=0,super.dispose()}get rowCount(){return this._rowSizers.length}set rowCount(e){e!==this.rowCount&&(wn.reallocSizers(this._rowSizers,e),this.parent&&this.parent.fit())}get columnCount(){return this._columnSizers.length}set columnCount(e){e!==this.columnCount&&(wn.reallocSizers(this._columnSizers,e),this.parent&&this.parent.fit())}get rowSpacing(){return this._rowSpacing}set rowSpacing(e){e=wn.clampValue(e),this._rowSpacing!==e&&(this._rowSpacing=e,this.parent&&this.parent.fit())}get columnSpacing(){return this._columnSpacing}set columnSpacing(e){e=wn.clampValue(e),this._columnSpacing!==e&&(this._columnSpacing=e,this.parent&&this.parent.fit())}rowStretch(e){let i=this._rowSizers[e];return i?i.stretch:-1}setRowStretch(e,i){let n=this._rowSizers[e];n&&(i=wn.clampValue(i),n.stretch!==i&&(n.stretch=i,this.parent&&this.parent.update()))}columnStretch(e){let i=this._columnSizers[e];return i?i.stretch:-1}setColumnStretch(e,i){let n=this._columnSizers[e];n&&(i=wn.clampValue(i),n.stretch!==i&&(n.stretch=i,this.parent&&this.parent.update()))}*[Symbol.iterator](){for(let e of this._items)yield e.widget}addWidget(e){je.findFirstIndex(this._items,n=>n.widget===e)===-1&&(this._items.push(new Hu(e)),this.parent&&this.attachWidget(e))}removeWidget(e){let i=je.findFirstIndex(this._items,r=>r.widget===e);if(i===-1)return;let n=je.removeAt(this._items,i);this.parent&&this.detachWidget(e),n.dispose()}init(){super.init();for(let e of this)this.attachWidget(e)}attachWidget(e){this.parent.isAttached&&ze.sendMessage(e,fe.Msg.BeforeAttach),this.parent.node.appendChild(e.node),this.parent.isAttached&&ze.sendMessage(e,fe.Msg.AfterAttach),this.parent.fit()}detachWidget(e){this.parent.isAttached&&ze.sendMessage(e,fe.Msg.BeforeDetach),this.parent.node.removeChild(e.node),this.parent.isAttached&&ze.sendMessage(e,fe.Msg.AfterDetach),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_fit(){for(let l=0,c=this.rowCount;l!l.isHidden);for(let l=0,c=e.length;l({row:0,column:0,rowSpan:1,columnSpan:1}),changed:a});function e(l){let c=Math.max(0,Math.floor(l.row||0)),u=Math.max(0,Math.floor(l.column||0)),d=Math.max(1,Math.floor(l.rowSpan||0)),f=Math.max(1,Math.floor(l.columnSpan||0));return{row:c,column:u,rowSpan:d,columnSpan:f}}t.normalizeConfig=e;function i(l){return Math.max(0,Math.floor(l))}t.clampValue=i;function n(l,c){let u=t.cellConfigProperty.get(l.widget),d=t.cellConfigProperty.get(c.widget);return u.rowSpan-d.rowSpan}t.rowSpanCmp=n;function r(l,c){let u=t.cellConfigProperty.get(l.widget),d=t.cellConfigProperty.get(c.widget);return u.columnSpan-d.columnSpan}t.columnSpanCmp=r;function s(l,c){for(c=Math.max(1,Math.floor(c));l.lengthc&&(l.length=c)}t.reallocSizers=s;function o(l,c,u,d){if(u=d)return;let h=(d-f)/(u-c+1);for(let m=c;m<=u;++m)l[m].minSize+=h}t.distributeMin=o;function a(l){l.parent&&l.parent.layout instanceof pl&&l.parent.fit()}})(wn||(wn={}));jf=class extends fe{constructor(e={}){super({node:_0.createNode()}),this._activeIndex=-1,this._tabFocusIndex=0,this._menus=[],this._childMenu=null,this._overflowMenu=null,this._menuItemSizes=[],this._overflowIndex=-1,this.addClass("lm-MenuBar"),this.setFlag(fe.Flag.DisallowLayout),this.renderer=e.renderer||jf.defaultRenderer,this._forceItemsPosition=e.forceItemsPosition||{forceX:!0,forceY:!0},this._overflowMenuOptions=e.overflowMenuOptions||{isVisible:!0}}dispose(){this._closeChildMenu(),this._menus.length=0,super.dispose()}get childMenu(){return this._childMenu}get overflowIndex(){return this._overflowIndex}get overflowMenu(){return this._overflowMenu}get contentNode(){return this.node.getElementsByClassName("lm-MenuBar-content")[0]}get activeMenu(){return this._menus[this._activeIndex]||null}set activeMenu(e){this.activeIndex=e?this._menus.indexOf(e):-1}get activeIndex(){return this._activeIndex}set activeIndex(e){(e<0||e>=this._menus.length)&&(e=-1),e>-1&&this._menus[e].items.length===0&&(e=-1),this._activeIndex!==e&&(this._activeIndex=e,this.update())}get menus(){return this._menus}openActiveMenu(){this._activeIndex!==-1&&(this._openChildMenu(),this._childMenu&&(this._childMenu.activeIndex=-1,this._childMenu.activateNextItem()))}addMenu(e,i=!0){this.insertMenu(this._menus.length,e,i)}insertMenu(e,i,n=!0){this._closeChildMenu();let r=this._menus.indexOf(i),s=Math.max(0,Math.min(e,this._menus.length));if(r===-1){je.insert(this._menus,s,i),i.addClass("lm-MenuBar-menu"),i.aboutToClose.connect(this._onMenuAboutToClose,this),i.menuRequested.connect(this._onMenuMenuRequested,this),i.title.changed.connect(this._onTitleChanged,this),n&&this.update();return}s===this._menus.length&&s--,r!==s&&(je.move(this._menus,r,s),n&&this.update())}removeMenu(e,i=!0){this.removeMenuAt(this._menus.indexOf(e),i)}removeMenuAt(e,i=!0){this._closeChildMenu();let n=je.removeAt(this._menus,e);n&&(n.aboutToClose.disconnect(this._onMenuAboutToClose,this),n.menuRequested.disconnect(this._onMenuMenuRequested,this),n.title.changed.disconnect(this._onTitleChanged,this),n.removeClass("lm-MenuBar-menu"),i&&this.update())}clearMenus(){if(this._menus.length!==0){this._closeChildMenu();for(let e of this._menus)e.aboutToClose.disconnect(this._onMenuAboutToClose,this),e.menuRequested.disconnect(this._onMenuMenuRequested,this),e.title.changed.disconnect(this._onTitleChanged,this),e.removeClass("lm-MenuBar-menu");this._menus.length=0,this.update()}}handleEvent(e){switch(e.type){case"keydown":this._evtKeyDown(e);break;case"mousedown":this._evtMouseDown(e);break;case"mousemove":this._evtMouseMove(e);break;case"focusout":this._evtFocusOut(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("keydown",this),this.node.addEventListener("mousedown",this),this.node.addEventListener("mousemove",this),this.node.addEventListener("focusout",this),this.node.addEventListener("contextmenu",this)}onAfterDetach(e){this.node.removeEventListener("keydown",this),this.node.removeEventListener("mousedown",this),this.node.removeEventListener("mousemove",this),this.node.removeEventListener("focusout",this),this.node.removeEventListener("contextmenu",this),this._closeChildMenu()}onActivateRequest(e){this.isAttached&&this._focusItemAt(0)}onResize(e){this.update(),super.onResize(e)}onUpdateRequest(e){var i;let n=this._menus,r=this.renderer,s=this._activeIndex,o=this._tabFocusIndex>=0&&this._tabFocusIndex-1?this._overflowIndex:n.length,l=0,c=!1;a=this._overflowMenu!==null?a-1:a;let u=new Array(a);for(let d=0;d{this._tabFocusIndex=d,this.activeIndex=d}}),l+=this._menuItemSizes[d],n[d].title.label===this._overflowMenuOptions.title&&(c=!0,a--);if(this._overflowMenuOptions.isVisible){if(this._overflowIndex>-1&&!c){if(this._overflowMenu===null){let d=(i=this._overflowMenuOptions.title)!==null&&i!==void 0?i:"...";this._overflowMenu=new ha({commands:new er}),this._overflowMenu.title.label=d,this._overflowMenu.title.mnemonic=0,this.addMenu(this._overflowMenu,!1)}for(let d=n.length-2;d>=a;d--){let f=this.menus[d];f.title.mnemonic=0,this._overflowMenu.insertItem(0,{type:"submenu",submenu:f}),this.removeMenu(f,!1)}u[a]=r.renderItem({title:this._overflowMenu.title,active:a===s&&n[a].items.length!==0,tabbable:a===o,disabled:n[a].items.length===0,onfocus:()=>{this._tabFocusIndex=a,this.activeIndex=a}}),a++}else if(this._overflowMenu!==null){let d=this._overflowMenu.items,f=this.node.offsetWidth,h=this._overflowMenu.items.length;for(let m=0;mthis._menuItemSizes[p]){let _=d[0].submenu;this._overflowMenu.removeItemAt(0),this.insertMenu(a,_,!1),u[a]=r.renderItem({title:_.title,active:!1,tabbable:a===o,disabled:n[a].items.length===0,onfocus:()=>{this._tabFocusIndex=a,this.activeIndex=a}}),a++}}this._overflowMenu.items.length===0&&(this.removeMenu(this._overflowMenu,!1),u.pop(),this._overflowMenu=null,this._overflowIndex=-1)}}Zt.render(u,this.contentNode),this._updateOverflowIndex()}_updateOverflowIndex(){if(!this._overflowMenuOptions.isVisible)return;let e=this.contentNode.childNodes,i=this.node.offsetWidth,n=0,r=-1,s=e.length;if(this._menuItemSizes.length==0)for(let o=0;oi&&r===-1&&(r=o)}else for(let o=0;oi){r=o;break}this._overflowIndex=r}_evtKeyDown(e){let i=e.keyCode;if(i===9){this.activeIndex=-1;return}if(e.preventDefault(),e.stopPropagation(),i===13||i===32||i===38||i===40){if(this.activeIndex=this._tabFocusIndex,this.activeIndex!==this._tabFocusIndex)return;this.openActiveMenu();return}if(i===27){this._closeChildMenu(),this._focusItemAt(this.activeIndex);return}if(i===37||i===39){let o=i===37?-1:1,a=this._tabFocusIndex+o,l=this._menus.length;for(let c=0;cii.hitTest(n,e.clientX,e.clientY));if(i===-1){this._closeChildMenu();return}if(e.button===0)if(this._childMenu)this._closeChildMenu(),this.activeIndex=i;else{e.preventDefault();let n=this._positionForMenu(i);ha.saveWindowData(),this.activeIndex=i,this._openChildMenu(n)}}_evtMouseMove(e){let i=je.findFirstIndex(this.contentNode.children,r=>ii.hitTest(r,e.clientX,e.clientY));if(i===this._activeIndex||i===-1&&this._childMenu)return;let n=i>=0&&this._childMenu?this._positionForMenu(i):null;ha.saveWindowData(),this.activeIndex=i,n&&this._openChildMenu(n)}_positionForMenu(e){let i=this.contentNode.children[e],{left:n,bottom:r}=i.getBoundingClientRect();return{top:r,left:n}}_evtFocusOut(e){!this._childMenu&&!this.node.contains(e.relatedTarget)&&(this.activeIndex=-1)}_focusItemAt(e){let i=this.contentNode.childNodes[e];i&&i.focus()}_openChildMenu(e={}){let i=this.activeMenu;if(!i){this._closeChildMenu();return}let n=this._childMenu;if(n===i)return;this._childMenu=i,n?n.close():document.addEventListener("mousedown",this,!0),this._tabFocusIndex=this.activeIndex,ze.sendMessage(this,fe.Msg.UpdateRequest);let{left:r,top:s}=e;(typeof r>"u"||typeof s>"u")&&({left:r,top:s}=this._positionForMenu(this._activeIndex)),n||this.addClass("lm-mod-active"),i.items.length>0&&i.open(r,s,this._forceItemsPosition)}_closeChildMenu(){if(!this._childMenu)return;this.removeClass("lm-mod-active"),document.removeEventListener("mousedown",this,!0);let e=this._childMenu;this._childMenu=null,e.close(),this.activeIndex=-1}_onMenuAboutToClose(e){e===this._childMenu&&(this.removeClass("lm-mod-active"),document.removeEventListener("mousedown",this,!0),this._childMenu=null,this.activeIndex=-1)}_onMenuMenuRequested(e,i){if(e!==this._childMenu)return;let n=this._activeIndex,r=this._menus.length;switch(i){case"next":this.activeIndex=n===r-1?0:n+1;break;case"previous":this.activeIndex=n===0?r-1:n-1;break}this.openActiveMenu()}_onTitleChanged(){this.update()}};(function(t){class e{renderItem(n){let r=this.createItemClass(n),s=this.createItemDataset(n),o=this.createItemARIA(n);return Ce.li({className:r,dataset:s,...n.disabled?{}:{tabindex:n.tabbable?"0":"-1"},onfocus:n.onfocus,...o},this.renderIcon(n),this.renderLabel(n))}renderIcon(n){let r=this.createIconClass(n);return Ce.div({className:r},n.title.icon,n.title.iconLabel)}renderLabel(n){let r=this.formatLabel(n);return Ce.div({className:"lm-MenuBar-itemLabel"},r)}createItemClass(n){let r="lm-MenuBar-item";return n.title.className&&(r+=` ${n.title.className}`),n.active&&!n.disabled&&(r+=" lm-mod-active"),r}createItemDataset(n){return n.title.dataset}createItemARIA(n){return{role:"menuitem","aria-haspopup":"true","aria-disabled":n.disabled?"true":"false"}}createIconClass(n){let r="lm-MenuBar-itemIcon",s=n.title.iconClass;return s?`${r} ${s}`:r}formatLabel(n){let{label:r,mnemonic:s}=n.title;if(s<0||s>=r.length)return r;let o=r.slice(0,s),a=r.slice(s+1),l=r[s],c=Ce.span({className:"lm-MenuBar-itemMnemonic"},l);return[o,c,a]}}t.Renderer=e,t.defaultRenderer=new e})(jf||(jf={}));(function(t){function e(){let n=document.createElement("div"),r=document.createElement("ul");return r.className="lm-MenuBar-content",n.appendChild(r),r.setAttribute("role","menubar"),n}t.createNode=e;function i(n,r,s){let o=-1,a=-1,l=!1,c=r.toUpperCase();for(let u=0,d=n.length;u=0&&m1&&this.widgets.forEach(i=>{i.hiddenMode=this._hiddenMode}))}dispose(){for(let e of this._items)e.dispose();this._box=null,this._items.length=0,super.dispose()}attachWidget(e,i){this._hiddenMode===fe.HiddenMode.Scale&&this._items.length>0?(this._items.length===1&&(this.widgets[0].hiddenMode=fe.HiddenMode.Scale),i.hiddenMode=fe.HiddenMode.Scale):i.hiddenMode=fe.HiddenMode.Display,je.insert(this._items,e,new Hu(i)),this.parent.isAttached&&ze.sendMessage(i,fe.Msg.BeforeAttach),this.parent.node.appendChild(i.node),this.parent.isAttached&&ze.sendMessage(i,fe.Msg.AfterAttach),this.parent.fit()}moveWidget(e,i,n){je.move(this._items,e,i),this.parent.update()}detachWidget(e,i){let n=je.removeAt(this._items,e);this.parent.isAttached&&ze.sendMessage(i,fe.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.isAttached&&ze.sendMessage(i,fe.Msg.AfterDetach),n.widget.node.style.zIndex="",this._hiddenMode===fe.HiddenMode.Scale&&(i.hiddenMode=fe.HiddenMode.Display,this._items.length===1&&(this._items[0].widget.hiddenMode=fe.HiddenMode.Display)),n.dispose(),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_fit(){let e=0,i=0;for(let s=0,o=this._items.length;s{y1();$z=P(v1());Rp();v0=P(Qn());Tp();Rs();Hz();fG="jp-OutputArea",jz="jp-OutputArea-child",Fz="jp-OutputArea-output",Wz="jp-OutputArea-prompt",mG="jp-OutputArea-stdin-hiding",pG="jp-OutputPrompt",gG="jp-OutputArea-executeResult",_G="jp-OutputArea-stdin-item",vG="jp-Stdin",bG="jp-Stdin-prompt",xG="jp-Stdin-input",yG="jp-OutputArea-promptOverlay",ds=class extends fe{constructor(e){var i,n,r,s;super(),this.outputLengthChanged=new Te(this),this._onIOPub=a=>{let l=this.model,c=a.header.msg_type,u,f=(a.content.transient||{}).display_id,h;switch(c){case"execute_result":case"display_data":case"stream":case"error":u={...a.content,output_type:c},l.add(u);break;case"clear_output":{let m=a.content.wait;l.clear(m);break}case"update_display_data":if(u={...a.content,output_type:"display_data"},h=this._displayIdMap.get(f),h)for(let m of h)l.set(m,u);break;case"status":{a.content.execution_state==="idle"&&(this._pendingInput=!1);break}default:break}f&&c==="display_data"&&(h=this._displayIdMap.get(f)||[],h.push(l.length-1),this._displayIdMap.set(f,h))},this._onExecuteReply=a=>{let l=this.model,c=a.content;if(c.status!=="ok")return;let u=c&&c.payload;if(!u||!u.length)return;let d=u.filter(m=>m.source==="page");if(!d.length)return;let h={output_type:"display_data",data:JSON.parse(JSON.stringify(d[0])).data,metadata:{}};l.add(h)},this._displayIdMap=new Map,this._minHeightTimeout=null,this._inputRequested=new Te(this),this._toggleScrolling=new Te(this),this._initialize=new Te(this),this._outputTracker=new SA({namespace:v0.UUID.uuid4()}),this._inputHistoryScope="global",this._pendingInput=!1,super.layout=new yc,this.addClass(fG),this.contentFactory=(i=e.contentFactory)!==null&&i!==void 0?i:ds.defaultContentFactory,this.rendermime=e.rendermime,this._maxNumberOutputs=(n=e.maxNumberOutputs)!==null&&n!==void 0?n:1/0,this._translator=(r=e.translator)!==null&&r!==void 0?r:fo,this._inputHistoryScope=(s=e.inputHistoryScope)!==null&&s!==void 0?s:"global";let o=this.model=e.model;for(let a=0;a{this._pendingInput=!1}).catch(()=>{}),this.model.clear(),this.widgets.length&&(this._clear(),this.outputLengthChanged.emit(Math.min(this.model.length,this._maxNumberOutputs))),e.onIOPub=this._onIOPub,e.onReply=this._onExecuteReply,e.onStdin=i=>{$z.KernelMessage.isInputRequestMsg(i)&&this.onInputRequest(i,e)})}get inputRequested(){return this._inputRequested}get pendingInput(){return this._pendingInput}get maxNumberOutputs(){return this._maxNumberOutputs}set maxNumberOutputs(e){if(e<=0){console.warn("OutputArea.maxNumberOutputs must be strictly positive.");return}let i=this._maxNumberOutputs;this._maxNumberOutputs=e,i{this._toggleScrolling.emit()}),this.node.appendChild(e),requestAnimationFrame(()=>{this._initialize.emit()})}_moveDisplayIdIndices(e,i){this._displayIdMap.forEach(n=>{let r=e+i,s=n.length;for(let o=s-1;o>=0;--o){let a=n[o];a>=e&&a=r&&(n[o]-=i)}})}onStateChanged(e,i){let n=Math.min(this.model.length,this._maxNumberOutputs);if(i){if(i>=this._maxNumberOutputs)return;this._setOutput(i,this.model.get(i))}else for(let r=0;r{this.isDisposed||(this.node.style.minHeight="")},50)}onInputRequest(e,i){let n=this.contentFactory,r=e.content.prompt,s=e.content.password,o=new ju;o.addClass(jz),o.addClass(_G);let a=n.createOutputPrompt();a.addClass(Wz),o.addWidget(a),this._pendingInput=!0;let l=n.createStdin({parent_header:e.header,prompt:r,password:s,future:i,translator:this._translator,inputHistoryScope:this._inputHistoryScope});l.addClass(Fz),o.addWidget(l),this.model.length>=this.maxNumberOutputs&&(this.maxNumberOutputs=this.model.length),this._inputRequested.emit(l);let c=l.node.getElementsByTagName("input")[0];l.value.then(u=>{this.model.length>=this.maxNumberOutputs&&(this.maxNumberOutputs=this.model.length+1),o.addClass(mG),this.model.add({output_type:"stream",name:"stdin",text:u+` +`);s.search(/\r[^$]/g)>-1;){let o=s.match(/^(.*)\r+/m)[1],a=s.match(/\r+(.*)$/m)[1];a=a+o.slice(a.length,o.length),s=s.replace(/\r+.*$/m,"\r").replace(/^.*\r/m,a)}return s}function r(s){return n(i(s))}t.removeOverwrittenChars=r})(Pf||(Pf={}))});var kz,ni,zu,Pu,wo,Az=$(()=>{(function(t){function e(i){let n=document.body,r=s=>{s.preventDefault(),s.stopPropagation(),s.clipboardData.setData("text",i),n.removeEventListener("copy",r,!0)};n.addEventListener("copy",r,!0),document.execCommand("copy")}t.copyText=e})(kz||(kz={}));(function(t){function e(s){let o=window.getComputedStyle(s),a=parseFloat(o.borderTopWidth)||0,l=parseFloat(o.borderLeftWidth)||0,c=parseFloat(o.borderRightWidth)||0,u=parseFloat(o.borderBottomWidth)||0,d=parseFloat(o.paddingTop)||0,f=parseFloat(o.paddingLeft)||0,h=parseFloat(o.paddingRight)||0,m=parseFloat(o.paddingBottom)||0,p=l+f+h+c,_=a+d+m+u;return{borderTop:a,borderLeft:l,borderRight:c,borderBottom:u,paddingTop:d,paddingLeft:f,paddingRight:h,paddingBottom:m,horizontalSum:p,verticalSum:_}}t.boxSizing=e;function i(s){let o=window.getComputedStyle(s),a=parseFloat(o.minWidth)||0,l=parseFloat(o.minHeight)||0,c=parseFloat(o.maxWidth)||1/0,u=parseFloat(o.maxHeight)||1/0;return c=Math.max(a,c),u=Math.max(l,u),{minWidth:a,minHeight:l,maxWidth:c,maxHeight:u}}t.sizeLimits=i;function n(s,o,a){let l=s.getBoundingClientRect();return o>=l.left&&o=l.top&&a=a.bottom)){if(l.topa.bottom&&l.height>=a.height){s.scrollTop-=a.top-l.top;return}if(l.topa.height){s.scrollTop-=a.bottom-l.bottom;return}if(l.bottom>a.bottom&&l.height{let u=Element.prototype;return u.matches||u.matchesSelector||u.mozMatchesSelector||u.msMatchesSelector||u.oMatchesSelector||u.webkitMatchesSelector||function(d){let f=this,h=f.ownerDocument?f.ownerDocument.querySelectorAll(d):[];return Array.prototype.indexOf.call(h,f)!==-1}})();function e(u){u=u.split(",",1)[0];let d=0,f=0,h=0;function m(p){let _=u.match(p);return _===null?!1:(u=u.slice(_[0].length),!0)}for(u=u.replace(c," $1 ");u.length>0;){if(m(i)){d++;continue}if(m(n)){f++;continue}if(m(r)){f++;continue}if(m(o)){h++;continue}if(m(a)){f++;continue}if(m(s)){h++;continue}if(!m(l))return 0}return d=Math.min(d,255),f=Math.min(f,255),h=Math.min(h,255),d<<16|f<<8|h}t.calculateSingle=e;let i=/^#[^\s\+>~#\.\[:]+/,n=/^\.[^\s\+>~#\.\[:]+/,r=/^\[[^\]]+\]/,s=/^[^\s\+>~#\.\[:]+/,o=/^(::[^\s\+>~#\.\[:]+|:first-line|:first-letter|:before|:after)/,a=/^:[^\s\+>~#\.\[:]+/,l=/^[\s\+>~\*]+/,c=/:not\(([^\)]+)\)/g})(wo||(wo={}))});var Bf,hl,Lz=$(()=>{Bf=class{constructor(){this._first=null,this._last=null,this._size=0}get isEmpty(){return this._size===0}get size(){return this._size}get length(){return this._size}get first(){return this._first?this._first.value:void 0}get last(){return this._last?this._last.value:void 0}get firstNode(){return this._first}get lastNode(){return this._last}*[Symbol.iterator](){let e=this._first;for(;e;)yield e.value,e=e.next}*retro(){let e=this._last;for(;e;)yield e.value,e=e.prev}*nodes(){let e=this._first;for(;e;)yield e,e=e.next}*retroNodes(){let e=this._last;for(;e;)yield e,e=e.prev}assign(e){this.clear();for(let i of e)this.addLast(i)}push(e){this.addLast(e)}pop(){return this.removeLast()}shift(e){this.addFirst(e)}unshift(){return this.removeFirst()}addFirst(e){let i=new hl.LinkedListNode(this,e);return this._first?(i.next=this._first,this._first.prev=i,this._first=i):(this._first=i,this._last=i),this._size++,i}addLast(e){let i=new hl.LinkedListNode(this,e);return this._last?(i.prev=this._last,this._last.next=i,this._last=i):(this._first=i,this._last=i),this._size++,i}insertBefore(e,i){if(!i||i===this._first)return this.addFirst(e);if(!(i instanceof hl.LinkedListNode)||i.list!==this)throw new Error("Reference node is not owned by the list.");let n=new hl.LinkedListNode(this,e),r=i,s=r.prev;return n.next=r,n.prev=s,r.prev=n,s.next=n,this._size++,n}insertAfter(e,i){if(!i||i===this._last)return this.addLast(e);if(!(i instanceof hl.LinkedListNode)||i.list!==this)throw new Error("Reference node is not owned by the list.");let n=new hl.LinkedListNode(this,e),r=i,s=r.next;return n.next=s,n.prev=r,r.next=n,s.prev=n,this._size++,n}removeFirst(){let e=this._first;if(e)return e===this._last?(this._first=null,this._last=null):(this._first=e.next,this._first.prev=null),e.list=null,e.next=null,e.prev=null,this._size--,e.value}removeLast(){let e=this._last;if(e)return e===this._first?(this._first=null,this._last=null):(this._last=e.prev,this._last.next=null),e.list=null,e.next=null,e.prev=null,this._size--,e.value}removeNode(e){if(!(e instanceof hl.LinkedListNode)||e.list!==this)throw new Error("Node is not owned by the list.");let i=e;i===this._first&&i===this._last?(this._first=null,this._last=null):i===this._first?(this._first=i.next,this._first.prev=null):i===this._last?(this._last=i.prev,this._last.next=null):(i.next.prev=i.prev,i.prev.next=i.next),i.list=null,i.next=null,i.prev=null,this._size--}clear(){let e=this._first;for(;e;){let i=e.next;e.list=null,e.prev=null,e.next=null,e=i}this._first=null,this._last=null,this._size=0}};(function(t){function e(i){let n=new t;return n.assign(i),n}t.from=e})(Bf||(Bf={}));(function(t){class e{constructor(n,r){this.list=null,this.next=null,this.prev=null,this.list=n,this.value=r}}t.LinkedListNode=e})(hl||(hl={}))});var br,xc,ze,Nz=$(()=>{f0();Lz();br=class{constructor(e){this.type=e}get isConflatable(){return!1}conflate(e){return!1}},xc=class extends br{get isConflatable(){return!0}conflate(e){return!0}};(function(t){let e=null,i=(C=>x=>{let w=!1;return C.then(()=>!w&&x()),()=>{w=!0}})(Promise.resolve());function n(C,x){let w=f.get(C);if(!w||w.length===0){y(C,x);return}Ez(Tz(w),N=>N?_(N,C,x):!0)&&y(C,x)}t.sendMessage=n;function r(C,x){if(!x.isConflatable){S(C,x);return}Mz(d,E=>E.handler!==C||!E.msg||E.msg.type!==x.type||!E.msg.isConflatable?!1:E.msg.conflate(x))||S(C,x)}t.postMessage=r;function s(C,x){let w=f.get(C);w&&w.indexOf(x)!==-1||(w?w.push(x):f.set(C,[x]))}t.installMessageHook=s;function o(C,x){let w=f.get(C);if(!w)return;let E=w.indexOf(x);E!==-1&&(w[E]=null,O(w))}t.removeMessageHook=o;function a(C){let x=f.get(C);x&&x.length>0&&(je.fill(x,null),O(x));for(let w of d)w.handler===C&&(w.handler=null,w.msg=null)}t.clearData=a;function l(){p||e===null||(e(),e=null,p=!0,T(),p=!1)}t.flush=l;function c(){return m}t.getExceptionHandler=c;function u(C){let x=m;return m=C,x}t.setExceptionHandler=u;let d=new Bf,f=new WeakMap,h=new Set,m=C=>{console.error(C)},p=!1;function _(C,x,w){let E=!0;try{typeof C=="function"?E=C(x,w):E=C.messageHook(x,w)}catch(N){m(N)}return E}function y(C,x){try{C.processMessage(x)}catch(w){m(w)}}function S(C,x){d.addLast({handler:C,msg:x}),e===null&&(e=i(T))}function T(){if(e=null,d.isEmpty)return;let C={handler:null,msg:null};for(d.addLast(C);;){let x=d.removeFirst();if(x===C)return;x.handler&&x.msg&&n(x.handler,x.msg)}}function O(C){h.size===0&&i(A),h.add(C)}function A(){h.forEach(b),h.clear()}function b(C){je.removeAllWhere(C,M)}function M(C){return C===null}})(ze||(ze={}))});var gl,da,cs,ug,fe,m0,fa,Hu,Hf,yc,dg,hg,So,ml,DC,p0,ju,OC,Fu,zC,fg,PC,us,Bu,g0,BC,jf,fl,ha,xr,Dz,hG,wc,Vs,HC,nn,Wu,Vi,pl,wn,Ff,_0,Oz,zz,jC,Pz,Bz,Hz=$(()=>{f0();gl=P(Qn());Az();Nz();Tp();Rs();Ev();Iv();Tv();Cv();Mv();da=class{constructor(){this.sizeHint=0,this.minSize=0,this.maxSize=1/0,this.stretch=1,this.size=0,this.done=!1}};(function(t){function e(s,o){let a=s.length;if(a===0)return o;let l=0,c=0,u=0,d=0,f=0;for(let p=0;p0&&(d+=_.stretch,f++)}if(o===u)return 0;if(o<=l){for(let p=0;p=c){for(let p=0;p0&&p>h;){let _=p,y=d;for(let S=0;S0&&p>h;){let _=p/m;for(let y=0;y0&&p>h;){let _=p,y=d;for(let S=0;S=T.maxSize?(p-=T.maxSize-T.size,d-=T.stretch,T.size=T.maxSize,T.done=!0,m--,f--):(p-=O,T.size+=O)}}for(;m>0&&p>h;){let _=p/m;for(let y=0;y=S.maxSize?(p-=S.maxSize-S.size,S.size=S.maxSize,S.done=!0,m--):(p-=_,S.size+=_))}}}return 0}t.calc=e;function i(s,o,a){s.length===0||a===0||(a>0?n(s,o,a):r(s,o,-a))}t.adjust=i;function n(s,o,a){let l=0;for(let f=0;f<=o;++f){let h=s[f];l+=h.maxSize-h.size}let c=0;for(let f=o+1,h=s.length;f=0&&u>0;--f){let h=s[f],m=h.maxSize-h.size;m>=u?(h.sizeHint=h.size+u,u=0):(h.sizeHint=h.size+m,u-=m)}let d=a;for(let f=o+1,h=s.length;f0;++f){let m=s[f],p=m.size-m.minSize;p>=d?(m.sizeHint=m.size-d,d=0):(m.sizeHint=m.size-p,d-=p)}}function r(s,o,a){let l=0;for(let f=o+1,h=s.length;f0;++f){let m=s[f],p=m.maxSize-m.size;p>=u?(m.sizeHint=m.size+u,u=0):(m.sizeHint=m.size+p,u-=p)}let d=a;for(let f=o;f>=0&&d>0;--f){let h=s[f],m=h.size-h.minSize;m>=d?(h.sizeHint=h.size-d,d=0):(h.sizeHint=h.size-m,d-=m)}}})(cs||(cs={}));ug=class{constructor(e){this._label="",this._caption="",this._mnemonic=-1,this._icon=void 0,this._iconClass="",this._iconLabel="",this._className="",this._closable=!1,this._changed=new Te(this),this._isDisposed=!1,this.owner=e.owner,e.label!==void 0&&(this._label=e.label),e.mnemonic!==void 0&&(this._mnemonic=e.mnemonic),e.icon!==void 0&&(this._icon=e.icon),e.iconClass!==void 0&&(this._iconClass=e.iconClass),e.iconLabel!==void 0&&(this._iconLabel=e.iconLabel),e.caption!==void 0&&(this._caption=e.caption),e.className!==void 0&&(this._className=e.className),e.closable!==void 0&&(this._closable=e.closable),this._dataset=e.dataset||{}}get changed(){return this._changed}get label(){return this._label}set label(e){this._label!==e&&(this._label=e,this._changed.emit(void 0))}get mnemonic(){return this._mnemonic}set mnemonic(e){this._mnemonic!==e&&(this._mnemonic=e,this._changed.emit(void 0))}get icon(){return this._icon}set icon(e){this._icon!==e&&(this._icon=e,this._changed.emit(void 0))}get iconClass(){return this._iconClass}set iconClass(e){this._iconClass!==e&&(this._iconClass=e,this._changed.emit(void 0))}get iconLabel(){return this._iconLabel}set iconLabel(e){this._iconLabel!==e&&(this._iconLabel=e,this._changed.emit(void 0))}get caption(){return this._caption}set caption(e){this._caption!==e&&(this._caption=e,this._changed.emit(void 0))}get className(){return this._className}set className(e){this._className!==e&&(this._className=e,this._changed.emit(void 0))}get closable(){return this._closable}set closable(e){this._closable!==e&&(this._closable=e,this._changed.emit(void 0))}get dataset(){return this._dataset}set dataset(e){this._dataset!==e&&(this._dataset=e,this._changed.emit(void 0))}get isDisposed(){return this._isDisposed}dispose(){this.isDisposed||(this._isDisposed=!0,Te.clearData(this))}},fe=class{constructor(e={}){this._flags=0,this._layout=null,this._parent=null,this._disposed=new Te(this),this._hiddenMode=fe.HiddenMode.Display,this.node=m0.createNode(e),this.addClass("lm-Widget")}dispose(){this.isDisposed||(this.setFlag(fe.Flag.IsDisposed),this._disposed.emit(void 0),this.parent?this.parent=null:this.isAttached&&fe.detach(this),this._layout&&(this._layout.dispose(),this._layout=null),this.title.dispose(),Te.clearData(this),ze.clearData(this),pt.clearData(this))}get disposed(){return this._disposed}get isDisposed(){return this.testFlag(fe.Flag.IsDisposed)}get isAttached(){return this.testFlag(fe.Flag.IsAttached)}get isHidden(){return this.testFlag(fe.Flag.IsHidden)}get isVisible(){return this.testFlag(fe.Flag.IsVisible)}get title(){return m0.titleProperty.get(this)}get id(){return this.node.id}set id(e){this.node.id=e}get dataset(){return this.node.dataset}get hiddenMode(){return this._hiddenMode}set hiddenMode(e){this._hiddenMode!==e&&(this.isHidden&&this._toggleHidden(!1),e==fe.HiddenMode.Scale?this.node.style.willChange="transform":this.node.style.willChange="auto",this._hiddenMode=e,this.isHidden&&this._toggleHidden(!0))}get parent(){return this._parent}set parent(e){if(this._parent!==e){if(e&&this.contains(e))throw new Error("Invalid parent widget.");if(this._parent&&!this._parent.isDisposed){let i=new fe.ChildMessage("child-removed",this);ze.sendMessage(this._parent,i)}if(this._parent=e,this._parent&&!this._parent.isDisposed){let i=new fe.ChildMessage("child-added",this);ze.sendMessage(this._parent,i)}this.isDisposed||ze.sendMessage(this,fe.Msg.ParentChanged)}}get layout(){return this._layout}set layout(e){if(this._layout!==e){if(this.testFlag(fe.Flag.DisallowLayout))throw new Error("Cannot set widget layout.");if(this._layout)throw new Error("Cannot change widget layout.");if(e.parent)throw new Error("Cannot change layout parent.");this._layout=e,e.parent=this}}*children(){this._layout&&(yield*this._layout)}contains(e){for(let i=e;i;i=i._parent)if(i===this)return!0;return!1}hasClass(e){return this.node.classList.contains(e)}addClass(e){this.node.classList.add(e)}removeClass(e){this.node.classList.remove(e)}toggleClass(e,i){return i===!0?(this.node.classList.add(e),!0):i===!1?(this.node.classList.remove(e),!1):this.node.classList.toggle(e)}update(){ze.postMessage(this,fe.Msg.UpdateRequest)}fit(){ze.postMessage(this,fe.Msg.FitRequest)}activate(){ze.postMessage(this,fe.Msg.ActivateRequest)}close(){ze.sendMessage(this,fe.Msg.CloseRequest)}show(){if(this.testFlag(fe.Flag.IsHidden)&&(this.isAttached&&(!this.parent||this.parent.isVisible)&&ze.sendMessage(this,fe.Msg.BeforeShow),this.clearFlag(fe.Flag.IsHidden),this._toggleHidden(!1),this.isAttached&&(!this.parent||this.parent.isVisible)&&ze.sendMessage(this,fe.Msg.AfterShow),this.parent)){let e=new fe.ChildMessage("child-shown",this);ze.sendMessage(this.parent,e)}}hide(){if(!this.testFlag(fe.Flag.IsHidden)&&(this.isAttached&&(!this.parent||this.parent.isVisible)&&ze.sendMessage(this,fe.Msg.BeforeHide),this.setFlag(fe.Flag.IsHidden),this._toggleHidden(!0),this.isAttached&&(!this.parent||this.parent.isVisible)&&ze.sendMessage(this,fe.Msg.AfterHide),this.parent)){let e=new fe.ChildMessage("child-hidden",this);ze.sendMessage(this.parent,e)}}setHidden(e){e?this.hide():this.show()}testFlag(e){return(this._flags&e)!==0}setFlag(e){this._flags|=e}clearFlag(e){this._flags&=~e}processMessage(e){switch(e.type){case"resize":this.notifyLayout(e),this.onResize(e);break;case"update-request":this.notifyLayout(e),this.onUpdateRequest(e);break;case"fit-request":this.notifyLayout(e),this.onFitRequest(e);break;case"before-show":this.notifyLayout(e),this.onBeforeShow(e);break;case"after-show":this.setFlag(fe.Flag.IsVisible),this.notifyLayout(e),this.onAfterShow(e);break;case"before-hide":this.notifyLayout(e),this.onBeforeHide(e);break;case"after-hide":this.clearFlag(fe.Flag.IsVisible),this.notifyLayout(e),this.onAfterHide(e);break;case"before-attach":this.notifyLayout(e),this.onBeforeAttach(e);break;case"after-attach":!this.isHidden&&(!this.parent||this.parent.isVisible)&&this.setFlag(fe.Flag.IsVisible),this.setFlag(fe.Flag.IsAttached),this.notifyLayout(e),this.onAfterAttach(e);break;case"before-detach":this.notifyLayout(e),this.onBeforeDetach(e);break;case"after-detach":this.clearFlag(fe.Flag.IsVisible),this.clearFlag(fe.Flag.IsAttached),this.notifyLayout(e),this.onAfterDetach(e);break;case"activate-request":this.notifyLayout(e),this.onActivateRequest(e);break;case"close-request":this.notifyLayout(e),this.onCloseRequest(e);break;case"child-added":this.notifyLayout(e),this.onChildAdded(e);break;case"child-removed":this.notifyLayout(e),this.onChildRemoved(e);break;default:this.notifyLayout(e);break}}notifyLayout(e){this._layout&&this._layout.processParentMessage(e)}onCloseRequest(e){this.parent?this.parent=null:this.isAttached&&fe.detach(this)}onResize(e){}onUpdateRequest(e){}onFitRequest(e){}onActivateRequest(e){}onBeforeShow(e){}onAfterShow(e){}onBeforeHide(e){}onAfterHide(e){}onBeforeAttach(e){}onAfterAttach(e){}onBeforeDetach(e){}onAfterDetach(e){}onChildAdded(e){}onChildRemoved(e){}_toggleHidden(e){if(e)switch(this._hiddenMode){case fe.HiddenMode.Display:this.addClass("lm-mod-hidden");break;case fe.HiddenMode.Scale:this.node.style.transform="scale(0)",this.node.setAttribute("aria-hidden","true");break;case fe.HiddenMode.ContentVisibility:this.node.style.contentVisibility="hidden",this.node.style.zIndex="-1";break}else switch(this._hiddenMode){case fe.HiddenMode.Display:this.removeClass("lm-mod-hidden");break;case fe.HiddenMode.Scale:this.node.style.transform="",this.node.removeAttribute("aria-hidden");break;case fe.HiddenMode.ContentVisibility:this.node.style.contentVisibility="",this.node.style.zIndex="";break}}};(function(t){(function(s){s[s.Display=0]="Display",s[s.Scale=1]="Scale",s[s.ContentVisibility=2]="ContentVisibility"})(t.HiddenMode||(t.HiddenMode={})),function(s){s[s.IsDisposed=1]="IsDisposed",s[s.IsAttached=2]="IsAttached",s[s.IsHidden=4]="IsHidden",s[s.IsVisible=8]="IsVisible",s[s.DisallowLayout=16]="DisallowLayout"}(t.Flag||(t.Flag={})),function(s){s.BeforeShow=new br("before-show"),s.AfterShow=new br("after-show"),s.BeforeHide=new br("before-hide"),s.AfterHide=new br("after-hide"),s.BeforeAttach=new br("before-attach"),s.AfterAttach=new br("after-attach"),s.BeforeDetach=new br("before-detach"),s.AfterDetach=new br("after-detach"),s.ParentChanged=new br("parent-changed"),s.UpdateRequest=new xc("update-request"),s.FitRequest=new xc("fit-request"),s.ActivateRequest=new xc("activate-request"),s.CloseRequest=new xc("close-request")}(t.Msg||(t.Msg={}));class e extends br{constructor(o,a){super(o),this.child=a}}t.ChildMessage=e;class i extends br{constructor(o,a){super("resize"),this.width=o,this.height=a}}t.ResizeMessage=i,function(s){s.UnknownSize=new s(-1,-1)}(i=t.ResizeMessage||(t.ResizeMessage={}));function n(s,o,a=null){if(s.parent)throw new Error("Cannot attach a child widget.");if(s.isAttached||s.node.isConnected)throw new Error("Widget is already attached.");if(!o.isConnected)throw new Error("Host is not attached.");ze.sendMessage(s,t.Msg.BeforeAttach),o.insertBefore(s.node,a),ze.sendMessage(s,t.Msg.AfterAttach)}t.attach=n;function r(s){if(s.parent)throw new Error("Cannot detach a child widget.");if(!s.isAttached||!s.node.isConnected)throw new Error("Widget is not attached.");ze.sendMessage(s,t.Msg.BeforeDetach),s.node.parentNode.removeChild(s.node),ze.sendMessage(s,t.Msg.AfterDetach)}t.detach=r})(fe||(fe={}));(function(t){t.titleProperty=new pt({name:"title",create:i=>new ug({owner:i})});function e(i){return i.node||document.createElement(i.tag||"div")}t.createNode=e})(m0||(m0={}));fa=class{constructor(e={}){this._disposed=!1,this._parent=null,this._fitPolicy=e.fitPolicy||"set-min-size"}dispose(){this._parent=null,this._disposed=!0,Te.clearData(this),pt.clearData(this)}get isDisposed(){return this._disposed}get parent(){return this._parent}set parent(e){if(this._parent!==e){if(this._parent)throw new Error("Cannot change parent widget.");if(e.layout!==this)throw new Error("Invalid parent widget.");this._parent=e,this.init()}}get fitPolicy(){return this._fitPolicy}set fitPolicy(e){if(this._fitPolicy!==e&&(this._fitPolicy=e,this._parent)){let i=this._parent.node.style;i.minWidth="",i.minHeight="",i.maxWidth="",i.maxHeight="",this._parent.fit()}}processParentMessage(e){switch(e.type){case"resize":this.onResize(e);break;case"update-request":this.onUpdateRequest(e);break;case"fit-request":this.onFitRequest(e);break;case"before-show":this.onBeforeShow(e);break;case"after-show":this.onAfterShow(e);break;case"before-hide":this.onBeforeHide(e);break;case"after-hide":this.onAfterHide(e);break;case"before-attach":this.onBeforeAttach(e);break;case"after-attach":this.onAfterAttach(e);break;case"before-detach":this.onBeforeDetach(e);break;case"after-detach":this.onAfterDetach(e);break;case"child-removed":this.onChildRemoved(e);break;case"child-shown":this.onChildShown(e);break;case"child-hidden":this.onChildHidden(e);break}}init(){for(let e of this)e.parent=this.parent}onResize(e){for(let i of this)ze.sendMessage(i,fe.ResizeMessage.UnknownSize)}onUpdateRequest(e){for(let i of this)ze.sendMessage(i,fe.ResizeMessage.UnknownSize)}onBeforeAttach(e){for(let i of this)ze.sendMessage(i,e)}onAfterAttach(e){for(let i of this)ze.sendMessage(i,e)}onBeforeDetach(e){for(let i of this)ze.sendMessage(i,e)}onAfterDetach(e){for(let i of this)ze.sendMessage(i,e)}onBeforeShow(e){for(let i of this)i.isHidden||ze.sendMessage(i,e)}onAfterShow(e){for(let i of this)i.isHidden||ze.sendMessage(i,e)}onBeforeHide(e){for(let i of this)i.isHidden||ze.sendMessage(i,e)}onAfterHide(e){for(let i of this)i.isHidden||ze.sendMessage(i,e)}onChildRemoved(e){this.removeWidget(e.child)}onFitRequest(e){}onChildShown(e){}onChildHidden(e){}};(function(t){function e(s){return Hf.horizontalAlignmentProperty.get(s)}t.getHorizontalAlignment=e;function i(s,o){Hf.horizontalAlignmentProperty.set(s,o)}t.setHorizontalAlignment=i;function n(s){return Hf.verticalAlignmentProperty.get(s)}t.getVerticalAlignment=n;function r(s,o){Hf.verticalAlignmentProperty.set(s,o)}t.setVerticalAlignment=r})(fa||(fa={}));Hu=class{constructor(e){this._top=NaN,this._left=NaN,this._width=NaN,this._height=NaN,this._minWidth=0,this._minHeight=0,this._maxWidth=1/0,this._maxHeight=1/0,this._disposed=!1,this.widget=e,this.widget.node.style.position="absolute",this.widget.node.style.contain="strict"}dispose(){if(this._disposed)return;this._disposed=!0;let e=this.widget.node.style;e.position="",e.top="",e.left="",e.width="",e.height="",e.contain=""}get minWidth(){return this._minWidth}get minHeight(){return this._minHeight}get maxWidth(){return this._maxWidth}get maxHeight(){return this._maxHeight}get isDisposed(){return this._disposed}get isHidden(){return this.widget.isHidden}get isVisible(){return this.widget.isVisible}get isAttached(){return this.widget.isAttached}fit(){let e=ni.sizeLimits(this.widget.node);this._minWidth=e.minWidth,this._minHeight=e.minHeight,this._maxWidth=e.maxWidth,this._maxHeight=e.maxHeight}update(e,i,n,r){let s=Math.max(this._minWidth,Math.min(n,this._maxWidth)),o=Math.max(this._minHeight,Math.min(r,this._maxHeight));if(s"center",changed:e}),t.verticalAlignmentProperty=new pt({name:"verticalAlignment",create:()=>"top",changed:e});function e(i){i.parent&&i.parent.layout&&i.parent.update()}})(Hf||(Hf={}));yc=class extends fa{constructor(){super(...arguments),this._widgets=[]}dispose(){for(;this._widgets.length>0;)this._widgets.pop().dispose();super.dispose()}get widgets(){return this._widgets}*[Symbol.iterator](){yield*this._widgets}addWidget(e){this.insertWidget(this._widgets.length,e)}insertWidget(e,i){i.parent=this.parent;let n=this._widgets.indexOf(i),r=Math.max(0,Math.min(e,this._widgets.length));if(n===-1){je.insert(this._widgets,r,i),this.parent&&this.attachWidget(r,i);return}r===this._widgets.length&&r--,n!==r&&(je.move(this._widgets,n,r),this.parent&&this.moveWidget(n,r,i))}removeWidget(e){this.removeWidgetAt(this._widgets.indexOf(e))}removeWidgetAt(e){let i=je.removeAt(this._widgets,e);i&&this.parent&&this.detachWidget(e,i)}init(){super.init();let e=0;for(let i of this)this.attachWidget(e++,i)}attachWidget(e,i){let n=this.parent.node.children[e];this.parent.isAttached&&ze.sendMessage(i,fe.Msg.BeforeAttach),this.parent.node.insertBefore(i.node,n),this.parent.isAttached&&ze.sendMessage(i,fe.Msg.AfterAttach)}moveWidget(e,i,n){this.parent.isAttached&&ze.sendMessage(n,fe.Msg.BeforeDetach),this.parent.node.removeChild(n.node),this.parent.isAttached&&ze.sendMessage(n,fe.Msg.AfterDetach);let r=this.parent.node.children[i];this.parent.isAttached&&ze.sendMessage(n,fe.Msg.BeforeAttach),this.parent.node.insertBefore(n.node,r),this.parent.isAttached&&ze.sendMessage(n,fe.Msg.AfterAttach)}detachWidget(e,i){this.parent.isAttached&&ze.sendMessage(i,fe.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.isAttached&&ze.sendMessage(i,fe.Msg.AfterDetach)}};(function(t){function e(i){return Math.max(0,Math.floor(i))}t.clampDimension=e})(dg||(dg={}));hg=dg,So=class extends yc{constructor(e){super(),this.widgetOffset=0,this._fixed=0,this._spacing=4,this._dirty=!1,this._hasNormedSizes=!1,this._sizers=[],this._items=[],this._handles=[],this._box=null,this._alignment="start",this._orientation="horizontal",this.renderer=e.renderer,e.orientation!==void 0&&(this._orientation=e.orientation),e.alignment!==void 0&&(this._alignment=e.alignment),e.spacing!==void 0&&(this._spacing=dg.clampDimension(e.spacing))}dispose(){for(let e of this._items)e.dispose();this._box=null,this._items.length=0,this._sizers.length=0,this._handles.length=0,super.dispose()}get orientation(){return this._orientation}set orientation(e){this._orientation!==e&&(this._orientation=e,this.parent&&(this.parent.dataset.orientation=e,this.parent.fit()))}get alignment(){return this._alignment}set alignment(e){this._alignment!==e&&(this._alignment=e,this.parent&&(this.parent.dataset.alignment=e,this.parent.update()))}get spacing(){return this._spacing}set spacing(e){e=dg.clampDimension(e),this._spacing!==e&&(this._spacing=e,this.parent&&this.parent.fit())}get handles(){return this._handles}absoluteSizes(){return this._sizers.map(e=>e.size)}relativeSizes(){return ml.normalize(this._sizers.map(e=>e.size))}setRelativeSizes(e,i=!0){let n=this._sizers.length,r=e.slice(0,n);for(;r.length0&&(s.sizeHint=s.size);cs.adjust(this._sizers,e,r),this.parent&&this.parent.update()}}init(){this.parent.dataset.orientation=this.orientation,this.parent.dataset.alignment=this.alignment,super.init()}attachWidget(e,i){let n=new Hu(i),r=ml.createHandle(this.renderer),s=ml.averageSize(this._sizers),o=ml.createSizer(s);je.insert(this._items,e,n),je.insert(this._sizers,e,o),je.insert(this._handles,e,r),this.parent.isAttached&&ze.sendMessage(i,fe.Msg.BeforeAttach),this.parent.node.appendChild(i.node),this.parent.node.appendChild(r),this.parent.isAttached&&ze.sendMessage(i,fe.Msg.AfterAttach),this.parent.fit()}moveWidget(e,i,n){je.move(this._items,e,i),je.move(this._sizers,e,i),je.move(this._handles,e,i),this.parent.fit()}detachWidget(e,i){let n=je.removeAt(this._items,e),r=je.removeAt(this._handles,e);je.removeAt(this._sizers,e),this.parent.isAttached&&ze.sendMessage(i,fe.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.node.removeChild(r),this.parent.isAttached&&ze.sendMessage(i,fe.Msg.AfterDetach),n.dispose(),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}updateItemPosition(e,i,n,r,s,o,a){let l=this._items[e];if(l.isHidden)return;let c=this._handles[e].style;i?(n+=this.widgetOffset,l.update(n,r,a,s),n+=a,c.top=`${r}px`,c.left=`${n}px`,c.width=`${this._spacing}px`,c.height=`${s}px`):(r+=this.widgetOffset,l.update(n,r,o,a),r+=a,c.top=`${r}px`,c.left=`${n}px`,c.width=`${o}px`,c.height=`${this._spacing}px`)}_fit(){let e=0,i=-1;for(let l=0,c=this._items.length;l0&&(d.sizeHint=d.size),u.isHidden){d.minSize=0,d.maxSize=0;continue}u.fit(),d.stretch=So.getStretch(u.widget),n?(d.minSize=u.minWidth,d.maxSize=u.maxWidth,r+=u.minWidth,s=Math.max(s,u.minHeight)):(d.minSize=u.minHeight,d.maxSize=u.maxHeight,s+=u.minHeight,r=Math.max(r,u.minWidth))}let o=this._box=ni.boxSizing(this.parent.node);r+=o.horizontalSum,s+=o.verticalSum;let a=this.parent.node.style;a.minWidth=`${r}px`,a.minHeight=`${s}px`,this._dirty=!0,this.parent.parent&&ze.sendMessage(this.parent.parent,fe.Msg.FitRequest),this._dirty&&ze.sendMessage(this.parent,fe.Msg.UpdateRequest)}_update(e,i){this._dirty=!1;let n=0;for(let d=0,f=this._items.length;d0){let d;if(u?d=Math.max(0,o-this._fixed):d=Math.max(0,a-this._fixed),this._hasNormedSizes){for(let h of this._sizers)h.sizeHint*=d;this._hasNormedSizes=!1}let f=cs.calc(this._sizers,d);if(f>0)switch(this._alignment){case"start":break;case"center":l=0,c=f/2;break;case"end":l=0,c=f;break;case"justify":l=f/n,c=0;break;default:throw"unreachable"}}for(let d=0,f=this._items.length;d0,coerce:(o,a)=>Math.max(0,Math.floor(a)),changed:s});function e(o){let a=new da;return a.sizeHint=Math.floor(o),a}t.createSizer=e;function i(o){let a=o.createHandle();return a.style.position="absolute",a.style.contain="style",a}t.createHandle=i;function n(o){return o.reduce((a,l)=>a+l.size,0)/o.length||0}t.averageSize=n;function r(o){let a=o.length;if(a===0)return[];let l=o.reduce((c,u)=>c+Math.abs(u),0);return l===0?o.map(c=>1/a):o.map(c=>c/l)}t.normalize=r;function s(o){o.parent&&o.parent.layout instanceof So&&o.parent.fit()}})(ml||(ml={}));DC=class extends So{constructor(e){super({...e,orientation:e.orientation||"vertical"}),this._titles=[],this.titleSpace=e.titleSpace||22}get titleSpace(){return this.widgetOffset}set titleSpace(e){e=hg.clampDimension(e),this.widgetOffset!==e&&(this.widgetOffset=e,this.parent&&this.parent.fit())}get titles(){return this._titles}dispose(){this.isDisposed||(this._titles.length=0,super.dispose())}updateTitle(e,i){let n=this._titles[e],r=n.classList.contains("lm-mod-expanded"),s=p0.createTitle(this.renderer,i.title,r);this._titles[e]=s,this.parent.node.replaceChild(s,n)}insertWidget(e,i){i.id||(i.id=`id-${gl.UUID.uuid4()}`),super.insertWidget(e,i)}attachWidget(e,i){let n=p0.createTitle(this.renderer,i.title);je.insert(this._titles,e,n),this.parent.node.appendChild(n),i.node.setAttribute("role","region"),i.node.setAttribute("aria-labelledby",n.id),super.attachWidget(e,i)}moveWidget(e,i,n){je.move(this._titles,e,i),super.moveWidget(e,i,n)}detachWidget(e,i){let n=je.removeAt(this._titles,e);this.parent.node.removeChild(n),super.detachWidget(e,i)}updateItemPosition(e,i,n,r,s,o,a){let l=this._titles[e].style;l.top=`${r}px`,l.left=`${n}px`,l.height=`${this.widgetOffset}px`,i?l.width=`${s}px`:l.width=`${o}px`,super.updateItemPosition(e,i,n,r,s,o,a)}};(function(t){function e(i,n,r=!0){let s=i.createSectionTitle(n);return s.style.position="absolute",s.style.contain="strict",s.setAttribute("aria-label",`${n.label} Section`),s.setAttribute("aria-expanded",r?"true":"false"),s.setAttribute("aria-controls",n.owner.id),r&&s.classList.add("lm-mod-expanded"),s}t.createTitle=e})(p0||(p0={}));ju=class extends fe{constructor(e={}){super(),this.addClass("lm-Panel"),this.layout=OC.createLayout(e)}get widgets(){return this.layout.widgets}addWidget(e){this.layout.addWidget(e)}insertWidget(e,i){this.layout.insertWidget(e,i)}};(function(t){function e(i){return i.layout||new yc}t.createLayout=e})(OC||(OC={}));Fu=class extends ju{constructor(e={}){super({layout:zC.createLayout(e)}),this._handleMoved=new Te(this),this._pressData=null,this.addClass("lm-SplitPanel")}dispose(){this._releaseMouse(),super.dispose()}get orientation(){return this.layout.orientation}set orientation(e){this.layout.orientation=e}get alignment(){return this.layout.alignment}set alignment(e){this.layout.alignment=e}get spacing(){return this.layout.spacing}set spacing(e){this.layout.spacing=e}get renderer(){return this.layout.renderer}get handleMoved(){return this._handleMoved}get handles(){return this.layout.handles}relativeSizes(){return this.layout.relativeSizes()}setRelativeSizes(e,i=!0){this.layout.setRelativeSizes(e,i)}handleEvent(e){switch(e.type){case"pointerdown":this._evtPointerDown(e);break;case"pointermove":this._evtPointerMove(e);break;case"pointerup":this._evtPointerUp(e);break;case"keydown":this._evtKeyDown(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("pointerdown",this)}onAfterDetach(e){this.node.removeEventListener("pointerdown",this),this._releaseMouse()}onChildAdded(e){e.child.addClass("lm-SplitPanel-child"),this._releaseMouse()}onChildRemoved(e){e.child.removeClass("lm-SplitPanel-child"),this._releaseMouse()}_evtKeyDown(e){this._pressData&&(e.preventDefault(),e.stopPropagation()),e.keyCode===27&&this._releaseMouse()}_evtPointerDown(e){if(e.button!==0)return;let i=this.layout,n=je.findFirstIndex(i.handles,c=>c.contains(e.target));if(n===-1)return;e.preventDefault(),e.stopPropagation(),document.addEventListener("pointerup",this,!0),document.addEventListener("pointermove",this,!0),document.addEventListener("keydown",this,!0),document.addEventListener("contextmenu",this,!0);let r,s=i.handles[n],o=s.getBoundingClientRect();i.orientation==="horizontal"?r=e.clientX-o.left:r=e.clientY-o.top;let a=window.getComputedStyle(s),l=an.overrideCursor(a.cursor);this._pressData={index:n,delta:r,override:l}}_evtPointerMove(e){e.preventDefault(),e.stopPropagation();let i,n=this.layout,r=this.node.getBoundingClientRect();n.orientation==="horizontal"?i=e.clientX-r.left-this._pressData.delta:i=e.clientY-r.top-this._pressData.delta,n.moveHandle(this._pressData.index,i)}_evtPointerUp(e){e.button===0&&(e.preventDefault(),e.stopPropagation(),this._releaseMouse())}_releaseMouse(){this._pressData&&(this._pressData.override.dispose(),this._pressData=null,this._handleMoved.emit(),document.removeEventListener("keydown",this,!0),document.removeEventListener("pointerup",this,!0),document.removeEventListener("pointermove",this,!0),document.removeEventListener("contextmenu",this,!0))}};(function(t){class e{createHandle(){let s=document.createElement("div");return s.className="lm-SplitPanel-handle",s}}t.Renderer=e,t.defaultRenderer=new e;function i(r){return So.getStretch(r)}t.getStretch=i;function n(r,s){So.setStretch(r,s)}t.setStretch=n})(Fu||(Fu={}));(function(t){function e(i){return i.layout||new So({renderer:i.renderer||Fu.defaultRenderer,orientation:i.orientation,alignment:i.alignment,spacing:i.spacing})}t.createLayout=e})(zC||(zC={}));fg=class extends Fu{constructor(e={}){super({...e,layout:PC.createLayout(e)}),this._widgetSizesCache=new WeakMap,this._expansionToggled=new Te(this),this.addClass("lm-AccordionPanel")}get renderer(){return this.layout.renderer}get titleSpace(){return this.layout.titleSpace}set titleSpace(e){this.layout.titleSpace=e}get titles(){return this.layout.titles}get expansionToggled(){return this._expansionToggled}addWidget(e){super.addWidget(e),e.title.changed.connect(this._onTitleChanged,this)}collapse(e){let i=this.layout.widgets[e];i&&!i.isHidden&&this._toggleExpansion(e)}expand(e){let i=this.layout.widgets[e];i&&i.isHidden&&this._toggleExpansion(e)}insertWidget(e,i){super.insertWidget(e,i),i.title.changed.connect(this._onTitleChanged,this)}handleEvent(e){switch(super.handleEvent(e),e.type){case"click":this._evtClick(e);break;case"keydown":this._eventKeyDown(e);break}}onBeforeAttach(e){this.node.addEventListener("click",this),this.node.addEventListener("keydown",this),super.onBeforeAttach(e)}onAfterDetach(e){super.onAfterDetach(e),this.node.removeEventListener("click",this),this.node.removeEventListener("keydown",this)}_onTitleChanged(e){let i=je.findFirstIndex(this.widgets,n=>n.contains(e.owner));i>=0&&(this.layout.updateTitle(i,e.owner),this.update())}_computeWidgetSize(e){let i=this.layout,n=i.widgets[e];if(!n)return;let r=n.isHidden,s=i.absoluteSizes(),o=(r?-1:1)*this.spacing,a=s.reduce((c,u)=>c+u),l=[...s];if(r){let c=this._widgetSizesCache.get(n);if(!c)return;l[e]+=c;let u=l.map(d=>d-c>0).lastIndexOf(!0);u===-1?l.forEach((d,f)=>{f!==e&&(l[f]-=s[f]/a*(c-o))}):l[u]-=c-o}else{let c=s[e];this._widgetSizesCache.set(n,c),l[e]=0;let u=l.map(d=>d>0).lastIndexOf(!0);if(u===-1)return;l[u]=s[u]+c+o}return l.map(c=>c/(a+o))}_evtClick(e){let i=e.target;if(i){let n=je.findFirstIndex(this.titles,r=>r.contains(i));n>=0&&(e.preventDefault(),e.stopPropagation(),this._toggleExpansion(n))}}_eventKeyDown(e){if(e.defaultPrevented)return;let i=e.target,n=!1;if(i){let r=je.findFirstIndex(this.titles,s=>s.contains(i));if(r>=0){let s=e.keyCode.toString();if(e.key.match(/Space|Enter/)||s.match(/13|32/))i.click(),n=!0;else if(this.orientation==="horizontal"?e.key.match(/ArrowLeft|ArrowRight/)||s.match(/37|39/):e.key.match(/ArrowUp|ArrowDown/)||s.match(/38|40/)){let o=e.key.match(/ArrowLeft|ArrowUp/)||s.match(/37|38/)?-1:1,a=this.titles.length,l=(r+a+o)%a;this.titles[l].focus(),n=!0}else e.key==="End"||s==="35"?(this.titles[this.titles.length-1].focus(),n=!0):(e.key==="Home"||s==="36")&&(this.titles[0].focus(),n=!0)}n&&e.preventDefault()}}_toggleExpansion(e){let i=this.titles[e],n=this.layout.widgets[e],r=this._computeWidgetSize(e);r&&this.setRelativeSizes(r,!1),n.isHidden?(i.classList.add("lm-mod-expanded"),i.setAttribute("aria-expanded","true"),n.show()):(i.classList.remove("lm-mod-expanded"),i.setAttribute("aria-expanded","false"),n.hide()),this._expansionToggled.emit(e)}};(function(t){class e extends Fu.Renderer{constructor(){super(),this.titleClassName="lm-AccordionPanel-title",this._titleID=0,this._titleKeys=new WeakMap,this._uuid=++e._nInstance}createCollapseIcon(n){return document.createElement("span")}createSectionTitle(n){let r=document.createElement("h3");r.setAttribute("tabindex","0"),r.id=this.createTitleKey(n),r.className=this.titleClassName;for(let a in n.dataset)r.dataset[a]=n.dataset[a];let s=r.appendChild(this.createCollapseIcon(n));s.className="lm-AccordionPanel-titleCollapser";let o=r.appendChild(document.createElement("span"));return o.className="lm-AccordionPanel-titleLabel",o.textContent=n.label,o.title=n.caption||n.label,r}createTitleKey(n){let r=this._titleKeys.get(n);return r===void 0&&(r=`title-key-${this._uuid}-${this._titleID++}`,this._titleKeys.set(n,r)),r}}e._nInstance=0,t.Renderer=e,t.defaultRenderer=new e})(fg||(fg={}));(function(t){function e(i){return i.layout||new DC({renderer:i.renderer||fg.defaultRenderer,orientation:i.orientation,alignment:i.alignment,spacing:i.spacing,titleSpace:i.titleSpace})}t.createLayout=e})(PC||(PC={}));us=class extends yc{constructor(e={}){super(),this._fixed=0,this._spacing=4,this._dirty=!1,this._sizers=[],this._items=[],this._box=null,this._alignment="start",this._direction="top-to-bottom",e.direction!==void 0&&(this._direction=e.direction),e.alignment!==void 0&&(this._alignment=e.alignment),e.spacing!==void 0&&(this._spacing=hg.clampDimension(e.spacing))}dispose(){for(let e of this._items)e.dispose();this._box=null,this._items.length=0,this._sizers.length=0,super.dispose()}get direction(){return this._direction}set direction(e){this._direction!==e&&(this._direction=e,this.parent&&(this.parent.dataset.direction=e,this.parent.fit()))}get alignment(){return this._alignment}set alignment(e){this._alignment!==e&&(this._alignment=e,this.parent&&(this.parent.dataset.alignment=e,this.parent.update()))}get spacing(){return this._spacing}set spacing(e){e=hg.clampDimension(e),this._spacing!==e&&(this._spacing=e,this.parent&&this.parent.fit())}init(){this.parent.dataset.direction=this.direction,this.parent.dataset.alignment=this.alignment,super.init()}attachWidget(e,i){je.insert(this._items,e,new Hu(i)),je.insert(this._sizers,e,new da),this.parent.isAttached&&ze.sendMessage(i,fe.Msg.BeforeAttach),this.parent.node.appendChild(i.node),this.parent.isAttached&&ze.sendMessage(i,fe.Msg.AfterAttach),this.parent.fit()}moveWidget(e,i,n){je.move(this._items,e,i),je.move(this._sizers,e,i),this.parent.update()}detachWidget(e,i){let n=je.removeAt(this._items,e);je.removeAt(this._sizers,e),this.parent.isAttached&&ze.sendMessage(i,fe.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.isAttached&&ze.sendMessage(i,fe.Msg.AfterDetach),n.dispose(),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_fit(){let e=0;for(let a=0,l=this._items.length;a0)switch(this._alignment){case"start":break;case"center":c=0,u=l/2;break;case"end":c=0,u=l;break;case"justify":c=l/n,u=0;break;default:throw"unreachable"}for(let d=0,f=this._items.length;d0,coerce:(r,s)=>Math.max(0,Math.floor(s)),changed:n}),t.sizeBasisProperty=new pt({name:"sizeBasis",create:()=>0,coerce:(r,s)=>Math.max(0,Math.floor(s)),changed:n});function e(r){return r==="left-to-right"||r==="right-to-left"}t.isHorizontal=e;function i(r){return Math.max(0,Math.floor(r))}t.clampSpacing=i;function n(r){r.parent&&r.parent.layout instanceof us&&r.parent.fit()}})(Bu||(Bu={}));g0=class extends ju{constructor(e={}){super({layout:BC.createLayout(e)}),this.addClass("lm-BoxPanel")}get direction(){return this.layout.direction}set direction(e){this.layout.direction=e}get alignment(){return this.layout.alignment}set alignment(e){this.layout.alignment=e}get spacing(){return this.layout.spacing}set spacing(e){this.layout.spacing=e}onChildAdded(e){e.child.addClass("lm-BoxPanel-child")}onChildRemoved(e){e.child.removeClass("lm-BoxPanel-child")}};(function(t){function e(s){return us.getStretch(s)}t.getStretch=e;function i(s,o){us.setStretch(s,o)}t.setStretch=i;function n(s){return us.getSizeBasis(s)}t.getSizeBasis=n;function r(s,o){us.setSizeBasis(s,o)}t.setSizeBasis=r})(g0||(g0={}));(function(t){function e(i){return i.layout||new us(i)}t.createLayout=e})(BC||(BC={}));jf=class extends fe{constructor(e){super({node:fl.createNode()}),this._activeIndex=-1,this._items=[],this._results=null,this.addClass("lm-CommandPalette"),this.setFlag(fe.Flag.DisallowLayout),this.commands=e.commands,this.renderer=e.renderer||jf.defaultRenderer,this.commands.commandChanged.connect(this._onGenericChange,this),this.commands.keyBindingChanged.connect(this._onGenericChange,this)}dispose(){this._items.length=0,this._results=null,super.dispose()}get searchNode(){return this.node.getElementsByClassName("lm-CommandPalette-search")[0]}get inputNode(){return this.node.getElementsByClassName("lm-CommandPalette-input")[0]}get contentNode(){return this.node.getElementsByClassName("lm-CommandPalette-content")[0]}get items(){return this._items}addItem(e){let i=fl.createItem(this.commands,e);return this._items.push(i),this.refresh(),i}addItems(e){let i=e.map(n=>fl.createItem(this.commands,n));return i.forEach(n=>this._items.push(n)),this.refresh(),i}removeItem(e){this.removeItemAt(this._items.indexOf(e))}removeItemAt(e){je.removeAt(this._items,e)&&this.refresh()}clearItems(){this._items.length!==0&&(this._items.length=0,this.refresh())}refresh(){if(this._results=null,this.inputNode.value!==""){let e=this.node.getElementsByClassName("lm-close-icon")[0];e.style.display="inherit"}else{let e=this.node.getElementsByClassName("lm-close-icon")[0];e.style.display="none"}this.update()}handleEvent(e){switch(e.type){case"click":this._evtClick(e);break;case"keydown":this._evtKeyDown(e);break;case"input":this.refresh();break;case"focus":case"blur":this._toggleFocused();break}}onBeforeAttach(e){this.node.addEventListener("click",this),this.node.addEventListener("keydown",this),this.node.addEventListener("input",this),this.node.addEventListener("focus",this,!0),this.node.addEventListener("blur",this,!0)}onAfterDetach(e){this.node.removeEventListener("click",this),this.node.removeEventListener("keydown",this),this.node.removeEventListener("input",this),this.node.removeEventListener("focus",this,!0),this.node.removeEventListener("blur",this,!0)}onAfterShow(e){this.update(),super.onAfterShow(e)}onActivateRequest(e){if(this.isAttached){let i=this.inputNode;i.focus(),i.select()}}onUpdateRequest(e){if(this.isHidden)return;let i=this.inputNode.value,n=this.contentNode,r=this._results;if(r||(r=this._results=fl.search(this._items,i),this._activeIndex=i?je.findFirstIndex(r,fl.canActivate):-1),!i&&r.length===0){Qt.render(null,n);return}if(i&&r.length===0){let l=this.renderer.renderEmptyMessage({query:i});Qt.render(l,n);return}let s=this.renderer,o=this._activeIndex,a=new Array(r.length);for(let l=0,c=r.length;l=r.length)n.scrollTop=0;else{let l=n.children[o];ni.scrollIntoViewIfNeeded(n,l)}}_evtClick(e){if(e.button!==0)return;if(e.target.classList.contains("lm-close-icon")){this.inputNode.value="",this.refresh();return}let i=je.findFirstIndex(this.contentNode.children,n=>n.contains(e.target));i!==-1&&(e.preventDefault(),e.stopPropagation(),this._execute(i))}_evtKeyDown(e){if(!(e.altKey||e.ctrlKey||e.metaKey||e.shiftKey))switch(e.keyCode){case 13:e.preventDefault(),e.stopPropagation(),this._execute(this._activeIndex);break;case 38:e.preventDefault(),e.stopPropagation(),this._activatePreviousItem();break;case 40:e.preventDefault(),e.stopPropagation(),this._activateNextItem();break}}_activateNextItem(){if(!this._results||this._results.length===0)return;let e=this._activeIndex,i=this._results.length,n=eC-x),b=S.slice(0,A),M=S.slice(A);for(let C=0,x=M.length;Cp.command===h&&gl.JSONExt.deepEqual(p.args,m))||null}}})(fl||(fl={}));ha=class extends fe{constructor(e){super({node:xr.createNode()}),this._childIndex=-1,this._activeIndex=-1,this._openTimerID=0,this._closeTimerID=0,this._items=[],this._childMenu=null,this._parentMenu=null,this._aboutToClose=new Te(this),this._menuRequested=new Te(this),this.addClass("lm-Menu"),this.setFlag(fe.Flag.DisallowLayout),this.commands=e.commands,this.renderer=e.renderer||ha.defaultRenderer}dispose(){this.close(),this._items.length=0,super.dispose()}get aboutToClose(){return this._aboutToClose}get menuRequested(){return this._menuRequested}get parentMenu(){return this._parentMenu}get childMenu(){return this._childMenu}get rootMenu(){let e=this;for(;e._parentMenu;)e=e._parentMenu;return e}get leafMenu(){let e=this;for(;e._childMenu;)e=e._childMenu;return e}get contentNode(){return this.node.getElementsByClassName("lm-Menu-content")[0]}get activeItem(){return this._items[this._activeIndex]||null}set activeItem(e){this.activeIndex=e?this._items.indexOf(e):-1}get activeIndex(){return this._activeIndex}set activeIndex(e){(e<0||e>=this._items.length)&&(e=-1),e!==-1&&!xr.canActivate(this._items[e])&&(e=-1),this._activeIndex!==e&&(this._activeIndex=e,this._activeIndex>=0&&this.contentNode.childNodes[this._activeIndex]&&this.contentNode.childNodes[this._activeIndex].focus(),this.update())}get items(){return this._items}activateNextItem(){let e=this._items.length,i=this._activeIndex,n=i{this.activeIndex=a}})}Qt.render(o,this.contentNode)}onCloseRequest(e){this._cancelOpenTimer(),this._cancelCloseTimer(),this.activeIndex=-1;let i=this._childMenu;i&&(this._childIndex=-1,this._childMenu=null,i._parentMenu=null,i.close());let n=this._parentMenu;n&&(this._parentMenu=null,n._childIndex=-1,n._childMenu=null,n.activate()),this.isAttached&&this._aboutToClose.emit(void 0),super.onCloseRequest(e)}_evtKeyDown(e){e.preventDefault(),e.stopPropagation();let i=e.keyCode;if(i===13){this.triggerActiveItem();return}if(i===27){this.close();return}if(i===37){this._parentMenu?this.close():this._menuRequested.emit("previous");return}if(i===38){this.activatePreviousItem();return}if(i===39){let o=this.activeItem;o&&o.type==="submenu"?this.triggerActiveItem():this.rootMenu._menuRequested.emit("next");return}if(i===40){this.activateNextItem();return}let n=ks().keyForKeydownEvent(e);if(!n)return;let r=this._activeIndex+1,s=xr.findMnemonic(this._items,n,r);s.index!==-1&&!s.multiple?(this.activeIndex=s.index,this.triggerActiveItem()):s.index!==-1?this.activeIndex=s.index:s.auto!==-1&&(this.activeIndex=s.auto)}_evtMouseUp(e){e.button===0&&(e.preventDefault(),e.stopPropagation(),this.triggerActiveItem())}_evtMouseMove(e){let i=je.findFirstIndex(this.contentNode.children,r=>ni.hitTest(r,e.clientX,e.clientY));if(i===this._activeIndex)return;if(this.activeIndex=i,i=this.activeIndex,i===this._childIndex){this._cancelOpenTimer(),this._cancelCloseTimer();return}this._childIndex!==-1&&this._startCloseTimer(),this._cancelOpenTimer();let n=this.activeItem;!n||n.type!=="submenu"||!n.submenu||this._startOpenTimer()}_evtMouseEnter(e){for(let i=this._parentMenu;i;i=i._parentMenu)i._cancelOpenTimer(),i._cancelCloseTimer(),i.activeIndex=i._childIndex}_evtMouseLeave(e){if(this._cancelOpenTimer(),!this._childMenu){this.activeIndex=-1;return}let{clientX:i,clientY:n}=e;if(ni.hitTest(this._childMenu.node,i,n)){this._cancelCloseTimer();return}this.activeIndex=-1,this._startCloseTimer()}_evtMouseDown(e){this._parentMenu||(xr.hitTestMenus(this,e.clientX,e.clientY)?(e.preventDefault(),e.stopPropagation()):this.close())}_openChildMenu(e=!1){let i=this.activeItem;if(!i||i.type!=="submenu"||!i.submenu){this._closeChildMenu();return}let n=i.submenu;if(n===this._childMenu)return;ha.saveWindowData(),this._closeChildMenu(),this._childMenu=n,this._childIndex=this._activeIndex,n._parentMenu=this,ze.sendMessage(this,fe.Msg.UpdateRequest);let r=this.contentNode.children[this._activeIndex];xr.openSubmenu(n,r),e&&(n.activeIndex=-1,n.activateNextItem()),n.activate()}_closeChildMenu(){this._childMenu&&this._childMenu.close()}_startOpenTimer(){this._openTimerID===0&&(this._openTimerID=window.setTimeout(()=>{this._openTimerID=0,this._openChildMenu()},xr.TIMER_DELAY))}_startCloseTimer(){this._closeTimerID===0&&(this._closeTimerID=window.setTimeout(()=>{this._closeTimerID=0,this._closeChildMenu()},xr.TIMER_DELAY))}_cancelOpenTimer(){this._openTimerID!==0&&(clearTimeout(this._openTimerID),this._openTimerID=0)}_cancelCloseTimer(){this._closeTimerID!==0&&(clearTimeout(this._closeTimerID),this._closeTimerID=0)}static saveWindowData(){xr.saveWindowData()}};(function(t){class e{renderItem(n){let r=this.createItemClass(n),s=this.createItemDataset(n),o=this.createItemARIA(n);return Ce.li({className:r,dataset:s,tabindex:"0",onfocus:n.onfocus,...o},this.renderIcon(n),this.renderLabel(n),this.renderShortcut(n),this.renderSubmenu(n))}renderIcon(n){let r=this.createIconClass(n);return Ce.div({className:r},n.item.icon,n.item.iconLabel)}renderLabel(n){let r=this.formatLabel(n);return Ce.div({className:"lm-Menu-itemLabel"},r)}renderShortcut(n){let r=this.formatShortcut(n);return Ce.div({className:"lm-Menu-itemShortcut"},r)}renderSubmenu(n){return Ce.div({className:"lm-Menu-itemSubmenuIcon"})}createItemClass(n){let r="lm-Menu-item";n.item.isEnabled||(r+=" lm-mod-disabled"),n.item.isToggled&&(r+=" lm-mod-toggled"),n.item.isVisible||(r+=" lm-mod-hidden"),n.active&&(r+=" lm-mod-active"),n.collapsed&&(r+=" lm-mod-collapsed");let s=n.item.className;return s&&(r+=` ${s}`),r}createItemDataset(n){let r,{type:s,command:o,dataset:a}=n.item;return s==="command"?r={...a,type:s,command:o}:r={...a,type:s},r}createIconClass(n){let r="lm-Menu-itemIcon",s=n.item.iconClass;return s?`${r} ${s}`:r}createItemARIA(n){let r={};switch(n.item.type){case"separator":r.role="presentation";break;case"submenu":r["aria-haspopup"]="true",n.item.isEnabled||(r["aria-disabled"]="true");break;default:n.item.isEnabled||(r["aria-disabled"]="true"),r.role="menuitem"}return r}formatLabel(n){let{label:r,mnemonic:s}=n.item;if(s<0||s>=r.length)return r;let o=r.slice(0,s),a=r.slice(s+1),l=r[s],c=Ce.span({className:"lm-Menu-itemMnemonic"},l);return[o,c,a]}formatShortcut(n){let r=n.item.keyBinding;return r?er.formatKeystroke(r.keys):null}}t.Renderer=e,t.defaultRenderer=new e})(ha||(ha={}));(function(t){t.TIMER_DELAY=300,t.SUBMENU_OVERLAP=3;let e=null,i=0;function n(){return i>0?(i--,e):u()}function r(){e=u(),i++}t.saveWindowData=r;function s(){let p=document.createElement("div"),_=document.createElement("ul");return _.className="lm-Menu-content",p.appendChild(_),_.setAttribute("role","menu"),p.tabIndex=0,p}t.createNode=s;function o(p){return p.type!=="separator"&&p.isEnabled&&p.isVisible}t.canActivate=o;function a(p,_){return new m(p.commands,_)}t.createItem=a;function l(p,_,y){for(let S=p;S;S=S.childMenu)if(ni.hitTest(S.node,_,y))return!0;return!1}t.hitTestMenus=l;function c(p){let _=new Array(p.length);je.fill(_,!1);let y=0,S=p.length;for(;y=0;--T){let A=p[T];if(A.isVisible){if(A.type!=="separator")break;_[T]=!0}}let O=!1;for(;++yM+x&&(_=M+x-Z),!T&&y+X>C+w&&(y>C+w?y=C+w-X:y=y-X),B.transform=`translate(${Math.max(0,_)}px, ${Math.max(0,y)}px`,B.opacity="1"}t.openRootMenu=d;function f(p,_){let y=n(),S=y.pageXOffset,T=y.pageYOffset,O=y.clientWidth,A=y.clientHeight;ze.sendMessage(p,fe.Msg.UpdateRequest);let b=A,M=p.node,C=M.style;C.opacity="0",C.maxHeight=`${b}px`,fe.attach(p,document.body);let{width:x,height:w}=M.getBoundingClientRect(),E=ni.boxSizing(p.node),N=_.getBoundingClientRect(),B=N.right-t.SUBMENU_OVERLAP;B+x>S+O&&(B=N.left+t.SUBMENU_OVERLAP-x);let Z=N.top-E.borderTop-E.paddingTop;Z+w>T+A&&(Z=N.bottom+E.borderBottom+E.paddingBottom-w),C.transform=`translate(${Math.max(0,B)}px, ${Math.max(0,Z)}px`,C.opacity="1"}t.openSubmenu=f;function h(p,_,y){let S=-1,T=-1,O=!1,A=_.toUpperCase();for(let b=0,M=p.length;b=0&&ES.command===_&&gl.JSONExt.deepEqual(S.args,y))||null}return null}}})(xr||(xr={}));(function(t){function e(o,a){let l=n(o.selector),c=o.rank!==void 0?o.rank:1/0;return{...o,selector:l,rank:c,id:a}}t.createItem=e;function i(o,a,l,c){let u=a.target;if(!u)return null;let d=a.currentTarget;if(!d||!d.contains(u)&&(u=document.elementFromPoint(a.clientX,a.clientY),!u||!d.contains(u)))return null;let f=[],h=o.slice();for(;u!==null;){let m=[];for(let p=0,_=h.length;p<_;++p){let y=h[p];y&&Pu.matches(u,y.selector)&&(m.push(y),h[p]=null)}if(m.length!==0&&(l&&m.sort(c?s:r),f.push(...m)),u===d)break;u=u.parentElement}return l||f.sort(c?s:r),f}t.matchItems=i;function n(o){if(o.indexOf(",")!==-1)throw new Error(`Selector cannot contain commas: ${o}`);if(!Pu.isValid(o))throw new Error(`Invalid selector: ${o}`);return o}function r(o,a){let l=o.rank,c=a.rank;return l!==c?l=this._titles.length)&&(e=-1),this._currentIndex===e)return;let i=this._currentIndex,n=this._titles[i]||null,r=e,s=this._titles[r]||null;this._currentIndex=r,this._previousTitle=n,this.update(),this._currentChanged.emit({previousIndex:i,previousTitle:n,currentIndex:r,currentTitle:s})}get name(){return this._name}set name(e){this._name=e,e?this.contentNode.setAttribute("aria-label",e):this.contentNode.removeAttribute("aria-label")}get orientation(){return this._orientation}set orientation(e){this._orientation!==e&&(this._releaseMouse(),this._orientation=e,this.dataset.orientation=e,this.contentNode.setAttribute("aria-orientation",e))}get addButtonEnabled(){return this._addButtonEnabled}set addButtonEnabled(e){this._addButtonEnabled!==e&&(this._addButtonEnabled=e,e?this.addButtonNode.classList.remove("lm-mod-hidden"):this.addButtonNode.classList.add("lm-mod-hidden"))}get titles(){return this._titles}get contentNode(){return this.node.getElementsByClassName("lm-TabBar-content")[0]}get addButtonNode(){return this.node.getElementsByClassName("lm-TabBar-addButton")[0]}addTab(e){return this.insertTab(this._titles.length,e)}insertTab(e,i){this._releaseMouse();let n=Vs.asTitle(i),r=this._titles.indexOf(n),s=Math.max(0,Math.min(e,this._titles.length));return r===-1?(je.insert(this._titles,s,n),n.changed.connect(this._onTitleChanged,this),this.update(),this._adjustCurrentForInsert(s,n),n):(s===this._titles.length&&s--,r===s||(je.move(this._titles,r,s),this.update(),this._adjustCurrentForMove(r,s)),n)}removeTab(e){this.removeTabAt(this._titles.indexOf(e))}removeTabAt(e){this._releaseMouse();let i=je.removeAt(this._titles,e);i&&(i.changed.disconnect(this._onTitleChanged,this),i===this._previousTitle&&(this._previousTitle=null),this.update(),this._adjustCurrentForRemove(e,i))}clearTabs(){if(this._titles.length===0)return;this._releaseMouse();for(let n of this._titles)n.changed.disconnect(this._onTitleChanged,this);let e=this.currentIndex,i=this.currentTitle;this._currentIndex=-1,this._previousTitle=null,this._titles.length=0,this.update(),e!==-1&&this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:-1,currentTitle:null})}releaseMouse(){this._releaseMouse()}handleEvent(e){switch(e.type){case"pointerdown":this._evtPointerDown(e);break;case"pointermove":this._evtPointerMove(e);break;case"pointerup":this._evtPointerUp(e);break;case"dblclick":this._evtDblClick(e);break;case"keydown":e.eventPhase===Event.CAPTURING_PHASE?this._evtKeyDownCapturing(e):this._evtKeyDown(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("pointerdown",this),this.node.addEventListener("dblclick",this),this.node.addEventListener("keydown",this)}onAfterDetach(e){this.node.removeEventListener("pointerdown",this),this.node.removeEventListener("dblclick",this),this.node.removeEventListener("keydown",this),this._releaseMouse()}onUpdateRequest(e){var i;let n=this._titles,r=this.renderer,s=this.currentTitle,o=new Array(n.length),a=(i=this._getCurrentTabindex())!==null&&i!==void 0?i:this._currentIndex>-1?this._currentIndex:0;for(let l=0,c=n.length;lni.hitTest(o,e.clientX,e.clientY));if(n===-1)return;let r=this.titles[n],s=i[n].querySelector(".lm-TabBar-tabLabel");if(s&&s.contains(e.target)){let o=r.label||"",a=s.innerHTML;s.innerHTML="";let l=document.createElement("input");l.classList.add("lm-TabBar-tabInput"),l.value=o,s.appendChild(l);let c=()=>{l.removeEventListener("blur",c),s.innerHTML=a,this.node.addEventListener("keydown",this)};l.addEventListener("dblclick",u=>u.stopPropagation()),l.addEventListener("blur",c),l.addEventListener("keydown",u=>{u.key==="Enter"?(l.value!==""&&(r.label=r.caption=l.value),c()):u.key==="Escape"&&c()}),this.node.removeEventListener("keydown",this),l.select(),l.focus(),s.children.length>0&&s.children[0].focus()}}_evtKeyDownCapturing(e){e.eventPhase===Event.CAPTURING_PHASE&&(e.preventDefault(),e.stopPropagation(),e.key==="Escape"&&this._releaseMouse())}_evtKeyDown(e){var i,n,r;if(!(e.key==="Tab"||e.eventPhase===Event.CAPTURING_PHASE)){if(e.key==="Enter"||e.key==="Spacebar"||e.key===" "){let s=document.activeElement;if(this.addButtonEnabled&&this.addButtonNode.contains(s))e.preventDefault(),e.stopPropagation(),this._addRequested.emit();else{let o=je.findFirstIndex(this.contentNode.children,a=>a.contains(s));o>=0&&(e.preventDefault(),e.stopPropagation(),this.currentIndex=o)}}else if(hG.includes(e.key)){let s=[...this.contentNode.children];if(this.addButtonEnabled&&s.push(this.addButtonNode),s.length<=1)return;e.preventDefault(),e.stopPropagation();let o=s.indexOf(document.activeElement);o===-1&&(o=this._currentIndex);let a;e.key==="ArrowRight"&&this._orientation==="horizontal"||e.key==="ArrowDown"&&this._orientation==="vertical"?a=(i=s[o+1])!==null&&i!==void 0?i:s[0]:e.key==="ArrowLeft"&&this._orientation==="horizontal"||e.key==="ArrowUp"&&this._orientation==="vertical"?a=(n=s[o-1])!==null&&n!==void 0?n:s[s.length-1]:e.key==="Home"?a=s[0]:e.key==="End"&&(a=s[s.length-1]),a&&((r=s[o])===null||r===void 0||r.setAttribute("tabindex","-1"),a?.setAttribute("tabindex","0"),a.focus())}}}_evtPointerDown(e){if(e.button!==0&&e.button!==1||this._dragData||e.target.classList.contains("lm-TabBar-tabInput"))return;let i=this.addButtonEnabled&&this.addButtonNode.contains(e.target),n=this.contentNode.children,r=je.findFirstIndex(n,o=>ni.hitTest(o,e.clientX,e.clientY));if(r===-1&&!i||(e.preventDefault(),e.stopPropagation(),this._dragData={tab:n[r],index:r,pressX:e.clientX,pressY:e.clientY,tabPos:-1,tabSize:-1,tabPressPos:-1,targetIndex:-1,tabLayout:null,contentRect:null,override:null,dragActive:!1,dragAborted:!1,detachRequested:!1},this.document.addEventListener("pointerup",this,!0),e.button===1||i))return;let s=n[r].querySelector(this.renderer.closeIconSelector);s&&s.contains(e.target)||(this.tabsMovable&&(this.document.addEventListener("pointermove",this,!0),this.document.addEventListener("keydown",this,!0),this.document.addEventListener("contextmenu",this,!0)),this.allowDeselect&&this.currentIndex===r?this.currentIndex=-1:this.currentIndex=r,this.currentIndex!==-1&&this._tabActivateRequested.emit({index:this.currentIndex,title:this.currentTitle}))}_evtPointerMove(e){let i=this._dragData;if(!i)return;e.preventDefault(),e.stopPropagation();let n=this.contentNode.children;if(!(!i.dragActive&&!Vs.dragExceeded(i,e))){if(!i.dragActive){let r=i.tab.getBoundingClientRect();this._orientation==="horizontal"?(i.tabPos=i.tab.offsetLeft,i.tabSize=r.width,i.tabPressPos=i.pressX-r.left):(i.tabPos=i.tab.offsetTop,i.tabSize=r.height,i.tabPressPos=i.pressY-r.top),i.tabPressOffset={x:i.pressX-r.left,y:i.pressY-r.top},i.tabLayout=Vs.snapTabLayout(n,this._orientation),i.contentRect=this.contentNode.getBoundingClientRect(),i.override=an.overrideCursor("default"),i.tab.classList.add("lm-mod-dragging"),this.addClass("lm-mod-dragging"),i.dragActive=!0}if(!i.detachRequested&&Vs.detachExceeded(i,e)){i.detachRequested=!0;let r=i.index,s=e.clientX,o=e.clientY,a=n[r],l=this._titles[r];if(this._tabDetachRequested.emit({index:r,title:l,tab:a,clientX:s,clientY:o,offset:i.tabPressOffset}),i.dragAborted)return}Vs.layoutTabs(n,i,e,this._orientation)}}_evtPointerUp(e){if(e.button!==0&&e.button!==1)return;let i=this._dragData;if(!i)return;if(e.preventDefault(),e.stopPropagation(),this.document.removeEventListener("pointermove",this,!0),this.document.removeEventListener("pointerup",this,!0),this.document.removeEventListener("keydown",this,!0),this.document.removeEventListener("contextmenu",this,!0),!i.dragActive){if(this._dragData=null,this.addButtonEnabled&&this.addButtonNode.contains(e.target)){this._addRequested.emit(void 0);return}let s=this.contentNode.children,o=je.findFirstIndex(s,c=>ni.hitTest(c,e.clientX,e.clientY));if(o!==i.index)return;let a=this._titles[o];if(!a.closable)return;if(e.button===1){this._tabCloseRequested.emit({index:o,title:a});return}let l=s[o].querySelector(this.renderer.closeIconSelector);if(l&&l.contains(e.target)){this._tabCloseRequested.emit({index:o,title:a});return}return}if(e.button!==0)return;Vs.finalizeTabPosition(i,this._orientation),i.tab.classList.remove("lm-mod-dragging");let n=Vs.parseTransitionDuration(i.tab);setTimeout(()=>{if(i.dragAborted)return;this._dragData=null,Vs.resetTabPositions(this.contentNode.children,this._orientation),i.override.dispose(),this.removeClass("lm-mod-dragging");let r=i.index,s=i.targetIndex;s===-1||r===s||(je.move(this._titles,r,s),this._adjustCurrentForMove(r,s),this._tabMoved.emit({fromIndex:r,toIndex:s,title:this._titles[s]}),ze.sendMessage(this,fe.Msg.UpdateRequest))},n)}_releaseMouse(){let e=this._dragData;e&&(this._dragData=null,this.document.removeEventListener("pointermove",this,!0),this.document.removeEventListener("pointerup",this,!0),this.document.removeEventListener("keydown",this,!0),this.document.removeEventListener("contextmenu",this,!0),e.dragAborted=!0,e.dragActive&&(Vs.resetTabPositions(this.contentNode.children,this._orientation),e.override.dispose(),e.tab.classList.remove("lm-mod-dragging"),this.removeClass("lm-mod-dragging")))}_adjustCurrentForInsert(e,i){let n=this.currentTitle,r=this._currentIndex,s=this.insertBehavior;if(s==="select-tab"||s==="select-tab-if-needed"&&r===-1){this._currentIndex=e,this._previousTitle=n,this._currentChanged.emit({previousIndex:r,previousTitle:n,currentIndex:e,currentTitle:i});return}r>=e&&this._currentIndex++}_adjustCurrentForMove(e,i){this._currentIndex===e?this._currentIndex=i:this._currentIndex=i?this._currentIndex++:this._currentIndex>e&&this._currentIndex<=i&&this._currentIndex--}_adjustCurrentForRemove(e,i){let n=this._currentIndex,r=this.removeBehavior;if(n!==e){n>e&&this._currentIndex--;return}if(this._titles.length===0){this._currentIndex=-1,this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:-1,currentTitle:null});return}if(r==="select-tab-after"){this._currentIndex=Math.min(e,this._titles.length-1),this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:this._currentIndex,currentTitle:this.currentTitle});return}if(r==="select-tab-before"){this._currentIndex=Math.max(0,e-1),this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:this._currentIndex,currentTitle:this.currentTitle});return}if(r==="select-previous-tab"){this._previousTitle?(this._currentIndex=this._titles.indexOf(this._previousTitle),this._previousTitle=null):this._currentIndex=Math.min(e,this._titles.length-1),this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:this._currentIndex,currentTitle:this.currentTitle});return}this._currentIndex=-1,this._currentChanged.emit({previousIndex:e,previousTitle:i,currentIndex:-1,currentTitle:null})}_onTitleChanged(e){this.update()}};(function(t){class e{constructor(){this.closeIconSelector=".lm-TabBar-tabCloseIcon",this._tabID=0,this._tabKeys=new WeakMap,this._uuid=++e._nInstance}renderTab(n){let r=n.title.caption,s=this.createTabKey(n),o=s,a=this.createTabStyle(n),l=this.createTabClass(n),c=this.createTabDataset(n),u=this.createTabARIA(n);return n.title.closable?Ce.li({id:o,key:s,className:l,title:r,style:a,dataset:c,...u},this.renderIcon(n),this.renderLabel(n),this.renderCloseIcon(n)):Ce.li({id:o,key:s,className:l,title:r,style:a,dataset:c,...u},this.renderIcon(n),this.renderLabel(n))}renderIcon(n){let{title:r}=n,s=this.createIconClass(n);return Ce.div({className:s},r.icon,r.iconLabel)}renderLabel(n){return Ce.div({className:"lm-TabBar-tabLabel"},n.title.label)}renderCloseIcon(n){return Ce.div({className:"lm-TabBar-tabCloseIcon"})}createTabKey(n){let r=this._tabKeys.get(n.title);return r===void 0&&(r=`tab-key-${this._uuid}-${this._tabID++}`,this._tabKeys.set(n.title,r)),r}createTabStyle(n){return{zIndex:`${n.zIndex}`}}createTabClass(n){let r="lm-TabBar-tab";return n.title.className&&(r+=` ${n.title.className}`),n.title.closable&&(r+=" lm-mod-closable"),n.current&&(r+=" lm-mod-current"),r}createTabDataset(n){return n.title.dataset}createTabARIA(n){var r;return{role:"tab","aria-selected":n.current.toString(),tabindex:`${(r=n.tabIndex)!==null&&r!==void 0?r:"-1"}`}}createIconClass(n){let r="lm-TabBar-tabIcon",s=n.title.iconClass;return s?`${r} ${s}`:r}}e._nInstance=0,t.Renderer=e,t.defaultRenderer=new e,t.addButtonSelector=".lm-TabBar-addButton"})(wc||(wc={}));(function(t){t.DRAG_THRESHOLD=5,t.DETACH_THRESHOLD=20;function e(){let u=document.createElement("div"),d=document.createElement("ul");d.setAttribute("role","tablist"),d.className="lm-TabBar-content",u.appendChild(d);let f=document.createElement("div");return f.className="lm-TabBar-addButton lm-mod-hidden",f.setAttribute("tabindex","-1"),f.setAttribute("role","button"),u.appendChild(f),u}t.createNode=e;function i(u){return u instanceof ug?u:new ug(u)}t.asTitle=i;function n(u){let d=window.getComputedStyle(u);return 1e3*(parseFloat(d.transitionDuration)||0)}t.parseTransitionDuration=n;function r(u,d){let f=new Array(u.length);for(let h=0,m=u.length;h=t.DRAG_THRESHOLD||h>=t.DRAG_THRESHOLD}t.dragExceeded=s;function o(u,d){let f=u.contentRect;return d.clientX=f.right+t.DETACH_THRESHOLD||d.clientY=f.bottom+t.DETACH_THRESHOLD}t.detachExceeded=o;function a(u,d,f,h){let m,p,_,y;h==="horizontal"?(m=d.pressX,p=f.clientX-d.contentRect.left,_=f.clientX,y=d.contentRect.width):(m=d.pressY,p=f.clientY-d.contentRect.top,_=f.clientY,y=d.contentRect.height);let S=d.index,T=p-d.tabPressPos,O=T+d.tabSize;for(let A=0,b=u.length;A>1);if(Ad.index&&O>x)M=`${-d.tabSize-C.margin}px`,S=Math.max(S,A);else if(A===d.index){let w=_-m,E=y-(d.tabPos+d.tabSize);M=`${Math.max(-d.tabPos,Math.min(w,E))}px`}else M="";h==="horizontal"?u[A].style.left=M:u[A].style.top=M}d.targetIndex=S}t.layoutTabs=a;function l(u,d){let f;d==="horizontal"?f=u.contentRect.width:f=u.contentRect.height;let h;if(u.targetIndex===u.index)h=0;else if(u.targetIndex>u.index){let _=u.tabLayout[u.targetIndex];h=_.pos+_.size-u.tabSize-u.tabPos}else h=u.tabLayout[u.targetIndex].pos-u.tabPos;let m=f-(u.tabPos+u.tabSize),p=Math.max(-u.tabPos,Math.min(h,m));d==="horizontal"?u.tab.style.left=`${p}px`:u.tab.style.top=`${p}px`}t.finalizeTabPosition=l;function c(u,d){for(let f of u)d==="horizontal"?f.style.left="":f.style.top=""}t.resetTabPositions=c})(Vs||(Vs={}));HC=class extends fa{constructor(e){super(),this._spacing=4,this._dirty=!1,this._root=null,this._box=null,this._items=new Map,this.renderer=e.renderer,e.spacing!==void 0&&(this._spacing=hg.clampDimension(e.spacing)),this._document=e.document||document,this._hiddenMode=e.hiddenMode!==void 0?e.hiddenMode:fe.HiddenMode.Display}dispose(){let e=this[Symbol.iterator]();this._items.forEach(i=>{i.dispose()}),this._box=null,this._root=null,this._items.clear();for(let i of e)i.dispose();super.dispose()}get hiddenMode(){return this._hiddenMode}set hiddenMode(e){if(this._hiddenMode!==e){this._hiddenMode=e;for(let i of this.tabBars())if(i.titles.length>1)for(let n of i.titles)n.owner.hiddenMode=this._hiddenMode}}get spacing(){return this._spacing}set spacing(e){e=hg.clampDimension(e),this._spacing!==e&&(this._spacing=e,this.parent&&this.parent.fit())}get isEmpty(){return this._root===null}[Symbol.iterator](){return this._root?this._root.iterAllWidgets():zf()}widgets(){return this._root?this._root.iterUserWidgets():zf()}selectedWidgets(){return this._root?this._root.iterSelectedWidgets():zf()}tabBars(){return this._root?this._root.iterTabBars():zf()}handles(){return this._root?this._root.iterHandles():zf()}moveHandle(e,i,n){let r=e.classList.contains("lm-mod-hidden");if(!this._root||r)return;let s=this._root.findSplitNode(e);if(!s)return;let o;s.node.orientation==="horizontal"?o=i-e.offsetLeft:o=n-e.offsetTop,o!==0&&(s.node.holdSizes(),cs.adjust(s.node.sizers,s.index,o),this.parent&&this.parent.update())}saveLayout(){return this._root?(this._root.holdAllSizes(),{main:this._root.createConfig()}):{main:null}}restoreLayout(e){let i=new Set,n;e.main?n=nn.normalizeAreaConfig(e.main,i):n=null;let r=this.widgets(),s=this.tabBars(),o=this.handles();this._root=null;for(let a of r)i.has(a)||(a.parent=null);for(let a of s)a.dispose();for(let a of o)a.parentNode&&a.parentNode.removeChild(a);for(let a of i)a.parent=this.parent;n?this._root=nn.realizeAreaConfig(n,{createTabBar:a=>this._createTabBar(),createHandle:()=>this._createHandle()},this._document):this._root=null,this.parent&&(i.forEach(a=>{this.attachWidget(a)}),this.parent.fit())}addWidget(e,i={}){let n=i.ref||null,r=i.mode||"tab-after",s=null;if(this._root&&n&&(s=this._root.findTabNode(n)),n&&!s)throw new Error("Reference widget is not in the layout.");switch(e.parent=this.parent,r){case"tab-after":this._insertTab(e,n,s,!0);break;case"tab-before":this._insertTab(e,n,s,!1);break;case"split-top":this._insertSplit(e,n,s,"vertical",!1);break;case"split-left":this._insertSplit(e,n,s,"horizontal",!1);break;case"split-right":this._insertSplit(e,n,s,"horizontal",!0);break;case"split-bottom":this._insertSplit(e,n,s,"vertical",!0);break;case"merge-top":this._insertSplit(e,n,s,"vertical",!1,!0);break;case"merge-left":this._insertSplit(e,n,s,"horizontal",!1,!0);break;case"merge-right":this._insertSplit(e,n,s,"horizontal",!0,!0);break;case"merge-bottom":this._insertSplit(e,n,s,"vertical",!0,!0);break}this.parent&&(this.attachWidget(e),this.parent.fit())}removeWidget(e){this._removeWidget(e),this.parent&&(this.detachWidget(e),this.parent.fit())}hitTestTabAreas(e,i){if(!this._root||!this.parent||!this.parent.isVisible)return null;this._box||(this._box=ni.boxSizing(this.parent.node));let n=this.parent.node.getBoundingClientRect(),r=e-n.left-this._box.borderLeft,s=i-n.top-this._box.borderTop,o=this._root.hitTestTabNodes(r,s);if(!o)return null;let{tabBar:a,top:l,left:c,width:u,height:d}=o,f=this._box.borderLeft+this._box.borderRight,h=this._box.borderTop+this._box.borderBottom,m=n.width-f-(c+u),p=n.height-h-(l+d);return{tabBar:a,x:r,y:s,top:l,left:c,right:m,bottom:p,width:u,height:d}}init(){super.init();for(let e of this)this.attachWidget(e);for(let e of this.handles())this.parent.node.appendChild(e);this.parent.fit()}attachWidget(e){this.parent.node!==e.node.parentNode&&(this._items.set(e,new Hu(e)),this.parent.isAttached&&ze.sendMessage(e,fe.Msg.BeforeAttach),this.parent.node.appendChild(e.node),this.parent.isAttached&&ze.sendMessage(e,fe.Msg.AfterAttach))}detachWidget(e){if(this.parent.node!==e.node.parentNode)return;this.parent.isAttached&&ze.sendMessage(e,fe.Msg.BeforeDetach),this.parent.node.removeChild(e.node),this.parent.isAttached&&ze.sendMessage(e,fe.Msg.AfterDetach);let i=this._items.get(e);i&&(this._items.delete(e),i.dispose())}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_removeWidget(e){if(!this._root)return;let i=this._root.findTabNode(e);if(!i)return;if(nn.removeAria(e),i.tabBar.titles.length>1){if(i.tabBar.removeTab(e.title),this._hiddenMode===fe.HiddenMode.Scale&&i.tabBar.titles.length==1){let f=i.tabBar.titles[0].owner;f.hiddenMode=fe.HiddenMode.Display}return}if(i.tabBar.dispose(),this._root===i){this._root=null;return}this._root.holdAllSizes();let n=i.parent;i.parent=null;let r=je.removeFirstOf(n.children,i),s=je.removeAt(n.handles,r);if(je.removeAt(n.sizers,r),s.parentNode&&s.parentNode.removeChild(s),n.children.length>1){n.syncHandles();return}let o=n.parent;n.parent=null;let a=n.children[0],l=n.handles[0];if(n.children.length=0,n.handles.length=0,n.sizers.length=0,l.parentNode&&l.parentNode.removeChild(l),this._root===n){a.parent=null,this._root=a;return}let c=o,u=c.children.indexOf(n);if(a instanceof nn.TabLayoutNode){a.parent=c,c.children[u]=a;return}let d=je.removeAt(c.handles,u);je.removeAt(c.children,u),je.removeAt(c.sizers,u),d.parentNode&&d.parentNode.removeChild(d);for(let f=0,h=a.children.length;f=this._left+this._width||m=this._top+this._height?null:this}createConfig(){let h=this.tabBar.titles.map(p=>p.owner),m=this.tabBar.currentIndex;return{type:"tab-area",widgets:h,currentIndex:m}}holdAllSizes(){}fit(h,m){let p=0,_=0,y=1/0,S=1/0,T=m.get(this.tabBar),O=this.tabBar.currentTitle,A=O?m.get(O.owner):void 0,[b,M]=this.sizers;return T&&T.fit(),A&&A.fit(),T&&!T.isHidden?(p=Math.max(p,T.minWidth),_+=T.minHeight,b.minSize=T.minHeight,b.maxSize=T.maxHeight):(b.minSize=0,b.maxSize=0),A&&!A.isHidden?(p=Math.max(p,A.minWidth),_+=A.minHeight,M.minSize=A.minHeight,M.maxSize=1/0):(M.minSize=0,M.maxSize=1/0),{minWidth:p,minHeight:_,maxWidth:y,maxHeight:S}}update(h,m,p,_,y,S){this._top=m,this._left=h,this._width=p,this._height=_;let T=S.get(this.tabBar),O=this.tabBar.currentTitle,A=O?S.get(O.owner):void 0;if(cs.calc(this.sizers,_),T&&!T.isHidden){let b=this.sizers[0].size;T.update(h,m,p,b),m+=b}if(A&&!A.isHidden){let b=this.sizers[1].size;A.update(h,m,p,b)}}}t.TabLayoutNode=r;class s{constructor(h){this.parent=null,this.normalized=!1,this.children=[],this.sizers=[],this.handles=[],this.orientation=h}*iterAllWidgets(){for(let h of this.children)yield*h.iterAllWidgets()}*iterUserWidgets(){for(let h of this.children)yield*h.iterUserWidgets()}*iterSelectedWidgets(){for(let h of this.children)yield*h.iterSelectedWidgets()}*iterTabBars(){for(let h of this.children)yield*h.iterTabBars()}*iterHandles(){yield*this.handles;for(let h of this.children)yield*h.iterHandles()}findTabNode(h){for(let m=0,p=this.children.length;m_.createConfig());return{type:"split-area",orientation:h,children:p,sizes:m}}syncHandles(){this.handles.forEach((h,m)=>{h.setAttribute("data-orientation",this.orientation),m===this.handles.length-1?h.classList.add("lm-mod-hidden"):h.classList.remove("lm-mod-hidden")})}holdSizes(){for(let h of this.sizers)h.sizeHint=h.size}holdAllSizes(){for(let h of this.children)h.holdAllSizes();this.holdSizes()}normalizeSizes(){let h=this.sizers.length;if(h===0)return;this.holdSizes();let m=this.sizers.reduce((p,_)=>p+_.sizeHint,0);if(m===0)for(let p of this.sizers)p.size=p.sizeHint=1/h;else for(let p of this.sizers)p.size=p.sizeHint/=m;this.normalized=!0}createNormalizedSizes(){let h=this.sizers.length;if(h===0)return[];let m=this.sizers.map(_=>_.size),p=m.reduce((_,y)=>_+y,0);if(p===0)for(let _=m.length-1;_>-1;_--)m[_]=1/h;else for(let _=m.length-1;_>-1;_--)m[_]/=p;return m}fit(h,m){let p=this.orientation==="horizontal",_=Math.max(0,this.children.length-1)*h,y=p?_:0,S=p?0:_,T=1/0,O=1/0;for(let A=0,b=this.children.length;A=m.length)&&(p=0),{type:"tab-area",widgets:m,currentIndex:p}}function c(f,h){let m=f.orientation,p=[],_=[];for(let y=0,S=f.children.length;y{let S=n(_,h,m),T=e(f.sizes[y]),O=h.createHandle();p.children.push(S),p.handles.push(O),p.sizers.push(T),S.parent=p}),p.syncHandles(),p.normalizeSizes(),p}})(nn||(nn={}));Wu=class extends fe{constructor(e={}){super(),this._drag=null,this._tabsMovable=!0,this._tabsConstrained=!1,this._addButtonEnabled=!1,this._pressData=null,this._layoutModified=new Te(this),this._addRequested=new Te(this),this.addClass("lm-DockPanel"),this._document=e.document||document,this._mode=e.mode||"multiple-document",this._renderer=e.renderer||Wu.defaultRenderer,this._edges=e.edges||Vi.DEFAULT_EDGES,e.tabsMovable!==void 0&&(this._tabsMovable=e.tabsMovable),e.tabsConstrained!==void 0&&(this._tabsConstrained=e.tabsConstrained),e.addButtonEnabled!==void 0&&(this._addButtonEnabled=e.addButtonEnabled),this.dataset.mode=this._mode;let i={createTabBar:()=>this._createTabBar(),createHandle:()=>this._createHandle()};this.layout=new HC({document:this._document,renderer:i,spacing:e.spacing,hiddenMode:e.hiddenMode}),this.overlay=e.overlay||new Wu.Overlay,this.node.appendChild(this.overlay.node)}dispose(){this._releaseMouse(),this.overlay.hide(0),this._drag&&this._drag.dispose(),super.dispose()}get hiddenMode(){return this.layout.hiddenMode}set hiddenMode(e){this.layout.hiddenMode=e}get layoutModified(){return this._layoutModified}get addRequested(){return this._addRequested}get renderer(){return this.layout.renderer}get spacing(){return this.layout.spacing}set spacing(e){this.layout.spacing=e}get mode(){return this._mode}set mode(e){if(this._mode===e)return;this._mode=e,this.dataset.mode=e;let i=this.layout;switch(e){case"multiple-document":for(let n of i.tabBars())n.show();break;case"single-document":i.restoreLayout(Vi.createSingleDocumentConfig(this));break;default:throw"unreachable"}ze.postMessage(this,Vi.LayoutModified)}get tabsMovable(){return this._tabsMovable}set tabsMovable(e){this._tabsMovable=e;for(let i of this.tabBars())i.tabsMovable=e}get tabsConstrained(){return this._tabsConstrained}set tabsConstrained(e){this._tabsConstrained=e}get addButtonEnabled(){return this._addButtonEnabled}set addButtonEnabled(e){this._addButtonEnabled=e;for(let i of this.tabBars())i.addButtonEnabled=e}get isEmpty(){return this.layout.isEmpty}*widgets(){yield*this.layout.widgets()}*selectedWidgets(){yield*this.layout.selectedWidgets()}*tabBars(){yield*this.layout.tabBars()}*handles(){yield*this.layout.handles()}selectWidget(e){let i=LC(this.tabBars(),n=>n.titles.indexOf(e.title)!==-1);if(!i)throw new Error("Widget is not contained in the dock panel.");i.currentTitle=e.title}activateWidget(e){this.selectWidget(e),e.activate()}saveLayout(){return this.layout.saveLayout()}restoreLayout(e){this._mode="multiple-document",this.layout.restoreLayout(e),(zu.IS_EDGE||zu.IS_IE)&&ze.flush(),ze.postMessage(this,Vi.LayoutModified)}addWidget(e,i={}){this._mode==="single-document"?this.layout.addWidget(e):this.layout.addWidget(e,i),ze.postMessage(this,Vi.LayoutModified)}processMessage(e){e.type==="layout-modified"?this._layoutModified.emit(void 0):super.processMessage(e)}handleEvent(e){switch(e.type){case"lm-dragenter":this._evtDragEnter(e);break;case"lm-dragleave":this._evtDragLeave(e);break;case"lm-dragover":this._evtDragOver(e);break;case"lm-drop":this._evtDrop(e);break;case"pointerdown":this._evtPointerDown(e);break;case"pointermove":this._evtPointerMove(e);break;case"pointerup":this._evtPointerUp(e);break;case"keydown":this._evtKeyDown(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("lm-dragenter",this),this.node.addEventListener("lm-dragleave",this),this.node.addEventListener("lm-dragover",this),this.node.addEventListener("lm-drop",this),this.node.addEventListener("pointerdown",this)}onAfterDetach(e){this.node.removeEventListener("lm-dragenter",this),this.node.removeEventListener("lm-dragleave",this),this.node.removeEventListener("lm-dragover",this),this.node.removeEventListener("lm-drop",this),this.node.removeEventListener("pointerdown",this),this._releaseMouse()}onChildAdded(e){Vi.isGeneratedTabBarProperty.get(e.child)||e.child.addClass("lm-DockPanel-widget")}onChildRemoved(e){Vi.isGeneratedTabBarProperty.get(e.child)||(e.child.removeClass("lm-DockPanel-widget"),ze.postMessage(this,Vi.LayoutModified))}_evtDragEnter(e){e.mimeData.hasData("application/vnd.lumino.widget-factory")&&(e.preventDefault(),e.stopPropagation())}_evtDragLeave(e){e.preventDefault(),!(this._tabsConstrained&&e.source!==this)&&(e.stopPropagation(),this.overlay.hide(1))}_evtDragOver(e){e.preventDefault(),this._tabsConstrained&&e.source!==this||this._showOverlay(e.clientX,e.clientY)==="invalid"?e.dropAction="none":(e.stopPropagation(),e.dropAction=e.proposedAction)}_evtDrop(e){if(e.preventDefault(),this.overlay.hide(0),e.proposedAction==="none"){e.dropAction="none";return}let{clientX:i,clientY:n}=e,{zone:r,target:s}=Vi.findDropTarget(this,i,n,this._edges);if(this._tabsConstrained&&e.source!==this||r==="invalid"){e.dropAction="none";return}let a=e.mimeData.getData("application/vnd.lumino.widget-factory");if(typeof a!="function"){e.dropAction="none";return}let l=a();if(!(l instanceof fe)){e.dropAction="none";return}if(l.contains(this)){e.dropAction="none";return}let c=s?Vi.getDropRef(s.tabBar):null;switch(r){case"root-all":this.addWidget(l);break;case"root-top":this.addWidget(l,{mode:"split-top"});break;case"root-left":this.addWidget(l,{mode:"split-left"});break;case"root-right":this.addWidget(l,{mode:"split-right"});break;case"root-bottom":this.addWidget(l,{mode:"split-bottom"});break;case"widget-all":this.addWidget(l,{mode:"tab-after",ref:c});break;case"widget-top":this.addWidget(l,{mode:"split-top",ref:c});break;case"widget-left":this.addWidget(l,{mode:"split-left",ref:c});break;case"widget-right":this.addWidget(l,{mode:"split-right",ref:c});break;case"widget-bottom":this.addWidget(l,{mode:"split-bottom",ref:c});break;case"widget-tab":this.addWidget(l,{mode:"tab-after",ref:c});break;default:throw"unreachable"}e.dropAction=e.proposedAction,e.stopPropagation(),this.activateWidget(l)}_evtKeyDown(e){e.preventDefault(),e.stopPropagation(),e.keyCode===27&&(this._releaseMouse(),ze.postMessage(this,Vi.LayoutModified))}_evtPointerDown(e){if(e.button!==0)return;let i=this.layout,n=e.target,r=LC(i.handles(),u=>u.contains(n));if(!r)return;e.preventDefault(),e.stopPropagation(),this._document.addEventListener("keydown",this,!0),this._document.addEventListener("pointerup",this,!0),this._document.addEventListener("pointermove",this,!0),this._document.addEventListener("contextmenu",this,!0);let s=r.getBoundingClientRect(),o=e.clientX-s.left,a=e.clientY-s.top,l=window.getComputedStyle(r),c=an.overrideCursor(l.cursor,this._document);this._pressData={handle:r,deltaX:o,deltaY:a,override:c}}_evtPointerMove(e){if(!this._pressData)return;e.preventDefault(),e.stopPropagation();let i=this.node.getBoundingClientRect(),n=e.clientX-i.left-this._pressData.deltaX,r=e.clientY-i.top-this._pressData.deltaY;this.layout.moveHandle(this._pressData.handle,n,r)}_evtPointerUp(e){e.button===0&&(e.preventDefault(),e.stopPropagation(),this._releaseMouse(),ze.postMessage(this,Vi.LayoutModified))}_releaseMouse(){this._pressData&&(this._pressData.override.dispose(),this._pressData=null,this._document.removeEventListener("keydown",this,!0),this._document.removeEventListener("pointerup",this,!0),this._document.removeEventListener("pointermove",this,!0),this._document.removeEventListener("contextmenu",this,!0))}_showOverlay(e,i){let{zone:n,target:r}=Vi.findDropTarget(this,e,i,this._edges);if(n==="invalid")return this.overlay.hide(100),n;let s,o,a,l,c=ni.boxSizing(this.node),u=this.node.getBoundingClientRect();switch(n){case"root-all":s=c.paddingTop,o=c.paddingLeft,a=c.paddingRight,l=c.paddingBottom;break;case"root-top":s=c.paddingTop,o=c.paddingLeft,a=c.paddingRight,l=u.height*Vi.GOLDEN_RATIO;break;case"root-left":s=c.paddingTop,o=c.paddingLeft,a=u.width*Vi.GOLDEN_RATIO,l=c.paddingBottom;break;case"root-right":s=c.paddingTop,o=u.width*Vi.GOLDEN_RATIO,a=c.paddingRight,l=c.paddingBottom;break;case"root-bottom":s=u.height*Vi.GOLDEN_RATIO,o=c.paddingLeft,a=c.paddingRight,l=c.paddingBottom;break;case"widget-all":s=r.top,o=r.left,a=r.right,l=r.bottom;break;case"widget-top":s=r.top,o=r.left,a=r.right,l=r.bottom+r.height/2;break;case"widget-left":s=r.top,o=r.left,a=r.right+r.width/2,l=r.bottom;break;case"widget-right":s=r.top,o=r.left+r.width/2,a=r.right,l=r.bottom;break;case"widget-bottom":s=r.top+r.height/2,o=r.left,a=r.right,l=r.bottom;break;case"widget-tab":{let d=r.tabBar.node.getBoundingClientRect().height;s=r.top,o=r.left,a=r.right,l=r.bottom+r.height-d;break}default:throw"unreachable"}return this.overlay.show({top:s,left:o,right:a,bottom:l}),n}_createTabBar(){let e=this._renderer.createTabBar(this._document);return Vi.isGeneratedTabBarProperty.set(e,!0),this._mode==="single-document"&&e.hide(),e.tabsMovable=this._tabsMovable,e.allowDeselect=!1,e.addButtonEnabled=this._addButtonEnabled,e.removeBehavior="select-previous-tab",e.insertBehavior="select-tab-if-needed",e.tabMoved.connect(this._onTabMoved,this),e.currentChanged.connect(this._onCurrentChanged,this),e.tabCloseRequested.connect(this._onTabCloseRequested,this),e.tabDetachRequested.connect(this._onTabDetachRequested,this),e.tabActivateRequested.connect(this._onTabActivateRequested,this),e.addRequested.connect(this._onTabAddRequested,this),e}_createHandle(){return this._renderer.createHandle()}_onTabMoved(){ze.postMessage(this,Vi.LayoutModified)}_onCurrentChanged(e,i){let{previousTitle:n,currentTitle:r}=i;n&&n.owner.hide(),r&&r.owner.show(),(zu.IS_EDGE||zu.IS_IE)&&ze.flush(),ze.postMessage(this,Vi.LayoutModified)}_onTabAddRequested(e){this._addRequested.emit(e)}_onTabActivateRequested(e,i){i.title.owner.activate()}_onTabCloseRequested(e,i){i.title.owner.close()}_onTabDetachRequested(e,i){if(this._drag)return;e.releaseMouse();let{title:n,tab:r,clientX:s,clientY:o,offset:a}=i,l=new gl.MimeData,c=()=>n.owner;l.setData("application/vnd.lumino.widget-factory",c);let u=r.cloneNode(!0);a&&(u.style.top=`-${a.y}px`,u.style.left=`-${a.x}px`),this._drag=new an({document:this._document,mimeData:l,dragImage:u,proposedAction:"move",supportedActions:"move",source:this}),r.classList.add("lm-mod-hidden");let d=()=>{this._drag=null,r.classList.remove("lm-mod-hidden")};this._drag.start(s,o).then(d)}};(function(t){class e{constructor(){this._timer=-1,this._hidden=!0,this.node=document.createElement("div"),this.node.classList.add("lm-DockPanel-overlay"),this.node.classList.add("lm-mod-hidden"),this.node.style.position="absolute",this.node.style.contain="strict"}show(r){let s=this.node.style;s.top=`${r.top}px`,s.left=`${r.left}px`,s.right=`${r.right}px`,s.bottom=`${r.bottom}px`,clearTimeout(this._timer),this._timer=-1,this._hidden&&(this._hidden=!1,this.node.classList.remove("lm-mod-hidden"))}hide(r){if(!this._hidden){if(r<=0){clearTimeout(this._timer),this._timer=-1,this._hidden=!0,this.node.classList.add("lm-mod-hidden");return}this._timer===-1&&(this._timer=window.setTimeout(()=>{this._timer=-1,this._hidden=!0,this.node.classList.add("lm-mod-hidden")},r))}}}t.Overlay=e;class i{createTabBar(r){let s=new wc({document:r});return s.addClass("lm-DockPanel-tabBar"),s}createHandle(){let r=document.createElement("div");return r.className="lm-DockPanel-handle",r}}t.Renderer=i,t.defaultRenderer=new i})(Wu||(Wu={}));(function(t){t.GOLDEN_RATIO=.618,t.DEFAULT_EDGES={top:12,right:40,bottom:40,left:40},t.LayoutModified=new xc("layout-modified"),t.isGeneratedTabBarProperty=new pt({name:"isGeneratedTabBar",create:()=>!1});function e(r){if(r.isEmpty)return{main:null};let s=Array.from(r.widgets()),o=r.selectedWidgets().next().value,a=o?s.indexOf(o):-1;return{main:{type:"tab-area",widgets:s,currentIndex:a}}}t.createSingleDocumentConfig=e;function i(r,s,o,a){if(!ni.hitTest(r.node,s,o))return{zone:"invalid",target:null};let l=r.layout;if(l.isEmpty)return{zone:"root-all",target:null};if(r.mode==="multiple-document"){let T=r.node.getBoundingClientRect(),O=s-T.left+1,A=o-T.top+1,b=T.right-s,M=T.bottom-o;switch(Math.min(A,b,M,O)){case A:if(Ap&&f>p&&d>_&&h>_)return{zone:"widget-all",target:c};u/=p,d/=_,f/=p,h/=_;let y=Math.min(u,d,f,h),S;switch(y){case u:S="widget-left";break;case d:S="widget-top";break;case f:S="widget-right";break;case h:S="widget-bottom";break;default:throw"unreachable"}return{zone:S,target:c}}t.findDropTarget=i;function n(r){return r.titles.length===0?null:r.currentTitle?r.currentTitle.owner:r.titles[r.titles.length-1].owner}t.getDropRef=n})(Vi||(Vi={}));pl=class extends fa{constructor(e={}){super(e),this._dirty=!1,this._rowSpacing=4,this._columnSpacing=4,this._items=[],this._rowStarts=[],this._columnStarts=[],this._rowSizers=[new da],this._columnSizers=[new da],this._box=null,e.rowCount!==void 0&&wn.reallocSizers(this._rowSizers,e.rowCount),e.columnCount!==void 0&&wn.reallocSizers(this._columnSizers,e.columnCount),e.rowSpacing!==void 0&&(this._rowSpacing=wn.clampValue(e.rowSpacing)),e.columnSpacing!==void 0&&(this._columnSpacing=wn.clampValue(e.columnSpacing))}dispose(){for(let e of this._items){let i=e.widget;e.dispose(),i.dispose()}this._box=null,this._items.length=0,this._rowStarts.length=0,this._rowSizers.length=0,this._columnStarts.length=0,this._columnSizers.length=0,super.dispose()}get rowCount(){return this._rowSizers.length}set rowCount(e){e!==this.rowCount&&(wn.reallocSizers(this._rowSizers,e),this.parent&&this.parent.fit())}get columnCount(){return this._columnSizers.length}set columnCount(e){e!==this.columnCount&&(wn.reallocSizers(this._columnSizers,e),this.parent&&this.parent.fit())}get rowSpacing(){return this._rowSpacing}set rowSpacing(e){e=wn.clampValue(e),this._rowSpacing!==e&&(this._rowSpacing=e,this.parent&&this.parent.fit())}get columnSpacing(){return this._columnSpacing}set columnSpacing(e){e=wn.clampValue(e),this._columnSpacing!==e&&(this._columnSpacing=e,this.parent&&this.parent.fit())}rowStretch(e){let i=this._rowSizers[e];return i?i.stretch:-1}setRowStretch(e,i){let n=this._rowSizers[e];n&&(i=wn.clampValue(i),n.stretch!==i&&(n.stretch=i,this.parent&&this.parent.update()))}columnStretch(e){let i=this._columnSizers[e];return i?i.stretch:-1}setColumnStretch(e,i){let n=this._columnSizers[e];n&&(i=wn.clampValue(i),n.stretch!==i&&(n.stretch=i,this.parent&&this.parent.update()))}*[Symbol.iterator](){for(let e of this._items)yield e.widget}addWidget(e){je.findFirstIndex(this._items,n=>n.widget===e)===-1&&(this._items.push(new Hu(e)),this.parent&&this.attachWidget(e))}removeWidget(e){let i=je.findFirstIndex(this._items,r=>r.widget===e);if(i===-1)return;let n=je.removeAt(this._items,i);this.parent&&this.detachWidget(e),n.dispose()}init(){super.init();for(let e of this)this.attachWidget(e)}attachWidget(e){this.parent.isAttached&&ze.sendMessage(e,fe.Msg.BeforeAttach),this.parent.node.appendChild(e.node),this.parent.isAttached&&ze.sendMessage(e,fe.Msg.AfterAttach),this.parent.fit()}detachWidget(e){this.parent.isAttached&&ze.sendMessage(e,fe.Msg.BeforeDetach),this.parent.node.removeChild(e.node),this.parent.isAttached&&ze.sendMessage(e,fe.Msg.AfterDetach),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_fit(){for(let l=0,c=this.rowCount;l!l.isHidden);for(let l=0,c=e.length;l({row:0,column:0,rowSpan:1,columnSpan:1}),changed:a});function e(l){let c=Math.max(0,Math.floor(l.row||0)),u=Math.max(0,Math.floor(l.column||0)),d=Math.max(1,Math.floor(l.rowSpan||0)),f=Math.max(1,Math.floor(l.columnSpan||0));return{row:c,column:u,rowSpan:d,columnSpan:f}}t.normalizeConfig=e;function i(l){return Math.max(0,Math.floor(l))}t.clampValue=i;function n(l,c){let u=t.cellConfigProperty.get(l.widget),d=t.cellConfigProperty.get(c.widget);return u.rowSpan-d.rowSpan}t.rowSpanCmp=n;function r(l,c){let u=t.cellConfigProperty.get(l.widget),d=t.cellConfigProperty.get(c.widget);return u.columnSpan-d.columnSpan}t.columnSpanCmp=r;function s(l,c){for(c=Math.max(1,Math.floor(c));l.lengthc&&(l.length=c)}t.reallocSizers=s;function o(l,c,u,d){if(u=d)return;let h=(d-f)/(u-c+1);for(let m=c;m<=u;++m)l[m].minSize+=h}t.distributeMin=o;function a(l){l.parent&&l.parent.layout instanceof pl&&l.parent.fit()}})(wn||(wn={}));Ff=class extends fe{constructor(e={}){super({node:_0.createNode()}),this._activeIndex=-1,this._tabFocusIndex=0,this._menus=[],this._childMenu=null,this._overflowMenu=null,this._menuItemSizes=[],this._overflowIndex=-1,this.addClass("lm-MenuBar"),this.setFlag(fe.Flag.DisallowLayout),this.renderer=e.renderer||Ff.defaultRenderer,this._forceItemsPosition=e.forceItemsPosition||{forceX:!0,forceY:!0},this._overflowMenuOptions=e.overflowMenuOptions||{isVisible:!0}}dispose(){this._closeChildMenu(),this._menus.length=0,super.dispose()}get childMenu(){return this._childMenu}get overflowIndex(){return this._overflowIndex}get overflowMenu(){return this._overflowMenu}get contentNode(){return this.node.getElementsByClassName("lm-MenuBar-content")[0]}get activeMenu(){return this._menus[this._activeIndex]||null}set activeMenu(e){this.activeIndex=e?this._menus.indexOf(e):-1}get activeIndex(){return this._activeIndex}set activeIndex(e){(e<0||e>=this._menus.length)&&(e=-1),e>-1&&this._menus[e].items.length===0&&(e=-1),this._activeIndex!==e&&(this._activeIndex=e,this.update())}get menus(){return this._menus}openActiveMenu(){this._activeIndex!==-1&&(this._openChildMenu(),this._childMenu&&(this._childMenu.activeIndex=-1,this._childMenu.activateNextItem()))}addMenu(e,i=!0){this.insertMenu(this._menus.length,e,i)}insertMenu(e,i,n=!0){this._closeChildMenu();let r=this._menus.indexOf(i),s=Math.max(0,Math.min(e,this._menus.length));if(r===-1){je.insert(this._menus,s,i),i.addClass("lm-MenuBar-menu"),i.aboutToClose.connect(this._onMenuAboutToClose,this),i.menuRequested.connect(this._onMenuMenuRequested,this),i.title.changed.connect(this._onTitleChanged,this),n&&this.update();return}s===this._menus.length&&s--,r!==s&&(je.move(this._menus,r,s),n&&this.update())}removeMenu(e,i=!0){this.removeMenuAt(this._menus.indexOf(e),i)}removeMenuAt(e,i=!0){this._closeChildMenu();let n=je.removeAt(this._menus,e);n&&(n.aboutToClose.disconnect(this._onMenuAboutToClose,this),n.menuRequested.disconnect(this._onMenuMenuRequested,this),n.title.changed.disconnect(this._onTitleChanged,this),n.removeClass("lm-MenuBar-menu"),i&&this.update())}clearMenus(){if(this._menus.length!==0){this._closeChildMenu();for(let e of this._menus)e.aboutToClose.disconnect(this._onMenuAboutToClose,this),e.menuRequested.disconnect(this._onMenuMenuRequested,this),e.title.changed.disconnect(this._onTitleChanged,this),e.removeClass("lm-MenuBar-menu");this._menus.length=0,this.update()}}handleEvent(e){switch(e.type){case"keydown":this._evtKeyDown(e);break;case"mousedown":this._evtMouseDown(e);break;case"mousemove":this._evtMouseMove(e);break;case"focusout":this._evtFocusOut(e);break;case"contextmenu":e.preventDefault(),e.stopPropagation();break}}onBeforeAttach(e){this.node.addEventListener("keydown",this),this.node.addEventListener("mousedown",this),this.node.addEventListener("mousemove",this),this.node.addEventListener("focusout",this),this.node.addEventListener("contextmenu",this)}onAfterDetach(e){this.node.removeEventListener("keydown",this),this.node.removeEventListener("mousedown",this),this.node.removeEventListener("mousemove",this),this.node.removeEventListener("focusout",this),this.node.removeEventListener("contextmenu",this),this._closeChildMenu()}onActivateRequest(e){this.isAttached&&this._focusItemAt(0)}onResize(e){this.update(),super.onResize(e)}onUpdateRequest(e){var i;let n=this._menus,r=this.renderer,s=this._activeIndex,o=this._tabFocusIndex>=0&&this._tabFocusIndex-1?this._overflowIndex:n.length,l=0,c=!1;a=this._overflowMenu!==null?a-1:a;let u=new Array(a);for(let d=0;d{this._tabFocusIndex=d,this.activeIndex=d}}),l+=this._menuItemSizes[d],n[d].title.label===this._overflowMenuOptions.title&&(c=!0,a--);if(this._overflowMenuOptions.isVisible){if(this._overflowIndex>-1&&!c){if(this._overflowMenu===null){let d=(i=this._overflowMenuOptions.title)!==null&&i!==void 0?i:"...";this._overflowMenu=new ha({commands:new er}),this._overflowMenu.title.label=d,this._overflowMenu.title.mnemonic=0,this.addMenu(this._overflowMenu,!1)}for(let d=n.length-2;d>=a;d--){let f=this.menus[d];f.title.mnemonic=0,this._overflowMenu.insertItem(0,{type:"submenu",submenu:f}),this.removeMenu(f,!1)}u[a]=r.renderItem({title:this._overflowMenu.title,active:a===s&&n[a].items.length!==0,tabbable:a===o,disabled:n[a].items.length===0,onfocus:()=>{this._tabFocusIndex=a,this.activeIndex=a}}),a++}else if(this._overflowMenu!==null){let d=this._overflowMenu.items,f=this.node.offsetWidth,h=this._overflowMenu.items.length;for(let m=0;mthis._menuItemSizes[p]){let _=d[0].submenu;this._overflowMenu.removeItemAt(0),this.insertMenu(a,_,!1),u[a]=r.renderItem({title:_.title,active:!1,tabbable:a===o,disabled:n[a].items.length===0,onfocus:()=>{this._tabFocusIndex=a,this.activeIndex=a}}),a++}}this._overflowMenu.items.length===0&&(this.removeMenu(this._overflowMenu,!1),u.pop(),this._overflowMenu=null,this._overflowIndex=-1)}}Qt.render(u,this.contentNode),this._updateOverflowIndex()}_updateOverflowIndex(){if(!this._overflowMenuOptions.isVisible)return;let e=this.contentNode.childNodes,i=this.node.offsetWidth,n=0,r=-1,s=e.length;if(this._menuItemSizes.length==0)for(let o=0;oi&&r===-1&&(r=o)}else for(let o=0;oi){r=o;break}this._overflowIndex=r}_evtKeyDown(e){let i=e.keyCode;if(i===9){this.activeIndex=-1;return}if(e.preventDefault(),e.stopPropagation(),i===13||i===32||i===38||i===40){if(this.activeIndex=this._tabFocusIndex,this.activeIndex!==this._tabFocusIndex)return;this.openActiveMenu();return}if(i===27){this._closeChildMenu(),this._focusItemAt(this.activeIndex);return}if(i===37||i===39){let o=i===37?-1:1,a=this._tabFocusIndex+o,l=this._menus.length;for(let c=0;cni.hitTest(n,e.clientX,e.clientY));if(i===-1){this._closeChildMenu();return}if(e.button===0)if(this._childMenu)this._closeChildMenu(),this.activeIndex=i;else{e.preventDefault();let n=this._positionForMenu(i);ha.saveWindowData(),this.activeIndex=i,this._openChildMenu(n)}}_evtMouseMove(e){let i=je.findFirstIndex(this.contentNode.children,r=>ni.hitTest(r,e.clientX,e.clientY));if(i===this._activeIndex||i===-1&&this._childMenu)return;let n=i>=0&&this._childMenu?this._positionForMenu(i):null;ha.saveWindowData(),this.activeIndex=i,n&&this._openChildMenu(n)}_positionForMenu(e){let i=this.contentNode.children[e],{left:n,bottom:r}=i.getBoundingClientRect();return{top:r,left:n}}_evtFocusOut(e){!this._childMenu&&!this.node.contains(e.relatedTarget)&&(this.activeIndex=-1)}_focusItemAt(e){let i=this.contentNode.childNodes[e];i&&i.focus()}_openChildMenu(e={}){let i=this.activeMenu;if(!i){this._closeChildMenu();return}let n=this._childMenu;if(n===i)return;this._childMenu=i,n?n.close():document.addEventListener("mousedown",this,!0),this._tabFocusIndex=this.activeIndex,ze.sendMessage(this,fe.Msg.UpdateRequest);let{left:r,top:s}=e;(typeof r>"u"||typeof s>"u")&&({left:r,top:s}=this._positionForMenu(this._activeIndex)),n||this.addClass("lm-mod-active"),i.items.length>0&&i.open(r,s,this._forceItemsPosition)}_closeChildMenu(){if(!this._childMenu)return;this.removeClass("lm-mod-active"),document.removeEventListener("mousedown",this,!0);let e=this._childMenu;this._childMenu=null,e.close(),this.activeIndex=-1}_onMenuAboutToClose(e){e===this._childMenu&&(this.removeClass("lm-mod-active"),document.removeEventListener("mousedown",this,!0),this._childMenu=null,this.activeIndex=-1)}_onMenuMenuRequested(e,i){if(e!==this._childMenu)return;let n=this._activeIndex,r=this._menus.length;switch(i){case"next":this.activeIndex=n===r-1?0:n+1;break;case"previous":this.activeIndex=n===0?r-1:n-1;break}this.openActiveMenu()}_onTitleChanged(){this.update()}};(function(t){class e{renderItem(n){let r=this.createItemClass(n),s=this.createItemDataset(n),o=this.createItemARIA(n);return Ce.li({className:r,dataset:s,...n.disabled?{}:{tabindex:n.tabbable?"0":"-1"},onfocus:n.onfocus,...o},this.renderIcon(n),this.renderLabel(n))}renderIcon(n){let r=this.createIconClass(n);return Ce.div({className:r},n.title.icon,n.title.iconLabel)}renderLabel(n){let r=this.formatLabel(n);return Ce.div({className:"lm-MenuBar-itemLabel"},r)}createItemClass(n){let r="lm-MenuBar-item";return n.title.className&&(r+=` ${n.title.className}`),n.active&&!n.disabled&&(r+=" lm-mod-active"),r}createItemDataset(n){return n.title.dataset}createItemARIA(n){return{role:"menuitem","aria-haspopup":"true","aria-disabled":n.disabled?"true":"false"}}createIconClass(n){let r="lm-MenuBar-itemIcon",s=n.title.iconClass;return s?`${r} ${s}`:r}formatLabel(n){let{label:r,mnemonic:s}=n.title;if(s<0||s>=r.length)return r;let o=r.slice(0,s),a=r.slice(s+1),l=r[s],c=Ce.span({className:"lm-MenuBar-itemMnemonic"},l);return[o,c,a]}}t.Renderer=e,t.defaultRenderer=new e})(Ff||(Ff={}));(function(t){function e(){let n=document.createElement("div"),r=document.createElement("ul");return r.className="lm-MenuBar-content",n.appendChild(r),r.setAttribute("role","menubar"),n}t.createNode=e;function i(n,r,s){let o=-1,a=-1,l=!1,c=r.toUpperCase();for(let u=0,d=n.length;u=0&&m1&&this.widgets.forEach(i=>{i.hiddenMode=this._hiddenMode}))}dispose(){for(let e of this._items)e.dispose();this._box=null,this._items.length=0,super.dispose()}attachWidget(e,i){this._hiddenMode===fe.HiddenMode.Scale&&this._items.length>0?(this._items.length===1&&(this.widgets[0].hiddenMode=fe.HiddenMode.Scale),i.hiddenMode=fe.HiddenMode.Scale):i.hiddenMode=fe.HiddenMode.Display,je.insert(this._items,e,new Hu(i)),this.parent.isAttached&&ze.sendMessage(i,fe.Msg.BeforeAttach),this.parent.node.appendChild(i.node),this.parent.isAttached&&ze.sendMessage(i,fe.Msg.AfterAttach),this.parent.fit()}moveWidget(e,i,n){je.move(this._items,e,i),this.parent.update()}detachWidget(e,i){let n=je.removeAt(this._items,e);this.parent.isAttached&&ze.sendMessage(i,fe.Msg.BeforeDetach),this.parent.node.removeChild(i.node),this.parent.isAttached&&ze.sendMessage(i,fe.Msg.AfterDetach),n.widget.node.style.zIndex="",this._hiddenMode===fe.HiddenMode.Scale&&(i.hiddenMode=fe.HiddenMode.Display,this._items.length===1&&(this._items[0].widget.hiddenMode=fe.HiddenMode.Display)),n.dispose(),this.parent.fit()}onBeforeShow(e){super.onBeforeShow(e),this.parent.update()}onBeforeAttach(e){super.onBeforeAttach(e),this.parent.fit()}onChildShown(e){this.parent.fit()}onChildHidden(e){this.parent.fit()}onResize(e){this.parent.isVisible&&this._update(e.width,e.height)}onUpdateRequest(e){this.parent.isVisible&&this._update(-1,-1)}onFitRequest(e){this.parent.isAttached&&this._fit()}_fit(){let e=0,i=0;for(let s=0,o=this._items.length;s{y1();$z=P(v1());Rp();v0=P(Qn());Tp();Rs();Hz();fG="jp-OutputArea",jz="jp-OutputArea-child",Fz="jp-OutputArea-output",Wz="jp-OutputArea-prompt",mG="jp-OutputArea-stdin-hiding",pG="jp-OutputPrompt",gG="jp-OutputArea-executeResult",_G="jp-OutputArea-stdin-item",vG="jp-Stdin",bG="jp-Stdin-prompt",xG="jp-Stdin-input",yG="jp-OutputArea-promptOverlay",ds=class extends fe{constructor(e){var i,n,r,s;super(),this.outputLengthChanged=new Te(this),this._onIOPub=a=>{let l=this.model,c=a.header.msg_type,u,f=(a.content.transient||{}).display_id,h;switch(c){case"execute_result":case"display_data":case"stream":case"error":u={...a.content,output_type:c},l.add(u);break;case"clear_output":{let m=a.content.wait;l.clear(m);break}case"update_display_data":if(u={...a.content,output_type:"display_data"},h=this._displayIdMap.get(f),h)for(let m of h)l.set(m,u);break;case"status":{a.content.execution_state==="idle"&&(this._pendingInput=!1);break}default:break}f&&c==="display_data"&&(h=this._displayIdMap.get(f)||[],h.push(l.length-1),this._displayIdMap.set(f,h))},this._onExecuteReply=a=>{let l=this.model,c=a.content;if(c.status!=="ok")return;let u=c&&c.payload;if(!u||!u.length)return;let d=u.filter(m=>m.source==="page");if(!d.length)return;let h={output_type:"display_data",data:JSON.parse(JSON.stringify(d[0])).data,metadata:{}};l.add(h)},this._displayIdMap=new Map,this._minHeightTimeout=null,this._inputRequested=new Te(this),this._toggleScrolling=new Te(this),this._initialize=new Te(this),this._outputTracker=new SA({namespace:v0.UUID.uuid4()}),this._inputHistoryScope="global",this._pendingInput=!1,super.layout=new yc,this.addClass(fG),this.contentFactory=(i=e.contentFactory)!==null&&i!==void 0?i:ds.defaultContentFactory,this.rendermime=e.rendermime,this._maxNumberOutputs=(n=e.maxNumberOutputs)!==null&&n!==void 0?n:1/0,this._translator=(r=e.translator)!==null&&r!==void 0?r:fo,this._inputHistoryScope=(s=e.inputHistoryScope)!==null&&s!==void 0?s:"global";let o=this.model=e.model;for(let a=0;a{this._pendingInput=!1}).catch(()=>{}),this.model.clear(),this.widgets.length&&(this._clear(),this.outputLengthChanged.emit(Math.min(this.model.length,this._maxNumberOutputs))),e.onIOPub=this._onIOPub,e.onReply=this._onExecuteReply,e.onStdin=i=>{$z.KernelMessage.isInputRequestMsg(i)&&this.onInputRequest(i,e)})}get inputRequested(){return this._inputRequested}get pendingInput(){return this._pendingInput}get maxNumberOutputs(){return this._maxNumberOutputs}set maxNumberOutputs(e){if(e<=0){console.warn("OutputArea.maxNumberOutputs must be strictly positive.");return}let i=this._maxNumberOutputs;this._maxNumberOutputs=e,i{this._toggleScrolling.emit()}),this.node.appendChild(e),requestAnimationFrame(()=>{this._initialize.emit()})}_moveDisplayIdIndices(e,i){this._displayIdMap.forEach(n=>{let r=e+i,s=n.length;for(let o=s-1;o>=0;--o){let a=n[o];a>=e&&a=r&&(n[o]-=i)}})}onStateChanged(e,i){let n=Math.min(this.model.length,this._maxNumberOutputs);if(i){if(i>=this._maxNumberOutputs)return;this._setOutput(i,this.model.get(i))}else for(let r=0;r{this.isDisposed||(this.node.style.minHeight="")},50)}onInputRequest(e,i){let n=this.contentFactory,r=e.content.prompt,s=e.content.password,o=new ju;o.addClass(jz),o.addClass(_G);let a=n.createOutputPrompt();a.addClass(Wz),o.addWidget(a),this._pendingInput=!0;let l=n.createStdin({parent_header:e.header,prompt:r,password:s,future:i,translator:this._translator,inputHistoryScope:this._inputHistoryScope});l.addClass(Fz),o.addWidget(l),this.model.length>=this.maxNumberOutputs&&(this.maxNumberOutputs=this.model.length),this._inputRequested.emit(l);let c=l.node.getElementsByTagName("input")[0];l.value.then(u=>{this.model.length>=this.maxNumberOutputs&&(this.maxNumberOutputs=this.model.length+1),o.addClass(mG),this.model.add({output_type:"stream",name:"stdin",text:u+` `}),c.focus(),this._pendingInput=!1,window.setTimeout(()=>{let d=document.activeElement;o.dispose(),d&&d instanceof HTMLElement&&d.focus()},500)}),this.layout.addWidget(o)}_setOutput(e,i){if(e>=this._maxNumberOutputs)return;let n=this.layout.widgets[e],r=n.widgets?n.widgets.filter(o=>"renderModel"in o).pop():n,s=this.rendermime.preferredMimeType(i.data,i.trusted?"any":"ensure");_l.currentPreferredMimetype.get(r)===s&&ds.isIsolated(s,i.metadata)===r instanceof _l.IsolatedRenderer?r.renderModel(i):(this.layout.widgets[e].dispose(),this._insertOutput(e,i))}_insertOutput(e,i){if(e>this._maxNumberOutputs)return;let n=this.layout;if(e===this._maxNumberOutputs){let r=new _l.TrimmedOutputs(this._maxNumberOutputs,()=>{let s=this._maxNumberOutputs;this._maxNumberOutputs=1/0,this._showTrimmedOutputs(s)});n.insertWidget(e,this._wrappedOutput(r))}else{let r=this.createOutputItem(i);r?r.toggleClass(gG,i.executionCount!==null):r=new fe,this._outputTracker.has(r)||this._outputTracker.add(r),n.insertWidget(e,r)}}get outputTracker(){return this._outputTracker}_showTrimmedOutputs(e){this.widgets[e].dispose();for(let i=e;i{let o=document.createElement("pre"),a=this._translator.load("jupyterlab");o.textContent=a.__("Javascript Error: %1",s.message),n.node.appendChild(o),n.node.className="lm-Widget jp-RenderedText",n.node.setAttribute("data-mime-type","application/vnd.jupyter.stderr")}),n}_wrappedOutput(e,i=null){let n=new _l.OutputPanel;n.addClass(jz);let r=this.contentFactory.createOutputPrompt();return r.executionCount=i,r.addClass(Wz),n.addWidget(r),e.addClass(Fz),n.addWidget(e),n}};(function(t){async function e(r,s,o,a){var l;let c=!0;a&&Array.isArray(a.tags)&&a.tags.indexOf("raises-exception")!==-1&&(c=!1);let u={code:r,stop_on_error:c},d=(l=o.session)===null||l===void 0?void 0:l.kernel;if(!d)throw new Error("Session has no kernel.");let f=d.requestExecute(u,!1,a);return s.future=f,f.done}t.execute=e;function i(r,s){let o=s[r];return o&&o.isolated!==void 0?!!o.isolated:!!s.isolated}t.isIsolated=i;class n{createOutputPrompt(){return new FC}createStdin(s){return new Ln(s)}}t.ContentFactory=n,t.defaultContentFactory=new n})(ds||(ds={}));FC=class extends fe{constructor(){super(),this._executionCount=null,this.addClass(pG)}get executionCount(){return this._executionCount}set executionCount(e){this._executionCount=e,e===null?this.node.textContent="":this.node.textContent=`[${e}]:`}},Ln=class extends fe{static _historyIx(e,i){let n=Ln._history.get(e);if(!n)return;let r=n.length;if(i<=0)return r+i}static _historyAt(e,i){let n=Ln._history.get(e);if(!n)return;let r=n.length,s=Ln._historyIx(e,i);if(s!==void 0&&s1e3&&n.shift()}static _historySearch(e,i,n,r=!0){let s=Ln._history.get(e),o=s.length,a=Ln._historyIx(e,n),l=c=>c.search(i)!==-1;if(a!==void 0)if(r){if(a===0)return;let c=s.slice(0,a).findLastIndex(l);if(c!==-1)return c-o}else{if(a>=o-1)return;let c=s.slice(a+1).findIndex(l);if(c!==-1)return c-o+a+1}}constructor(e){var i;super({node:_l.createInputWidgetNode(e.prompt,e.password)}),this._promise=new v0.PromiseDelegate,this._resolved=!1,this.addClass(vG),this._future=e.future,this._historyIndex=0,this._historyKey=e.inputHistoryScope==="session"?e.parent_header.session:"",this._historyPat="",this._parentHeader=e.parent_header,this._password=e.password,this._trans=((i=e.translator)!==null&&i!==void 0?i:fo).load("jupyterlab"),this._value=e.prompt+" ",this._input=this.node.getElementsByTagName("input")[0],this._password?this._input.placeholder="":this._input.placeholder=this._trans.__("\u2191\u2193 for history. Search history with c-\u2191/c-\u2193"),Ln._history.has(this._historyKey)||Ln._history.set(this._historyKey,[])}get value(){return this._promise.promise.then(()=>this._value)}handleEvent(e){if(this._resolved){e.preventDefault();return}let i=this._input;if(e.type==="keydown"){if(e.key==="Enter")this.resetSearch(),this._future.sendInputReply({status:"ok",value:i.value},this._parentHeader),this._password?this._value+="\xB7\xB7\xB7\xB7\xB7\xB7\xB7\xB7":(this._value+=i.value,Ln._historyPush(this._historyKey,i.value)),this._resolved=!0,this._promise.resolve(void 0);else if(e.key==="Escape")this.resetSearch(),i.blur();else if(e.ctrlKey&&(e.key==="ArrowUp"||e.key==="ArrowDown")){this._historyPat===""&&(this._historyPat=i.value);let n=e.key==="ArrowUp",r=Ln._historySearch(this._historyKey,this._historyPat,this._historyIndex,n);if(r!==void 0){let s=Ln._historyAt(this._historyKey,r);s!==void 0&&(this._historyIndex===0&&(this._valueCache=i.value),this._setInputValue(s),this._historyIndex=r,e.preventDefault())}}else if(e.key==="ArrowUp"){this.resetSearch();let n=Ln._historyAt(this._historyKey,this._historyIndex-1);n&&(this._historyIndex===0&&(this._valueCache=i.value),this._setInputValue(n),--this._historyIndex,e.preventDefault())}else if(e.key==="ArrowDown"&&(this.resetSearch(),this._historyIndex!==0))if(this._historyIndex===-1)this._setInputValue(this._valueCache),++this._historyIndex;else{let n=Ln._historyAt(this._historyKey,this._historyIndex+1);n&&(this._setInputValue(n),++this._historyIndex)}}}resetSearch(){this._historyPat=""}onAfterAttach(e){this._input.addEventListener("keydown",this),this._input.focus()}onBeforeDetach(e){this._input.removeEventListener("keydown",this)}_setInputValue(e){this._input.value=e,this._input.setSelectionRange(e.length,e.length)}};Ln._history=new Map;(function(t){function e(s,o){let a=document.createElement("div"),l=document.createElement("pre");l.className=bG,l.textContent=s;let c=document.createElement("input");return c.className=xG,o&&(c.type="password"),a.appendChild(l),l.appendChild(c),a}t.createInputWidgetNode=e;class i extends fe{constructor(o){super({node:document.createElement("iframe")}),this.addClass("jp-mod-isolated"),this._wrapped=o;let a=this.node;a.frameBorder="0",a.scrolling="auto",a.addEventListener("load",()=>{a.contentDocument.open(),a.contentDocument.write(this._wrapped.node.innerHTML),a.contentDocument.close();let l=a.contentDocument.body;a.style.height=`${l.scrollHeight}px`,a.heightChangeObserver=new ResizeObserver(()=>{a.style.height=`${l.scrollHeight}px`}),a.heightChangeObserver.observe(l)})}renderModel(o){return this._wrapped.renderModel(o)}}t.IsolatedRenderer=i,t.currentPreferredMimetype=new pt({name:"preferredMimetype",create:s=>""});class n extends ju{constructor(o){super(o)}_onContext(o){this.node.focus()}onAfterAttach(o){super.onAfterAttach(o),this.node.addEventListener("contextmenu",this._onContext.bind(this))}onBeforeDetach(o){super.onAfterDetach(o),this.node.removeEventListener("contextmenu",this._onContext.bind(this))}}t.OutputPanel=n;class r extends fe{constructor(o,a){let l=document.createElement("div"),c=`The first ${o} are displayed`,u="Show more outputs";l.insertAdjacentHTML("afterbegin",`
${u}
-
`),super({node:l}),this._onClick=a,this.addClass("jp-TrimmedOutputs"),this.addClass("jp-RenderedHTMLCommon")}handleEvent(o){o.type==="click"&&this._onClick(o)}onAfterAttach(o){super.onAfterAttach(o),this.node.addEventListener("click",this)}onBeforeDetach(o){super.onBeforeDetach(o),this.node.removeEventListener("click",this)}}t.TrimmedOutputs=r})(_l||(_l={}))});var WC=$(()=>{Rz();qz()});var Uz,mg,Vz=$(()=>{Uz=P(Qn());lu();mg=class extends Ls{constructor(e,i){super(),this._manager=new Uz.PromiseDelegate,this._rerenderMimeModel=null,this.mimeType=e.mimeType,i&&(this.manager=i)}set manager(e){e.restored.connect(this._rerender,this),this._manager.resolve(e)}async renderModel(e){let i=e.data[this.mimeType];this.node.textContent="Loading widget...";let n=await this._manager.promise;if(i.model_id==="")return this.hide(),Promise.resolve();let r;try{r=await n.get_model(i.model_id)}catch(o){if(n.restoredStatus){this.node.textContent="Error displaying widget: model not found",this.addClass("jupyter-widgets"),console.error(o);return}this._rerenderMimeModel=e;return}this._rerenderMimeModel=null;let s;try{let o=await n.create_view(r);s=o.luminoWidget||o.pWidget}catch(o){this.node.textContent="Error displaying widget",this.addClass("jupyter-widgets"),console.error(o);return}this.node.textContent="",this.addWidget(s),s.disposed.connect(()=>{this.hide(),i.model_id=""})}dispose(){this.isDisposed||(this._manager=null,super.dispose())}_rerender(){this._rerenderMimeModel&&(this.node.textContent="",this.removeClass("jupyter-widgets"),this.renderModel(this._rerenderMimeModel))}}});function x0(t,e){return t.filter(i=>e.indexOf(i)===-1)}function Ff(t,e){return(0,Gz.default)(t,e)}function ni(){return b0.UUID.uuid4()}function pa(t){let e=Object.keys(t),i=[];return e.forEach(function(n){i.push(t[n])}),Promise.all(i).then(n=>{let r={};for(let s=0;s{b0=P(Qn()),Gz=P(V8());ma=Object.assign||function(t,...e){for(let i=1;i + `),super({node:l}),this._onClick=a,this.addClass("jp-TrimmedOutputs"),this.addClass("jp-RenderedHTMLCommon")}handleEvent(o){o.type==="click"&&this._onClick(o)}onAfterAttach(o){super.onAfterAttach(o),this.node.addEventListener("click",this)}onBeforeDetach(o){super.onBeforeDetach(o),this.node.removeEventListener("click",this)}}t.TrimmedOutputs=r})(_l||(_l={}))});var WC=$(()=>{Rz();qz()});var Uz,mg,Vz=$(()=>{Uz=P(Qn());lu();mg=class extends Ls{constructor(e,i){super(),this._manager=new Uz.PromiseDelegate,this._rerenderMimeModel=null,this.mimeType=e.mimeType,i&&(this.manager=i)}set manager(e){e.restored.connect(this._rerender,this),this._manager.resolve(e)}async renderModel(e){let i=e.data[this.mimeType];this.node.textContent="Loading widget...";let n=await this._manager.promise;if(i.model_id==="")return this.hide(),Promise.resolve();let r;try{r=await n.get_model(i.model_id)}catch(o){if(n.restoredStatus){this.node.textContent="Error displaying widget: model not found",this.addClass("jupyter-widgets"),console.error(o);return}this._rerenderMimeModel=e;return}this._rerenderMimeModel=null;let s;try{let o=await n.create_view(r);s=o.luminoWidget||o.pWidget}catch(o){this.node.textContent="Error displaying widget",this.addClass("jupyter-widgets"),console.error(o);return}this.node.textContent="",this.addWidget(s),s.disposed.connect(()=>{this.hide(),i.model_id=""})}dispose(){this.isDisposed||(this._manager=null,super.dispose())}_rerender(){this._rerenderMimeModel&&(this.node.textContent="",this.removeClass("jupyter-widgets"),this.renderModel(this._rerenderMimeModel))}}});function x0(t,e){return t.filter(i=>e.indexOf(i)===-1)}function Wf(t,e){return(0,Gz.default)(t,e)}function ri(){return b0.UUID.uuid4()}function pa(t){let e=Object.keys(t),i=[];return e.forEach(function(n){i.push(t[n])}),Promise.all(i).then(n=>{let r={};for(let s=0;s{b0=P(Qn()),Gz=P(V8());ma=Object.assign||function(t,...e){for(let i=1;i @@ -16,7 +16,7 @@ ${JSON.stringify(t.traceback)}`:t.evalue}var Qi,$s,JS,jr,mc,_o,$n,gf,xu,el=$(()= -`});function Zz(t,e,i){if(t==null)return this;let n;if(Jz.JSONExt.isObject(t)?(n=t,i=e):(n={})[t]=e,i||(i={}),!this._validate(n,i))return!1;let r=i.unset,s=i.silent,o=[],a=this._changing;this._changing=!0;try{a||(this._previousAttributes=Object.assign({},this.attributes),this.changed={});let l=this.attributes,c=this.changed,u=this._previousAttributes;for(let d in n)e=n[d],Ff(l[d],e)||o.push(d),Ff(u[d],e)?delete c[d]:c[d]=e,r?delete l[d]:l[d]=e;if(this.id=this.get(this.idAttribute),!s){o.length&&(this._pending=i);for(let d=0;d{$u();Jz=P(Qn())});var gg,$C,_g,w0,qC,eP,ga,vl,tP,UC,iP,nP,VC,GC,YC,rP,sP,S0,KC,oP,yi=$(()=>{gg="1.13.7",$C=typeof self=="object"&&self.self===self&&self||typeof globalThis=="object"&&globalThis.global===globalThis&&globalThis||Function("return this")()||{},_g=Array.prototype,w0=Object.prototype,qC=typeof Symbol<"u"?Symbol.prototype:null,eP=_g.push,ga=_g.slice,vl=w0.toString,tP=w0.hasOwnProperty,UC=typeof ArrayBuffer<"u",iP=typeof DataView<"u",nP=Array.isArray,VC=Object.keys,GC=Object.create,YC=UC&&ArrayBuffer.isView,rP=isNaN,sP=isFinite,S0=!{toString:null}.propertyIsEnumerable("toString"),KC=["valueOf","isPrototypeOf","toString","propertyIsEnumerable","hasOwnProperty","toLocaleString"],oP=Math.pow(2,53)-1});function Yt(t,e){return e=e==null?t.length-1:+e,function(){for(var i=Math.max(arguments.length-e,0),n=Array(i),r=0;r{});function ir(t){var e=typeof t;return e==="function"||e==="object"&&!!t}var Cc=$(()=>{});function C0(t){return t===null}var aP=$(()=>{});function Wf(t){return t===void 0}var XC=$(()=>{});function $f(t){return t===!0||t===!1||vl.call(t)==="[object Boolean]"}var JC=$(()=>{yi()});function E0(t){return!!(t&&t.nodeType===1)}var lP=$(()=>{});function Bt(t){var e="[object "+t+"]";return function(i){return vl.call(i)===e}}var qn=$(()=>{yi()});var qu,M0=$(()=>{qn();qu=Bt("String")});var vg,ZC=$(()=>{qn();vg=Bt("Number")});var QC,cP=$(()=>{qn();QC=Bt("Date")});var eE,uP=$(()=>{qn();eE=Bt("RegExp")});var tE,dP=$(()=>{qn();tE=Bt("Error")});var bg,iE=$(()=>{qn();bg=Bt("Symbol")});var xg,nE=$(()=>{qn();xg=Bt("ArrayBuffer")});var hP,SG,ri,hs=$(()=>{qn();yi();hP=Bt("Function"),SG=$C.document&&$C.document.childNodes;typeof/./!="function"&&typeof Int8Array!="object"&&typeof SG!="function"&&(hP=function(t){return typeof t=="function"||!1});ri=hP});var rE,fP=$(()=>{qn();rE=Bt("Object")});var I0,qf,Uf=$(()=>{yi();fP();I0=iP&&(!/\[native code\]/.test(String(DataView))||rE(new DataView(new ArrayBuffer(8)))),qf=typeof Map<"u"&&rE(new Map)});function EG(t){return t!=null&&ri(t.getInt8)&&xg(t.buffer)}var CG,Ec,T0=$(()=>{qn();hs();nE();Uf();CG=Bt("DataView");Ec=I0?EG:CG});var yr,Mc=$(()=>{yi();qn();yr=nP||Bt("Array")});function Un(t,e){return t!=null&&tP.call(t,e)}var bl=$(()=>{yi()});var sE,Uu,R0=$(()=>{qn();bl();sE=Bt("Arguments");(function(){sE(arguments)||(sE=function(t){return Un(t,"callee")})})();Uu=sE});function k0(t){return!bg(t)&&sP(t)&&!isNaN(parseFloat(t))}var mP=$(()=>{yi();iE()});function Vf(t){return vg(t)&&rP(t)}var oE=$(()=>{yi();ZC()});function Gf(t){return function(){return t}}var aE=$(()=>{});function yg(t){return function(e){var i=t(e);return typeof i=="number"&&i>=0&&i<=oP}}var lE=$(()=>{yi()});function wg(t){return function(e){return e?.[t]}}var cE=$(()=>{});var Vu,A0=$(()=>{cE();Vu=wg("byteLength")});var pP,gP=$(()=>{lE();A0();pP=yg(Vu)});function IG(t){return YC?YC(t)&&!Ec(t):pP(t)&&MG.test(vl.call(t))}var MG,Sg,uE=$(()=>{yi();T0();aE();gP();MG=/\[object ((I|Ui)nt(8|16|32)|Float(32|64)|Uint8Clamped|Big(I|Ui)nt64)Array\]/;Sg=UC?IG:Gf(!1)});var hi,fs=$(()=>{cE();hi=wg("length")});function TG(t){for(var e={},i=t.length,n=0;n{yi();hs();bl()});function Tt(t){if(!ir(t))return[];if(VC)return VC(t);var e=[];for(var i in t)Un(t,i)&&e.push(i);return S0&&Cg(t,e),e}var Nn=$(()=>{Cc();yi();bl();dE()});function L0(t){if(t==null)return!0;var e=hi(t);return typeof e=="number"&&(yr(t)||qu(t)||Uu(t))?e===0:hi(Tt(t))===0}var _P=$(()=>{fs();Mc();M0();R0();Nn()});function Yf(t,e){var i=Tt(e),n=i.length;if(t==null)return!n;for(var r=Object(t),s=0;s{Nn()});function _t(t){if(t instanceof _t)return t;if(!(this instanceof _t))return new _t(t);this._wrapped=t}var Wr=$(()=>{yi();_t.VERSION=gg;_t.prototype.value=function(){return this._wrapped};_t.prototype.valueOf=_t.prototype.toJSON=_t.prototype.value;_t.prototype.toString=function(){return String(this._wrapped)}});function N0(t){return new Uint8Array(t.buffer||t,t.byteOffset||0,Vu(t))}var vP=$(()=>{A0()});function fE(t,e,i,n){if(t===e)return t!==0||1/t===1/e;if(t==null||e==null)return!1;if(t!==t)return e!==e;var r=typeof t;return r!=="function"&&r!=="object"&&typeof e!="object"?!1:xP(t,e,i,n)}function xP(t,e,i,n){t instanceof _t&&(t=t._wrapped),e instanceof _t&&(e=e._wrapped);var r=vl.call(t);if(r!==vl.call(e))return!1;if(I0&&r=="[object Object]"&&Ec(t)){if(!Ec(e))return!1;r=bP}switch(r){case"[object RegExp]":case"[object String]":return""+t==""+e;case"[object Number]":return+t!=+t?+e!=+e:+t==0?1/+t===1/e:+t==+e;case"[object Date]":case"[object Boolean]":return+t==+e;case"[object Symbol]":return qC.valueOf.call(t)===qC.valueOf.call(e);case"[object ArrayBuffer]":case bP:return xP(N0(t),N0(e),i,n)}var s=r==="[object Array]";if(!s&&Sg(t)){var o=Vu(t);if(o!==Vu(e))return!1;if(t.buffer===e.buffer&&t.byteOffset===e.byteOffset)return!0;s=!0}if(!s){if(typeof t!="object"||typeof e!="object")return!1;var a=t.constructor,l=e.constructor;if(a!==l&&!(ri(a)&&a instanceof a&&ri(l)&&l instanceof l)&&"constructor"in t&&"constructor"in e)return!1}i=i||[],n=n||[];for(var c=i.length;c--;)if(i[c]===t)return n[c]===e;if(i.push(t),n.push(e),s){if(c=t.length,c!==e.length)return!1;for(;c--;)if(!fE(t[c],e[c],i,n))return!1}else{var u=Tt(t),d;if(c=u.length,Tt(e).length!==c)return!1;for(;c--;)if(d=u[c],!(Un(e,d)&&fE(t[d],e[d],i,n)))return!1}return i.pop(),n.pop(),!0}function D0(t,e){return fE(t,e)}var bP,yP=$(()=>{Wr();yi();A0();uE();hs();Uf();T0();Nn();bl();vP();bP="[object DataView]"});function Gs(t){if(!ir(t))return[];var e=[];for(var i in t)e.push(i);return S0&&Cg(t,e),e}var Kf=$(()=>{Cc();yi();dE()});function Xf(t){var e=hi(t);return function(i){if(i==null)return!1;var n=Gs(i);if(hi(n))return!1;for(var r=0;r{fs();hs();Kf();mE="forEach",wP="has",pE=["clear","delete"],SP=["get",wP,"set"],CP=pE.concat(mE,SP),gE=pE.concat(SP),EP=["add"].concat(pE,mE,wP)});var _E,MP=$(()=>{qn();Uf();O0();_E=qf?Xf(CP):Bt("Map")});var vE,IP=$(()=>{qn();Uf();O0();vE=qf?Xf(gE):Bt("WeakMap")});var bE,TP=$(()=>{qn();Uf();O0();bE=qf?Xf(EP):Bt("Set")});var xE,RP=$(()=>{qn();xE=Bt("WeakSet")});function $r(t){for(var e=Tt(t),i=e.length,n=Array(i),r=0;r{Nn()});function z0(t){for(var e=Tt(t),i=e.length,n=Array(i),r=0;r{Nn()});function Jf(t){for(var e={},i=Tt(t),n=0,r=i.length;n{Nn()});function Yu(t){var e=[];for(var i in t)ri(t[i])&&e.push(i);return e.sort()}var wE=$(()=>{hs()});function Ku(t,e){return function(i){var n=arguments.length;if(e&&(i=Object(i)),n<2||i==null)return i;for(var r=1;r{});var Eg,SE=$(()=>{P0();Kf();Eg=Ku(Gs)});var Ic,B0=$(()=>{P0();Nn();Ic=Ku(Tt)});var Mg,CE=$(()=>{P0();Kf();Mg=Ku(Gs,!0)});function RG(){return function(){}}function Ig(t){if(!ir(t))return{};if(GC)return GC(t);var e=RG();e.prototype=t;var i=new e;return e.prototype=null,i}var EE=$(()=>{Cc();yi()});function H0(t,e){var i=Ig(t);return e&&Ic(i,e),i}var AP=$(()=>{EE();B0()});function j0(t){return ir(t)?yr(t)?t.slice():Eg({},t):t}var LP=$(()=>{Cc();Mc();SE()});function F0(t,e){return e(t),t}var NP=$(()=>{});function Tg(t){return yr(t)?t:[t]}var ME=$(()=>{Wr();Mc();_t.toPath=Tg});function Co(t){return _t.toPath(t)}var Zf=$(()=>{Wr();ME()});function Xu(t,e){for(var i=e.length,n=0;n{});function Qf(t,e,i){var n=Xu(t,Co(e));return Wf(n)?i:n}var IE=$(()=>{Zf();W0();XC()});function $0(t,e){e=Co(e);for(var i=e.length,n=0;n{bl();Zf()});function Tc(t){return t}var q0=$(()=>{});function Eo(t){return t=Ic({},t),function(e){return Yf(e,t)}}var Rg=$(()=>{B0();hE()});function Rc(t){return t=Co(t),function(e){return Xu(e,t)}}var U0=$(()=>{W0();Zf()});function Mo(t,e,i){if(e===void 0)return t;switch(i??3){case 1:return function(n){return t.call(e,n)};case 3:return function(n,r,s){return t.call(e,n,r,s)};case 4:return function(n,r,s,o){return t.call(e,n,r,s,o)}}return function(){return t.apply(e,arguments)}}var em=$(()=>{});function kg(t,e,i){return t==null?Tc:ri(t)?Mo(t,e,i):ir(t)&&!yr(t)?Eo(t):Rc(t)}var TE=$(()=>{q0();hs();Cc();Mc();Rg();U0();em()});function Ju(t,e){return kg(t,e,1/0)}var RE=$(()=>{Wr();TE();_t.iteratee=Ju});function Kt(t,e,i){return _t.iteratee!==Ju?_t.iteratee(t,e):kg(t,e,i)}var wr=$(()=>{Wr();TE();RE()});function V0(t,e,i){e=Kt(e,i);for(var n=Tt(t),r=n.length,s={},o=0;o{wr();Nn()});function tm(){}var kE=$(()=>{});function G0(t){return t==null?tm:function(e){return Qf(t,e)}}var zP=$(()=>{kE();IE()});function Y0(t,e,i){var n=Array(Math.max(0,t));e=Mo(e,i,1);for(var r=0;r{em()});function Zu(t,e){return e==null&&(e=t,t=0),t+Math.floor(Math.random()*(e-t+1))}var AE=$(()=>{});var xl,K0=$(()=>{xl=Date.now||function(){return new Date().getTime()}});function Ag(t){var e=function(s){return t[s]},i="(?:"+Tt(t).join("|")+")",n=RegExp(i),r=RegExp(i,"g");return function(s){return s=s==null?"":""+s,n.test(s)?s.replace(r,e):s}}var LE=$(()=>{Nn()});var X0,NE=$(()=>{X0={"&":"&","<":"<",">":">",'"':""","'":"'","`":"`"}});var DE,BP=$(()=>{LE();NE();DE=Ag(X0)});var HP,jP=$(()=>{yE();NE();HP=Jf(X0)});var OE,FP=$(()=>{LE();jP();OE=Ag(HP)});var zE,PE=$(()=>{Wr();zE=_t.templateSettings={evaluate:/<%([\s\S]+?)%>/g,interpolate:/<%=([\s\S]+?)%>/g,escape:/<%-([\s\S]+?)%>/g}});function LG(t){return"\\"+kG[t]}function J0(t,e,i){!e&&i&&(e=i),e=Mg({},e,_t.templateSettings);var n=RegExp([(e.escape||BE).source,(e.interpolate||BE).source,(e.evaluate||BE).source].join("|")+"|$","g"),r=0,s="__p+='";t.replace(n,function(c,u,d,f,h){return s+=t.slice(r,h).replace(AG,LG),r=h+c.length,u?s+=`'+ +`});function Zz(t,e,i){if(t==null)return this;let n;if(Jz.JSONExt.isObject(t)?(n=t,i=e):(n={})[t]=e,i||(i={}),!this._validate(n,i))return!1;let r=i.unset,s=i.silent,o=[],a=this._changing;this._changing=!0;try{a||(this._previousAttributes=Object.assign({},this.attributes),this.changed={});let l=this.attributes,c=this.changed,u=this._previousAttributes;for(let d in n)e=n[d],Wf(l[d],e)||o.push(d),Wf(u[d],e)?delete c[d]:c[d]=e,r?delete l[d]:l[d]=e;if(this.id=this.get(this.idAttribute),!s){o.length&&(this._pending=i);for(let d=0;d{$u();Jz=P(Qn())});var gg,$C,_g,w0,qC,eP,ga,vl,tP,UC,iP,nP,VC,GC,YC,rP,sP,S0,KC,oP,yi=$(()=>{gg="1.13.7",$C=typeof self=="object"&&self.self===self&&self||typeof globalThis=="object"&&globalThis.global===globalThis&&globalThis||Function("return this")()||{},_g=Array.prototype,w0=Object.prototype,qC=typeof Symbol<"u"?Symbol.prototype:null,eP=_g.push,ga=_g.slice,vl=w0.toString,tP=w0.hasOwnProperty,UC=typeof ArrayBuffer<"u",iP=typeof DataView<"u",nP=Array.isArray,VC=Object.keys,GC=Object.create,YC=UC&&ArrayBuffer.isView,rP=isNaN,sP=isFinite,S0=!{toString:null}.propertyIsEnumerable("toString"),KC=["valueOf","isPrototypeOf","toString","propertyIsEnumerable","hasOwnProperty","toLocaleString"],oP=Math.pow(2,53)-1});function Kt(t,e){return e=e==null?t.length-1:+e,function(){for(var i=Math.max(arguments.length-e,0),n=Array(i),r=0;r{});function ir(t){var e=typeof t;return e==="function"||e==="object"&&!!t}var Cc=$(()=>{});function C0(t){return t===null}var aP=$(()=>{});function $f(t){return t===void 0}var XC=$(()=>{});function qf(t){return t===!0||t===!1||vl.call(t)==="[object Boolean]"}var JC=$(()=>{yi()});function E0(t){return!!(t&&t.nodeType===1)}var lP=$(()=>{});function Bt(t){var e="[object "+t+"]";return function(i){return vl.call(i)===e}}var qn=$(()=>{yi()});var qu,M0=$(()=>{qn();qu=Bt("String")});var vg,ZC=$(()=>{qn();vg=Bt("Number")});var QC,cP=$(()=>{qn();QC=Bt("Date")});var eE,uP=$(()=>{qn();eE=Bt("RegExp")});var tE,dP=$(()=>{qn();tE=Bt("Error")});var bg,iE=$(()=>{qn();bg=Bt("Symbol")});var xg,nE=$(()=>{qn();xg=Bt("ArrayBuffer")});var hP,SG,si,hs=$(()=>{qn();yi();hP=Bt("Function"),SG=$C.document&&$C.document.childNodes;typeof/./!="function"&&typeof Int8Array!="object"&&typeof SG!="function"&&(hP=function(t){return typeof t=="function"||!1});si=hP});var rE,fP=$(()=>{qn();rE=Bt("Object")});var I0,Uf,Vf=$(()=>{yi();fP();I0=iP&&(!/\[native code\]/.test(String(DataView))||rE(new DataView(new ArrayBuffer(8)))),Uf=typeof Map<"u"&&rE(new Map)});function EG(t){return t!=null&&si(t.getInt8)&&xg(t.buffer)}var CG,Ec,T0=$(()=>{qn();hs();nE();Vf();CG=Bt("DataView");Ec=I0?EG:CG});var yr,Mc=$(()=>{yi();qn();yr=nP||Bt("Array")});function Un(t,e){return t!=null&&tP.call(t,e)}var bl=$(()=>{yi()});var sE,Uu,R0=$(()=>{qn();bl();sE=Bt("Arguments");(function(){sE(arguments)||(sE=function(t){return Un(t,"callee")})})();Uu=sE});function k0(t){return!bg(t)&&sP(t)&&!isNaN(parseFloat(t))}var mP=$(()=>{yi();iE()});function Gf(t){return vg(t)&&rP(t)}var oE=$(()=>{yi();ZC()});function Yf(t){return function(){return t}}var aE=$(()=>{});function yg(t){return function(e){var i=t(e);return typeof i=="number"&&i>=0&&i<=oP}}var lE=$(()=>{yi()});function wg(t){return function(e){return e?.[t]}}var cE=$(()=>{});var Vu,A0=$(()=>{cE();Vu=wg("byteLength")});var pP,gP=$(()=>{lE();A0();pP=yg(Vu)});function IG(t){return YC?YC(t)&&!Ec(t):pP(t)&&MG.test(vl.call(t))}var MG,Sg,uE=$(()=>{yi();T0();aE();gP();MG=/\[object ((I|Ui)nt(8|16|32)|Float(32|64)|Uint8Clamped|Big(I|Ui)nt64)Array\]/;Sg=UC?IG:Yf(!1)});var hi,fs=$(()=>{cE();hi=wg("length")});function TG(t){for(var e={},i=t.length,n=0;n{yi();hs();bl()});function Tt(t){if(!ir(t))return[];if(VC)return VC(t);var e=[];for(var i in t)Un(t,i)&&e.push(i);return S0&&Cg(t,e),e}var Nn=$(()=>{Cc();yi();bl();dE()});function L0(t){if(t==null)return!0;var e=hi(t);return typeof e=="number"&&(yr(t)||qu(t)||Uu(t))?e===0:hi(Tt(t))===0}var _P=$(()=>{fs();Mc();M0();R0();Nn()});function Kf(t,e){var i=Tt(e),n=i.length;if(t==null)return!n;for(var r=Object(t),s=0;s{Nn()});function _t(t){if(t instanceof _t)return t;if(!(this instanceof _t))return new _t(t);this._wrapped=t}var Wr=$(()=>{yi();_t.VERSION=gg;_t.prototype.value=function(){return this._wrapped};_t.prototype.valueOf=_t.prototype.toJSON=_t.prototype.value;_t.prototype.toString=function(){return String(this._wrapped)}});function N0(t){return new Uint8Array(t.buffer||t,t.byteOffset||0,Vu(t))}var vP=$(()=>{A0()});function fE(t,e,i,n){if(t===e)return t!==0||1/t===1/e;if(t==null||e==null)return!1;if(t!==t)return e!==e;var r=typeof t;return r!=="function"&&r!=="object"&&typeof e!="object"?!1:xP(t,e,i,n)}function xP(t,e,i,n){t instanceof _t&&(t=t._wrapped),e instanceof _t&&(e=e._wrapped);var r=vl.call(t);if(r!==vl.call(e))return!1;if(I0&&r=="[object Object]"&&Ec(t)){if(!Ec(e))return!1;r=bP}switch(r){case"[object RegExp]":case"[object String]":return""+t==""+e;case"[object Number]":return+t!=+t?+e!=+e:+t==0?1/+t===1/e:+t==+e;case"[object Date]":case"[object Boolean]":return+t==+e;case"[object Symbol]":return qC.valueOf.call(t)===qC.valueOf.call(e);case"[object ArrayBuffer]":case bP:return xP(N0(t),N0(e),i,n)}var s=r==="[object Array]";if(!s&&Sg(t)){var o=Vu(t);if(o!==Vu(e))return!1;if(t.buffer===e.buffer&&t.byteOffset===e.byteOffset)return!0;s=!0}if(!s){if(typeof t!="object"||typeof e!="object")return!1;var a=t.constructor,l=e.constructor;if(a!==l&&!(si(a)&&a instanceof a&&si(l)&&l instanceof l)&&"constructor"in t&&"constructor"in e)return!1}i=i||[],n=n||[];for(var c=i.length;c--;)if(i[c]===t)return n[c]===e;if(i.push(t),n.push(e),s){if(c=t.length,c!==e.length)return!1;for(;c--;)if(!fE(t[c],e[c],i,n))return!1}else{var u=Tt(t),d;if(c=u.length,Tt(e).length!==c)return!1;for(;c--;)if(d=u[c],!(Un(e,d)&&fE(t[d],e[d],i,n)))return!1}return i.pop(),n.pop(),!0}function D0(t,e){return fE(t,e)}var bP,yP=$(()=>{Wr();yi();A0();uE();hs();Vf();T0();Nn();bl();vP();bP="[object DataView]"});function Gs(t){if(!ir(t))return[];var e=[];for(var i in t)e.push(i);return S0&&Cg(t,e),e}var Xf=$(()=>{Cc();yi();dE()});function Jf(t){var e=hi(t);return function(i){if(i==null)return!1;var n=Gs(i);if(hi(n))return!1;for(var r=0;r{fs();hs();Xf();mE="forEach",wP="has",pE=["clear","delete"],SP=["get",wP,"set"],CP=pE.concat(mE,SP),gE=pE.concat(SP),EP=["add"].concat(pE,mE,wP)});var _E,MP=$(()=>{qn();Vf();O0();_E=Uf?Jf(CP):Bt("Map")});var vE,IP=$(()=>{qn();Vf();O0();vE=Uf?Jf(gE):Bt("WeakMap")});var bE,TP=$(()=>{qn();Vf();O0();bE=Uf?Jf(EP):Bt("Set")});var xE,RP=$(()=>{qn();xE=Bt("WeakSet")});function $r(t){for(var e=Tt(t),i=e.length,n=Array(i),r=0;r{Nn()});function z0(t){for(var e=Tt(t),i=e.length,n=Array(i),r=0;r{Nn()});function Zf(t){for(var e={},i=Tt(t),n=0,r=i.length;n{Nn()});function Yu(t){var e=[];for(var i in t)si(t[i])&&e.push(i);return e.sort()}var wE=$(()=>{hs()});function Ku(t,e){return function(i){var n=arguments.length;if(e&&(i=Object(i)),n<2||i==null)return i;for(var r=1;r{});var Eg,SE=$(()=>{P0();Xf();Eg=Ku(Gs)});var Ic,B0=$(()=>{P0();Nn();Ic=Ku(Tt)});var Mg,CE=$(()=>{P0();Xf();Mg=Ku(Gs,!0)});function RG(){return function(){}}function Ig(t){if(!ir(t))return{};if(GC)return GC(t);var e=RG();e.prototype=t;var i=new e;return e.prototype=null,i}var EE=$(()=>{Cc();yi()});function H0(t,e){var i=Ig(t);return e&&Ic(i,e),i}var AP=$(()=>{EE();B0()});function j0(t){return ir(t)?yr(t)?t.slice():Eg({},t):t}var LP=$(()=>{Cc();Mc();SE()});function F0(t,e){return e(t),t}var NP=$(()=>{});function Tg(t){return yr(t)?t:[t]}var ME=$(()=>{Wr();Mc();_t.toPath=Tg});function Co(t){return _t.toPath(t)}var Qf=$(()=>{Wr();ME()});function Xu(t,e){for(var i=e.length,n=0;n{});function em(t,e,i){var n=Xu(t,Co(e));return $f(n)?i:n}var IE=$(()=>{Qf();W0();XC()});function $0(t,e){e=Co(e);for(var i=e.length,n=0;n{bl();Qf()});function Tc(t){return t}var q0=$(()=>{});function Eo(t){return t=Ic({},t),function(e){return Kf(e,t)}}var Rg=$(()=>{B0();hE()});function Rc(t){return t=Co(t),function(e){return Xu(e,t)}}var U0=$(()=>{W0();Qf()});function Mo(t,e,i){if(e===void 0)return t;switch(i??3){case 1:return function(n){return t.call(e,n)};case 3:return function(n,r,s){return t.call(e,n,r,s)};case 4:return function(n,r,s,o){return t.call(e,n,r,s,o)}}return function(){return t.apply(e,arguments)}}var tm=$(()=>{});function kg(t,e,i){return t==null?Tc:si(t)?Mo(t,e,i):ir(t)&&!yr(t)?Eo(t):Rc(t)}var TE=$(()=>{q0();hs();Cc();Mc();Rg();U0();tm()});function Ju(t,e){return kg(t,e,1/0)}var RE=$(()=>{Wr();TE();_t.iteratee=Ju});function Xt(t,e,i){return _t.iteratee!==Ju?_t.iteratee(t,e):kg(t,e,i)}var wr=$(()=>{Wr();TE();RE()});function V0(t,e,i){e=Xt(e,i);for(var n=Tt(t),r=n.length,s={},o=0;o{wr();Nn()});function im(){}var kE=$(()=>{});function G0(t){return t==null?im:function(e){return em(t,e)}}var zP=$(()=>{kE();IE()});function Y0(t,e,i){var n=Array(Math.max(0,t));e=Mo(e,i,1);for(var r=0;r{tm()});function Zu(t,e){return e==null&&(e=t,t=0),t+Math.floor(Math.random()*(e-t+1))}var AE=$(()=>{});var xl,K0=$(()=>{xl=Date.now||function(){return new Date().getTime()}});function Ag(t){var e=function(s){return t[s]},i="(?:"+Tt(t).join("|")+")",n=RegExp(i),r=RegExp(i,"g");return function(s){return s=s==null?"":""+s,n.test(s)?s.replace(r,e):s}}var LE=$(()=>{Nn()});var X0,NE=$(()=>{X0={"&":"&","<":"<",">":">",'"':""","'":"'","`":"`"}});var DE,BP=$(()=>{LE();NE();DE=Ag(X0)});var HP,jP=$(()=>{yE();NE();HP=Zf(X0)});var OE,FP=$(()=>{LE();jP();OE=Ag(HP)});var zE,PE=$(()=>{Wr();zE=_t.templateSettings={evaluate:/<%([\s\S]+?)%>/g,interpolate:/<%=([\s\S]+?)%>/g,escape:/<%-([\s\S]+?)%>/g}});function LG(t){return"\\"+kG[t]}function J0(t,e,i){!e&&i&&(e=i),e=Mg({},e,_t.templateSettings);var n=RegExp([(e.escape||BE).source,(e.interpolate||BE).source,(e.evaluate||BE).source].join("|")+"|$","g"),r=0,s="__p+='";t.replace(n,function(c,u,d,f,h){return s+=t.slice(r,h).replace(AG,LG),r=h+c.length,u?s+=`'+ ((__t=(`+u+`))==null?'':_.escape(__t))+ '`:d?s+=`'+ ((__t=(`+d+`))==null?'':__t)+ @@ -28,14 +28,14 @@ __p+='`),c}),s+=`'; `,o="obj";s=`var __t,__p='',__j=Array.prototype.join,print=function(){__p+=__j.call(arguments,'');}; `+s+`return __p; `;var a;try{a=new Function(o,"_",s)}catch(c){throw c.source=s,c}var l=function(c){return a.call(this,c,_t)};return l.source="function("+o+`){ -`+s+"}",l}var BE,kG,AG,NG,WP=$(()=>{CE();Wr();PE();BE=/(.)^/,kG={"'":"'","\\":"\\","\r":"r","\n":"n","\u2028":"u2028","\u2029":"u2029"},AG=/\\|'|\r|\n|\u2028|\u2029/g;NG=/^\s*(\w|\$)+\s*$/});function Z0(t,e,i){e=Co(e);var n=e.length;if(!n)return ri(i)?i.call(t):i;for(var r=0;r{hs();Zf()});function Q0(t){var e=++DG+"";return t?t+e:e}var DG,qP=$(()=>{DG=0});function ex(t){var e=_t(t);return e._chain=!0,e}var UP=$(()=>{Wr()});function Lg(t,e,i,n,r){if(!(n instanceof e))return t.apply(i,r);var s=Ig(t.prototype),o=t.apply(s,r);return ir(o)?o:s}var HE=$(()=>{EE();Cc()});var jE,yl,Ng=$(()=>{Fr();HE();Wr();jE=Yt(function(t,e){var i=jE.placeholder,n=function(){for(var r=0,s=e.length,o=Array(s),a=0;a{Fr();hs();HE();Dg=Yt(function(t,e,i){if(!ri(t))throw new TypeError("Bind must be called on a function");var n=Yt(function(r){return Lg(t,n,e,this,i.concat(r))});return n})});var fi,qr=$(()=>{lE();fs();fi=yg(hi)});function Ur(t,e,i,n){if(n=n||[],!e&&e!==0)e=1/0;else if(e<=0)return n.concat(t);for(var r=n.length,s=0,o=hi(t);s1)Ur(a,e-1,i,n),r=n.length;else for(var l=0,c=a.length;l{fs();qr();Mc();R0()});var WE,VP=$(()=>{Fr();Qu();FE();WE=Yt(function(t,e){e=Ur(e,!1,!1);var i=e.length;if(i<1)throw new Error("bindAll must be passed function names");for(;i--;){var n=e[i];t[n]=Dg(t[n],t)}return t})});function tx(t,e){var i=function(n){var r=i.cache,s=""+(e?e.apply(this,arguments):n);return Un(r,s)||(r[s]=t.apply(this,arguments)),r[s]};return i.cache={},i}var GP=$(()=>{bl()});var Og,$E=$(()=>{Fr();Og=Yt(function(t,e,i){return setTimeout(function(){return t.apply(null,i)},e)})});var qE,YP=$(()=>{Ng();$E();Wr();qE=yl(Og,_t,1)});function ix(t,e,i){var n,r,s,o,a=0;i||(i={});var l=function(){a=i.leading===!1?0:xl(),n=null,o=t.apply(r,s),n||(r=s=null)},c=function(){var u=xl();!a&&i.leading===!1&&(a=u);var d=e-(u-a);return r=this,s=arguments,d<=0||d>e?(n&&(clearTimeout(n),n=null),a=u,o=t.apply(r,s),n||(r=s=null)):!n&&i.trailing!==!1&&(n=setTimeout(l,d)),o};return c.cancel=function(){clearTimeout(n),a=0,n=r=s=null},c}var KP=$(()=>{K0()});function nx(t,e,i){var n,r,s,o,a,l=function(){var u=xl()-r;e>u?n=setTimeout(l,e-u):(n=null,i||(o=t.apply(a,s)),n||(s=a=null))},c=Yt(function(u){return a=this,s=u,r=xl(),n||(n=setTimeout(l,e),i&&(o=t.apply(a,s))),o});return c.cancel=function(){clearTimeout(n),n=s=a=null},c}var XP=$(()=>{Fr();K0()});function rx(t,e){return yl(e,t)}var JP=$(()=>{Ng()});function kc(t){return function(){return!t.apply(this,arguments)}}var sx=$(()=>{});function ox(){var t=arguments,e=t.length-1;return function(){for(var i=e,n=t[e].apply(this,arguments);i--;)n=t[i].call(this,n);return n}}var ZP=$(()=>{});function ax(t,e){return function(){if(--t<1)return e.apply(this,arguments)}}var QP=$(()=>{});function im(t,e){var i;return function(){return--t>0&&(i=e.apply(this,arguments)),t<=1&&(e=null),i}}var UE=$(()=>{});var VE,eB=$(()=>{Ng();UE();VE=yl(im,2)});function nm(t,e,i){e=Kt(e,i);for(var n=Tt(t),r,s=0,o=n.length;s{wr();Nn()});function zg(t){return function(e,i,n){i=Kt(i,n);for(var r=hi(e),s=t>0?0:r-1;s>=0&&s{wr();fs()});var ed,lx=$(()=>{YE();ed=zg(1)});var Pg,KE=$(()=>{YE();Pg=zg(-1)});function rm(t,e,i,n){i=Kt(i,n,1);for(var r=i(e),s=0,o=hi(t);s{wr();fs()});function Bg(t,e,i){return function(n,r,s){var o=0,a=hi(n);if(typeof s=="number")t>0?o=s>=0?s:Math.max(s+a,o):a=s>=0?Math.min(s+1,a):s+a+1;else if(i&&s&&a)return s=i(n,r),n[s]===r?s:-1;if(r!==r)return s=e(ga.call(n,o,a),Vf),s>=0?s+o:-1;for(s=t>0?o:a-1;s>=0&&s{fs();yi();oE()});var Hg,ZE=$(()=>{XE();lx();JE();Hg=Bg(1,ed,rm)});var QE,tB=$(()=>{KE();JE();QE=Bg(-1,Pg)});function td(t,e,i){var n=fi(t)?ed:nm,r=n(t,e,i);if(r!==void 0&&r!==-1)return t[r]}var eM=$(()=>{qr();lx();GE()});function cx(t,e){return td(t,Eo(e))}var iB=$(()=>{eM();Rg()});function Dn(t,e,i){e=Mo(e,i);var n,r;if(fi(t))for(n=0,r=t.length;n{em();qr();Nn()});function Sr(t,e,i){e=Kt(e,i);for(var n=!fi(t)&&Tt(t),r=(n||t).length,s=Array(r),o=0;o{wr();qr();Nn()});function jg(t){var e=function(i,n,r,s){var o=!fi(i)&&Tt(i),a=(o||i).length,l=t>0?0:a-1;for(s||(r=i[o?o[l]:l],l+=t);l>=0&&l=3;return e(i,Mo(n,s,4),r,o)}}var tM=$(()=>{qr();Nn();em()});var Fg,nB=$(()=>{tM();Fg=jg(1)});var ux,rB=$(()=>{tM();ux=jg(-1)});function ms(t,e,i){var n=[];return e=Kt(e,i),Dn(t,function(r,s,o){e(r,s,o)&&n.push(r)}),n}var sm=$(()=>{wr();Ac()});function dx(t,e,i){return ms(t,kc(Kt(e)),i)}var sB=$(()=>{sm();sx();wr()});function Wg(t,e,i){e=Kt(e,i);for(var n=!fi(t)&&Tt(t),r=(n||t).length,s=0;s{wr();qr();Nn()});function $g(t,e,i){e=Kt(e,i);for(var n=!fi(t)&&Tt(t),r=(n||t).length,s=0;s{wr();qr();Nn()});function nr(t,e,i,n){return fi(t)||(t=$r(t)),(typeof i!="number"||n)&&(i=0),Hg(t,e,i)>=0}var om=$(()=>{qr();Gu();ZE()});var iM,lB=$(()=>{Fr();hs();id();W0();Zf();iM=Yt(function(t,e,i){var n,r;return ri(e)?r=e:(e=Co(e),n=e.slice(0,-1),e=e[e.length-1]),Sr(t,function(s){var o=r;if(!o){if(n&&n.length&&(s=Xu(s,n)),s==null)return;o=s[e]}return o==null?o:o.apply(s,i)})})});function Lc(t,e){return Sr(t,Rc(e))}var hx=$(()=>{id();U0()});function fx(t,e){return ms(t,Eo(e))}var cB=$(()=>{sm();Rg()});function am(t,e,i){var n=-1/0,r=-1/0,s,o;if(e==null||typeof e=="number"&&typeof t[0]!="object"&&t!=null){t=fi(t)?t:$r(t);for(var a=0,l=t.length;an&&(n=s)}else e=Kt(e,i),Dn(t,function(c,u,d){o=e(c,u,d),(o>r||o===-1/0&&n===-1/0)&&(n=c,r=o)});return n}var nM=$(()=>{qr();Gu();wr();Ac()});function mx(t,e,i){var n=1/0,r=1/0,s,o;if(e==null||typeof e=="number"&&typeof t[0]!="object"&&t!=null){t=fi(t)?t:$r(t);for(var a=0,l=t.length;a{qr();Gu();wr();Ac()});function lm(t){return t?yr(t)?ga.call(t):qu(t)?t.match(OG):fi(t)?Sr(t,Tc):$r(t):[]}var OG,rM=$(()=>{Mc();yi();M0();qr();id();q0();Gu();OG=/[^\ud800-\udfff]|[\ud800-\udbff][\udc00-\udfff]|[\ud800-\udfff]/g});function cm(t,e,i){if(e==null||i)return fi(t)||(t=$r(t)),t[Zu(t.length-1)];var n=lm(t),r=hi(n);e=Math.max(Math.min(e,r),0);for(var s=r-1,o=0;o{qr();Gu();fs();AE();rM()});function px(t){return cm(t,1/0)}var dB=$(()=>{sM()});function gx(t,e,i){var n=0;return e=Kt(e,i),Lc(Sr(t,function(r,s,o){return{value:r,index:n++,criteria:e(r,s,o)}}).sort(function(r,s){var o=r.criteria,a=s.criteria;if(o!==a){if(o>a||o===void 0)return 1;if(o{wr();hx();id()});function wl(t,e){return function(i,n,r){var s=e?[[],[]]:{};return n=Kt(n,r),Dn(i,function(o,a){var l=n(o,a,i);t(s,o,l)}),s}}var qg=$(()=>{wr();Ac()});var oM,fB=$(()=>{qg();bl();oM=wl(function(t,e,i){Un(t,i)?t[i].push(e):t[i]=[e]})});var aM,mB=$(()=>{qg();aM=wl(function(t,e,i){t[i]=e})});var lM,pB=$(()=>{qg();bl();lM=wl(function(t,e,i){Un(t,i)?t[i]++:t[i]=1})});var cM,gB=$(()=>{qg();cM=wl(function(t,e,i){t[i?0:1].push(e)},!0)});function _x(t){return t==null?0:fi(t)?t.length:Tt(t).length}var _B=$(()=>{qr();Nn()});function uM(t,e,i){return e in i}var vB=$(()=>{});var Ug,dM=$(()=>{Fr();hs();em();Kf();vB();Qu();Ug=Yt(function(t,e){var i={},n=e[0];if(t==null)return i;ri(n)?(e.length>1&&(n=Mo(n,e[1])),e=Gs(t)):(n=uM,e=Ur(e,!1,!1),t=Object(t));for(var r=0,s=e.length;r{Fr();hs();sx();id();Qu();om();dM();hM=Yt(function(t,e){var i=e[0],n;return ri(i)?(i=kc(i),e.length>1&&(n=e[1])):(e=Sr(Ur(e,!1,!1),String),i=function(r,s){return!nr(e,s)}),Ug(t,i,n)})});function um(t,e,i){return ga.call(t,0,Math.max(0,t.length-(e==null||i?1:e)))}var fM=$(()=>{yi()});function dm(t,e,i){return t==null||t.length<1?e==null||i?void 0:[]:e==null||i?t[0]:um(t,t.length-e)}var xB=$(()=>{fM()});function Nc(t,e,i){return ga.call(t,e==null||i?1:e)}var mM=$(()=>{yi()});function vx(t,e,i){return t==null||t.length<1?e==null||i?void 0:[]:e==null||i?t[t.length-1]:Nc(t,Math.max(0,t.length-e))}var yB=$(()=>{mM()});function bx(t){return ms(t,Boolean)}var wB=$(()=>{sm()});function xx(t,e){return Ur(t,e,!1)}var SB=$(()=>{Qu()});var Vg,pM=$(()=>{Fr();Qu();sm();om();Vg=Yt(function(t,e){return e=Ur(e,!0,!0),ms(t,function(i){return!nr(e,i)})})});var gM,CB=$(()=>{Fr();pM();gM=Yt(function(t,e){return Vg(t,e)})});function nd(t,e,i,n){$f(e)||(n=i,i=e,e=!1),i!=null&&(i=Kt(i,n));for(var r=[],s=[],o=0,a=hi(t);o{JC();wr();fs();om()});var vM,EB=$(()=>{Fr();_M();Qu();vM=Yt(function(t){return nd(Ur(t,!0,!0))})});function yx(t){for(var e=[],i=arguments.length,n=0,r=hi(t);n{fs();om()});function rd(t){for(var e=t&&am(t,hi).length||0,i=Array(e),n=0;n{nM();fs();hx()});var xM,IB=$(()=>{Fr();bM();xM=Yt(rd)});function wx(t,e){for(var i={},n=0,r=hi(t);n{fs()});function Sx(t,e,i){e==null&&(e=t||0,t=0),i||(i=e{});function Cx(t,e){if(e==null||e<1)return[];for(var i=[],n=0,r=t.length;n{yi()});function hm(t,e){return t._chain?_t(e).chain():e}var yM=$(()=>{Wr()});function fm(t){return Dn(Yu(t),function(e){var i=_t[e]=t[e];_t.prototype[e]=function(){var n=[this._wrapped];return eP.apply(n,arguments),hm(this,i.apply(_t,n))}}),_t}var AB=$(()=>{Wr();Ac();wE();yi();yM()});var LB,NB=$(()=>{Wr();Ac();yi();yM();Dn(["pop","push","reverse","shift","sort","splice","unshift"],function(t){var e=_g[t];_t.prototype[t]=function(){var i=this._wrapped;return i!=null&&(e.apply(i,arguments),(t==="shift"||t==="splice")&&i.length===0&&delete i[0]),hm(this,i)}});Dn(["concat","join","slice"],function(t){var e=_g[t];_t.prototype[t]=function(){var i=this._wrapped;return i!=null&&(i=e.apply(i,arguments)),hm(this,i)}});LB=_t});var wM={};lh(wM,{VERSION:()=>gg,after:()=>ax,all:()=>Wg,allKeys:()=>Gs,any:()=>$g,assign:()=>Ic,before:()=>im,bind:()=>Dg,bindAll:()=>WE,chain:()=>ex,chunk:()=>Cx,clone:()=>j0,collect:()=>Sr,compact:()=>bx,compose:()=>ox,constant:()=>Gf,contains:()=>nr,countBy:()=>lM,create:()=>H0,debounce:()=>nx,default:()=>LB,defaults:()=>Mg,defer:()=>qE,delay:()=>Og,detect:()=>td,difference:()=>Vg,drop:()=>Nc,each:()=>Dn,escape:()=>DE,every:()=>Wg,extend:()=>Eg,extendOwn:()=>Ic,filter:()=>ms,find:()=>td,findIndex:()=>ed,findKey:()=>nm,findLastIndex:()=>Pg,findWhere:()=>cx,first:()=>dm,flatten:()=>xx,foldl:()=>Fg,foldr:()=>ux,forEach:()=>Dn,functions:()=>Yu,get:()=>Qf,groupBy:()=>oM,has:()=>$0,head:()=>dm,identity:()=>Tc,include:()=>nr,includes:()=>nr,indexBy:()=>aM,indexOf:()=>Hg,initial:()=>um,inject:()=>Fg,intersection:()=>yx,invert:()=>Jf,invoke:()=>iM,isArguments:()=>Uu,isArray:()=>yr,isArrayBuffer:()=>xg,isBoolean:()=>$f,isDataView:()=>Ec,isDate:()=>QC,isElement:()=>E0,isEmpty:()=>L0,isEqual:()=>D0,isError:()=>tE,isFinite:()=>k0,isFunction:()=>ri,isMap:()=>_E,isMatch:()=>Yf,isNaN:()=>Vf,isNull:()=>C0,isNumber:()=>vg,isObject:()=>ir,isRegExp:()=>eE,isSet:()=>bE,isString:()=>qu,isSymbol:()=>bg,isTypedArray:()=>Sg,isUndefined:()=>Wf,isWeakMap:()=>vE,isWeakSet:()=>xE,iteratee:()=>Ju,keys:()=>Tt,last:()=>vx,lastIndexOf:()=>QE,map:()=>Sr,mapObject:()=>V0,matcher:()=>Eo,matches:()=>Eo,max:()=>am,memoize:()=>tx,methods:()=>Yu,min:()=>mx,mixin:()=>fm,negate:()=>kc,noop:()=>tm,now:()=>xl,object:()=>wx,omit:()=>hM,once:()=>VE,pairs:()=>z0,partial:()=>yl,partition:()=>cM,pick:()=>Ug,pluck:()=>Lc,property:()=>Rc,propertyOf:()=>G0,random:()=>Zu,range:()=>Sx,reduce:()=>Fg,reduceRight:()=>ux,reject:()=>dx,rest:()=>Nc,restArguments:()=>Yt,result:()=>Z0,sample:()=>cm,select:()=>ms,shuffle:()=>px,size:()=>_x,some:()=>$g,sortBy:()=>gx,sortedIndex:()=>rm,tail:()=>Nc,take:()=>dm,tap:()=>F0,template:()=>J0,templateSettings:()=>zE,throttle:()=>ix,times:()=>Y0,toArray:()=>lm,toPath:()=>Tg,transpose:()=>rd,unescape:()=>OE,union:()=>vM,uniq:()=>nd,unique:()=>nd,uniqueId:()=>Q0,unzip:()=>rd,values:()=>$r,where:()=>fx,without:()=>gM,wrap:()=>rx,zip:()=>xM});var Ex=$(()=>{yi();Fr();Cc();aP();XC();JC();lP();M0();ZC();cP();uP();dP();iE();nE();T0();Mc();hs();R0();mP();oE();uE();_P();hE();yP();MP();IP();TP();RP();Nn();Kf();Gu();kP();yE();wE();SE();B0();CE();AP();LP();NP();IE();DP();OP();q0();aE();kE();ME();U0();zP();Rg();PP();AE();K0();BP();FP();PE();WP();$P();qP();UP();RE();Ng();FE();VP();GP();$E();YP();KP();XP();JP();sx();ZP();QP();UE();eB();GE();lx();KE();XE();ZE();tB();eM();iB();Ac();id();nB();rB();sm();sB();oB();aB();om();lB();hx();cB();nM();uB();dB();sM();hB();fB();mB();pB();gB();rM();_B();dM();bB();xB();fM();yB();mM();wB();SB();CB();_M();EB();MB();pM();bM();IB();TB();RB();kB();AB();NB()});var SM,DB,OB=$(()=>{Ex();Ex();SM=fm(wM);SM._=SM;DB=SM});var zB={};lh(zB,{VERSION:()=>gg,after:()=>ax,all:()=>Wg,allKeys:()=>Gs,any:()=>$g,assign:()=>Ic,before:()=>im,bind:()=>Dg,bindAll:()=>WE,chain:()=>ex,chunk:()=>Cx,clone:()=>j0,collect:()=>Sr,compact:()=>bx,compose:()=>ox,constant:()=>Gf,contains:()=>nr,countBy:()=>lM,create:()=>H0,debounce:()=>nx,default:()=>DB,defaults:()=>Mg,defer:()=>qE,delay:()=>Og,detect:()=>td,difference:()=>Vg,drop:()=>Nc,each:()=>Dn,escape:()=>DE,every:()=>Wg,extend:()=>Eg,extendOwn:()=>Ic,filter:()=>ms,find:()=>td,findIndex:()=>ed,findKey:()=>nm,findLastIndex:()=>Pg,findWhere:()=>cx,first:()=>dm,flatten:()=>xx,foldl:()=>Fg,foldr:()=>ux,forEach:()=>Dn,functions:()=>Yu,get:()=>Qf,groupBy:()=>oM,has:()=>$0,head:()=>dm,identity:()=>Tc,include:()=>nr,includes:()=>nr,indexBy:()=>aM,indexOf:()=>Hg,initial:()=>um,inject:()=>Fg,intersection:()=>yx,invert:()=>Jf,invoke:()=>iM,isArguments:()=>Uu,isArray:()=>yr,isArrayBuffer:()=>xg,isBoolean:()=>$f,isDataView:()=>Ec,isDate:()=>QC,isElement:()=>E0,isEmpty:()=>L0,isEqual:()=>D0,isError:()=>tE,isFinite:()=>k0,isFunction:()=>ri,isMap:()=>_E,isMatch:()=>Yf,isNaN:()=>Vf,isNull:()=>C0,isNumber:()=>vg,isObject:()=>ir,isRegExp:()=>eE,isSet:()=>bE,isString:()=>qu,isSymbol:()=>bg,isTypedArray:()=>Sg,isUndefined:()=>Wf,isWeakMap:()=>vE,isWeakSet:()=>xE,iteratee:()=>Ju,keys:()=>Tt,last:()=>vx,lastIndexOf:()=>QE,map:()=>Sr,mapObject:()=>V0,matcher:()=>Eo,matches:()=>Eo,max:()=>am,memoize:()=>tx,methods:()=>Yu,min:()=>mx,mixin:()=>fm,negate:()=>kc,noop:()=>tm,now:()=>xl,object:()=>wx,omit:()=>hM,once:()=>VE,pairs:()=>z0,partial:()=>yl,partition:()=>cM,pick:()=>Ug,pluck:()=>Lc,property:()=>Rc,propertyOf:()=>G0,random:()=>Zu,range:()=>Sx,reduce:()=>Fg,reduceRight:()=>ux,reject:()=>dx,rest:()=>Nc,restArguments:()=>Yt,result:()=>Z0,sample:()=>cm,select:()=>ms,shuffle:()=>px,size:()=>_x,some:()=>$g,sortBy:()=>gx,sortedIndex:()=>rm,tail:()=>Nc,take:()=>dm,tap:()=>F0,template:()=>J0,templateSettings:()=>zE,throttle:()=>ix,times:()=>Y0,toArray:()=>lm,toPath:()=>Tg,transpose:()=>rd,unescape:()=>OE,union:()=>vM,uniq:()=>nd,unique:()=>nd,uniqueId:()=>Q0,unzip:()=>rd,values:()=>$r,where:()=>fx,without:()=>gM,wrap:()=>rx,zip:()=>xM});var PB=$(()=>{OB();Ex()});var sd=Ge((BB,Mx)=>{(function(t,e){"use strict";typeof Mx=="object"&&typeof Mx.exports=="object"?Mx.exports=t.document?e(t,!0):function(i){if(!i.document)throw new Error("jQuery requires a window with a document");return e(i)}:e(t)})(typeof window<"u"?window:BB,function(t,e){"use strict";var i=[],n=Object.getPrototypeOf,r=i.slice,s=i.flat?function(g){return i.flat.call(g)}:function(g){return i.concat.apply([],g)},o=i.push,a=i.indexOf,l={},c=l.toString,u=l.hasOwnProperty,d=u.toString,f=d.call(Object),h={},m=function(v){return typeof v=="function"&&typeof v.nodeType!="number"&&typeof v.item!="function"},p=function(v){return v!=null&&v===v.window},_=t.document,y={type:!0,src:!0,nonce:!0,noModule:!0};function S(g,v,I){I=I||_;var R,L,D=I.createElement("script");if(D.text=g,v)for(R in y)L=v[R]||v.getAttribute&&v.getAttribute(R),L&&D.setAttribute(R,L);I.head.appendChild(D).parentNode.removeChild(D)}function T(g){return g==null?g+"":typeof g=="object"||typeof g=="function"?l[c.call(g)]||"object":typeof g}var O="3.7.1",A=/HTML$/i,b=function(g,v){return new b.fn.init(g,v)};b.fn=b.prototype={jquery:O,constructor:b,length:0,toArray:function(){return r.call(this)},get:function(g){return g==null?r.call(this):g<0?this[g+this.length]:this[g]},pushStack:function(g){var v=b.merge(this.constructor(),g);return v.prevObject=this,v},each:function(g){return b.each(this,g)},map:function(g){return this.pushStack(b.map(this,function(v,I){return g.call(v,I,v)}))},slice:function(){return this.pushStack(r.apply(this,arguments))},first:function(){return this.eq(0)},last:function(){return this.eq(-1)},even:function(){return this.pushStack(b.grep(this,function(g,v){return(v+1)%2}))},odd:function(){return this.pushStack(b.grep(this,function(g,v){return v%2}))},eq:function(g){var v=this.length,I=+g+(g<0?v:0);return this.pushStack(I>=0&&I0&&v-1 in g}function C(g,v){return g.nodeName&&g.nodeName.toLowerCase()===v.toLowerCase()}var x=i.pop,w=i.sort,E=i.splice,N="[\\x20\\t\\r\\n\\f]",B=new RegExp("^"+N+"+|((?:^|[^\\\\])(?:\\\\.)*)"+N+"+$","g");b.contains=function(g,v){var I=v&&v.parentNode;return g===I||!!(I&&I.nodeType===1&&(g.contains?g.contains(I):g.compareDocumentPosition&&g.compareDocumentPosition(I)&16))};var Z=/([\0-\x1f\x7f]|^-?\d)|^-$|[^\x80-\uFFFF\w-]/g;function X(g,v){return v?g==="\0"?"\uFFFD":g.slice(0,-1)+"\\"+g.charCodeAt(g.length-1).toString(16)+" ":"\\"+g}b.escapeSelector=function(g){return(g+"").replace(Z,X)};var K=_,V=o;(function(){var g,v,I,R,L,D=V,z,G,U,te,de,be=b.expando,le=0,ke=0,ft=bv(),Nt=bv(),St=bv(),Hn=bv(),pn=function(W,J){return W===J&&(L=!0),0},Fo="checked|selected|async|autofocus|autoplay|controls|defer|disabled|hidden|ismap|loop|multiple|open|readonly|required|scoped",Wo="(?:\\\\[\\da-fA-F]{1,6}"+N+"?|\\\\[^\\r\\n\\f]|[\\w-]|[^\0-\\x7f])+",kt="\\["+N+"*("+Wo+")(?:"+N+"*([*^$|!~]?=)"+N+`*(?:'((?:\\\\.|[^\\\\'])*)'|"((?:\\\\.|[^\\\\"])*)"|(`+Wo+"))|)"+N+"*\\]",su=":("+Wo+`)(?:\\((('((?:\\\\.|[^\\\\'])*)'|"((?:\\\\.|[^\\\\"])*)")|((?:\\\\.|[^\\\\()[\\]]|`+kt+")*)|.*)\\)|)",Pt=new RegExp(N+"+","g"),Hi=new RegExp("^"+N+"*,"+N+"*"),Ep=new RegExp("^"+N+"*([>+~]|"+N+")"+N+"*"),c1=new RegExp(N+"|>"),$o=new RegExp(su),Mp=new RegExp("^"+Wo+"$"),qo={ID:new RegExp("^#("+Wo+")"),CLASS:new RegExp("^\\.("+Wo+")"),TAG:new RegExp("^("+Wo+"|[*])"),ATTR:new RegExp("^"+kt),PSEUDO:new RegExp("^"+su),CHILD:new RegExp("^:(only|first|last|nth|nth-last)-(child|of-type)(?:\\("+N+"*(even|odd|(([+-]|)(\\d*)n|)"+N+"*(?:([+-]|)"+N+"*(\\d+)|))"+N+"*\\)|)","i"),bool:new RegExp("^(?:"+Fo+")$","i"),needsContext:new RegExp("^"+N+"*[>+~]|:(even|odd|eq|gt|lt|nth|first|last)(?:\\("+N+"*((?:-\\d)?\\d*)"+N+"*\\)|)(?=[^-]|$)","i")},Gl=/^(?:input|select|textarea|button)$/i,Yl=/^h\d$/i,Ms=/^(?:#([\w-]+)|(\w+)|\.([\w-]+))$/,u1=/[+~]/,Oa=new RegExp("\\\\[\\da-fA-F]{1,6}"+N+"?|\\\\([^\\r\\n\\f])","g"),za=function(W,J){var re="0x"+W.slice(1)-65536;return J||(re<0?String.fromCharCode(re+65536):String.fromCharCode(re>>10|55296,re&1023|56320))},B8=function(){Kl()},H8=yv(function(W){return W.disabled===!0&&C(W,"fieldset")},{dir:"parentNode",next:"legend"});function j8(){try{return z.activeElement}catch{}}try{D.apply(i=r.call(K.childNodes),K.childNodes),i[K.childNodes.length].nodeType}catch{D={apply:function(J,re){V.apply(J,r.call(re))},call:function(J){V.apply(J,r.call(arguments,1))}}}function Vt(W,J,re,ue){var ve,We,Ye,et,Ke,Mt,dt,gt=J&&J.ownerDocument,It=J?J.nodeType:9;if(re=re||[],typeof W!="string"||!W||It!==1&&It!==9&&It!==11)return re;if(!ue&&(Kl(J),J=J||z,U)){if(It!==11&&(Ke=Ms.exec(W)))if(ve=Ke[1]){if(It===9)if(Ye=J.getElementById(ve)){if(Ye.id===ve)return D.call(re,Ye),re}else return re;else if(gt&&(Ye=gt.getElementById(ve))&&Vt.contains(J,Ye)&&Ye.id===ve)return D.call(re,Ye),re}else{if(Ke[2])return D.apply(re,J.getElementsByTagName(W)),re;if((ve=Ke[3])&&J.getElementsByClassName)return D.apply(re,J.getElementsByClassName(ve)),re}if(!Hn[W+" "]&&(!te||!te.test(W))){if(dt=W,gt=J,It===1&&(c1.test(W)||Ep.test(W))){for(gt=u1.test(W)&&d1(J.parentNode)||J,(gt!=J||!h.scope)&&((et=J.getAttribute("id"))?et=b.escapeSelector(et):J.setAttribute("id",et=be)),Mt=Ip(W),We=Mt.length;We--;)Mt[We]=(et?"#"+et:":scope")+" "+xv(Mt[We]);dt=Mt.join(",")}try{return D.apply(re,gt.querySelectorAll(dt)),re}catch{Hn(W,!0)}finally{et===be&&J.removeAttribute("id")}}}return tA(W.replace(B,"$1"),J,re,ue)}function bv(){var W=[];function J(re,ue){return W.push(re+" ")>v.cacheLength&&delete J[W.shift()],J[re+" "]=ue}return J}function ho(W){return W[be]=!0,W}function oh(W){var J=z.createElement("fieldset");try{return!!W(J)}catch{return!1}finally{J.parentNode&&J.parentNode.removeChild(J),J=null}}function F8(W){return function(J){return C(J,"input")&&J.type===W}}function W8(W){return function(J){return(C(J,"input")||C(J,"button"))&&J.type===W}}function Qk(W){return function(J){return"form"in J?J.parentNode&&J.disabled===!1?"label"in J?"label"in J.parentNode?J.parentNode.disabled===W:J.disabled===W:J.isDisabled===W||J.isDisabled!==!W&&H8(J)===W:J.disabled===W:"label"in J?J.disabled===W:!1}}function ou(W){return ho(function(J){return J=+J,ho(function(re,ue){for(var ve,We=W([],re.length,J),Ye=We.length;Ye--;)re[ve=We[Ye]]&&(re[ve]=!(ue[ve]=re[ve]))})})}function d1(W){return W&&typeof W.getElementsByTagName<"u"&&W}function Kl(W){var J,re=W?W.ownerDocument||W:K;return re==z||re.nodeType!==9||!re.documentElement||(z=re,G=z.documentElement,U=!b.isXMLDoc(z),de=G.matches||G.webkitMatchesSelector||G.msMatchesSelector,G.msMatchesSelector&&K!=z&&(J=z.defaultView)&&J.top!==J&&J.addEventListener("unload",B8),h.getById=oh(function(ue){return G.appendChild(ue).id=b.expando,!z.getElementsByName||!z.getElementsByName(b.expando).length}),h.disconnectedMatch=oh(function(ue){return de.call(ue,"*")}),h.scope=oh(function(){return z.querySelectorAll(":scope")}),h.cssHas=oh(function(){try{return z.querySelector(":has(*,:jqfake)"),!1}catch{return!0}}),h.getById?(v.filter.ID=function(ue){var ve=ue.replace(Oa,za);return function(We){return We.getAttribute("id")===ve}},v.find.ID=function(ue,ve){if(typeof ve.getElementById<"u"&&U){var We=ve.getElementById(ue);return We?[We]:[]}}):(v.filter.ID=function(ue){var ve=ue.replace(Oa,za);return function(We){var Ye=typeof We.getAttributeNode<"u"&&We.getAttributeNode("id");return Ye&&Ye.value===ve}},v.find.ID=function(ue,ve){if(typeof ve.getElementById<"u"&&U){var We,Ye,et,Ke=ve.getElementById(ue);if(Ke){if(We=Ke.getAttributeNode("id"),We&&We.value===ue)return[Ke];for(et=ve.getElementsByName(ue),Ye=0;Ke=et[Ye++];)if(We=Ke.getAttributeNode("id"),We&&We.value===ue)return[Ke]}return[]}}),v.find.TAG=function(ue,ve){return typeof ve.getElementsByTagName<"u"?ve.getElementsByTagName(ue):ve.querySelectorAll(ue)},v.find.CLASS=function(ue,ve){if(typeof ve.getElementsByClassName<"u"&&U)return ve.getElementsByClassName(ue)},te=[],oh(function(ue){var ve;G.appendChild(ue).innerHTML="",ue.querySelectorAll("[selected]").length||te.push("\\["+N+"*(?:value|"+Fo+")"),ue.querySelectorAll("[id~="+be+"-]").length||te.push("~="),ue.querySelectorAll("a#"+be+"+*").length||te.push(".#.+[+~]"),ue.querySelectorAll(":checked").length||te.push(":checked"),ve=z.createElement("input"),ve.setAttribute("type","hidden"),ue.appendChild(ve).setAttribute("name","D"),G.appendChild(ue).disabled=!0,ue.querySelectorAll(":disabled").length!==2&&te.push(":enabled",":disabled"),ve=z.createElement("input"),ve.setAttribute("name",""),ue.appendChild(ve),ue.querySelectorAll("[name='']").length||te.push("\\["+N+"*name"+N+"*="+N+`*(?:''|"")`)}),h.cssHas||te.push(":has"),te=te.length&&new RegExp(te.join("|")),pn=function(ue,ve){if(ue===ve)return L=!0,0;var We=!ue.compareDocumentPosition-!ve.compareDocumentPosition;return We||(We=(ue.ownerDocument||ue)==(ve.ownerDocument||ve)?ue.compareDocumentPosition(ve):1,We&1||!h.sortDetached&&ve.compareDocumentPosition(ue)===We?ue===z||ue.ownerDocument==K&&Vt.contains(K,ue)?-1:ve===z||ve.ownerDocument==K&&Vt.contains(K,ve)?1:R?a.call(R,ue)-a.call(R,ve):0:We&4?-1:1)}),z}Vt.matches=function(W,J){return Vt(W,null,null,J)},Vt.matchesSelector=function(W,J){if(Kl(W),U&&!Hn[J+" "]&&(!te||!te.test(J)))try{var re=de.call(W,J);if(re||h.disconnectedMatch||W.document&&W.document.nodeType!==11)return re}catch{Hn(J,!0)}return Vt(J,z,null,[W]).length>0},Vt.contains=function(W,J){return(W.ownerDocument||W)!=z&&Kl(W),b.contains(W,J)},Vt.attr=function(W,J){(W.ownerDocument||W)!=z&&Kl(W);var re=v.attrHandle[J.toLowerCase()],ue=re&&u.call(v.attrHandle,J.toLowerCase())?re(W,J,!U):void 0;return ue!==void 0?ue:W.getAttribute(J)},Vt.error=function(W){throw new Error("Syntax error, unrecognized expression: "+W)},b.uniqueSort=function(W){var J,re=[],ue=0,ve=0;if(L=!h.sortStable,R=!h.sortStable&&r.call(W,0),w.call(W,pn),L){for(;J=W[ve++];)J===W[ve]&&(ue=re.push(ve));for(;ue--;)E.call(W,re[ue],1)}return R=null,W},b.fn.uniqueSort=function(){return this.pushStack(b.uniqueSort(r.apply(this)))},v=b.expr={cacheLength:50,createPseudo:ho,match:qo,attrHandle:{},find:{},relative:{">":{dir:"parentNode",first:!0}," ":{dir:"parentNode"},"+":{dir:"previousSibling",first:!0},"~":{dir:"previousSibling"}},preFilter:{ATTR:function(W){return W[1]=W[1].replace(Oa,za),W[3]=(W[3]||W[4]||W[5]||"").replace(Oa,za),W[2]==="~="&&(W[3]=" "+W[3]+" "),W.slice(0,4)},CHILD:function(W){return W[1]=W[1].toLowerCase(),W[1].slice(0,3)==="nth"?(W[3]||Vt.error(W[0]),W[4]=+(W[4]?W[5]+(W[6]||1):2*(W[3]==="even"||W[3]==="odd")),W[5]=+(W[7]+W[8]||W[3]==="odd")):W[3]&&Vt.error(W[0]),W},PSEUDO:function(W){var J,re=!W[6]&&W[2];return qo.CHILD.test(W[0])?null:(W[3]?W[2]=W[4]||W[5]||"":re&&$o.test(re)&&(J=Ip(re,!0))&&(J=re.indexOf(")",re.length-J)-re.length)&&(W[0]=W[0].slice(0,J),W[2]=re.slice(0,J)),W.slice(0,3))}},filter:{TAG:function(W){var J=W.replace(Oa,za).toLowerCase();return W==="*"?function(){return!0}:function(re){return C(re,J)}},CLASS:function(W){var J=ft[W+" "];return J||(J=new RegExp("(^|"+N+")"+W+"("+N+"|$)"))&&ft(W,function(re){return J.test(typeof re.className=="string"&&re.className||typeof re.getAttribute<"u"&&re.getAttribute("class")||"")})},ATTR:function(W,J,re){return function(ue){var ve=Vt.attr(ue,W);return ve==null?J==="!=":J?(ve+="",J==="="?ve===re:J==="!="?ve!==re:J==="^="?re&&ve.indexOf(re)===0:J==="*="?re&&ve.indexOf(re)>-1:J==="$="?re&&ve.slice(-re.length)===re:J==="~="?(" "+ve.replace(Pt," ")+" ").indexOf(re)>-1:J==="|="?ve===re||ve.slice(0,re.length+1)===re+"-":!1):!0}},CHILD:function(W,J,re,ue,ve){var We=W.slice(0,3)!=="nth",Ye=W.slice(-4)!=="last",et=J==="of-type";return ue===1&&ve===0?function(Ke){return!!Ke.parentNode}:function(Ke,Mt,dt){var gt,It,ot,ci,Lr,Zn=We!==Ye?"nextSibling":"previousSibling",Is=Ke.parentNode,Uo=et&&Ke.nodeName.toLowerCase(),ah=!dt&&!et,dr=!1;if(Is){if(We){for(;Zn;){for(ot=Ke;ot=ot[Zn];)if(et?C(ot,Uo):ot.nodeType===1)return!1;Lr=Zn=W==="only"&&!Lr&&"nextSibling"}return!0}if(Lr=[Ye?Is.firstChild:Is.lastChild],Ye&&ah){for(It=Is[be]||(Is[be]={}),gt=It[W]||[],ci=gt[0]===le&>[1],dr=ci&>[2],ot=ci&&Is.childNodes[ci];ot=++ci&&ot&&ot[Zn]||(dr=ci=0)||Lr.pop();)if(ot.nodeType===1&&++dr&&ot===Ke){It[W]=[le,ci,dr];break}}else if(ah&&(It=Ke[be]||(Ke[be]={}),gt=It[W]||[],ci=gt[0]===le&>[1],dr=ci),dr===!1)for(;(ot=++ci&&ot&&ot[Zn]||(dr=ci=0)||Lr.pop())&&!((et?C(ot,Uo):ot.nodeType===1)&&++dr&&(ah&&(It=ot[be]||(ot[be]={}),It[W]=[le,dr]),ot===Ke)););return dr-=ve,dr===ue||dr%ue===0&&dr/ue>=0}}},PSEUDO:function(W,J){var re,ue=v.pseudos[W]||v.setFilters[W.toLowerCase()]||Vt.error("unsupported pseudo: "+W);return ue[be]?ue(J):ue.length>1?(re=[W,W,"",J],v.setFilters.hasOwnProperty(W.toLowerCase())?ho(function(ve,We){for(var Ye,et=ue(ve,J),Ke=et.length;Ke--;)Ye=a.call(ve,et[Ke]),ve[Ye]=!(We[Ye]=et[Ke])}):function(ve){return ue(ve,0,re)}):ue}},pseudos:{not:ho(function(W){var J=[],re=[],ue=p1(W.replace(B,"$1"));return ue[be]?ho(function(ve,We,Ye,et){for(var Ke,Mt=ue(ve,null,et,[]),dt=ve.length;dt--;)(Ke=Mt[dt])&&(ve[dt]=!(We[dt]=Ke))}):function(ve,We,Ye){return J[0]=ve,ue(J,null,Ye,re),J[0]=null,!re.pop()}}),has:ho(function(W){return function(J){return Vt(W,J).length>0}}),contains:ho(function(W){return W=W.replace(Oa,za),function(J){return(J.textContent||b.text(J)).indexOf(W)>-1}}),lang:ho(function(W){return Mp.test(W||"")||Vt.error("unsupported lang: "+W),W=W.replace(Oa,za).toLowerCase(),function(J){var re;do if(re=U?J.lang:J.getAttribute("xml:lang")||J.getAttribute("lang"))return re=re.toLowerCase(),re===W||re.indexOf(W+"-")===0;while((J=J.parentNode)&&J.nodeType===1);return!1}}),target:function(W){var J=t.location&&t.location.hash;return J&&J.slice(1)===W.id},root:function(W){return W===G},focus:function(W){return W===j8()&&z.hasFocus()&&!!(W.type||W.href||~W.tabIndex)},enabled:Qk(!1),disabled:Qk(!0),checked:function(W){return C(W,"input")&&!!W.checked||C(W,"option")&&!!W.selected},selected:function(W){return W.parentNode&&W.parentNode.selectedIndex,W.selected===!0},empty:function(W){for(W=W.firstChild;W;W=W.nextSibling)if(W.nodeType<6)return!1;return!0},parent:function(W){return!v.pseudos.empty(W)},header:function(W){return Yl.test(W.nodeName)},input:function(W){return Gl.test(W.nodeName)},button:function(W){return C(W,"input")&&W.type==="button"||C(W,"button")},text:function(W){var J;return C(W,"input")&&W.type==="text"&&((J=W.getAttribute("type"))==null||J.toLowerCase()==="text")},first:ou(function(){return[0]}),last:ou(function(W,J){return[J-1]}),eq:ou(function(W,J,re){return[re<0?re+J:re]}),even:ou(function(W,J){for(var re=0;reJ?ue=J:ue=re;--ue>=0;)W.push(ue);return W}),gt:ou(function(W,J,re){for(var ue=re<0?re+J:re;++ue1?function(J,re,ue){for(var ve=W.length;ve--;)if(!W[ve](J,re,ue))return!1;return!0}:W[0]}function $8(W,J,re){for(var ue=0,ve=J.length;ue-1&&(Ye[dt]=!(et[dt]=It))}}else ot=wv(ot===et?ot.splice(Zn,ot.length):ot),ve?ve(null,et,ot,Mt):D.apply(et,ot)})}function m1(W){for(var J,re,ue,ve=W.length,We=v.relative[W[0].type],Ye=We||v.relative[" "],et=We?1:0,Ke=yv(function(gt){return gt===J},Ye,!0),Mt=yv(function(gt){return a.call(J,gt)>-1},Ye,!0),dt=[function(gt,It,ot){var ci=!We&&(ot||It!=I)||((J=It).nodeType?Ke(gt,It,ot):Mt(gt,It,ot));return J=null,ci}];et1&&h1(dt),et>1&&xv(W.slice(0,et-1).concat({value:W[et-2].type===" "?"*":""})).replace(B,"$1"),re,et0,ue=W.length>0,ve=function(We,Ye,et,Ke,Mt){var dt,gt,It,ot=0,ci="0",Lr=We&&[],Zn=[],Is=I,Uo=We||ue&&v.find.TAG("*",Mt),ah=le+=Is==null?1:Math.random()||.1,dr=Uo.length;for(Mt&&(I=Ye==z||Ye||Mt);ci!==dr&&(dt=Uo[ci])!=null;ci++){if(ue&&dt){for(gt=0,!Ye&&dt.ownerDocument!=z&&(Kl(dt),et=!U);It=W[gt++];)if(It(dt,Ye||z,et)){D.call(Ke,dt);break}Mt&&(le=ah)}re&&((dt=!It&&dt)&&ot--,We&&Lr.push(dt))}if(ot+=ci,re&&ci!==ot){for(gt=0;It=J[gt++];)It(Lr,Zn,Ye,et);if(We){if(ot>0)for(;ci--;)Lr[ci]||Zn[ci]||(Zn[ci]=x.call(Ke));Zn=wv(Zn)}D.apply(Ke,Zn),Mt&&!We&&Zn.length>0&&ot+J.length>1&&b.uniqueSort(Ke)}return Mt&&(le=ah,I=Is),Lr};return re?ho(ve):ve}function p1(W,J){var re,ue=[],ve=[],We=St[W+" "];if(!We){for(J||(J=Ip(W)),re=J.length;re--;)We=m1(J[re]),We[be]?ue.push(We):ve.push(We);We=St(W,q8(ve,ue)),We.selector=W}return We}function tA(W,J,re,ue){var ve,We,Ye,et,Ke,Mt=typeof W=="function"&&W,dt=!ue&&Ip(W=Mt.selector||W);if(re=re||[],dt.length===1){if(We=dt[0]=dt[0].slice(0),We.length>2&&(Ye=We[0]).type==="ID"&&J.nodeType===9&&U&&v.relative[We[1].type]){if(J=(v.find.ID(Ye.matches[0].replace(Oa,za),J)||[])[0],J)Mt&&(J=J.parentNode);else return re;W=W.slice(We.shift().value.length)}for(ve=qo.needsContext.test(W)?0:We.length;ve--&&(Ye=We[ve],!v.relative[et=Ye.type]);)if((Ke=v.find[et])&&(ue=Ke(Ye.matches[0].replace(Oa,za),u1.test(We[0].type)&&d1(J.parentNode)||J))){if(We.splice(ve,1),W=ue.length&&xv(We),!W)return D.apply(re,ue),re;break}}return(Mt||p1(W,dt))(ue,J,!U,re,!J||u1.test(W)&&d1(J.parentNode)||J),re}h.sortStable=be.split("").sort(pn).join("")===be,Kl(),h.sortDetached=oh(function(W){return W.compareDocumentPosition(z.createElement("fieldset"))&1}),b.find=Vt,b.expr[":"]=b.expr.pseudos,b.unique=b.uniqueSort,Vt.compile=p1,Vt.select=tA,Vt.setDocument=Kl,Vt.tokenize=Ip,Vt.escape=b.escapeSelector,Vt.getText=b.text,Vt.isXML=b.isXMLDoc,Vt.selectors=b.expr,Vt.support=b.support,Vt.uniqueSort=b.uniqueSort})();var ie=function(g,v,I){for(var R=[],L=I!==void 0;(g=g[v])&&g.nodeType!==9;)if(g.nodeType===1){if(L&&b(g).is(I))break;R.push(g)}return R},_e=function(g,v){for(var I=[];g;g=g.nextSibling)g.nodeType===1&&g!==v&&I.push(g);return I},Ne=b.expr.match.needsContext,ye=/^<([a-z][^\/\0>:\x20\t\r\n\f]*)[\x20\t\r\n\f]*\/?>(?:<\/\1>|)$/i;function Ie(g,v,I){return m(v)?b.grep(g,function(R,L){return!!v.call(R,L,R)!==I}):v.nodeType?b.grep(g,function(R){return R===v!==I}):typeof v!="string"?b.grep(g,function(R){return a.call(v,R)>-1!==I}):b.filter(v,g,I)}b.filter=function(g,v,I){var R=v[0];return I&&(g=":not("+g+")"),v.length===1&&R.nodeType===1?b.find.matchesSelector(R,g)?[R]:[]:b.find.matches(g,b.grep(v,function(L){return L.nodeType===1}))},b.fn.extend({find:function(g){var v,I,R=this.length,L=this;if(typeof g!="string")return this.pushStack(b(g).filter(function(){for(v=0;v1?b.uniqueSort(I):I},filter:function(g){return this.pushStack(Ie(this,g||[],!1))},not:function(g){return this.pushStack(Ie(this,g||[],!0))},is:function(g){return!!Ie(this,typeof g=="string"&&Ne.test(g)?b(g):g||[],!1).length}});var at,Ve=/^(?:\s*(<[\w\W]+>)[^>]*|#([\w-]+))$/,Ze=b.fn.init=function(g,v,I){var R,L;if(!g)return this;if(I=I||at,typeof g=="string")if(g[0]==="<"&&g[g.length-1]===">"&&g.length>=3?R=[null,g,null]:R=Ve.exec(g),R&&(R[1]||!v))if(R[1]){if(v=v instanceof b?v[0]:v,b.merge(this,b.parseHTML(R[1],v&&v.nodeType?v.ownerDocument||v:_,!0)),ye.test(R[1])&&b.isPlainObject(v))for(R in v)m(this[R])?this[R](v[R]):this.attr(R,v[R]);return this}else return L=_.getElementById(R[2]),L&&(this[0]=L,this.length=1),this;else return!v||v.jquery?(v||I).find(g):this.constructor(v).find(g);else{if(g.nodeType)return this[0]=g,this.length=1,this;if(m(g))return I.ready!==void 0?I.ready(g):g(b)}return b.makeArray(g,this)};Ze.prototype=b.fn,at=b(_);var ct=/^(?:parents|prev(?:Until|All))/,yt={children:!0,contents:!0,next:!0,prev:!0};b.fn.extend({has:function(g){var v=b(g,this),I=v.length;return this.filter(function(){for(var R=0;R-1:I.nodeType===1&&b.find.matchesSelector(I,g))){D.push(I);break}}return this.pushStack(D.length>1?b.uniqueSort(D):D)},index:function(g){return g?typeof g=="string"?a.call(b(g),this[0]):a.call(this,g.jquery?g[0]:g):this[0]&&this[0].parentNode?this.first().prevAll().length:-1},add:function(g,v){return this.pushStack(b.uniqueSort(b.merge(this.get(),b(g,v))))},addBack:function(g){return this.add(g==null?this.prevObject:this.prevObject.filter(g))}});function Et(g,v){for(;(g=g[v])&&g.nodeType!==1;);return g}b.each({parent:function(g){var v=g.parentNode;return v&&v.nodeType!==11?v:null},parents:function(g){return ie(g,"parentNode")},parentsUntil:function(g,v,I){return ie(g,"parentNode",I)},next:function(g){return Et(g,"nextSibling")},prev:function(g){return Et(g,"previousSibling")},nextAll:function(g){return ie(g,"nextSibling")},prevAll:function(g){return ie(g,"previousSibling")},nextUntil:function(g,v,I){return ie(g,"nextSibling",I)},prevUntil:function(g,v,I){return ie(g,"previousSibling",I)},siblings:function(g){return _e((g.parentNode||{}).firstChild,g)},children:function(g){return _e(g.firstChild)},contents:function(g){return g.contentDocument!=null&&n(g.contentDocument)?g.contentDocument:(C(g,"template")&&(g=g.content||g),b.merge([],g.childNodes))}},function(g,v){b.fn[g]=function(I,R){var L=b.map(this,v,I);return g.slice(-5)!=="Until"&&(R=I),R&&typeof R=="string"&&(L=b.filter(R,L)),this.length>1&&(yt[g]||b.uniqueSort(L),ct.test(g)&&L.reverse()),this.pushStack(L)}});var li=/[^\x20\t\r\n\f]+/g;function bi(g){var v={};return b.each(g.match(li)||[],function(I,R){v[R]=!0}),v}b.Callbacks=function(g){g=typeof g=="string"?bi(g):b.extend({},g);var v,I,R,L,D=[],z=[],G=-1,U=function(){for(L=L||g.once,R=v=!0;z.length;G=-1)for(I=z.shift();++G-1;)D.splice(le,1),le<=G&&G--}),this},has:function(de){return de?b.inArray(de,D)>-1:D.length>0},empty:function(){return D&&(D=[]),this},disable:function(){return L=z=[],D=I="",this},disabled:function(){return!D},lock:function(){return L=z=[],!I&&!v&&(D=I=""),this},locked:function(){return!!L},fireWith:function(de,be){return L||(be=be||[],be=[de,be.slice?be.slice():be],z.push(be),v||U()),this},fire:function(){return te.fireWith(this,arguments),this},fired:function(){return!!R}};return te};function Ii(g){return g}function we(g){throw g}function k(g,v,I,R){var L;try{g&&m(L=g.promise)?L.call(g).done(v).fail(I):g&&m(L=g.then)?L.call(g,v,I):v.apply(void 0,[g].slice(R))}catch(D){I.apply(void 0,[D])}}b.extend({Deferred:function(g){var v=[["notify","progress",b.Callbacks("memory"),b.Callbacks("memory"),2],["resolve","done",b.Callbacks("once memory"),b.Callbacks("once memory"),0,"resolved"],["reject","fail",b.Callbacks("once memory"),b.Callbacks("once memory"),1,"rejected"]],I="pending",R={state:function(){return I},always:function(){return L.done(arguments).fail(arguments),this},catch:function(D){return R.then(null,D)},pipe:function(){var D=arguments;return b.Deferred(function(z){b.each(v,function(G,U){var te=m(D[U[4]])&&D[U[4]];L[U[1]](function(){var de=te&&te.apply(this,arguments);de&&m(de.promise)?de.promise().progress(z.notify).done(z.resolve).fail(z.reject):z[U[0]+"With"](this,te?[de]:arguments)})}),D=null}).promise()},then:function(D,z,G){var U=0;function te(de,be,le,ke){return function(){var ft=this,Nt=arguments,St=function(){var pn,Fo;if(!(de=U&&(le!==we&&(ft=void 0,Nt=[pn]),be.rejectWith(ft,Nt))}};de?Hn():(b.Deferred.getErrorHook?Hn.error=b.Deferred.getErrorHook():b.Deferred.getStackHook&&(Hn.error=b.Deferred.getStackHook()),t.setTimeout(Hn))}}return b.Deferred(function(de){v[0][3].add(te(0,de,m(G)?G:Ii,de.notifyWith)),v[1][3].add(te(0,de,m(D)?D:Ii)),v[2][3].add(te(0,de,m(z)?z:we))}).promise()},promise:function(D){return D!=null?b.extend(D,R):R}},L={};return b.each(v,function(D,z){var G=z[2],U=z[5];R[z[1]]=G.add,U&&G.add(function(){I=U},v[3-D][2].disable,v[3-D][3].disable,v[0][2].lock,v[0][3].lock),G.add(z[3].fire),L[z[0]]=function(){return L[z[0]+"With"](this===L?void 0:this,arguments),this},L[z[0]+"With"]=G.fireWith}),R.promise(L),g&&g.call(L,L),L},when:function(g){var v=arguments.length,I=v,R=Array(I),L=r.call(arguments),D=b.Deferred(),z=function(G){return function(U){R[G]=this,L[G]=arguments.length>1?r.call(arguments):U,--v||D.resolveWith(R,L)}};if(v<=1&&(k(g,D.done(z(I)).resolve,D.reject,!v),D.state()==="pending"||m(L[I]&&L[I].then)))return D.then();for(;I--;)k(L[I],z(I),D.reject);return D.promise()}});var j=/^(Eval|Internal|Range|Reference|Syntax|Type|URI)Error$/;b.Deferred.exceptionHook=function(g,v){t.console&&t.console.warn&&g&&j.test(g.name)&&t.console.warn("jQuery.Deferred exception: "+g.message,g.stack,v)},b.readyException=function(g){t.setTimeout(function(){throw g})};var F=b.Deferred();b.fn.ready=function(g){return F.then(g).catch(function(v){b.readyException(v)}),this},b.extend({isReady:!1,readyWait:1,ready:function(g){(g===!0?--b.readyWait:b.isReady)||(b.isReady=!0,!(g!==!0&&--b.readyWait>0)&&F.resolveWith(_,[b]))}}),b.ready.then=F.then;function Q(){_.removeEventListener("DOMContentLoaded",Q),t.removeEventListener("load",Q),b.ready()}_.readyState==="complete"||_.readyState!=="loading"&&!_.documentElement.doScroll?t.setTimeout(b.ready):(_.addEventListener("DOMContentLoaded",Q),t.addEventListener("load",Q));var ae=function(g,v,I,R,L,D,z){var G=0,U=g.length,te=I==null;if(T(I)==="object"){L=!0;for(G in I)ae(g,v,G,I[G],!0,D,z)}else if(R!==void 0&&(L=!0,m(R)||(z=!0),te&&(z?(v.call(g,R),v=null):(te=v,v=function(de,be,le){return te.call(b(de),le)})),v))for(;G1,null,!0)},removeData:function(g){return this.each(function(){q.remove(this,g)})}}),b.extend({queue:function(g,v,I){var R;if(g)return v=(v||"fx")+"queue",R=Se.get(g,v),I&&(!R||Array.isArray(I)?R=Se.access(g,v,b.makeArray(I)):R.push(I)),R||[]},dequeue:function(g,v){v=v||"fx";var I=b.queue(g,v),R=I.length,L=I.shift(),D=b._queueHooks(g,v),z=function(){b.dequeue(g,v)};L==="inprogress"&&(L=I.shift(),R--),L&&(v==="fx"&&I.unshift("inprogress"),delete D.stop,L.call(g,z,D)),!R&&D&&D.empty.fire()},_queueHooks:function(g,v){var I=v+"queueHooks";return Se.get(g,I)||Se.access(g,I,{empty:b.Callbacks("once memory").add(function(){Se.remove(g,[v+"queue",I])})})}}),b.fn.extend({queue:function(g,v){var I=2;return typeof g!="string"&&(v=g,g="fx",I--),arguments.length\x20\t\r\n\f]*)/i,La=/^$|^module$|\/(?:java|ecma)script/i;(function(){var g=_.createDocumentFragment(),v=g.appendChild(_.createElement("div")),I=_.createElement("input");I.setAttribute("type","radio"),I.setAttribute("checked","checked"),I.setAttribute("name","t"),v.appendChild(I),h.checkClone=v.cloneNode(!0).cloneNode(!0).lastChild.checked,v.innerHTML="",h.noCloneChecked=!!v.cloneNode(!0).lastChild.defaultValue,v.innerHTML="",h.option=!!v.lastChild})();var Bn={thead:[1,"","
"],col:[2,"","
"],tr:[2,"","
"],td:[3,"","
"],_default:[0,"",""]};Bn.tbody=Bn.tfoot=Bn.colgroup=Bn.caption=Bn.thead,Bn.th=Bn.td,h.option||(Bn.optgroup=Bn.option=[1,""]);function fn(g,v){var I;return typeof g.getElementsByTagName<"u"?I=g.getElementsByTagName(v||"*"):typeof g.querySelectorAll<"u"?I=g.querySelectorAll(v||"*"):I=[],v===void 0||v&&C(g,v)?b.merge([g],I):I}function dp(g,v){for(var I=0,R=g.length;I-1){L&&L.push(D);continue}if(te=Pi(D),z=fn(be.appendChild(D),"script"),te&&dp(z),I)for(de=0;D=z[de++];)La.test(D.type||"")&&I.push(D)}return be}var hp=/^([^.]*)(?:\.(.+)|)/;function $l(){return!0}function ql(){return!1}function fp(g,v,I,R,L,D){var z,G;if(typeof v=="object"){typeof I!="string"&&(R=R||I,I=void 0);for(G in v)fp(g,G,I,R,v[G],D);return g}if(R==null&&L==null?(L=I,R=I=void 0):L==null&&(typeof I=="string"?(L=R,R=void 0):(L=R,R=I,I=void 0)),L===!1)L=ql;else if(!L)return g;return D===1&&(z=L,L=function(U){return b().off(U),z.apply(this,arguments)},L.guid=z.guid||(z.guid=b.guid++)),g.each(function(){b.event.add(this,v,L,R,I)})}b.event={global:{},add:function(g,v,I,R,L){var D,z,G,U,te,de,be,le,ke,ft,Nt,St=Se.get(g);if(Xt(g))for(I.handler&&(D=I,I=D.handler,L=D.selector),L&&b.find.matchesSelector(ht,L),I.guid||(I.guid=b.guid++),(U=St.events)||(U=St.events=Object.create(null)),(z=St.handle)||(z=St.handle=function(Hn){return typeof b<"u"&&b.event.triggered!==Hn.type?b.event.dispatch.apply(g,arguments):void 0}),v=(v||"").match(li)||[""],te=v.length;te--;)G=hp.exec(v[te])||[],ke=Nt=G[1],ft=(G[2]||"").split(".").sort(),ke&&(be=b.event.special[ke]||{},ke=(L?be.delegateType:be.bindType)||ke,be=b.event.special[ke]||{},de=b.extend({type:ke,origType:Nt,data:R,handler:I,guid:I.guid,selector:L,needsContext:L&&b.expr.match.needsContext.test(L),namespace:ft.join(".")},D),(le=U[ke])||(le=U[ke]=[],le.delegateCount=0,(!be.setup||be.setup.call(g,R,ft,z)===!1)&&g.addEventListener&&g.addEventListener(ke,z)),be.add&&(be.add.call(g,de),de.handler.guid||(de.handler.guid=I.guid)),L?le.splice(le.delegateCount++,0,de):le.push(de),b.event.global[ke]=!0)},remove:function(g,v,I,R,L){var D,z,G,U,te,de,be,le,ke,ft,Nt,St=Se.hasData(g)&&Se.get(g);if(!(!St||!(U=St.events))){for(v=(v||"").match(li)||[""],te=v.length;te--;){if(G=hp.exec(v[te])||[],ke=Nt=G[1],ft=(G[2]||"").split(".").sort(),!ke){for(ke in U)b.event.remove(g,ke+v[te],I,R,!0);continue}for(be=b.event.special[ke]||{},ke=(R?be.delegateType:be.bindType)||ke,le=U[ke]||[],G=G[2]&&new RegExp("(^|\\.)"+ft.join("\\.(?:.*\\.|)")+"(\\.|$)"),z=D=le.length;D--;)de=le[D],(L||Nt===de.origType)&&(!I||I.guid===de.guid)&&(!G||G.test(de.namespace))&&(!R||R===de.selector||R==="**"&&de.selector)&&(le.splice(D,1),de.selector&&le.delegateCount--,be.remove&&be.remove.call(g,de));z&&!le.length&&((!be.teardown||be.teardown.call(g,ft,St.handle)===!1)&&b.removeEvent(g,ke,St.handle),delete U[ke])}b.isEmptyObject(U)&&Se.remove(g,"handle events")}},dispatch:function(g){var v,I,R,L,D,z,G=new Array(arguments.length),U=b.event.fix(g),te=(Se.get(this,"events")||Object.create(null))[U.type]||[],de=b.event.special[U.type]||{};for(G[0]=U,v=1;v=1)){for(;te!==this;te=te.parentNode||this)if(te.nodeType===1&&!(g.type==="click"&&te.disabled===!0)){for(D=[],z={},I=0;I-1:b.find(L,this,null,[te]).length),z[L]&&D.push(R);D.length&&G.push({elem:te,handlers:D})}}return te=this,U\s*$/g;function hv(g,v){return C(g,"table")&&C(v.nodeType!==11?v:v.firstChild,"tr")&&b(g).children("tbody")[0]||g}function Jw(g){return g.type=(g.getAttribute("type")!==null)+"/"+g.type,g}function fv(g){return(g.type||"").slice(0,5)==="true/"?g.type=g.type.slice(5):g.removeAttribute("type"),g}function mv(g,v){var I,R,L,D,z,G,U;if(v.nodeType===1){if(Se.hasData(g)&&(D=Se.get(g),U=D.events,U)){Se.remove(v,"handle events");for(L in U)for(I=0,R=U[L].length;I1&&typeof ke=="string"&&!h.checkClone&&pp.test(ke))return g.each(function(Nt){var St=g.eq(Nt);ft&&(v[0]=ke.call(this,Nt,St.html())),Ho(St,v,I,R)});if(be&&(L=uv(v,g[0].ownerDocument,!1,g,R),D=L.firstChild,L.childNodes.length===1&&(L=D),D||R)){for(z=b.map(fn(L,"script"),Jw),G=z.length;de0&&dp(z,!U&&fn(g,"script")),G},cleanData:function(g){for(var v,I,R,L=b.event.special,D=0;(I=g[D])!==void 0;D++)if(Xt(I)){if(v=I[Se.expando]){if(v.events)for(R in v.events)L[R]?b.event.remove(I,R):b.removeEvent(I,R,v.handle);I[Se.expando]=void 0}I[q.expando]&&(I[q.expando]=void 0)}}}),b.fn.extend({detach:function(g){return Jd(this,g,!0)},remove:function(g){return Jd(this,g)},text:function(g){return ae(this,function(v){return v===void 0?b.text(this):this.empty().each(function(){(this.nodeType===1||this.nodeType===11||this.nodeType===9)&&(this.textContent=v)})},null,g,arguments.length)},append:function(){return Ho(this,arguments,function(g){if(this.nodeType===1||this.nodeType===11||this.nodeType===9){var v=hv(this,g);v.appendChild(g)}})},prepend:function(){return Ho(this,arguments,function(g){if(this.nodeType===1||this.nodeType===11||this.nodeType===9){var v=hv(this,g);v.insertBefore(g,v.firstChild)}})},before:function(){return Ho(this,arguments,function(g){this.parentNode&&this.parentNode.insertBefore(g,this)})},after:function(){return Ho(this,arguments,function(g){this.parentNode&&this.parentNode.insertBefore(g,this.nextSibling)})},empty:function(){for(var g,v=0;(g=this[v])!=null;v++)g.nodeType===1&&(b.cleanData(fn(g,!1)),g.textContent="");return this},clone:function(g,v){return g=g??!1,v=v??g,this.map(function(){return b.clone(this,g,v)})},html:function(g){return ae(this,function(v){var I=this[0]||{},R=0,L=this.length;if(v===void 0&&I.nodeType===1)return I.innerHTML;if(typeof v=="string"&&!mp.test(v)&&!Bn[(Cs.exec(v)||["",""])[1].toLowerCase()]){v=b.htmlPrefilter(v);try{for(;R=0&&(U+=Math.max(0,Math.ceil(g["offset"+v[0].toUpperCase()+v.slice(1)]-D-U-G-.5))||0),U+te}function tu(g,v,I){var R=Qd(g),L=!h.boxSizingReliable()||I,D=L&&b.css(g,"boxSizing",!1,R)==="border-box",z=D,G=eu(g,v,R),U="offset"+v[0].toUpperCase()+v.slice(1);if(Zd.test(G)){if(!I)return G;G="auto"}return(!h.boxSizingReliable()&&D||!h.reliableTrDimensions()&&C(g,"tr")||G==="auto"||!parseFloat(G)&&b.css(g,"display",!1,R)==="inline")&&g.getClientRects().length&&(D=b.css(g,"boxSizing",!1,R)==="border-box",z=U in g,z&&(G=g[U])),G=parseFloat(G)||0,G+Ri(g,v,I||(D?"border":"content"),z,R,G)+"px"}b.extend({cssHooks:{opacity:{get:function(g,v){if(v){var I=eu(g,"opacity");return I===""?"1":I}}}},cssNumber:{animationIterationCount:!0,aspectRatio:!0,borderImageSlice:!0,columnCount:!0,flexGrow:!0,flexShrink:!0,fontWeight:!0,gridArea:!0,gridColumn:!0,gridColumnEnd:!0,gridColumnStart:!0,gridRow:!0,gridRowEnd:!0,gridRowStart:!0,lineHeight:!0,opacity:!0,order:!0,orphans:!0,scale:!0,widows:!0,zIndex:!0,zoom:!0,fillOpacity:!0,floodOpacity:!0,stopOpacity:!0,strokeMiterlimit:!0,strokeOpacity:!0},cssProps:{},style:function(g,v,I,R){if(!(!g||g.nodeType===3||g.nodeType===8||!g.style)){var L,D,z,G=wt(v),U=uo.test(v),te=g.style;if(U||(v=_p(G)),z=b.cssHooks[v]||b.cssHooks[G],I!==void 0){if(D=typeof I,D==="string"&&(L=ut.exec(I))&&L[1]&&(I=At(g,v,L),D="number"),I==null||I!==I)return;D==="number"&&!U&&(I+=L&&L[3]||(b.cssNumber[G]?"":"px")),!h.clearCloneStyle&&I===""&&v.indexOf("background")===0&&(te[v]="inherit"),(!z||!("set"in z)||(I=z.set(g,I,R))!==void 0)&&(U?te.setProperty(v,I):te[v]=I)}else return z&&"get"in z&&(L=z.get(g,!1,R))!==void 0?L:te[v]}},css:function(g,v,I,R){var L,D,z,G=wt(v),U=uo.test(v);return U||(v=_p(G)),z=b.cssHooks[v]||b.cssHooks[G],z&&"get"in z&&(L=z.get(g,!0,I)),L===void 0&&(L=eu(g,v,R)),L==="normal"&&v in _v&&(L=_v[v]),I===""||I?(D=parseFloat(L),I===!0||isFinite(D)?D||0:L):L}}),b.each(["height","width"],function(g,v){b.cssHooks[v]={get:function(I,R,L){if(R)return t1.test(b.css(I,"display"))&&(!I.getClientRects().length||!I.getBoundingClientRect().width)?gp(I,vp,function(){return tu(I,v,L)}):tu(I,v,L)},set:function(I,R,L){var D,z=Qd(I),G=!h.scrollboxSize()&&z.position==="absolute",U=G||L,te=U&&b.css(I,"boxSizing",!1,z)==="border-box",de=L?Ri(I,v,L,te,z):0;return te&&G&&(de-=Math.ceil(I["offset"+v[0].toUpperCase()+v.slice(1)]-parseFloat(z[v])-Ri(I,v,"border",!1,z)-.5)),de&&(D=ut.exec(R))&&(D[3]||"px")!=="px"&&(I.style[v]=R,R=b.css(I,v)),Ul(I,R,de)}}}),b.cssHooks.marginLeft=pv(h.reliableMarginLeft,function(g,v){if(v)return(parseFloat(eu(g,"marginLeft"))||g.getBoundingClientRect().left-gp(g,{marginLeft:0},function(){return g.getBoundingClientRect().left}))+"px"}),b.each({margin:"",padding:"",border:"Width"},function(g,v){b.cssHooks[g+v]={expand:function(I){for(var R=0,L={},D=typeof I=="string"?I.split(" "):[I];R<4;R++)L[g+Rt[R]+v]=D[R]||D[R-2]||D[0];return L}},g!=="margin"&&(b.cssHooks[g+v].set=Ul)}),b.fn.extend({css:function(g,v){return ae(this,function(I,R,L){var D,z,G={},U=0;if(Array.isArray(R)){for(D=Qd(I),z=R.length;U1)}});function Mn(g,v,I,R,L){return new Mn.prototype.init(g,v,I,R,L)}b.Tween=Mn,Mn.prototype={constructor:Mn,init:function(g,v,I,R,L,D){this.elem=g,this.prop=I,this.easing=L||b.easing._default,this.options=v,this.start=this.now=this.cur(),this.end=R,this.unit=D||(b.cssNumber[I]?"":"px")},cur:function(){var g=Mn.propHooks[this.prop];return g&&g.get?g.get(this):Mn.propHooks._default.get(this)},run:function(g){var v,I=Mn.propHooks[this.prop];return this.options.duration?this.pos=v=b.easing[this.easing](g,this.options.duration*g,0,1,this.options.duration):this.pos=v=g,this.now=(this.end-this.start)*v+this.start,this.options.step&&this.options.step.call(this.elem,this.now,this),I&&I.set?I.set(this):Mn.propHooks._default.set(this),this}},Mn.prototype.init.prototype=Mn.prototype,Mn.propHooks={_default:{get:function(g){var v;return g.elem.nodeType!==1||g.elem[g.prop]!=null&&g.elem.style[g.prop]==null?g.elem[g.prop]:(v=b.css(g.elem,g.prop,""),!v||v==="auto"?0:v)},set:function(g){b.fx.step[g.prop]?b.fx.step[g.prop](g):g.elem.nodeType===1&&(b.cssHooks[g.prop]||g.elem.style[_p(g.prop)]!=null)?b.style(g.elem,g.prop,g.now+g.unit):g.elem[g.prop]=g.now}}},Mn.propHooks.scrollTop=Mn.propHooks.scrollLeft={set:function(g){g.elem.nodeType&&g.elem.parentNode&&(g.elem[g.prop]=g.now)}},b.easing={linear:function(g){return g},swing:function(g){return .5-Math.cos(g*Math.PI)/2},_default:"swing"},b.fx=Mn.prototype.init,b.fx.step={};var Na,iu,i1=/^(?:toggle|show|hide)$/,bp=/queueHooks$/;function jo(){iu&&(_.hidden===!1&&t.requestAnimationFrame?t.requestAnimationFrame(jo):t.setTimeout(jo,b.fx.interval),b.fx.tick())}function xp(){return t.setTimeout(function(){Na=void 0}),Na=Date.now()}function nu(g,v){var I,R=0,L={height:g};for(v=v?1:0;R<4;R+=2-v)I=Rt[R],L["margin"+I]=L["padding"+I]=g;return v&&(L.opacity=L.width=g),L}function ru(g,v,I){for(var R,L=(Ar.tweeners[v]||[]).concat(Ar.tweeners["*"]),D=0,z=L.length;D1)},removeAttr:function(g){return this.each(function(){b.removeAttr(this,g)})}}),b.extend({attr:function(g,v,I){var R,L,D=g.nodeType;if(!(D===3||D===8||D===2)){if(typeof g.getAttribute>"u")return b.prop(g,v,I);if((D!==1||!b.isXMLDoc(g))&&(L=b.attrHooks[v.toLowerCase()]||(b.expr.match.bool.test(v)?vv:void 0)),I!==void 0){if(I===null){b.removeAttr(g,v);return}return L&&"set"in L&&(R=L.set(g,I,v))!==void 0?R:(g.setAttribute(v,I+""),I)}return L&&"get"in L&&(R=L.get(g,v))!==null?R:(R=b.find.attr(g,v),R??void 0)}},attrHooks:{type:{set:function(g,v){if(!h.radioValue&&v==="radio"&&C(g,"input")){var I=g.value;return g.setAttribute("type",v),I&&(g.value=I),v}}}},removeAttr:function(g,v){var I,R=0,L=v&&v.match(li);if(L&&g.nodeType===1)for(;I=L[R++];)g.removeAttribute(I)}}),vv={set:function(g,v,I){return v===!1?b.removeAttr(g,I):g.setAttribute(I,I),I}},b.each(b.expr.match.bool.source.match(/\w+/g),function(g,v){var I=Vl[v]||b.find.attr;Vl[v]=function(R,L,D){var z,G,U=L.toLowerCase();return D||(G=Vl[U],Vl[U]=z,z=I(R,L,D)!=null?U:null,Vl[U]=G),z}});var s1=/^(?:input|select|textarea|button)$/i,o1=/^(?:a|area)$/i;b.fn.extend({prop:function(g,v){return ae(this,b.prop,g,v,arguments.length>1)},removeProp:function(g){return this.each(function(){delete this[b.propFix[g]||g]})}}),b.extend({prop:function(g,v,I){var R,L,D=g.nodeType;if(!(D===3||D===8||D===2))return(D!==1||!b.isXMLDoc(g))&&(v=b.propFix[v]||v,L=b.propHooks[v]),I!==void 0?L&&"set"in L&&(R=L.set(g,I,v))!==void 0?R:g[v]=I:L&&"get"in L&&(R=L.get(g,v))!==null?R:g[v]},propHooks:{tabIndex:{get:function(g){var v=b.find.attr(g,"tabindex");return v?parseInt(v,10):s1.test(g.nodeName)||o1.test(g.nodeName)&&g.href?0:-1}}},propFix:{for:"htmlFor",class:"className"}}),h.optSelected||(b.propHooks.selected={get:function(g){var v=g.parentNode;return v&&v.parentNode&&v.parentNode.selectedIndex,null},set:function(g){var v=g.parentNode;v&&(v.selectedIndex,v.parentNode&&v.parentNode.selectedIndex)}}),b.each(["tabIndex","readOnly","maxLength","cellSpacing","cellPadding","rowSpan","colSpan","useMap","frameBorder","contentEditable"],function(){b.propFix[this.toLowerCase()]=this});function Da(g){var v=g.match(li)||[];return v.join(" ")}function Es(g){return g.getAttribute&&g.getAttribute("class")||""}function Y(g){return Array.isArray(g)?g:typeof g=="string"?g.match(li)||[]:[]}b.fn.extend({addClass:function(g){var v,I,R,L,D,z;return m(g)?this.each(function(G){b(this).addClass(g.call(this,G,Es(this)))}):(v=Y(g),v.length?this.each(function(){if(R=Es(this),I=this.nodeType===1&&" "+Da(R)+" ",I){for(D=0;D-1;)I=I.replace(" "+L+" "," ");z=Da(I),R!==z&&this.setAttribute("class",z)}}):this):this.attr("class","")},toggleClass:function(g,v){var I,R,L,D,z=typeof g,G=z==="string"||Array.isArray(g);return m(g)?this.each(function(U){b(this).toggleClass(g.call(this,U,Es(this),v),v)}):typeof v=="boolean"&&G?v?this.addClass(g):this.removeClass(g):(I=Y(g),this.each(function(){if(G)for(D=b(this),L=0;L-1)return!0;return!1}});var ne=/\r/g;b.fn.extend({val:function(g){var v,I,R,L=this[0];return arguments.length?(R=m(g),this.each(function(D){var z;this.nodeType===1&&(R?z=g.call(this,D,b(this).val()):z=g,z==null?z="":typeof z=="number"?z+="":Array.isArray(z)&&(z=b.map(z,function(G){return G==null?"":G+""})),v=b.valHooks[this.type]||b.valHooks[this.nodeName.toLowerCase()],(!v||!("set"in v)||v.set(this,z,"value")===void 0)&&(this.value=z))})):L?(v=b.valHooks[L.type]||b.valHooks[L.nodeName.toLowerCase()],v&&"get"in v&&(I=v.get(L,"value"))!==void 0?I:(I=L.value,typeof I=="string"?I.replace(ne,""):I??"")):void 0}}),b.extend({valHooks:{option:{get:function(g){var v=b.find.attr(g,"value");return v??Da(b.text(g))}},select:{get:function(g){var v,I,R,L=g.options,D=g.selectedIndex,z=g.type==="select-one",G=z?null:[],U=z?D+1:L.length;for(D<0?R=U:R=z?D:0;R-1)&&(I=!0);return I||(g.selectedIndex=-1),D}}}}),b.each(["radio","checkbox"],function(){b.valHooks[this]={set:function(g,v){if(Array.isArray(v))return g.checked=b.inArray(b(g).val(),v)>-1}},h.checkOn||(b.valHooks[this].get=function(g){return g.getAttribute("value")===null?"on":g.value})});var se=t.location,Ee={guid:Date.now()},$e=/\?/;b.parseXML=function(g){var v,I;if(!g||typeof g!="string")return null;try{v=new t.DOMParser().parseFromString(g,"text/xml")}catch{}return I=v&&v.getElementsByTagName("parsererror")[0],(!v||I)&&b.error("Invalid XML: "+(I?b.map(I.childNodes,function(R){return R.textContent}).join(` -`):g)),v};var qe=/^(?:focusinfocus|focusoutblur)$/,Qe=function(g){g.stopPropagation()};b.extend(b.event,{trigger:function(g,v,I,R){var L,D,z,G,U,te,de,be,le=[I||_],ke=u.call(g,"type")?g.type:g,ft=u.call(g,"namespace")?g.namespace.split("."):[];if(D=be=z=I=I||_,!(I.nodeType===3||I.nodeType===8)&&!qe.test(ke+b.event.triggered)&&(ke.indexOf(".")>-1&&(ft=ke.split("."),ke=ft.shift(),ft.sort()),U=ke.indexOf(":")<0&&"on"+ke,g=g[b.expando]?g:new b.Event(ke,typeof g=="object"&&g),g.isTrigger=R?2:3,g.namespace=ft.join("."),g.rnamespace=g.namespace?new RegExp("(^|\\.)"+ft.join("\\.(?:.*\\.|)")+"(\\.|$)"):null,g.result=void 0,g.target||(g.target=I),v=v==null?[g]:b.makeArray(v,[g]),de=b.event.special[ke]||{},!(!R&&de.trigger&&de.trigger.apply(I,v)===!1))){if(!R&&!de.noBubble&&!p(I)){for(G=de.delegateType||ke,qe.test(G+ke)||(D=D.parentNode);D;D=D.parentNode)le.push(D),z=D;z===(I.ownerDocument||_)&&le.push(z.defaultView||z.parentWindow||t)}for(L=0;(D=le[L++])&&!g.isPropagationStopped();)be=D,g.type=L>1?G:de.bindType||ke,te=(Se.get(D,"events")||Object.create(null))[g.type]&&Se.get(D,"handle"),te&&te.apply(D,v),te=U&&D[U],te&&te.apply&&Xt(D)&&(g.result=te.apply(D,v),g.result===!1&&g.preventDefault());return g.type=ke,!R&&!g.isDefaultPrevented()&&(!de._default||de._default.apply(le.pop(),v)===!1)&&Xt(I)&&U&&m(I[ke])&&!p(I)&&(z=I[U],z&&(I[U]=null),b.event.triggered=ke,g.isPropagationStopped()&&be.addEventListener(ke,Qe),I[ke](),g.isPropagationStopped()&&be.removeEventListener(ke,Qe),b.event.triggered=void 0,z&&(I[U]=z)),g.result}},simulate:function(g,v,I){var R=b.extend(new b.Event,I,{type:g,isSimulated:!0});b.event.trigger(R,null,v)}}),b.fn.extend({trigger:function(g,v){return this.each(function(){b.event.trigger(g,v,this)})},triggerHandler:function(g,v){var I=this[0];if(I)return b.event.trigger(g,v,I,!0)}});var st=/\[\]$/,Jt=/\r?\n/g,Ut=/^(?:submit|button|image|reset|file)$/i,Ht=/^(?:input|select|textarea|keygen)/i;function Bi(g,v,I,R){var L;if(Array.isArray(v))b.each(v,function(D,z){I||st.test(g)?R(g,z):Bi(g+"["+(typeof z=="object"&&z!=null?D:"")+"]",z,I,R)});else if(!I&&T(v)==="object")for(L in v)Bi(g+"["+L+"]",v[L],I,R);else R(g,v)}b.param=function(g,v){var I,R=[],L=function(D,z){var G=m(z)?z():z;R[R.length]=encodeURIComponent(D)+"="+encodeURIComponent(G??"")};if(g==null)return"";if(Array.isArray(g)||g.jquery&&!b.isPlainObject(g))b.each(g,function(){L(this.name,this.value)});else for(I in g)Bi(I,g[I],v,L);return R.join("&")},b.fn.extend({serialize:function(){return b.param(this.serializeArray())},serializeArray:function(){return this.map(function(){var g=b.prop(this,"elements");return g?b.makeArray(g):this}).filter(function(){var g=this.type;return this.name&&!b(this).is(":disabled")&&Ht.test(this.nodeName)&&!Ut.test(g)&&(this.checked||!Aa.test(g))}).map(function(g,v){var I=b(this).val();return I==null?null:Array.isArray(I)?b.map(I,function(R){return{name:v.name,value:R.replace(Jt,`\r -`)}}):{name:v.name,value:I.replace(Jt,`\r -`)}}).get()}});var $t=/%20/g,In=/#.*$/,Tn=/([?&])_=[^&]*/,on=/^(.*?):[ \t]*([^\r\n]*)$/mg,mn=/^(?:about|app|app-storage|.+-extension|file|res|widget):$/,yp=/^(?:GET|HEAD)$/,wp=/^\/\//,ih={},nh={},rh="*/".concat("*"),sh=_.createElement("a");sh.href=se.href;function Sp(g){return function(v,I){typeof v!="string"&&(I=v,v="*");var R,L=0,D=v.toLowerCase().match(li)||[];if(m(I))for(;R=D[L++];)R[0]==="+"?(R=R.slice(1)||"*",(g[R]=g[R]||[]).unshift(I)):(g[R]=g[R]||[]).push(I)}}function Jk(g,v,I,R){var L={},D=g===nh;function z(G){var U;return L[G]=!0,b.each(g[G]||[],function(te,de){var be=de(v,I,R);if(typeof be=="string"&&!D&&!L[be])return v.dataTypes.unshift(be),z(be),!1;if(D)return!(U=be)}),U}return z(v.dataTypes[0])||!L["*"]&&z("*")}function a1(g,v){var I,R,L=b.ajaxSettings.flatOptions||{};for(I in v)v[I]!==void 0&&((L[I]?g:R||(R={}))[I]=v[I]);return R&&b.extend(!0,g,R),g}function L8(g,v,I){for(var R,L,D,z,G=g.contents,U=g.dataTypes;U[0]==="*";)U.shift(),R===void 0&&(R=g.mimeType||v.getResponseHeader("Content-Type"));if(R){for(L in G)if(G[L]&&G[L].test(R)){U.unshift(L);break}}if(U[0]in I)D=U[0];else{for(L in I){if(!U[0]||g.converters[L+" "+U[0]]){D=L;break}z||(z=L)}D=D||z}if(D)return D!==U[0]&&U.unshift(D),I[D]}function N8(g,v,I,R){var L,D,z,G,U,te={},de=g.dataTypes.slice();if(de[1])for(z in g.converters)te[z.toLowerCase()]=g.converters[z];for(D=de.shift();D;)if(g.responseFields[D]&&(I[g.responseFields[D]]=v),!U&&R&&g.dataFilter&&(v=g.dataFilter(v,g.dataType)),U=D,D=de.shift(),D){if(D==="*")D=U;else if(U!=="*"&&U!==D){if(z=te[U+" "+D]||te["* "+D],!z){for(L in te)if(G=L.split(" "),G[1]===D&&(z=te[U+" "+G[0]]||te["* "+G[0]],z)){z===!0?z=te[L]:te[L]!==!0&&(D=G[0],de.unshift(G[1]));break}}if(z!==!0)if(z&&g.throws)v=z(v);else try{v=z(v)}catch(be){return{state:"parsererror",error:z?be:"No conversion from "+U+" to "+D}}}}return{state:"success",data:v}}b.extend({active:0,lastModified:{},etag:{},ajaxSettings:{url:se.href,type:"GET",isLocal:mn.test(se.protocol),global:!0,processData:!0,async:!0,contentType:"application/x-www-form-urlencoded; charset=UTF-8",accepts:{"*":rh,text:"text/plain",html:"text/html",xml:"application/xml, text/xml",json:"application/json, text/javascript"},contents:{xml:/\bxml\b/,html:/\bhtml/,json:/\bjson\b/},responseFields:{xml:"responseXML",text:"responseText",json:"responseJSON"},converters:{"* text":String,"text html":!0,"text json":JSON.parse,"text xml":b.parseXML},flatOptions:{url:!0,context:!0}},ajaxSetup:function(g,v){return v?a1(a1(g,b.ajaxSettings),v):a1(b.ajaxSettings,g)},ajaxPrefilter:Sp(ih),ajaxTransport:Sp(nh),ajax:function(g,v){typeof g=="object"&&(v=g,g=void 0),v=v||{};var I,R,L,D,z,G,U,te,de,be,le=b.ajaxSetup({},v),ke=le.context||le,ft=le.context&&(ke.nodeType||ke.jquery)?b(ke):b.event,Nt=b.Deferred(),St=b.Callbacks("once memory"),Hn=le.statusCode||{},pn={},Fo={},Wo="canceled",kt={readyState:0,getResponseHeader:function(Pt){var Hi;if(U){if(!D)for(D={};Hi=on.exec(L);)D[Hi[1].toLowerCase()+" "]=(D[Hi[1].toLowerCase()+" "]||[]).concat(Hi[2]);Hi=D[Pt.toLowerCase()+" "]}return Hi==null?null:Hi.join(", ")},getAllResponseHeaders:function(){return U?L:null},setRequestHeader:function(Pt,Hi){return U==null&&(Pt=Fo[Pt.toLowerCase()]=Fo[Pt.toLowerCase()]||Pt,pn[Pt]=Hi),this},overrideMimeType:function(Pt){return U==null&&(le.mimeType=Pt),this},statusCode:function(Pt){var Hi;if(Pt)if(U)kt.always(Pt[kt.status]);else for(Hi in Pt)Hn[Hi]=[Hn[Hi],Pt[Hi]];return this},abort:function(Pt){var Hi=Pt||Wo;return I&&I.abort(Hi),su(0,Hi),this}};if(Nt.promise(kt),le.url=((g||le.url||se.href)+"").replace(wp,se.protocol+"//"),le.type=v.method||v.type||le.method||le.type,le.dataTypes=(le.dataType||"*").toLowerCase().match(li)||[""],le.crossDomain==null){G=_.createElement("a");try{G.href=le.url,G.href=G.href,le.crossDomain=sh.protocol+"//"+sh.host!=G.protocol+"//"+G.host}catch{le.crossDomain=!0}}if(le.data&&le.processData&&typeof le.data!="string"&&(le.data=b.param(le.data,le.traditional)),Jk(ih,le,v,kt),U)return kt;te=b.event&&le.global,te&&b.active++===0&&b.event.trigger("ajaxStart"),le.type=le.type.toUpperCase(),le.hasContent=!yp.test(le.type),R=le.url.replace(In,""),le.hasContent?le.data&&le.processData&&(le.contentType||"").indexOf("application/x-www-form-urlencoded")===0&&(le.data=le.data.replace($t,"+")):(be=le.url.slice(R.length),le.data&&(le.processData||typeof le.data=="string")&&(R+=($e.test(R)?"&":"?")+le.data,delete le.data),le.cache===!1&&(R=R.replace(Tn,"$1"),be=($e.test(R)?"&":"?")+"_="+Ee.guid+++be),le.url=R+be),le.ifModified&&(b.lastModified[R]&&kt.setRequestHeader("If-Modified-Since",b.lastModified[R]),b.etag[R]&&kt.setRequestHeader("If-None-Match",b.etag[R])),(le.data&&le.hasContent&&le.contentType!==!1||v.contentType)&&kt.setRequestHeader("Content-Type",le.contentType),kt.setRequestHeader("Accept",le.dataTypes[0]&&le.accepts[le.dataTypes[0]]?le.accepts[le.dataTypes[0]]+(le.dataTypes[0]!=="*"?", "+rh+"; q=0.01":""):le.accepts["*"]);for(de in le.headers)kt.setRequestHeader(de,le.headers[de]);if(le.beforeSend&&(le.beforeSend.call(ke,kt,le)===!1||U))return kt.abort();if(Wo="abort",St.add(le.complete),kt.done(le.success),kt.fail(le.error),I=Jk(nh,le,v,kt),!I)su(-1,"No Transport");else{if(kt.readyState=1,te&&ft.trigger("ajaxSend",[kt,le]),U)return kt;le.async&&le.timeout>0&&(z=t.setTimeout(function(){kt.abort("timeout")},le.timeout));try{U=!1,I.send(pn,su)}catch(Pt){if(U)throw Pt;su(-1,Pt)}}function su(Pt,Hi,Ep,c1){var $o,Mp,qo,Gl,Yl,Ms=Hi;U||(U=!0,z&&t.clearTimeout(z),I=void 0,L=c1||"",kt.readyState=Pt>0?4:0,$o=Pt>=200&&Pt<300||Pt===304,Ep&&(Gl=L8(le,kt,Ep)),!$o&&b.inArray("script",le.dataTypes)>-1&&b.inArray("json",le.dataTypes)<0&&(le.converters["text script"]=function(){}),Gl=N8(le,Gl,kt,$o),$o?(le.ifModified&&(Yl=kt.getResponseHeader("Last-Modified"),Yl&&(b.lastModified[R]=Yl),Yl=kt.getResponseHeader("etag"),Yl&&(b.etag[R]=Yl)),Pt===204||le.type==="HEAD"?Ms="nocontent":Pt===304?Ms="notmodified":(Ms=Gl.state,Mp=Gl.data,qo=Gl.error,$o=!qo)):(qo=Ms,(Pt||!Ms)&&(Ms="error",Pt<0&&(Pt=0))),kt.status=Pt,kt.statusText=(Hi||Ms)+"",$o?Nt.resolveWith(ke,[Mp,Ms,kt]):Nt.rejectWith(ke,[kt,Ms,qo]),kt.statusCode(Hn),Hn=void 0,te&&ft.trigger($o?"ajaxSuccess":"ajaxError",[kt,le,$o?Mp:qo]),St.fireWith(ke,[kt,Ms]),te&&(ft.trigger("ajaxComplete",[kt,le]),--b.active||b.event.trigger("ajaxStop")))}return kt},getJSON:function(g,v,I){return b.get(g,v,I,"json")},getScript:function(g,v){return b.get(g,void 0,v,"script")}}),b.each(["get","post"],function(g,v){b[v]=function(I,R,L,D){return m(R)&&(D=D||L,L=R,R=void 0),b.ajax(b.extend({url:I,type:v,dataType:D,data:R,success:L},b.isPlainObject(I)&&I))}}),b.ajaxPrefilter(function(g){var v;for(v in g.headers)v.toLowerCase()==="content-type"&&(g.contentType=g.headers[v]||"")}),b._evalUrl=function(g,v,I){return b.ajax({url:g,type:"GET",dataType:"script",cache:!0,async:!1,global:!1,converters:{"text script":function(){}},dataFilter:function(R){b.globalEval(R,v,I)}})},b.fn.extend({wrapAll:function(g){var v;return this[0]&&(m(g)&&(g=g.call(this[0])),v=b(g,this[0].ownerDocument).eq(0).clone(!0),this[0].parentNode&&v.insertBefore(this[0]),v.map(function(){for(var I=this;I.firstElementChild;)I=I.firstElementChild;return I}).append(this)),this},wrapInner:function(g){return m(g)?this.each(function(v){b(this).wrapInner(g.call(this,v))}):this.each(function(){var v=b(this),I=v.contents();I.length?I.wrapAll(g):v.append(g)})},wrap:function(g){var v=m(g);return this.each(function(I){b(this).wrapAll(v?g.call(this,I):g)})},unwrap:function(g){return this.parent(g).not("body").each(function(){b(this).replaceWith(this.childNodes)}),this}}),b.expr.pseudos.hidden=function(g){return!b.expr.pseudos.visible(g)},b.expr.pseudos.visible=function(g){return!!(g.offsetWidth||g.offsetHeight||g.getClientRects().length)},b.ajaxSettings.xhr=function(){try{return new t.XMLHttpRequest}catch{}};var D8={0:200,1223:204},Cp=b.ajaxSettings.xhr();h.cors=!!Cp&&"withCredentials"in Cp,h.ajax=Cp=!!Cp,b.ajaxTransport(function(g){var v,I;if(h.cors||Cp&&!g.crossDomain)return{send:function(R,L){var D,z=g.xhr();if(z.open(g.type,g.url,g.async,g.username,g.password),g.xhrFields)for(D in g.xhrFields)z[D]=g.xhrFields[D];g.mimeType&&z.overrideMimeType&&z.overrideMimeType(g.mimeType),!g.crossDomain&&!R["X-Requested-With"]&&(R["X-Requested-With"]="XMLHttpRequest");for(D in R)z.setRequestHeader(D,R[D]);v=function(G){return function(){v&&(v=I=z.onload=z.onerror=z.onabort=z.ontimeout=z.onreadystatechange=null,G==="abort"?z.abort():G==="error"?typeof z.status!="number"?L(0,"error"):L(z.status,z.statusText):L(D8[z.status]||z.status,z.statusText,(z.responseType||"text")!=="text"||typeof z.responseText!="string"?{binary:z.response}:{text:z.responseText},z.getAllResponseHeaders()))}},z.onload=v(),I=z.onerror=z.ontimeout=v("error"),z.onabort!==void 0?z.onabort=I:z.onreadystatechange=function(){z.readyState===4&&t.setTimeout(function(){v&&I()})},v=v("abort");try{z.send(g.hasContent&&g.data||null)}catch(G){if(v)throw G}},abort:function(){v&&v()}}}),b.ajaxPrefilter(function(g){g.crossDomain&&(g.contents.script=!1)}),b.ajaxSetup({accepts:{script:"text/javascript, application/javascript, application/ecmascript, application/x-ecmascript"},contents:{script:/\b(?:java|ecma)script\b/},converters:{"text script":function(g){return b.globalEval(g),g}}}),b.ajaxPrefilter("script",function(g){g.cache===void 0&&(g.cache=!1),g.crossDomain&&(g.type="GET")}),b.ajaxTransport("script",function(g){if(g.crossDomain||g.scriptAttrs){var v,I;return{send:function(R,L){v=b("

2 Linear Quadratic Regulators

2.1Introduction

Up to this point, we have considered decision problems with finitely + c-1.4,4.9-3.6,8.6-6.8,11.1c-3.1,2.5-7.3,3.7-12.4,3.8H63c-6.7,0-10,2.9-10,8.8">Made with MyST

2 Linear Quadratic Regulators

2.1Introduction

Up to this point, we have considered decision problems with finitely many states and actions. However, in many applications, states and actions may take on continuous values. For example, consider autonomous driving, controlling a robot’s joints, and automated manufacturing. How can we teach computers to solve these kinds of problems? This is the -task of continuous control.

Solving a Rubik’s Cube with a robot hand.

Figure 2.1:Solving a Rubik’s Cube with a robot hand.

Boston Dynamics’s Spot robot.

Figure 2.2:Boston Dynamics’s Spot robot.

Aside from the change in the state and action spaces, the general +task of continuous control.

Solving a Rubik’s Cube with a robot hand.

Figure 2.1:Solving a Rubik’s Cube with a robot hand.

Boston Dynamics’s Spot robot.

Figure 2.2:Boston Dynamics’s Spot robot.

Aside from the change in the state and action spaces, the general problem setup remains the same: we seek to construct an optimal policy that outputs actions to solve the desired task. We will see that many key ideas and algorithms, in particular dynamic programming algorithms, @@ -32,7 +32,7 @@ difficult than it may first seem: the position of the pencil varies continuously, and the state transitions governing the system, i.e. the laws of physics, are highly complex. This task is equivalent to the -classic control problem known as CartPole:

The state xR4\st \in \mathbb{R}^4 can be described by:

  1. the position of the cart;

  2. the velocity of the cart;

  3. the angle of the pole;

  4. the angular velocity of the pole.

We can control the cart by applying a horizontal force uR\act \in \mathbb{R}.

Goal: Stabilize the cart around an ideal state and action +classic control problem known as CartPole:

The state xR4\st \in \mathbb{R}^4 can be described by:

  1. the position of the cart;

  2. the velocity of the cart;

  3. the angle of the pole;

  4. the angular velocity of the pole.

We can control the cart by applying a horizontal force uR\act \in \mathbb{R}.

Goal: Stabilize the cart around an ideal state and action (x,u)(\st^\star, \act^\star).

2.2Optimal control

Recall that an MDP is defined by its state space S\mathcal{S}, action space A\mathcal{A}, state transitions PP, reward function rr, and discount factor γ or time horizon H\hor. These have equivalents in the control @@ -60,7 +60,7 @@ over H\hor timesteps. In this chapter, we will only consider deterministic, time-dependent policies π=(π0,,πH1)\pi = (\pi_0, \dots, \pi_{H-1}) where πh:SA\pi_h : \mathcal{S} \to \mathcal{A} for each -h[H]\hi \in [\hor].

2.2.1A first attempt: Discretization

Can we solve this problem using tools from the finite MDP setting? If S\mathcal{S} and A\mathcal{A} were finite, then we’d be able to work backwards using the DP algorithm for computing the optimal policy in an MDP (Definition 1.11). This inspires us to try discretizing the problem.

Suppose S\mathcal{S} and A\mathcal{A} are bounded, that is, @@ -117,36 +117,36 @@ continuous structure in other ways? This leads us to the linear quadratic regulator.

2.3The Linear Quadratic Regulator

The optimal control problem Definition 2.1 seems highly complex in general. Is there a relevant simplification that we can analyze? The linear quadratic regulator (LQR) is a solvable case and a fundamental tool in control theory.

We will henceforth abbreviate “symmetric positive definite” as s.p.d. and “positive definite” as p.d.

It will be helpful to reintroduce the value function notation for a policy to denote the average cost it incurs. These will be instrumental in constructing the optimal policy via dynamic programming, as we did in Section 1.3.2 for MDPs.

2.4Optimality and the Riccati Equation

In this section, we’ll compute the optimal value function VhV^\star_h, Q-function QhQ^\star_h, @@ -154,19 +154,19 @@ in a very similar way to the DP algorithms in the MDP setting. Recall the definition of the optimal value function:

We will prove the striking fact that the solution has very simple structure: +\end{split}

Both of the definitions above assume deterministic policies. Otherwise we would have to take an expectation over actions drawn from the policy, i.e. uhπh(xh)\act_\hi \sim \pi_\hi (\st_\hi).

We will prove the striking fact that the solution has very simple structure: VhV_h^\star and QhQ^\star_h are upward-curved quadratics -and πh\pi_h^\star is linear and furthermore does not depend on the noise!

Now we’ve shown that Vh(x)=xPhx+phV^\star_\hi(\st) = \st^\top P_\hi \st + p_\hi, where PhP_\hi is s.p.d., @@ -290,24 +290,24 @@ policy.

2.4.1Expected state at time h\hi

How can we compute the expected state at time h\hi when acting according to the optimal policy? Let’s first express xh\st_\hi in a cleaner way in terms of the history. Note that having linear dynamics -makes it easy to expand terms backwards in time:

xh=Axh1+Buh1+wh1=A(Axh2+Buh2+wh2)+Buh1+wh1==Ahx0+i=0h1Ai(Buhi1+whi1).\begin{aligned} +makes it easy to expand terms backwards in time:

xh=Axh1+Buh1+wh1=A(Axh2+Buh2+wh2)+Buh1+wh1==Ahx0+i=0h1Ai(Buhi1+whi1).\begin{aligned} \st_\hi & = A \st_{\hi-1} + B \act_{\hi-1} + w_{\hi-1} \\ & = A (A\st_{\hi-2} + B \act_{\hi-2} + w_{\hi-2}) + B \act_{\hi-1} + w_{\hi-1} \\ & = \cdots \\ & = A^\hi \st_0 + \sum_{i=0}^{\hi-1} A^i (B \act_{\hi-i-1} + w_{\hi-i-1}). -\end{aligned}

Let’s consider the average state at this time, given all the past +\end{aligned}

Let’s consider the average state at this time, given all the past states and actions. Since we assume that E[wh]=0\E [w_\hi] = 0 (this is the zero vector in dd dimensions), when we take an expectation, the whw_\hi -term vanishes due to linearity, and so we’re left with

E[xhx0:(h1),u0:(h1)]=Ahx0+i=0h1AiBuhi1.\E [\st_\hi \mid \st_{0:(\hi-1)}, \act_{0:(\hi-1)}] = A^\hi \st_0 + \sum_{i=0}^{\hi-1} A^i B \act_{\hi-i-1}.

This introdces the quantity ABKiA - B K_i, which shows up frequently in +term vanishes due to linearity, and so we’re left with

E[xhx0:(h1),u0:(h1)]=Ahx0+i=0h1AiBuhi1.\E [\st_\hi \mid \st_{0:(\hi-1)}, \act_{0:(\hi-1)}] = A^\hi \st_0 + \sum_{i=0}^{\hi-1} A^i B \act_{\hi-i-1}.

This introdces the quantity ABKiA - B K_i, which shows up frequently in control theory. For example, one important question is: will xh\st_\hi remain bounded, or will it go to infinity as time goes on? To answer this, let’s imagine for simplicity that these KiK_is are equal (call this matrix KK). Then the expression above becomes (ABK)hx0(A-BK)^\hi \st_0. Now consider the maximum eigenvalue λmax\lambda_{\max} of ABKA - BK. If λmax>1|\lambda_{\max}| > 1, then there’s some nonzero initial state -xˉ0\bar \st_0, the corresponding eigenvector, for which

limh(ABK)hxˉ0=limhλmaxhxˉ0=.\lim_{\hi \to \infty} (A - BK)^\hi \bar \st_0 +xˉ0\bar \st_0, the corresponding eigenvector, for which

limh(ABK)hxˉ0=limhλmaxhxˉ0=.\lim_{\hi \to \infty} (A - BK)^\hi \bar \st_0 = \lim_{\hi \to \infty} \lambda_{\max}^\hi \bar \st_0 - = \infty.

Otherwise, if λmax<1|\lambda_{\max}| < 1, then it’s impossible for your original state to explode as dramatically.

2.5Extensions

We’ve now formulated an optimal solution for the time-homogeneous LQR + = \infty.

Otherwise, if λmax<1|\lambda_{\max}| < 1, then it’s impossible for your original state to explode as dramatically.

2.5Extensions

We’ve now formulated an optimal solution for the time-homogeneous LQR and computed the expected state under the optimal policy. However, real world tasks rarely have such simple dynamics, and we may wish to design more complex cost functions. In this section, we’ll consider more @@ -325,15 +325,15 @@ consider the case where the dynamics and cost function are time-dependent. Our analysis remains almost identical; in fact, we can simply add a time index to the matrices AA and BB that determine the -dynamics and the matrices QQ and RR that determine the cost.

The modified problem is now defined as follows:

The derivation of the optimal value functions and the optimal policy remains almost exactly the same, and we can modify the Riccati equation -accordingly:

\ No newline at end of file diff --git a/control.json b/control.json index 89a7fb4..27229de 100644 --- a/control.json +++ b/control.json @@ -1 +1 @@ -{"kind":"Notebook","sha256":"6b86f35044831ffbe0cf07af5eee27ce5d28fea0397ecdc730ddaa67506611c3","slug":"control","location":"/control.md","dependencies":[],"frontmatter":{"title":"2 Linear Quadratic Regulators","numbering":{"all":{"enabled":true},"enumerator":{"template":"2.%s"}},"math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"x"},"\\act":{"macro":"u"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","thumbnail":"/build/rubiks_cube-5d86d5b19a044eede0a3801e51b37815.jpg","thumbnailOptimized":"/build/rubiks_cube-5d86d5b19a044eede0a3801e51b37815.webp","exports":[{"format":"md","filename":"control.md","url":"/build/control-a8c1e7d39cf806d9a073317a2544cfca.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"Ozq4ciGdi1"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"2.1","key":"qLvZ9sth7f"},{"type":"paragraph","position":{"start":{"line":23,"column":1},"end":{"line":28,"column":1}},"children":[{"type":"text","value":"Up to this point, we have considered decision problems with finitely\nmany states and actions. However, in many applications, states and\nactions may take on continuous values. For example, consider autonomous\ndriving, controlling a robot’s joints, and automated manufacturing. How\ncan we teach computers to solve these kinds of problems? This is the\ntask of ","position":{"start":{"line":23,"column":1},"end":{"line":23,"column":1}},"key":"H9l2HeRrNA"},{"type":"strong","position":{"start":{"line":23,"column":1},"end":{"line":23,"column":1}},"children":[{"type":"text","value":"continuous control","position":{"start":{"line":23,"column":1},"end":{"line":23,"column":1}},"key":"JlA1YZ7SVw"}],"key":"uFXMWhvVU8"},{"type":"text","value":".","position":{"start":{"line":23,"column":1},"end":{"line":23,"column":1}},"key":"tCErFUZzxV"}],"key":"xHodts1xt1"},{"type":"container","kind":"figure","children":[{"type":"image","url":"/build/rubiks_cube-5d86d5b19a044eede0a3801e51b37815.jpg","alt":"Solving a Rubik’s Cube with a robot hand.","data":{"altTextIsAutoGenerated":true},"key":"pF8XNVzcWA","urlSource":"shared/rubiks_cube.jpg","urlOptimized":"/build/rubiks_cube-5d86d5b19a044eede0a3801e51b37815.webp"},{"type":"caption","children":[{"type":"paragraph","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"captionNumber","kind":"figure","label":"control_examples","identifier":"control_examples","html_id":"control-examples","enumerator":"2.1","children":[{"type":"text","value":"Figure ","key":"n83KFPeBim"},{"type":"text","value":"2.1","key":"CdHDsj4dKo"},{"type":"text","value":":","key":"hXELk7kO9W"}],"template":"Figure %s:","key":"HD6armt0Vh"},{"type":"text","value":"Solving a Rubik’s Cube with a robot hand.","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"OcZVucjRs6"}],"key":"WiHIVxlFfB"}],"key":"Y0TXg1iTAH"}],"label":"control_examples","identifier":"control_examples","enumerator":"2.1","html_id":"control-examples","key":"qoKJgR8luc"},{"type":"container","kind":"figure","children":[{"type":"image","url":"/build/boston_dynamics-07bc07f0646e10c0fddbe75b26862eee.jpg","alt":"Boston Dynamics’s Spot robot.","data":{"altTextIsAutoGenerated":true},"key":"wimyN3U2Ta","urlSource":"shared/boston_dynamics.jpg","urlOptimized":"/build/boston_dynamics-07bc07f0646e10c0fddbe75b26862eee.webp"},{"type":"caption","children":[{"type":"paragraph","position":{"start":{"line":39,"column":1},"end":{"line":39,"column":1}},"children":[{"type":"captionNumber","kind":"figure","label":"robot_hand","identifier":"robot_hand","html_id":"robot-hand","enumerator":"2.2","children":[{"type":"text","value":"Figure ","key":"Wg2pklxrT8"},{"type":"text","value":"2.2","key":"csJEOWNdbp"},{"type":"text","value":":","key":"RBWDojPskN"}],"template":"Figure %s:","key":"SNth9r3RdV"},{"type":"text","value":"Boston Dynamics’s Spot robot.","position":{"start":{"line":39,"column":1},"end":{"line":39,"column":1}},"key":"biSzHMKYfb"}],"key":"dvuSGOecTM"}],"key":"UC5GcxswI3"}],"label":"robot_hand","identifier":"robot_hand","enumerator":"2.2","html_id":"robot-hand","key":"FNsLonCLh3"},{"type":"paragraph","position":{"start":{"line":42,"column":1},"end":{"line":46,"column":1}},"children":[{"type":"text","value":"Aside from the change in the state and action spaces, the general\nproblem setup remains the same: we seek to construct an ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"zncLho35nY"},{"type":"emphasis","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"optimal policy","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"lYC80xWn3E"}],"key":"Cm5dMLMMn5"},{"type":"text","value":"\nthat outputs actions to solve the desired task. We will see that many\nkey ideas and algorithms, in particular dynamic programming algorithms,\ncarry over to this new setting.","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"RbAXMaK2g1"}],"key":"lL6TdiyWAh"},{"type":"paragraph","position":{"start":{"line":48,"column":1},"end":{"line":50,"column":1}},"children":[{"type":"text","value":"This chapter introduces a fundamental tool to solve a simple class of\ncontinuous control problems: the ","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"key":"lgUSjHoYdZ"},{"type":"strong","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"children":[{"type":"text","value":"linear quadratic regulator","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"key":"AphPa29FpB"}],"key":"nUTtFRbyiw"},{"type":"text","value":". We will\nthen extend this basic method to more complex settings.","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"key":"W1tQe1Dk3C"}],"key":"JgQKA1VHbx"},{"type":"proof","kind":"example","label":"cart_pole","identifier":"cart_pole","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"CartPole","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"tsTNQ5I1xO"}],"key":"e2xfbbT423"},{"type":"paragraph","position":{"start":{"line":55,"column":1},"end":{"line":59,"column":1}},"children":[{"type":"text","value":"Try to balance a pencil on its point on a flat surface. It’s much more\ndifficult than it may first seem: the position of the pencil varies\ncontinuously, and the state transitions governing the system, i.e. the\nlaws of physics, are highly complex. This task is equivalent to the\nclassic control problem known as ","position":{"start":{"line":55,"column":1},"end":{"line":55,"column":1}},"key":"Lh4G1hGku3"},{"type":"emphasis","position":{"start":{"line":55,"column":1},"end":{"line":55,"column":1}},"children":[{"type":"text","value":"CartPole","position":{"start":{"line":55,"column":1},"end":{"line":55,"column":1}},"key":"KuyYPK9E6c"}],"key":"eEiWwZKMir"},{"type":"text","value":":","position":{"start":{"line":55,"column":1},"end":{"line":55,"column":1}},"key":"L4x3XaDexH"}],"key":"ISq6T9L97S"},{"type":"image","url":"/build/cart_pole-cbbb59437cd1cf4230050ca053220243.png","width":"200px","align":"center","key":"PR26sS10jT","urlSource":"shared/cart_pole.png","urlOptimized":"/build/cart_pole-cbbb59437cd1cf4230050ca053220243.webp"},{"type":"paragraph","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"children":[{"type":"text","value":"The state ","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"RGT3Eq1Uv0"},{"type":"inlineMath","value":"\\st \\in \\mathbb{R}^4","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"html":"xR4\\st \\in \\mathbb{R}^4xR4","key":"vKNctjj4m7"},{"type":"text","value":" can be described by:","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"goQKI08T8r"}],"key":"Kye7Ag47aO"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":67,"column":1},"end":{"line":74,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":67,"column":1},"end":{"line":68,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":67,"column":1},"end":{"line":67,"column":1}},"children":[{"type":"text","value":"the position of the cart;","position":{"start":{"line":67,"column":1},"end":{"line":67,"column":1}},"key":"DVfaSc0nmp"}],"key":"a1NtkO8I3k"}],"key":"ltZCWZsYlI"},{"type":"listItem","spread":true,"position":{"start":{"line":69,"column":1},"end":{"line":70,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"children":[{"type":"text","value":"the velocity of the cart;","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"Km7yAsEvjB"}],"key":"h5iwLUDngd"}],"key":"oa69zp4XUZ"},{"type":"listItem","spread":true,"position":{"start":{"line":71,"column":1},"end":{"line":72,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"the angle of the pole;","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"b830U00qBZ"}],"key":"rhHHBhCfun"}],"key":"JkunOcXF8v"},{"type":"listItem","spread":true,"position":{"start":{"line":73,"column":1},"end":{"line":74,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":73,"column":1},"end":{"line":73,"column":1}},"children":[{"type":"text","value":"the angular velocity of the pole.","position":{"start":{"line":73,"column":1},"end":{"line":73,"column":1}},"key":"mh48tDz0a9"}],"key":"lQS0GbDRvF"}],"key":"w63z5rYBYZ"}],"key":"jwiQ5R7foW"},{"type":"paragraph","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"children":[{"type":"text","value":"We can ","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"key":"Vjp3U1Mbdm"},{"type":"emphasis","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"children":[{"type":"text","value":"control","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"key":"pd9c7MT1zM"}],"key":"EkGrAwfulW"},{"type":"text","value":" the cart by applying a horizontal force ","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"key":"HltSNiv2R2"},{"type":"inlineMath","value":"\\act \\in \\mathbb{R}","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"html":"uR\\act \\in \\mathbb{R}uR","key":"vi18PgUehN"},{"type":"text","value":".","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"key":"qd0fCL7pQ3"}],"key":"stTOjNz2jk"},{"type":"paragraph","position":{"start":{"line":77,"column":1},"end":{"line":78,"column":1}},"children":[{"type":"strong","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"children":[{"type":"text","value":"Goal:","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"key":"bssWvVa3j7"}],"key":"nvMmakl0St"},{"type":"text","value":" Stabilize the cart around an ideal state and action\n","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"key":"QQF0RBKr4x"},{"type":"inlineMath","value":"(\\st^\\star, \\act^\\star)","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"html":"(x,u)(\\st^\\star, \\act^\\star)(x,u)","key":"xFya7Ldzw3"},{"type":"text","value":".","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"key":"CkuQarCihN"}],"key":"mWErd8PTDh"}],"enumerator":"2.1","html_id":"cart-pole","key":"LA187fAXwr"},{"type":"heading","depth":2,"position":{"start":{"line":81,"column":1},"end":{"line":81,"column":1}},"children":[{"type":"text","value":"Optimal control","position":{"start":{"line":81,"column":1},"end":{"line":81,"column":1}},"key":"ouS4NmfdMB"}],"identifier":"optimal-control","label":"Optimal control","html_id":"optimal-control","implicit":true,"enumerator":"2.2","key":"jyh7Ab6Ret"},{"type":"paragraph","position":{"start":{"line":83,"column":1},"end":{"line":86,"column":1}},"children":[{"type":"text","value":"Recall that an MDP is defined by its state space ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"fqngj6J2dd"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"S\\mathcal{S}S","key":"lrxghbqROf"},{"type":"text","value":", action space\n","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"tY4DwpazYU"},{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"A\\mathcal{A}A","key":"shGmvyUF7A"},{"type":"text","value":", state transitions ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"bNtOBIOixO"},{"type":"inlineMath","value":"P","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"PPP","key":"Xxbs5tZac7"},{"type":"text","value":", reward function ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"DXBxu809ig"},{"type":"inlineMath","value":"r","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"rrr","key":"gmUmtKuFN9"},{"type":"text","value":", and discount factor\n","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"jYyFUgQhnX"},{"type":"text","value":"γ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"QXbrQ7imEm"},{"type":"text","value":" or time horizon ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"ZTtkcdIdqw"},{"type":"inlineMath","value":"\\hor","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"H\\horH","key":"ZDkse06FG4"},{"type":"text","value":". These have equivalents in the control\nsetting:","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"WbH1sMUPsz"}],"key":"jpxuC9X8T7"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":88,"column":1},"end":{"line":114,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":88,"column":1},"end":{"line":93,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":88,"column":1},"end":{"line":92,"column":1}},"children":[{"type":"text","value":"The state and action spaces are ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"HBA5Cvdn7b"},{"type":"emphasis","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"children":[{"type":"text","value":"continuous","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"VKQywXFXrj"}],"key":"Nkci8aEFmc"},{"type":"text","value":" rather than finite.\nThat is, ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"dP1SxbZlD8"},{"type":"inlineMath","value":"\\mathcal{S} \\subseteq \\mathbb{R}^{n_\\st}","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"html":"SRnx\\mathcal{S} \\subseteq \\mathbb{R}^{n_\\st}SRnx","key":"nwFb0XSRNF"},{"type":"text","value":" and ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"NWUyerhnVB"},{"type":"inlineMath","value":"\\mathcal{A} \\subseteq \\mathbb{R}^{n_\\act}","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"html":"ARnu\\mathcal{A} \\subseteq \\mathbb{R}^{n_\\act}ARnu","key":"PX6KgbcwAl"},{"type":"text","value":",\nwhere ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"sQzQjKUNQU"},{"type":"inlineMath","value":"n_\\st","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"html":"nxn_\\stnx","key":"NH3ejeNZVu"},{"type":"text","value":" and ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"ziTqQTw71x"},{"type":"inlineMath","value":"n_\\act","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"html":"nun_\\actnu","key":"R9qzBXWLdK"},{"type":"text","value":" are the corresponding dimensions of these\nspaces, i.e. the number of coordinates to specify a single state or\naction respectively.","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"rDvAr6D8f8"}],"key":"MvOWcNTs0i"}],"key":"MV0kYqSWKY"},{"type":"listItem","spread":true,"position":{"start":{"line":94,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":94,"column":1},"end":{"line":101,"column":1}},"children":[{"type":"text","value":"We call the state transitions the ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"Akjyyogay4"},{"type":"strong","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"dynamics","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"Ti3ZYt9k8O"}],"key":"yGbPNpGKiM"},{"type":"text","value":" of the system. In the\nmost general case, these might change across timesteps and also\ninclude some stochastic ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"yKC5h0uwJj"},{"type":"strong","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"noise","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"PmW2iLMN9M"}],"key":"gUTUgmxD4c"},{"type":"text","value":" ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"vRUvrb7K1O"},{"type":"inlineMath","value":"w_\\hi","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"whw_\\hiwh","key":"mhp3GQ9H3v"},{"type":"text","value":" at each timestep. We\ndenote these dynamics as the function ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"EieETguqMl"},{"type":"inlineMath","value":"f_\\hi","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"fhf_\\hifh","key":"ebKQwmcfQe"},{"type":"text","value":" such that\n","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"SGtW4o75up"},{"type":"inlineMath","value":"\\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi)","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"xh+1=fh(xh,uh,wh)\\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi)xh+1=fh(xh,uh,wh)","key":"NRpbimho8p"},{"type":"text","value":". Of course, we can\nsimplify to cases where the dynamics are ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"nEicBr3fYC"},{"type":"emphasis","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"deterministic/noise-free","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"mDKE4La4O3"}],"key":"eQcyWhXRKM"},{"type":"text","value":"\n(no ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"bveGOvN8vF"},{"type":"inlineMath","value":"w_\\hi","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"whw_\\hiwh","key":"kgu8CoTOOR"},{"type":"text","value":" term) and/or ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"yjhDWgTMBI"},{"type":"emphasis","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"time-homogeneous","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"DoKgz4JLY4"}],"key":"uaMyNLbvuh"},{"type":"text","value":" (the same function ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"EhhibhFL6E"},{"type":"inlineMath","value":"f","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"fff","key":"l0i2tW9TNq"},{"type":"text","value":"\nacross timesteps).","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"Zbfzet9hDD"}],"key":"qZUIF1mCdH"}],"key":"Z3dNFJVm3H"},{"type":"listItem","spread":true,"position":{"start":{"line":103,"column":1},"end":{"line":109,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":103,"column":1},"end":{"line":108,"column":1}},"children":[{"type":"text","value":"Instead of maximizing the reward function, we seek to minimize the\n","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"fYO9X9tiYB"},{"type":"strong","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"children":[{"type":"text","value":"cost function","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"senLEoaSeW"}],"key":"IWaduggdcM"},{"type":"text","value":" ","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"qteLIo7pIX"},{"type":"inlineMath","value":"c_\\hi: \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"html":"ch:S×ARc_\\hi: \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}ch:S×AR","key":"grb0W1gLAH"},{"type":"text","value":". Often, the cost\nfunction describes ","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"FOqqmQoYzy"},{"type":"emphasis","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"children":[{"type":"text","value":"how far away","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"SrLxNzSZuG"}],"key":"Kvpkm7cZ1T"},{"type":"text","value":" we are from a ","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"UeqiowV5AE"},{"type":"strong","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"children":[{"type":"text","value":"target\nstate-action pair","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"BQxb1cTjPe"}],"key":"kUtf1v1zu3"},{"type":"text","value":" ","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"yfB1lRmq3r"},{"type":"inlineMath","value":"(\\st^\\star, \\act^\\star)","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"html":"(x,u)(\\st^\\star, \\act^\\star)(x,u)","key":"ihcXkGoYkC"},{"type":"text","value":". An important special\ncase is when the cost is ","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"UP9UeIxjz3"},{"type":"emphasis","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"children":[{"type":"text","value":"time-homogeneous","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"wzKe8eHyTs"}],"key":"iZZwtZnjX0"},{"type":"text","value":"; that is, it remains the\nsame function ","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"ZL6j7E4BdP"},{"type":"inlineMath","value":"c","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"html":"ccc","key":"IDB7mEIjI3"},{"type":"text","value":" at each timestep ","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"Tl8z46rNZt"},{"type":"inlineMath","value":"h","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"html":"hhh","key":"kTA3hNnkqO"},{"type":"text","value":".","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"P96yiyhxG1"}],"key":"mOwItKWgK6"}],"key":"TcS2POaa80"},{"type":"listItem","spread":true,"position":{"start":{"line":110,"column":1},"end":{"line":114,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":110,"column":1},"end":{"line":113,"column":1}},"children":[{"type":"text","value":"We seek to minimize the ","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"hICsKMSkdL"},{"type":"emphasis","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"children":[{"type":"text","value":"undiscounted","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"oPlMjYUkG9"}],"key":"SPucjwAShE"},{"type":"text","value":" cost within a ","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"maQ6I50cIx"},{"type":"emphasis","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"children":[{"type":"text","value":"finite time\nhorizon","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"RcwkQQ7vw9"}],"key":"hBj1F25pPr"},{"type":"text","value":" ","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"yguLLXPQ9H"},{"type":"inlineMath","value":"\\hor","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"html":"H\\horH","key":"AyxGkhzSO2"},{"type":"text","value":". Note that we end an episode at the final state\n","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"HbaNCwB6ZW"},{"type":"inlineMath","value":"\\st_\\hor","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"html":"xH\\st_\\horxH","key":"Jik6AFOKNy"},{"type":"text","value":" -- there is no ","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"SxyIXvkssS"},{"type":"inlineMath","value":"\\act_\\hor","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"html":"uH\\act_\\horuH","key":"hhCJ6c1xzl"},{"type":"text","value":", and so we denote the cost for\nthe final state as ","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"GPMwTE49Vy"},{"type":"inlineMath","value":"c_\\hor(\\st_\\hor)","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"html":"cH(xH)c_\\hor(\\st_\\hor)cH(xH)","key":"wF30bdjyEb"},{"type":"text","value":".","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"TSnQnV3JRg"}],"key":"Gqeb21lIap"}],"key":"OSN2mTs6pd"}],"key":"cFxKWlz9zx"},{"type":"paragraph","position":{"start":{"line":115,"column":1},"end":{"line":120,"column":1}},"children":[{"type":"text","value":"With all of these components, we can now formulate the ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"FTU3CjErmE"},{"type":"strong","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"children":[{"type":"text","value":"optimal control\nproblem:","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"lm98kXdC2k"}],"key":"fzdbBOFRRB"},{"type":"text","value":" ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"nUSie1cdWu"},{"type":"emphasis","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"children":[{"type":"text","value":"compute a policy to minimize the expected undiscounted cost\nover ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"zzijsqOlCI"},{"type":"inlineMath","value":"\\hor","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"H\\horH","key":"w8K6Mm2yO8"},{"type":"text","value":" timesteps.","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"ys4fGS4pHh"}],"key":"J6b7FQdsrG"},{"type":"text","value":" In this chapter, we will only consider\n","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"kDgWXVxoAe"},{"type":"emphasis","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"children":[{"type":"text","value":"deterministic, time-dependent","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"eQI8snCHk8"}],"key":"jBU3iYPpJ0"},{"type":"text","value":" policies\n","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"AIGvmQEYyE"},{"type":"inlineMath","value":"\\pi = (\\pi_0, \\dots, \\pi_{H-1})","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"π=(π0,,πH1)\\pi = (\\pi_0, \\dots, \\pi_{H-1})π=(π0,,πH1)","key":"UiGhdmWpa0"},{"type":"text","value":" where ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"tjFurioxu5"},{"type":"inlineMath","value":"\\pi_h : \\mathcal{S} \\to \\mathcal{A}","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"πh:SA\\pi_h : \\mathcal{S} \\to \\mathcal{A}πh:SA","key":"w51B7TR6P9"},{"type":"text","value":" for each\n","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"e6W9NJ322M"},{"type":"inlineMath","value":"\\hi \\in [\\hor]","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"h[H]\\hi \\in [\\hor]h[H]","key":"VQH0OzpjH2"},{"type":"text","value":".","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"RAHdMdGSNF"}],"key":"MbZLAQDIsi"},{"type":"proof","kind":"definition","label":"optimal_control","identifier":"optimal_control","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"General optimal control problem","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"f2uyoZWdz9"}],"key":"T99dRr1I4z"},{"type":"math","value":"\\begin{split}\n \\min_{\\pi_0, \\dots, \\pi_{\\hor-1} : \\mathcal{S} \\to \\mathcal{A}} \\quad & \\E \\left[\n \\left( \\sum_{\\hi=0}^{\\hor-1} c_\\hi(\\st_\\hi, \\act_\\hi) \\right) + c_\\hor(\\st_\\hor)\n \\right] \\\\\n \\text{where} \\quad & \\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi), \\\\\n & \\act_\\hi = \\pi_\\hi(\\st_\\hi) \\\\\n & \\st_0 \\sim \\mu_0 \\\\\n & w_\\hi \\sim \\text{noise}\n\\end{split}","position":{"start":{"line":125,"column":1},"end":{"line":135,"column":1}},"html":"minπ0,,πH1:SAE[(h=0H1ch(xh,uh))+cH(xH)]wherexh+1=fh(xh,uh,wh),uh=πh(xh)x0μ0whnoise\\begin{split}\n \\min_{\\pi_0, \\dots, \\pi_{\\hor-1} : \\mathcal{S} \\to \\mathcal{A}} \\quad & \\E \\left[\n \\left( \\sum_{\\hi=0}^{\\hor-1} c_\\hi(\\st_\\hi, \\act_\\hi) \\right) + c_\\hor(\\st_\\hor)\n \\right] \\\\\n \\text{where} \\quad & \\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi), \\\\\n & \\act_\\hi = \\pi_\\hi(\\st_\\hi) \\\\\n & \\st_0 \\sim \\mu_0 \\\\\n & w_\\hi \\sim \\text{noise}\n\\end{split}π0,,πH1:SAminwhereE[(h=0H1ch(xh,uh))+cH(xH)]xh+1=fh(xh,uh,wh),uh=πh(xh)x0μ0whnoise","enumerator":"2.1","key":"fwaJN6M0Ap"}],"enumerator":"2.1","html_id":"optimal-control","key":"jkIEaEnnua"},{"type":"heading","depth":3,"position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"children":[{"type":"text","value":"A first attempt: Discretization","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"key":"zAH5nfKROq"}],"identifier":"a-first-attempt-discretization","label":"A first attempt: Discretization","html_id":"a-first-attempt-discretization","implicit":true,"enumerator":"2.2.1","key":"bSpEdoVdW5"},{"type":"paragraph","position":{"start":{"line":140,"column":1},"end":{"line":143,"column":1}},"children":[{"type":"text","value":"Can we solve this problem using tools from the finite MDP setting? If\n","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"MPb8Gy0VhU"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"html":"S\\mathcal{S}S","key":"seOEkies5C"},{"type":"text","value":" and ","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"Y3EaEvzUkk"},{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"html":"A\\mathcal{A}A","key":"bilqh6aN4k"},{"type":"text","value":" were finite, then we’d be able to work backwards using the DP algorithm for computing the optimal policy in an MDP (","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"vImOhQHX0l"},{"type":"crossReference","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"children":[{"type":"text","value":"Definition ","key":"L61uZva0LB"},{"type":"text","value":"1.11","key":"yqqfS8SGel"}],"identifier":"pi_star_dp","label":"pi_star_dp","kind":"proof:definition","template":"Definition %s","enumerator":"1.11","resolved":true,"html_id":"pi-star-dp","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"LKqXtjKqls"},{"type":"text","value":").\nThis inspires us to try ","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"aeQB799d9P"},{"type":"emphasis","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"children":[{"type":"text","value":"discretizing","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"gRP3BiiKhF"}],"key":"ML5dCIlQbn"},{"type":"text","value":" the\nproblem.","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"fznM4mQAGS"}],"key":"jySCqElvfb"},{"type":"paragraph","position":{"start":{"line":145,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"Suppose ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"PJKW4l7xja"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"S\\mathcal{S}S","key":"yvqkPhtBV5"},{"type":"text","value":" and ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"JkDS6MCIrl"},{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"A\\mathcal{A}A","key":"oeTOl2xrhc"},{"type":"text","value":" are bounded, that is,\n","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"VW6OugSKtq"},{"type":"inlineMath","value":"\\max_{\\st \\in \\mathcal{S}} \\|\\st\\| \\le B_\\st","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"maxxSxBx\\max_{\\st \\in \\mathcal{S}} \\|\\st\\| \\le B_\\stmaxxSxBx","key":"frWCANOAju"},{"type":"text","value":" and\n","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"tZGxKDeFTx"},{"type":"inlineMath","value":"\\max_{\\act \\in \\mathcal{A}} \\|\\act\\| \\le B_\\act","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"maxuAuBu\\max_{\\act \\in \\mathcal{A}} \\|\\act\\| \\le B_\\actmaxuAuBu","key":"BRhTHgRnm2"},{"type":"text","value":". To make ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"be6m1Vvuks"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"S\\mathcal{S}S","key":"d9wPE6v3gc"},{"type":"text","value":" and ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"J6r2KzPvfd"},{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"A\\mathcal{A}A","key":"VGyMVauEw7"},{"type":"text","value":" finite,\nlet’s choose some small positive ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"YGL8g5gyfL"},{"type":"text","value":"ε","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"dpiUhTSXSV"},{"type":"text","value":", and simply round each\ncoordinate to the nearest multiple of ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"oh4HKsDOB7"},{"type":"text","value":"ε","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"qLaSkQpV6n"},{"type":"text","value":". For example, if\n","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"RjyMrBgsj4"},{"type":"inlineMath","value":"\\epsilon = 0.01","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"ϵ=0.01\\epsilon = 0.01ϵ=0.01","key":"TbwK1hNt2i"},{"type":"text","value":", then we round each element of ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"sMnOccKvdM"},{"type":"inlineMath","value":"\\st","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"x\\stx","key":"WZBKtebcSE"},{"type":"text","value":" and ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"i96ax4MFcJ"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"u\\actu","key":"zV9JsTEgNC"},{"type":"text","value":" to two\ndecimal spaces.","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"J7w1FQFPyX"}],"key":"YoaiSEq1TW"},{"type":"paragraph","position":{"start":{"line":153,"column":1},"end":{"line":161,"column":1}},"children":[{"type":"text","value":"However, the discretized ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"wR8X5YUn2J"},{"type":"inlineMath","value":"\\widetilde{\\mathcal{S}}","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"S~\\widetilde{\\mathcal{S}}S","key":"eyi6FUTd1z"},{"type":"text","value":" and ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"tEg9qjDvcM"},{"type":"inlineMath","value":"\\widetilde{\\mathcal{A}}","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"A~\\widetilde{\\mathcal{A}}A","key":"gb7nrFHqPO"},{"type":"text","value":" may be finite, but\nthey may be infeasibly large: we must divide ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"hxobLFJSDD"},{"type":"emphasis","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"children":[{"type":"text","value":"each dimension","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"ypPD3auF1s"}],"key":"uJ3aAEY0Dj"},{"type":"text","value":" into\nintervals of length ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"RSDi5iY3h3"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"ε\\varepsilonε","key":"ugHxcVnLh3"},{"type":"text","value":", resulting in\n","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"tl2CuYxS6c"},{"type":"inlineMath","value":"|\\widetilde{\\mathcal{S}}| = (B_\\st/\\varepsilon)^{n_\\st}","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"S~=(Bx/ε)nx|\\widetilde{\\mathcal{S}}| = (B_\\st/\\varepsilon)^{n_\\st}S=(Bx/ε)nx","key":"wT0MqvyENc"},{"type":"text","value":" and\n","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"kAdxXyuyZn"},{"type":"inlineMath","value":"|\\widetilde{\\mathcal{A}}| = (B_\\act/\\varepsilon)^{n_\\act}","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"A~=(Bu/ε)nu|\\widetilde{\\mathcal{A}}| = (B_\\act/\\varepsilon)^{n_\\act}A=(Bu/ε)nu","key":"VfW3NZGFF3"},{"type":"text","value":". To get a sense of how\nquickly this grows, consider ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"Ww66sylDL8"},{"type":"inlineMath","value":"\\varepsilon = 0.01, n_\\st = n_\\act = 10","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"ε=0.01,nx=nu=10\\varepsilon = 0.01, n_\\st = n_\\act = 10ε=0.01,nx=nu=10","key":"dtVWwXloY5"},{"type":"text","value":".\nThen the number of elements in the transition matrix would be\n","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"pQhRtzO4JB"},{"type":"inlineMath","value":"|\\widetilde{\\mathcal{S}}|^2 |\\widetilde{\\mathcal{A}}| = (100^{10})^2 (100^{10}) = 10^{60}","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"S~2A~=(10010)2(10010)=1060|\\widetilde{\\mathcal{S}}|^2 |\\widetilde{\\mathcal{A}}| = (100^{10})^2 (100^{10}) = 10^{60}S2A=(10010)2(10010)=1060","key":"aYiv6Aog5j"},{"type":"text","value":"! (That’s\na trillion trillion trillion trillion trillion.)","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"MWXrE0DPMX"}],"key":"PWqaYRhwAA"},{"type":"paragraph","position":{"start":{"line":163,"column":1},"end":{"line":169,"column":1}},"children":[{"type":"text","value":"What properties of the problem could we instead make use of? Note that\nby discretizing the state and action spaces, we implicitly assumed that\nrounding each state or action vector by some tiny amount ","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"pWOK1LcDGc"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"html":"ε\\varepsilonε","key":"EXpJyEl6D0"},{"type":"text","value":"\nwouldn’t change the behavior of the system by much; namely, that the\ncost and dynamics were relatively ","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"fKtfhMOavH"},{"type":"emphasis","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"children":[{"type":"text","value":"continuous","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"RfWHLiMWXq"}],"key":"u4x7BvaSzl"},{"type":"text","value":". Can we use this\ncontinuous structure in other ways? This leads us to the ","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"Ma0zTVgr1j"},{"type":"strong","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"children":[{"type":"text","value":"linear\nquadratic regulator","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"k4vomIm67a"}],"key":"v1UZDH8dtG"},{"type":"text","value":".","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"qCZrSJojNn"}],"key":"A2NzujTiLO"},{"type":"heading","depth":2,"position":{"start":{"line":172,"column":1},"end":{"line":172,"column":1}},"children":[{"type":"text","value":"The Linear Quadratic Regulator","position":{"start":{"line":172,"column":1},"end":{"line":172,"column":1}},"key":"JXEkabJYua"}],"label":"lqr","identifier":"lqr","html_id":"lqr","enumerator":"2.3","key":"IS2T3hx6yE"},{"type":"paragraph","position":{"start":{"line":174,"column":1},"end":{"line":175,"column":1}},"children":[{"type":"text","value":"The optimal control problem ","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"ZMnHGV633A"},{"type":"crossReference","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"children":[{"type":"text","value":"Definition ","key":"T6Ivgnpcv3"},{"type":"text","value":"2.1","key":"yDkHWL1PiG"}],"identifier":"optimal_control","label":"optimal_control","kind":"proof:definition","template":"Definition %s","enumerator":"2.1","resolved":true,"html_id":"optimal-control","key":"bPyg4RdtDU"},{"type":"text","value":" seems highly complex in general. Is there a relevant simplification that we can analyze?\nThe ","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"QaN40TSD5P"},{"type":"strong","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"children":[{"type":"text","value":"linear quadratic regulator","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"Xe7HoHxuqN"}],"key":"ERl8a6Tzsy"},{"type":"text","value":" (LQR) is a solvable case and a fundamental tool in control theory.","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"QuwogqDpof"}],"key":"GctgDz8Uhq"},{"type":"proof","kind":"definition","label":"lqr_definition","identifier":"lqr_definition","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"The linear quadratic regulator","position":{"start":{"line":177,"column":1},"end":{"line":177,"column":1}},"key":"ZOhjinbHZY"}],"key":"AAAh2bncY7"},{"type":"paragraph","position":{"start":{"line":180,"column":1},"end":{"line":181,"column":1}},"children":[{"type":"text","value":"The LQR problem is a special case of the ","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"fLc9YTOOWb"},{"type":"crossReference","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"children":[{"type":"text","value":"General optimal control problem","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"x3agUhaXQf"}],"identifier":"optimal_control","label":"optimal_control","kind":"proof:definition","template":"Definition %s","enumerator":"2.1","resolved":true,"html_id":"optimal-control","key":"IqfoEopYck"},{"type":"text","value":" with ","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"rx8Jk2TJpd"},{"type":"emphasis","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"children":[{"type":"text","value":"linear dynamics","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"jaF1PEbLYT"}],"key":"Ur5K5nDSan"},{"type":"text","value":" and an ","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"z0zMP3KVlo"},{"type":"emphasis","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"children":[{"type":"text","value":"upward-curved quadratic cost function","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"CXplrGTKcz"}],"key":"pyGqHeCJCt"},{"type":"text","value":".\nSolving the LQR problem will additionally enable us to ","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"SVgeLtyXqD"},{"type":"emphasis","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"children":[{"type":"text","value":"locally approximate","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"BHgLdtEtJJ"}],"key":"UQwhG8PJjy"},{"type":"text","value":" more complex setups using ","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"aGmbugqIYE"},{"type":"emphasis","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"children":[{"type":"text","value":"Taylor approximations","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"rvn80FMtCe"}],"key":"dG5uddVd8W"},{"type":"text","value":".","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"TetFERvD6A"}],"key":"yZAcqqtCnG"},{"type":"paragraph","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"children":[{"type":"strong","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"children":[{"type":"text","value":"Linear, time-homogeneous dynamics","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"R8fdSgblyc"}],"key":"CEp6koHJ6v"},{"type":"text","value":": for each timestep ","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"qjwV3okNOK"},{"type":"inlineMath","value":"\\hi \\in [\\hor]","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"html":"h[H]\\hi \\in [\\hor]h[H]","key":"RBexxdOdBU"},{"type":"text","value":",","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"kq3iK8bDiM"}],"key":"NOXhFrrA9b"},{"type":"math","value":"\\begin{aligned}\n \\st_{\\hi+1} &= f(\\st_\\hi, \\act_\\hi, w_\\hi) = A \\st_\\hi + B \\act_\\hi + w_\\hi \\\\\n \\text{where } w_\\hi &\\sim \\mathcal{N}(0, \\sigma^2 I).\n\\end{aligned}","position":{"start":{"line":185,"column":1},"end":{"line":190,"column":1}},"html":"xh+1=f(xh,uh,wh)=Axh+Buh+whwhere whN(0,σ2I).\\begin{aligned}\n \\st_{\\hi+1} &= f(\\st_\\hi, \\act_\\hi, w_\\hi) = A \\st_\\hi + B \\act_\\hi + w_\\hi \\\\\n \\text{where } w_\\hi &\\sim \\mathcal{N}(0, \\sigma^2 I).\n\\end{aligned}xh+1where wh=f(xh,uh,wh)=Axh+Buh+whN(0,σ2I).","enumerator":"2.2","key":"XXz1piC84x"},{"type":"paragraph","position":{"start":{"line":192,"column":1},"end":{"line":194,"column":1}},"children":[{"type":"text","value":"Here, ","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"uiSL0EcD6h"},{"type":"inlineMath","value":"w_\\hi","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"html":"whw_\\hiwh","key":"PH1aPUtEH6"},{"type":"text","value":" is a spherical Gaussian ","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"dhCwdatJPP"},{"type":"strong","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"children":[{"type":"text","value":"noise term","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"UzK1t1fgpc"}],"key":"aBEbVP1ESp"},{"type":"text","value":" that makes the dynamics random.\nSetting ","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"eCzvob2U7J"},{"type":"inlineMath","value":"\\sigma = 0","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"html":"σ=0\\sigma = 0σ=0","key":"kzYB4TUAn2"},{"type":"text","value":" gives us ","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"l4fp6b4gRV"},{"type":"strong","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"children":[{"type":"text","value":"deterministic","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"jfjKBIKQlp"}],"key":"QhEDKK7JqH"},{"type":"text","value":" state transitions.\nWe will find that the optimal policy actually ","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"vSayJS5gaR"},{"type":"emphasis","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"children":[{"type":"text","value":"does not depend on the noise","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"AHb9qrhVYU"}],"key":"yRycskUoyu"},{"type":"text","value":", although the optimal value function and Q-function do.","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"wO61HEHjnh"}],"key":"pdf7X4Dqa7"},{"type":"paragraph","position":{"start":{"line":196,"column":1},"end":{"line":196,"column":1}},"children":[{"type":"strong","position":{"start":{"line":196,"column":1},"end":{"line":196,"column":1}},"children":[{"type":"text","value":"Upward-curved quadratic, time-homogeneous cost function","position":{"start":{"line":196,"column":1},"end":{"line":196,"column":1}},"key":"Eg7W18bxAx"}],"key":"cRjn0tjgGJ"},{"type":"text","value":":","position":{"start":{"line":196,"column":1},"end":{"line":196,"column":1}},"key":"kOsp1JiQmM"}],"key":"FQnsoXcMWI"},{"type":"math","value":"c(\\st_\\hi, \\act_\\hi) = \\begin{cases}\n \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi & \\hi < \\hor \\\\\n \\st_\\hi^\\top Q \\st_\\hi & \\hi = \\hor\n\\end{cases}.","position":{"start":{"line":198,"column":1},"end":{"line":203,"column":1}},"html":"c(xh,uh)={xhQxh+uhRuhh<HxhQxhh=H.c(\\st_\\hi, \\act_\\hi) = \\begin{cases}\n \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi & \\hi < \\hor \\\\\n \\st_\\hi^\\top Q \\st_\\hi & \\hi = \\hor\n\\end{cases}.c(xh,uh)={xhQxh+uhRuhxhQxhh<Hh=H.","enumerator":"2.3","key":"JgOeuETMhm"},{"type":"paragraph","position":{"start":{"line":205,"column":1},"end":{"line":207,"column":1}},"children":[{"type":"text","value":"This cost function attempts to stabilize the state and action about ","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"X2SYIbe8bP"},{"type":"inlineMath","value":"(s^\\star, a^\\star) = (0, 0)","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"html":"(s,a)=(0,0)(s^\\star, a^\\star) = (0, 0)(s,a)=(0,0)","key":"JsaSplFx8c"},{"type":"text","value":".\nWe require ","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"JshBrQtGiM"},{"type":"inlineMath","value":"Q \\in \\R^{n_\\st \\times n_\\st}","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"html":"QRnx×nxQ \\in \\R^{n_\\st \\times n_\\st}QRnx×nx","key":"BKVsiTB5sc"},{"type":"text","value":" and ","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"LAVSDwx1R9"},{"type":"inlineMath","value":"R \\in \\R^{n_\\act \\times n_\\act}","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"html":"RRnu×nuR \\in \\R^{n_\\act \\times n_\\act}RRnu×nu","key":"luEFm2tRUo"},{"type":"text","value":" to both be ","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"IrjN1X7NyS"},{"type":"emphasis","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"children":[{"type":"text","value":"positive definite","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"eBBlTlQtmn"}],"key":"DTX5lrsaQZ"},{"type":"text","value":" matrices so that ","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"ljZrilSB18"},{"type":"inlineMath","value":"c","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"html":"ccc","key":"lwSyDYJ2Pw"},{"type":"text","value":" has a well-defined unique minimum.\nWe can furthermore assume without loss of generality that they are both ","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"wReUlWoGrA"},{"type":"emphasis","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"children":[{"type":"text","value":"symmetric","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"dwZnOXjGiv"}],"key":"mR9r8ju3Z3"},{"type":"text","value":" (see exercise below).","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"aTuomLE6CR"}],"key":"yuAw3noYIZ"},{"type":"paragraph","position":{"start":{"line":209,"column":1},"end":{"line":209,"column":1}},"children":[{"type":"text","value":"This results in the LQR optimization problem:","position":{"start":{"line":209,"column":1},"end":{"line":209,"column":1}},"key":"YkDlq3jkji"}],"key":"skH2DYMPbk"},{"type":"math","value":"\\begin{aligned}\n \\min_{\\pi_0, \\dots, \\pi_{\\hor-1} : \\mathcal{S} \\to \\mathcal{A}} \\quad & \\E \\left[ \\left( \\sum_{\\hi=0}^{\\hor-1} \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi \\right) + \\st_\\hor^\\top Q \\st_\\hor \\right] \\\\\n \\textrm{where} \\quad & \\st_{\\hi+1} = A \\st_\\hi + B \\act_\\hi + w_\\hi \\\\\n & \\act_\\hi = \\pi_\\hi (\\st_\\hi) \\\\\n & w_\\hi \\sim \\mathcal{N}(0, \\sigma^2 I) \\\\\n & \\st_0 \\sim \\mu_0.\n\\end{aligned}","position":{"start":{"line":211,"column":1},"end":{"line":219,"column":1}},"html":"minπ0,,πH1:SAE[(h=0H1xhQxh+uhRuh)+xHQxH]wherexh+1=Axh+Buh+whuh=πh(xh)whN(0,σ2I)x0μ0.\\begin{aligned}\n \\min_{\\pi_0, \\dots, \\pi_{\\hor-1} : \\mathcal{S} \\to \\mathcal{A}} \\quad & \\E \\left[ \\left( \\sum_{\\hi=0}^{\\hor-1} \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi \\right) + \\st_\\hor^\\top Q \\st_\\hor \\right] \\\\\n \\textrm{where} \\quad & \\st_{\\hi+1} = A \\st_\\hi + B \\act_\\hi + w_\\hi \\\\\n & \\act_\\hi = \\pi_\\hi (\\st_\\hi) \\\\\n & w_\\hi \\sim \\mathcal{N}(0, \\sigma^2 I) \\\\\n & \\st_0 \\sim \\mu_0.\n\\end{aligned}π0,,πH1:SAminwhereE[(h=0H1xhQxh+uhRuh)+xHQxH]xh+1=Axh+Buh+whuh=πh(xh)whN(0,σ2I)x0μ0.","enumerator":"2.4","key":"RRzavaYicJ"}],"enumerator":"2.2","html_id":"lqr-definition","key":"Fw54VGC4e2"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Exercise","position":{"start":{"line":222,"column":1},"end":{"line":222,"column":1}},"key":"HqBKv6nNCp"}],"key":"tUFhNDFfJ7"},{"type":"paragraph","position":{"start":{"line":223,"column":1},"end":{"line":224,"column":1}},"children":[{"type":"text","value":"Here we’ll show that we don’t lose generality by assuming that ","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"rIAQ4ptvv8"},{"type":"inlineMath","value":"Q","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"QQQ","key":"iqAaQRpoVZ"},{"type":"text","value":" and ","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"IzQAJhR8ER"},{"type":"inlineMath","value":"R","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"RRR","key":"JCgBTSwsLk"},{"type":"text","value":" are symmetric.\nShow that replacing ","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"S1EM3oQFW4"},{"type":"inlineMath","value":"Q","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"QQQ","key":"tr6WuyQ17u"},{"type":"text","value":" and ","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"YvKtoAG505"},{"type":"inlineMath","value":"R","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"RRR","key":"dnze3IPl5Y"},{"type":"text","value":" with ","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"AQBF7vmjjn"},{"type":"inlineMath","value":"(Q + Q^\\top) / 2","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"(Q+Q)/2(Q + Q^\\top) / 2(Q+Q)/2","key":"ItH1FiI3x6"},{"type":"text","value":" and ","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"f2s2bCkitB"},{"type":"inlineMath","value":"(R + R^\\top) / 2","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"(R+R)/2(R + R^\\top) / 2(R+R)/2","key":"T2qUisiMcc"},{"type":"text","value":" (which are symmetric) yields the same cost function.","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"bJ2BEH3rAX"}],"key":"nULxYNK56q"}],"key":"v2N5G1ey91"},{"type":"paragraph","position":{"start":{"line":227,"column":1},"end":{"line":228,"column":1}},"children":[{"type":"text","value":"We will henceforth abbreviate “symmetric positive definite” as s.p.d.\nand “positive definite” as p.d.","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"key":"SpPa9VXUTv"}],"key":"YoKtbfABip"},{"type":"paragraph","position":{"start":{"line":230,"column":1},"end":{"line":232,"column":1}},"children":[{"type":"text","value":"It will be helpful to reintroduce the ","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"key":"AJjTbw6rYR"},{"type":"emphasis","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"children":[{"type":"text","value":"value function","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"key":"KWMYuKfjA2"}],"key":"ZoqQxhYzEs"},{"type":"text","value":" notation for a policy to denote the average cost it incurs.\nThese will be instrumental in constructing the optimal policy via ","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"key":"dpmEKnRbAP"},{"type":"strong","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"children":[{"type":"text","value":"dynamic programming,","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"key":"oYw0w88BUN"}],"key":"s7nqQgGtP5"},{"type":"text","value":"\nas we did in ","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"key":"ZDmm1Kq0l1"},{"type":"crossReference","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"children":[{"type":"text","value":"Section ","key":"Oj0yTJGoJM"},{"type":"text","value":"1.3.2","key":"dYkyroOawv"}],"identifier":"opt_dynamic_programming","label":"opt_dynamic_programming","kind":"heading","template":"Section %s","enumerator":"1.3.2","resolved":true,"html_id":"opt-dynamic-programming","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"Ws7WwAhUgF"},{"type":"text","value":" for MDPs.","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"key":"vS9Tl2yU1Z"}],"key":"bcdm9v9aIq"},{"type":"proof","kind":"definition","label":"value_lqr","identifier":"value_lqr","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Value functions for LQR","position":{"start":{"line":234,"column":1},"end":{"line":234,"column":1}},"key":"YTR8fxA1dr"}],"key":"IyJi4BNEU5"},{"type":"paragraph","position":{"start":{"line":237,"column":1},"end":{"line":238,"column":1}},"children":[{"type":"text","value":"Given a policy ","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"key":"olZDZ0Duql"},{"type":"inlineMath","value":"\\mathbf{\\pi} = (\\pi_0, \\dots, \\pi_{\\hor-1})","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"html":"π=(π0,,πH1)\\mathbf{\\pi} = (\\pi_0, \\dots, \\pi_{\\hor-1})π=(π0,,πH1)","key":"YYJQMYVdCU"},{"type":"text","value":",\nwe can define its value function ","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"key":"Z1FzTbRhA8"},{"type":"inlineMath","value":"V^\\pi_\\hi : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"html":"Vhπ:SRV^\\pi_\\hi : \\mathcal{S} \\to \\mathbb{R}Vhπ:SR","key":"HjH2dUG2OY"},{"type":"text","value":" at time ","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"key":"qXfUEeeW3R"},{"type":"inlineMath","value":"\\hi \\in [\\hor]","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"html":"h[H]\\hi \\in [\\hor]h[H]","key":"qVLZ2fa4aa"},{"type":"text","value":" as the average ","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"key":"qJpMHIJk0u"},{"type":"strong","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"children":[{"type":"text","value":"cost-to-go","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"key":"GgWb6YNbWc"}],"key":"SMMPzgNo0v"},{"type":"text","value":" incurred by that policy:","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"key":"n9Sp3os5od"}],"key":"gAYYkJTjZc"},{"type":"math","value":"\\begin{split}\n V^\\pi_\\hi (\\st) &= \\E \\left[ \\left( \\sum_{i=\\hi}^{\\hor-1} c(\\st_i, \\act_i) \\right) + c(\\st_\\hor) \\mid \\st_\\hi = \\st, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\right] \\\\\n &= \\E \\left[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_i^\\top Q \\st_i + \\act_i^\\top R \\act_i \\right) + \\st_\\hor^\\top Q \\st_\\hor \\mid \\st_\\hi = \\st, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\right] \\\\\n\\end{split}","position":{"start":{"line":240,"column":1},"end":{"line":245,"column":1}},"html":"Vhπ(x)=E[(i=hH1c(xi,ui))+c(xH)xh=x,ui=πi(xi)hi<H]=E[(i=hH1xiQxi+uiRui)+xHQxHxh=x,ui=πi(xi)hi<H]\\begin{split}\n V^\\pi_\\hi (\\st) &= \\E \\left[ \\left( \\sum_{i=\\hi}^{\\hor-1} c(\\st_i, \\act_i) \\right) + c(\\st_\\hor) \\mid \\st_\\hi = \\st, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\right] \\\\\n &= \\E \\left[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_i^\\top Q \\st_i + \\act_i^\\top R \\act_i \\right) + \\st_\\hor^\\top Q \\st_\\hor \\mid \\st_\\hi = \\st, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\right] \\\\\n\\end{split}Vhπ(x)=E[(i=hH1c(xi,ui))+c(xH)xh=x,ui=πi(xi)hi<H]=E[(i=hH1xiQxi+uiRui)+xHQxHxh=x,ui=πi(xi)hi<H]","enumerator":"2.5","key":"odxKEqLER6"},{"type":"paragraph","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"children":[{"type":"text","value":"The Q-function additionally conditions on the first action we take:","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"OAWL8HXW9e"}],"key":"A0Oa8pUEOC"},{"type":"math","value":"\\begin{split}\n Q^\\pi_\\hi (\\st, \\act) &= \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} c(\\st_i, \\act_i) \\right) + c(\\st_\\hor) \\\\\n &\\qquad\\qquad \\mid (\\st_\\hi, \\act_\\hi) = (\\st, \\act), \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\bigg] \\\\\n &= \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_i^\\top Q \\st_i + \\act_i^\\top R \\act_i \\right) + \\st_\\hor^\\top Q \\st_\\hor \\\\\n &\\qquad\\qquad \\mid (\\st_\\hi, \\act_\\hi) = (\\st, \\act), \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\bigg] \\\\\n\\end{split}","position":{"start":{"line":249,"column":1},"end":{"line":256,"column":1}},"html":"Qhπ(x,u)=E[(i=hH1c(xi,ui))+c(xH)(xh,uh)=(x,u),ui=πi(xi)hi<H]=E[(i=hH1xiQxi+uiRui)+xHQxH(xh,uh)=(x,u),ui=πi(xi)hi<H]\\begin{split}\n Q^\\pi_\\hi (\\st, \\act) &= \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} c(\\st_i, \\act_i) \\right) + c(\\st_\\hor) \\\\\n &\\qquad\\qquad \\mid (\\st_\\hi, \\act_\\hi) = (\\st, \\act), \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\bigg] \\\\\n &= \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_i^\\top Q \\st_i + \\act_i^\\top R \\act_i \\right) + \\st_\\hor^\\top Q \\st_\\hor \\\\\n &\\qquad\\qquad \\mid (\\st_\\hi, \\act_\\hi) = (\\st, \\act), \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\bigg] \\\\\n\\end{split}Qhπ(x,u)=E[(i=hH1c(xi,ui))+c(xH)(xh,uh)=(x,u),ui=πi(xi)hi<H]=E[(i=hH1xiQxi+uiRui)+xHQxH(xh,uh)=(x,u),ui=πi(xi)hi<H]","enumerator":"2.6","key":"nOo6bkPwmE"},{"type":"paragraph","position":{"start":{"line":258,"column":1},"end":{"line":259,"column":1}},"children":[{"type":"text","value":"Note that since we use ","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"HGqmjnsnkb"},{"type":"emphasis","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"children":[{"type":"text","value":"cost","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"D5EI5vTzfZ"}],"key":"WimLhigFzy"},{"type":"text","value":" instead of ","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"JXVyY1Dhln"},{"type":"emphasis","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"children":[{"type":"text","value":"reward,","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"a8uI9pWd6e"}],"key":"V4AJOmi3uM"},{"type":"text","value":"\nthe best policies are the ones with ","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"Duiq2r4ro0"},{"type":"emphasis","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"children":[{"type":"text","value":"smaller","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"lkLUHbpcFD"}],"key":"rHjOmTDraO"},{"type":"text","value":" values of the value function.","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"Y6b37IQBeh"}],"key":"m9lQNd4xqz"}],"enumerator":"2.3","html_id":"value-lqr","key":"IfQRGFqMTK"},{"type":"heading","depth":2,"position":{"start":{"line":263,"column":1},"end":{"line":263,"column":1}},"children":[{"type":"text","value":"Optimality and the Riccati Equation","position":{"start":{"line":263,"column":1},"end":{"line":263,"column":1}},"key":"hNjtSOUr5l"}],"label":"optimal_lqr","identifier":"optimal_lqr","html_id":"optimal-lqr","enumerator":"2.4","key":"XcKGGFtSyK"},{"type":"paragraph","position":{"start":{"line":265,"column":1},"end":{"line":270,"column":1}},"children":[{"type":"text","value":"In this section,\nwe’ll compute the optimal value function ","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"M9xcwjth9k"},{"type":"inlineMath","value":"V^\\star_h","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"html":"VhV^\\star_hVh","key":"XSs69xnTgt"},{"type":"text","value":",\nQ-function ","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"XSnPDZpwYu"},{"type":"inlineMath","value":"Q^\\star_h","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"html":"QhQ^\\star_hQh","key":"eLCrq0izGO"},{"type":"text","value":",\nand policy ","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"CVHr1bngZp"},{"type":"inlineMath","value":"\\pi^\\star_h","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"html":"πh\\pi^\\star_hπh","key":"kmUcJzrAaQ"},{"type":"text","value":" in ","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"fo68474r6B"},{"type":"crossReference","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"children":[{"type":"text","value":"the linear quadratic regulator","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"XtD4yMuur3"}],"identifier":"lqr_definition","label":"lqr_definition","kind":"proof:definition","template":"Definition %s","enumerator":"2.2","resolved":true,"html_id":"lqr-definition","key":"UAIy08Su1x"},{"type":"text","value":" using ","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"DZQLjzrmEm"},{"type":"strong","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"children":[{"type":"text","value":"dynamic programming","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"Mdd9UlYvNR"}],"key":"a1qBFdjM4L"},{"type":"text","value":"\nin a very similar way to the DP algorithms ","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"fyTw9Jv6YF"},{"type":"crossReference","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"children":[{"type":"text","value":"in the MDP setting","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"NtueF0ITZK"}],"identifier":"eval_dp","label":"eval_dp","kind":"heading","template":"Section %s","enumerator":"1.3.1","resolved":true,"html_id":"eval-dp","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"ruSfrtRL1k"},{"type":"text","value":".\nRecall the definition of the optimal value function:","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"hYIHPpxvoy"}],"key":"tCTzIELKgY"},{"type":"proof","kind":"definition","label":"optimal_value_lqr","identifier":"optimal_value_lqr","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Optimal value function in LQR","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"HCeRqpxPH5"}],"key":"PAmqLxDImy"},{"type":"paragraph","position":{"start":{"line":275,"column":1},"end":{"line":277,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"QYqcaPm3IM"},{"type":"strong","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"children":[{"type":"text","value":"optimal value function","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"ibpnChbofg"}],"key":"IVK0anFOsj"},{"type":"text","value":" is the one that,\nat any time and in any state,\nachieves ","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"B0yfyJs93d"},{"type":"emphasis","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"children":[{"type":"text","value":"minimum cost","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"a85E4RW2X0"}],"key":"Y5efqN5iIq"},{"type":"text","value":" across ","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"GYGULcfeT1"},{"type":"emphasis","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"children":[{"type":"text","value":"all policies","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"RSo88bXJ2j"}],"key":"taqPacETsa"},{"type":"text","value":":","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"O3eXWPKoOk"}],"key":"RabuWwvo2c"},{"type":"math","value":"\\begin{split}\n V^\\star_\\hi(\\st) &= \\min_{\\pi_\\hi, \\dots, \\pi_{\\hor-1}} V^\\pi_\\hi(\\st) \\\\\n &= \\min_{\\pi_{\\hi}, \\dots, \\pi_{\\hor-1}} \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi \\right) + \\st_\\hor^\\top Q \\st_\\hor \\\\\n &\\hspace{8em} \\mid \\st_\\hi = \\st, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\bigg] \\\\\n\\end{split}","position":{"start":{"line":279,"column":1},"end":{"line":285,"column":1}},"html":"Vh(x)=minπh,,πH1Vhπ(x)=minπh,,πH1E[(i=hH1xhQxh+uhRuh)+xHQxHxh=x,ui=πi(xi)hi<H]\\begin{split}\n V^\\star_\\hi(\\st) &= \\min_{\\pi_\\hi, \\dots, \\pi_{\\hor-1}} V^\\pi_\\hi(\\st) \\\\\n &= \\min_{\\pi_{\\hi}, \\dots, \\pi_{\\hor-1}} \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi \\right) + \\st_\\hor^\\top Q \\st_\\hor \\\\\n &\\hspace{8em} \\mid \\st_\\hi = \\st, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\bigg] \\\\\n\\end{split}Vh(x)=πh,,πH1minVhπ(x)=πh,,πH1minE[(i=hH1xhQxh+uhRuh)+xHQxHxh=x,ui=πi(xi)hi<H]","enumerator":"2.7","key":"QbXC8a8FZM"},{"type":"paragraph","position":{"start":{"line":287,"column":1},"end":{"line":288,"column":1}},"children":[{"type":"text","value":"The optimal Q-function is defined similarly,\nconditioned on the starting action as well:","position":{"start":{"line":287,"column":1},"end":{"line":287,"column":1}},"key":"Ff4q12N4jj"}],"key":"eKdH1hj6Ue"},{"type":"math","value":"\\begin{split}\n Q^\\star_\\hi(\\st, \\act) &= \\min_{\\pi_\\hi, \\dots, \\pi_{\\hor-1}} Q^\\pi_\\hi(\\st, \\act) \\\\\n &= \\min_{\\pi_{\\hi}, \\dots, \\pi_{\\hor-1}} \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi \\right) + \\st_\\hor^\\top Q \\st_\\hor \\\\\n &\\hspace{8em} \\mid \\st_\\hi = \\st, \\act_\\hi = \\act, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi < i < H \\bigg] \\\\\n\\end{split}","position":{"start":{"line":290,"column":1},"end":{"line":296,"column":1}},"html":"Qh(x,u)=minπh,,πH1Qhπ(x,u)=minπh,,πH1E[(i=hH1xhQxh+uhRuh)+xHQxHxh=x,uh=u,ui=πi(xi)h<i<H]\\begin{split}\n Q^\\star_\\hi(\\st, \\act) &= \\min_{\\pi_\\hi, \\dots, \\pi_{\\hor-1}} Q^\\pi_\\hi(\\st, \\act) \\\\\n &= \\min_{\\pi_{\\hi}, \\dots, \\pi_{\\hor-1}} \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi \\right) + \\st_\\hor^\\top Q \\st_\\hor \\\\\n &\\hspace{8em} \\mid \\st_\\hi = \\st, \\act_\\hi = \\act, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi < i < H \\bigg] \\\\\n\\end{split}Qh(x,u)=πh,,πH1minQhπ(x,u)=πh,,πH1minE[(i=hH1xhQxh+uhRuh)+xHQxHxh=x,uh=u,ui=πi(xi)h<i<H]","enumerator":"2.8","key":"JNaecM9MfB"},{"type":"paragraph","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"children":[{"type":"text","value":"Both of the definitions above assume ","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"key":"Ydn6Qv2CQB"},{"type":"emphasis","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"children":[{"type":"text","value":"deterministic","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"key":"O7KTQeHCIi"}],"key":"VXFWfGibml"},{"type":"text","value":" policies. Otherwise we would have to take an ","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"key":"fURxXRI64u"},{"type":"emphasis","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"children":[{"type":"text","value":"expectation","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"key":"iWYQapALRO"}],"key":"UF9I1WF284"},{"type":"text","value":" over actions drawn from the policy, i.e. ","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"key":"ucCCX4su4R"},{"type":"inlineMath","value":"\\act_\\hi \\sim \\pi_\\hi (\\st_\\hi)","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"html":"uhπh(xh)\\act_\\hi \\sim \\pi_\\hi (\\st_\\hi)uhπh(xh)","key":"hnVoZrEOXq"},{"type":"text","value":".","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"key":"MmffBcIFSg"}],"key":"m7ZNpUsB6u"}],"enumerator":"2.4","html_id":"optimal-value-lqr","key":"SD4xlZ2N86"},{"type":"paragraph","position":{"start":{"line":301,"column":1},"end":{"line":303,"column":1}},"children":[{"type":"text","value":"We will prove the striking fact that the solution has very simple structure:\n","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"AxYGIrv34j"},{"type":"inlineMath","value":"V_h^\\star","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"html":"VhV_h^\\starVh","key":"WYBNqz6wit"},{"type":"text","value":" and ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"ksJkfnZ8G3"},{"type":"inlineMath","value":"Q^\\star_h","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"html":"QhQ^\\star_hQh","key":"rsCJtkjikL"},{"type":"text","value":" are ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"Eq4IeVCYYM"},{"type":"emphasis","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"children":[{"type":"text","value":"upward-curved quadratics","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"U5qpO4XdN3"}],"key":"IPjbTmRURi"},{"type":"text","value":"\nand ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"pjEXCIz0wX"},{"type":"inlineMath","value":"\\pi_h^\\star","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"html":"πh\\pi_h^\\starπh","key":"FLqnJzStti"},{"type":"text","value":" is ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"hg2LXnDJpP"},{"type":"emphasis","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"children":[{"type":"text","value":"linear","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"QQ9MgZToTd"}],"key":"lyVxwjuTVg"},{"type":"text","value":" and furthermore does not depend on the noise!","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"hrkIRrAk8C"}],"key":"D8kqrG9qZ0"},{"type":"proof","kind":"theorem","label":"optimal_value_lqr_quadratic","identifier":"optimal_value_lqr_quadratic","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Optimal value function in LQR is an upward-curved quadratic","position":{"start":{"line":305,"column":1},"end":{"line":305,"column":1}},"key":"lgojp81Q94"}],"key":"eBNBSD5X8G"},{"type":"paragraph","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"children":[{"type":"text","value":"At each timestep ","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"owMA72YkZb"},{"type":"inlineMath","value":"\\hi \\in [\\hor]","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"html":"h[H]\\hi \\in [\\hor]h[H]","key":"fA2jXd6Rtw"},{"type":"text","value":",","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"mi4e7BM06Z"}],"key":"kT8u4HIZzC"},{"type":"math","value":"V^\\star_\\hi(\\st) = \\st^\\top P_\\hi \\st + p_\\hi","position":{"start":{"line":310,"column":1},"end":{"line":312,"column":1}},"html":"Vh(x)=xPhx+phV^\\star_\\hi(\\st) = \\st^\\top P_\\hi \\st + p_\\hiVh(x)=xPhx+ph","enumerator":"2.9","key":"kBxvbV5iSN"},{"type":"paragraph","position":{"start":{"line":314,"column":1},"end":{"line":315,"column":1}},"children":[{"type":"text","value":"for some s.p.d. matrix ","position":{"start":{"line":314,"column":1},"end":{"line":314,"column":1}},"key":"VeHZiX4RC8"},{"type":"inlineMath","value":"P_\\hi \\in \\mathbb{R}^{n_\\st \\times n_\\st}","position":{"start":{"line":314,"column":1},"end":{"line":314,"column":1}},"html":"PhRnx×nxP_\\hi \\in \\mathbb{R}^{n_\\st \\times n_\\st}PhRnx×nx","key":"q5PnZoOS6r"},{"type":"text","value":" and scalar\n","position":{"start":{"line":314,"column":1},"end":{"line":314,"column":1}},"key":"Y6kKwsnAgZ"},{"type":"inlineMath","value":"p_\\hi \\in \\mathbb{R}","position":{"start":{"line":314,"column":1},"end":{"line":314,"column":1}},"html":"phRp_\\hi \\in \\mathbb{R}phR","key":"c3VqVOhveH"},{"type":"text","value":".","position":{"start":{"line":314,"column":1},"end":{"line":314,"column":1}},"key":"EbYyDnvjwt"}],"key":"uMgXoUF1Ft"}],"enumerator":"2.1","html_id":"optimal-value-lqr-quadratic","key":"hDFLlKpCsa"},{"type":"proof","kind":"theorem","label":"optimal_policy_lqr_linear","identifier":"optimal_policy_lqr_linear","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Optimal policy in LQR is linear","position":{"start":{"line":318,"column":1},"end":{"line":318,"column":1}},"key":"Fy35NvSlXp"}],"key":"pnpo9A1Okj"},{"type":"paragraph","position":{"start":{"line":321,"column":1},"end":{"line":321,"column":1}},"children":[{"type":"text","value":"At each timestep ","position":{"start":{"line":321,"column":1},"end":{"line":321,"column":1}},"key":"s8QBobXyWQ"},{"type":"inlineMath","value":"\\hi \\in [\\hor]","position":{"start":{"line":321,"column":1},"end":{"line":321,"column":1}},"html":"h[H]\\hi \\in [\\hor]h[H]","key":"uaUvIljcXq"},{"type":"text","value":",","position":{"start":{"line":321,"column":1},"end":{"line":321,"column":1}},"key":"I4exMsqm19"}],"key":"frHWyWrwko"},{"type":"math","value":"\\pi^\\star_\\hi (\\st) = - K_\\hi \\st","position":{"start":{"line":323,"column":1},"end":{"line":325,"column":1}},"html":"πh(x)=Khx\\pi^\\star_\\hi (\\st) = - K_\\hi \\stπh(x)=Khx","enumerator":"2.10","key":"ELU7HnRlKm"},{"type":"paragraph","position":{"start":{"line":327,"column":1},"end":{"line":328,"column":1}},"children":[{"type":"text","value":"for some ","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"key":"RzzVcvAaZs"},{"type":"inlineMath","value":"K_\\hi \\in \\mathbb{R}^{n_\\act \\times n_\\st}","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"html":"KhRnu×nxK_\\hi \\in \\mathbb{R}^{n_\\act \\times n_\\st}KhRnu×nx","key":"XkaXdz2BqU"},{"type":"text","value":".\n(The negative is due to convention.)","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"key":"c1MBlYwJhG"}],"key":"ljsF8iYBbl"}],"enumerator":"2.2","html_id":"optimal-policy-lqr-linear","key":"P9a3MB7UPj"},{"type":"paragraph","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"children":[{"type":"text","value":"The construction (and inductive proof) proceeds similarly to the one ","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"key":"dvVHlpG2VG"},{"type":"crossReference","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"children":[{"type":"text","value":"in the MDP setting","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"key":"QGOtLnDdgx"}],"identifier":"eval_dp","label":"eval_dp","kind":"heading","template":"Section %s","enumerator":"1.3.1","resolved":true,"html_id":"eval-dp","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"UkxSGy2mel"},{"type":"text","value":".","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"key":"BP4xTyDewg"}],"key":"lBizK24rHx"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":333,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"We’ll compute ","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"yR7lfkd6bD"},{"type":"inlineMath","value":"V_\\hor^\\star","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"html":"VHV_\\hor^\\starVH","key":"huf0AbbYnl"},{"type":"text","value":" (at the end of the horizon) as our base case.","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"zYI6UqR5CL"}],"key":"ufATJfx6nY"},{"type":"listItem","spread":true,"position":{"start":{"line":334,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"Then we’ll work step-by-step backwards in time, using ","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"key":"kCkRI6QJhm"},{"type":"inlineMath","value":"V_{\\hi+1}^\\star","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"html":"Vh+1V_{\\hi+1}^\\starVh+1","key":"vb3W2ubPNU"},{"type":"text","value":" to compute ","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"key":"oQFJRh470O"},{"type":"inlineMath","value":"Q_\\hi^\\star","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"html":"QhQ_\\hi^\\starQh","key":"vv7YZ9la9i"},{"type":"text","value":", ","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"key":"a39LY1N4QF"},{"type":"inlineMath","value":"\\pi_{\\hi}^\\star","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"html":"πh\\pi_{\\hi}^\\starπh","key":"if9FPtSbYT"},{"type":"text","value":", and ","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"key":"tv4iQAacm3"},{"type":"inlineMath","value":"V_\\hi^\\star","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"html":"VhV_\\hi^\\starVh","key":"WKiY7pJOls"},{"type":"text","value":".","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"key":"oKCyPiiIqc"}],"key":"I80ixU2VYw"}],"key":"htyk7BB11D"},{"type":"comment","value":" TODO insert reference for proof by induction ","key":"kJ1pUIzztd"},{"type":"paragraph","position":{"start":{"line":338,"column":1},"end":{"line":343,"column":1}},"children":[{"type":"strong","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"children":[{"type":"text","value":"Base case:","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"key":"rEQPN434Am"}],"key":"rvqAgCkh7C"},{"type":"text","value":"\nAt the final timestep,\nthere are no possible actions to take,\nand so ","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"key":"GIapAtTE1T"},{"type":"inlineMath","value":"V^\\star_\\hor(\\st) = c(\\st) = \\st^\\top Q \\st","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"html":"VH(x)=c(x)=xQxV^\\star_\\hor(\\st) = c(\\st) = \\st^\\top Q \\stVH(x)=c(x)=xQx","key":"AxXwrYjT3g"},{"type":"text","value":".\nThus ","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"key":"LfNmdjaYCd"},{"type":"inlineMath","value":"V_\\hor^\\star(\\st) = \\st^\\top P_\\hor \\st + p_\\hor","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"html":"VH(x)=xPHx+pHV_\\hor^\\star(\\st) = \\st^\\top P_\\hor \\st + p_\\horVH(x)=xPHx+pH","key":"Erouwc52zf"},{"type":"text","value":"\nwhere ","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"key":"baSItI5HrZ"},{"type":"inlineMath","value":"P_\\hor = Q","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"html":"PH=QP_\\hor = QPH=Q","key":"pwc1hSuLdt"},{"type":"text","value":" and ","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"key":"FhKtjCB42l"},{"type":"inlineMath","value":"p_\\hor = 0","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"html":"pH=0p_\\hor = 0pH=0","key":"p2b53Qvbww"},{"type":"text","value":".","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"key":"QFRjvcjqzd"}],"key":"Xf5rufKX5d"},{"type":"paragraph","position":{"start":{"line":345,"column":1},"end":{"line":350,"column":1}},"children":[{"type":"strong","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"children":[{"type":"text","value":"Inductive hypothesis:","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"oC1dCZzh9E"}],"key":"qvC9Ct6E5T"},{"type":"text","value":"\nWe seek to show that the inductive step holds for both theorems:\nIf ","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"YEYbK4YKRH"},{"type":"inlineMath","value":"V^\\star_{\\hi+1}(\\st)","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"html":"Vh+1(x)V^\\star_{\\hi+1}(\\st)Vh+1(x)","key":"idniun1nsS"},{"type":"text","value":" is an upward-curved quadratic,\nthen ","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"yCjyTFm5bI"},{"type":"inlineMath","value":"V^\\star_\\hi(\\st)","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"html":"Vh(x)V^\\star_\\hi(\\st)Vh(x)","key":"HNGLcASCWp"},{"type":"text","value":" must also be an upward-curved quadratic,\nand ","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"LrOyFlQoGo"},{"type":"inlineMath","value":"\\pi^\\star_\\hi(\\st)","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"html":"πh(x)\\pi^\\star_\\hi(\\st)πh(x)","key":"LRHjntc5xk"},{"type":"text","value":" must be linear.\nWe’ll break this down into the following steps:","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"AV0Tasae2U"}],"key":"fIgXVgS2zm"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":352,"column":1},"end":{"line":358,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":352,"column":1},"end":{"line":353,"column":1}},"children":[{"type":"text","value":"Show that ","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"key":"MV2oIgFoCL"},{"type":"inlineMath","value":"Q^\\star_\\hi(\\st, \\act)","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"html":"Qh(x,u)Q^\\star_\\hi(\\st, \\act)Qh(x,u)","key":"p6FePCXCMz"},{"type":"text","value":" is an upward-curved quadratic (in both\n","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"key":"wjZzXtAIlo"},{"type":"inlineMath","value":"\\st","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"html":"x\\stx","key":"TwFxijS1PR"},{"type":"text","value":" and ","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"key":"fyknaOcltA"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"html":"u\\actu","key":"riCbndbNzP"},{"type":"text","value":").","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"key":"gs3roYBEL8"}],"key":"UpUBFt3uGB"},{"type":"listItem","spread":true,"position":{"start":{"line":354,"column":1},"end":{"line":356,"column":1}},"children":[{"type":"text","value":"Derive the optimal policy\n","position":{"start":{"line":354,"column":1},"end":{"line":354,"column":1}},"key":"tlZZixboZK"},{"type":"inlineMath","value":"\\pi^\\star_\\hi(\\st) = \\arg \\min_\\act Q^\\star_\\hi(\\st, \\act)","position":{"start":{"line":354,"column":1},"end":{"line":354,"column":1}},"html":"πh(x)=argminuQh(x,u)\\pi^\\star_\\hi(\\st) = \\arg \\min_\\act Q^\\star_\\hi(\\st, \\act)πh(x)=argminuQh(x,u)","key":"dBWSNQsx3g"},{"type":"text","value":" and show\nthat it’s linear.","position":{"start":{"line":354,"column":1},"end":{"line":354,"column":1}},"key":"QXP4QbNpCM"}],"key":"JgPNbMLT1C"},{"type":"listItem","spread":true,"position":{"start":{"line":357,"column":1},"end":{"line":358,"column":1}},"children":[{"type":"text","value":"Show that ","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"key":"qZ4Cb8PylT"},{"type":"inlineMath","value":"V^\\star_\\hi(\\st)","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"html":"Vh(x)V^\\star_\\hi(\\st)Vh(x)","key":"pBmoQ4bEvf"},{"type":"text","value":" is an upward-curved quadratic.","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"key":"iNpQwOYy4t"}],"key":"LzDVyVAiZ8"}],"key":"gEpjt3MN6g"},{"type":"paragraph","position":{"start":{"line":359,"column":1},"end":{"line":360,"column":1}},"children":[{"type":"text","value":"We first assume the inductive hypothesis that our theorems are true at\ntime ","position":{"start":{"line":359,"column":1},"end":{"line":359,"column":1}},"key":"rmUTob4eiR"},{"type":"inlineMath","value":"\\hi+1","position":{"start":{"line":359,"column":1},"end":{"line":359,"column":1}},"html":"h+1\\hi+1h+1","key":"DuQaeRlisn"},{"type":"text","value":". That is,","position":{"start":{"line":359,"column":1},"end":{"line":359,"column":1}},"key":"ugSDUZpB3g"}],"key":"ApaBiYiws3"},{"type":"math","value":"V^\\star_{\\hi+1}(\\st) = \\st^\\top P_{\\hi+1} \\st + p_{\\hi+1} \\quad \\forall \\st \\in \\mathcal{S}.","position":{"start":{"line":362,"column":1},"end":{"line":364,"column":1}},"html":"Vh+1(x)=xPh+1x+ph+1xS.V^\\star_{\\hi+1}(\\st) = \\st^\\top P_{\\hi+1} \\st + p_{\\hi+1} \\quad \\forall \\st \\in \\mathcal{S}.Vh+1(x)=xPh+1x+ph+1xS.","enumerator":"2.11","key":"YrpRP1clTh"},{"type":"proof","kind":"lemma","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"inlineMath","value":"Q^\\star_\\hi(\\st, \\act)","position":{"start":{"line":366,"column":1},"end":{"line":366,"column":1}},"html":"Qh(x,u)Q^\\star_\\hi(\\st, \\act)Qh(x,u)","key":"nAbkUl1FEK"},{"type":"text","value":" is an upward-curved quadratic","position":{"start":{"line":366,"column":1},"end":{"line":366,"column":1}},"key":"RWPxtjUghW"}],"key":"DQUDzjddgg"},{"type":"paragraph","position":{"start":{"line":367,"column":1},"end":{"line":368,"column":1}},"children":[{"type":"text","value":"Let us decompose ","position":{"start":{"line":367,"column":1},"end":{"line":367,"column":1}},"key":"uiDkZ1eFdW"},{"type":"inlineMath","value":"Q^\\star_\\hi : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}","position":{"start":{"line":367,"column":1},"end":{"line":367,"column":1}},"html":"Qh:S×ARQ^\\star_\\hi : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}Qh:S×AR","key":"ijz0qUmpLa"},{"type":"text","value":"\ninto the immediate reward plus the expected cost-to-go:","position":{"start":{"line":367,"column":1},"end":{"line":367,"column":1}},"key":"PoAyT3WTpl"}],"key":"lTUJf7suH3"},{"type":"math","value":"Q^\\star_\\hi(\\st, \\act) = c(\\st, \\act) + \\E_{\\st' \\sim f(\\st, \\act, w_{\\hi+1})} [V^\\star_{\\hi+1}(\\st')].","position":{"start":{"line":370,"column":1},"end":{"line":372,"column":1}},"html":"Qh(x,u)=c(x,u)+Exf(x,u,wh+1)[Vh+1(x)].Q^\\star_\\hi(\\st, \\act) = c(\\st, \\act) + \\E_{\\st' \\sim f(\\st, \\act, w_{\\hi+1})} [V^\\star_{\\hi+1}(\\st')].Qh(x,u)=c(x,u)+Exf(x,u,wh+1)[Vh+1(x)].","enumerator":"2.12","key":"XEtK3z7bIw"},{"type":"paragraph","position":{"start":{"line":374,"column":1},"end":{"line":378,"column":1}},"children":[{"type":"text","value":"Recall ","position":{"start":{"line":374,"column":1},"end":{"line":374,"column":1}},"key":"Jv6Ka1lhY9"},{"type":"inlineMath","value":"c(\\st, \\act) := \\st^\\top Q \\st + \\act^\\top R \\act","position":{"start":{"line":374,"column":1},"end":{"line":374,"column":1}},"html":"c(x,u):=xQx+uRuc(\\st, \\act) := \\st^\\top Q \\st + \\act^\\top R \\actc(x,u):=xQx+uRu","key":"SmfDOPKVtN"},{"type":"text","value":".\nLet’s consider the expectation over the next timestep.\nThe only randomness in the dynamics comes from the noise\n","position":{"start":{"line":374,"column":1},"end":{"line":374,"column":1}},"key":"yyMrqqASyp"},{"type":"inlineMath","value":"w_{\\hi+1} \\sim \\mathcal{N}(0, \\sigma^2 I)","position":{"start":{"line":374,"column":1},"end":{"line":374,"column":1}},"html":"wh+1N(0,σ2I)w_{\\hi+1} \\sim \\mathcal{N}(0, \\sigma^2 I)wh+1N(0,σ2I)","key":"LfqPFMttzN"},{"type":"text","value":",\nso we can expand the expectation as:","position":{"start":{"line":374,"column":1},"end":{"line":374,"column":1}},"key":"ysdl42kWpd"}],"key":"euBhLKrybA"},{"type":"math","value":"\\begin{aligned}\n & \\E_{\\st'} [V^\\star_{\\hi+1}(\\st')] \\\\\n {} = {} & \\E_{w_{\\hi+1}} [V^\\star_{\\hi+1}(A \\st + B \\act + w_{\\hi+1})] & & \\text{definition of } f \\\\\n {} = {} & \\E_{w_{\\hi+1}} [ (A \\st + B \\act + w_{\\hi+1})^\\top P_{\\hi+1} (A \\st + B \\act + w_{\\hi+1}) + p_{\\hi+1} ]. & & \\text{inductive hypothesis}\n\\end{aligned}","position":{"start":{"line":380,"column":1},"end":{"line":386,"column":1}},"html":"Ex[Vh+1(x)]=Ewh+1[Vh+1(Ax+Bu+wh+1)]definition of f=Ewh+1[(Ax+Bu+wh+1)Ph+1(Ax+Bu+wh+1)+ph+1].inductive hypothesis\\begin{aligned}\n & \\E_{\\st'} [V^\\star_{\\hi+1}(\\st')] \\\\\n {} = {} & \\E_{w_{\\hi+1}} [V^\\star_{\\hi+1}(A \\st + B \\act + w_{\\hi+1})] & & \\text{definition of } f \\\\\n {} = {} & \\E_{w_{\\hi+1}} [ (A \\st + B \\act + w_{\\hi+1})^\\top P_{\\hi+1} (A \\st + B \\act + w_{\\hi+1}) + p_{\\hi+1} ]. & & \\text{inductive hypothesis}\n\\end{aligned}==Ex[Vh+1(x)]Ewh+1[Vh+1(Ax+Bu+wh+1)]Ewh+1[(Ax+Bu+wh+1)Ph+1(Ax+Bu+wh+1)+ph+1].definition of finductive hypothesis","enumerator":"2.13","key":"F2DCnK20jo"},{"type":"paragraph","position":{"start":{"line":388,"column":1},"end":{"line":388,"column":1}},"children":[{"type":"text","value":"Summing and combining like terms, we get","position":{"start":{"line":388,"column":1},"end":{"line":388,"column":1}},"key":"QjteoNdT6o"}],"key":"Htj8MlcgWk"},{"type":"math","value":"\\begin{aligned}\n Q^\\star_\\hi(\\st, \\act) & = \\st^\\top Q \\st + \\act^\\top R \\act + \\E_{w_{\\hi+1}} [(A \\st + B \\act + w_{\\hi+1})^\\top P_{\\hi+1} (A \\st + B \\act + w_{\\hi+1}) + p_{\\hi+1}] \\\\\n & = \\st^\\top (Q + A^\\top P_{\\hi+1} A)\\st + \\act^\\top (R + B^\\top P_{\\hi+1} B) \\act + 2 \\st^\\top A^\\top P_{\\hi+1} B \\act \\\\\n & \\qquad + \\E_{w_{\\hi+1}} [w_{\\hi+1}^\\top P_{\\hi+1} w_{\\hi+1}] + p_{\\hi+1}.\n\\end{aligned}","position":{"start":{"line":390,"column":1},"end":{"line":396,"column":1}},"html":"Qh(x,u)=xQx+uRu+Ewh+1[(Ax+Bu+wh+1)Ph+1(Ax+Bu+wh+1)+ph+1]=x(Q+APh+1A)x+u(R+BPh+1B)u+2xAPh+1Bu+Ewh+1[wh+1Ph+1wh+1]+ph+1.\\begin{aligned}\n Q^\\star_\\hi(\\st, \\act) & = \\st^\\top Q \\st + \\act^\\top R \\act + \\E_{w_{\\hi+1}} [(A \\st + B \\act + w_{\\hi+1})^\\top P_{\\hi+1} (A \\st + B \\act + w_{\\hi+1}) + p_{\\hi+1}] \\\\\n & = \\st^\\top (Q + A^\\top P_{\\hi+1} A)\\st + \\act^\\top (R + B^\\top P_{\\hi+1} B) \\act + 2 \\st^\\top A^\\top P_{\\hi+1} B \\act \\\\\n & \\qquad + \\E_{w_{\\hi+1}} [w_{\\hi+1}^\\top P_{\\hi+1} w_{\\hi+1}] + p_{\\hi+1}.\n\\end{aligned}Qh(x,u)=xQx+uRu+Ewh+1[(Ax+Bu+wh+1)Ph+1(Ax+Bu+wh+1)+ph+1]=x(Q+APh+1A)x+u(R+BPh+1B)u+2xAPh+1Bu+Ewh+1[wh+1Ph+1wh+1]+ph+1.","enumerator":"2.14","key":"o4PvqoTGcD"},{"type":"paragraph","position":{"start":{"line":398,"column":1},"end":{"line":401,"column":1}},"children":[{"type":"text","value":"Note that the terms that are linear in ","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"key":"i0fsCO2gAR"},{"type":"inlineMath","value":"w_\\hi","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"html":"whw_\\hiwh","key":"OJI5TqjtXD"},{"type":"text","value":" have mean\nzero and vanish. Now consider the remaining expectation over the noise.\nBy expanding out the product and using linearity of expectation, we can\nwrite this out as","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"key":"FuDY7NzARw"}],"key":"zH7Jt3L7XO"},{"type":"math","value":"\\begin{aligned}\n \\E_{w_{\\hi+1}} [w_{\\hi+1}^\\top P_{\\hi+1} w_{\\hi+1}] & = \\sum_{i=1}^d \\sum_{j=1}^d (P_{\\hi+1})_{ij} \\E_{w_{\\hi+1}} [(w_{\\hi+1})_i (w_{\\hi+1})_j] \\\\\n & = \\sigma^2 \\mathrm{Tr}(P_{\\hi + 1})\n\\end{aligned}","position":{"start":{"line":403,"column":1},"end":{"line":408,"column":1}},"html":"Ewh+1[wh+1Ph+1wh+1]=i=1dj=1d(Ph+1)ijEwh+1[(wh+1)i(wh+1)j]=σ2Tr(Ph+1)\\begin{aligned}\n \\E_{w_{\\hi+1}} [w_{\\hi+1}^\\top P_{\\hi+1} w_{\\hi+1}] & = \\sum_{i=1}^d \\sum_{j=1}^d (P_{\\hi+1})_{ij} \\E_{w_{\\hi+1}} [(w_{\\hi+1})_i (w_{\\hi+1})_j] \\\\\n & = \\sigma^2 \\mathrm{Tr}(P_{\\hi + 1})\n\\end{aligned}Ewh+1[wh+1Ph+1wh+1]=i=1dj=1d(Ph+1)ijEwh+1[(wh+1)i(wh+1)j]=σ2Tr(Ph+1)","enumerator":"2.15","key":"rYPE6smifk"},{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Quadratic forms","position":{"start":{"line":410,"column":1},"end":{"line":410,"column":1}},"key":"i3kpJWFfBC"}],"key":"Bx92B7CggG"},{"type":"paragraph","position":{"start":{"line":411,"column":1},"end":{"line":412,"column":1}},"children":[{"type":"text","value":"When solving ","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"key":"CntcESgbV1"},{"type":"emphasis","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"children":[{"type":"text","value":"quadratic forms","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"key":"SpOyqIr88Q"}],"key":"lGzDAmEHKD"},{"type":"text","value":", i.e. expressions of the form ","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"key":"vqAqHI4oAm"},{"type":"inlineMath","value":"x^\\top A x","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"html":"xAxx^\\top A xxAx","key":"mRsE8gkdvt"},{"type":"text","value":",\nit’s often helpful to consider the terms on the diagonal (","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"key":"Ev6B3V64G6"},{"type":"inlineMath","value":"i = j","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"html":"i=ji = ji=j","key":"AfEn3SQbKq"},{"type":"text","value":") separately from those off the diagonal.","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"key":"UvV1glTiNX"}],"key":"vvDnFatqqM"},{"type":"paragraph","position":{"start":{"line":414,"column":1},"end":{"line":414,"column":1}},"children":[{"type":"text","value":"In this case, the expectation of each diagonal term becomes","position":{"start":{"line":414,"column":1},"end":{"line":414,"column":1}},"key":"f3oVo3CWRe"}],"key":"BFW3LI3Ici"},{"type":"math","value":"(P_{\\hi+1})_{ii} \\E (w_{\\hi+1})_i^2 = \\sigma^2 (P_{\\hi+1})_{ii}.","position":{"start":{"line":417,"column":1},"end":{"line":419,"column":1}},"html":"(Ph+1)iiE(wh+1)i2=σ2(Ph+1)ii.(P_{\\hi+1})_{ii} \\E (w_{\\hi+1})_i^2 = \\sigma^2 (P_{\\hi+1})_{ii}.(Ph+1)iiE(wh+1)i2=σ2(Ph+1)ii.","enumerator":"2.16","key":"y1axRRTiIs"},{"type":"paragraph","position":{"start":{"line":421,"column":1},"end":{"line":423,"column":1}},"children":[{"type":"text","value":"Off the diagonal, since the elements of ","position":{"start":{"line":421,"column":1},"end":{"line":421,"column":1}},"key":"ZzajR9SuO9"},{"type":"inlineMath","value":"w_{\\hi+1}","position":{"start":{"line":421,"column":1},"end":{"line":421,"column":1}},"html":"wh+1w_{\\hi+1}wh+1","key":"CgNJQnXuEZ"},{"type":"text","value":" are independent, the\nexpectation factors, and since each element has mean zero, the term\nvanishes:","position":{"start":{"line":421,"column":1},"end":{"line":421,"column":1}},"key":"JtaA5HuXbu"}],"key":"YJyyxmIFT3"},{"type":"math","value":"(P_{\\hi+1})_{ij} \\E [(w_{\\hi+1})_i] \\E [(w_{\\hi+1})_j] = 0.","position":{"start":{"line":425,"column":1},"end":{"line":427,"column":1}},"html":"(Ph+1)ijE[(wh+1)i]E[(wh+1)j]=0.(P_{\\hi+1})_{ij} \\E [(w_{\\hi+1})_i] \\E [(w_{\\hi+1})_j] = 0.(Ph+1)ijE[(wh+1)i]E[(wh+1)j]=0.","enumerator":"2.17","key":"TpT3COYtyJ"},{"type":"paragraph","position":{"start":{"line":429,"column":1},"end":{"line":431,"column":1}},"children":[{"type":"text","value":"Thus,\nthe only terms left are the ones on the diagonal,\nso the sum of these can be expressed as the trace of ","position":{"start":{"line":429,"column":1},"end":{"line":429,"column":1}},"key":"sCVQr7BX9r"},{"type":"inlineMath","value":"\\sigma^2 P_{\\hi+1}","position":{"start":{"line":429,"column":1},"end":{"line":429,"column":1}},"html":"σ2Ph+1\\sigma^2 P_{\\hi+1}σ2Ph+1","key":"X2HVBSjN4L"},{"type":"text","value":":","position":{"start":{"line":429,"column":1},"end":{"line":429,"column":1}},"key":"J4bqtufNQl"}],"key":"dEM4J4fOdE"},{"type":"math","value":"\\E_{w_{\\hi+1}} [w_{\\hi+1}^\\top P_{\\hi+1} w_{\\hi+1}] = \\sigma^2 \\mathrm{Tr}(P_{\\hi+1}).","position":{"start":{"line":433,"column":1},"end":{"line":435,"column":1}},"html":"Ewh+1[wh+1Ph+1wh+1]=σ2Tr(Ph+1).\\E_{w_{\\hi+1}} [w_{\\hi+1}^\\top P_{\\hi+1} w_{\\hi+1}] = \\sigma^2 \\mathrm{Tr}(P_{\\hi+1}).Ewh+1[wh+1Ph+1wh+1]=σ2Tr(Ph+1).","enumerator":"2.18","key":"VasDgtmnEi"}],"key":"jGWsrxj7RP"},{"type":"paragraph","position":{"start":{"line":438,"column":1},"end":{"line":438,"column":1}},"children":[{"type":"text","value":"Substituting this back into the expression for ","position":{"start":{"line":438,"column":1},"end":{"line":438,"column":1}},"key":"JS2WnwP6fB"},{"type":"inlineMath","value":"Q^\\star_\\hi","position":{"start":{"line":438,"column":1},"end":{"line":438,"column":1}},"html":"QhQ^\\star_\\hiQh","key":"iXdjOvj358"},{"type":"text","value":", we have:","position":{"start":{"line":438,"column":1},"end":{"line":438,"column":1}},"key":"UpCb8v425c"}],"key":"xIQACEFXNB"},{"type":"math","value":"\\begin{aligned}\n Q^\\star_\\hi(\\st, \\act) & = \\st^\\top (Q + A^\\top P_{\\hi+1} A) \\st + \\act^\\top (R + B^\\top P_{\\hi+1} B) \\act\n + 2\\st^\\top A^\\top P_{\\hi+1} B \\act \\\\\n & \\qquad + \\sigma^2 \\mathrm{Tr}(P_{\\hi+1}) + p_{\\hi+1}.\n\\end{aligned}","position":{"start":{"line":440,"column":1},"end":{"line":446,"column":1}},"html":"Qh(x,u)=x(Q+APh+1A)x+u(R+BPh+1B)u+2xAPh+1Bu+σ2Tr(Ph+1)+ph+1.\\begin{aligned}\n Q^\\star_\\hi(\\st, \\act) & = \\st^\\top (Q + A^\\top P_{\\hi+1} A) \\st + \\act^\\top (R + B^\\top P_{\\hi+1} B) \\act\n + 2\\st^\\top A^\\top P_{\\hi+1} B \\act \\\\\n & \\qquad + \\sigma^2 \\mathrm{Tr}(P_{\\hi+1}) + p_{\\hi+1}.\n\\end{aligned}Qh(x,u)=x(Q+APh+1A)x+u(R+BPh+1B)u+2xAPh+1Bu+σ2Tr(Ph+1)+ph+1.","enumerator":"2.19","key":"dhPactxjmK"},{"type":"paragraph","position":{"start":{"line":448,"column":1},"end":{"line":453,"column":1}},"children":[{"type":"text","value":"As we hoped, this expression is quadratic in ","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"xMqULYpibq"},{"type":"inlineMath","value":"\\st","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"html":"x\\stx","key":"FCZVY9JpiM"},{"type":"text","value":" and ","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"cWRccu2bq7"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"html":"u\\actu","key":"Ti8K7YsAco"},{"type":"text","value":".\nFurthermore,\nwe’d like to show that it also ","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"wDQIRs3u9K"},{"type":"emphasis","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"children":[{"type":"text","value":"curves upwards","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"dQvEWNrXuv"}],"key":"cC2wnVcBjv"},{"type":"text","value":"\nwith respect to ","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"kxaiq45uXq"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"html":"u\\actu","key":"PZjZbD37um"},{"type":"text","value":"\nso that its minimum with respect to ","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"PIujbxvAxo"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"html":"u\\actu","key":"o413UsD25L"},{"type":"text","value":" is well-defined.\nWe can do this by noting that the ","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"GaOSHQcaTQ"},{"type":"strong","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"children":[{"type":"text","value":"Hessian matrix","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"UnkTXFr6oW"}],"key":"KcEcQOm0ja"},{"type":"text","value":" of second derivatives is positive definite:","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"dILbI2lR0d"}],"key":"BPxqe4v3y4"},{"type":"math","value":"\\nabla_{\\act \\act} Q_\\hi^\\star(\\st, \\act) = R + B^\\top P_{\\hi+1} B","position":{"start":{"line":455,"column":1},"end":{"line":457,"column":1}},"html":"uuQh(x,u)=R+BPh+1B\\nabla_{\\act \\act} Q_\\hi^\\star(\\st, \\act) = R + B^\\top P_{\\hi+1} BuuQh(x,u)=R+BPh+1B","enumerator":"2.20","key":"y79HS0bmIT"},{"type":"paragraph","position":{"start":{"line":459,"column":1},"end":{"line":464,"column":1}},"children":[{"type":"text","value":"Since ","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"L70Y5W53iR"},{"type":"inlineMath","value":"R","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"html":"RRR","key":"EkSBHy85mQ"},{"type":"text","value":" is s.p.d. (by ","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"Xls7NuPxw8"},{"type":"crossReference","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"children":[{"type":"text","value":"the LQR definition","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"m7TFPbYbIk"}],"identifier":"lqr_definition","label":"lqr_definition","kind":"proof:definition","template":"Definition %s","enumerator":"2.2","resolved":true,"html_id":"lqr-definition","key":"yACH2kN3Jk"},{"type":"text","value":"),\nand ","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"dLuwEM4dj1"},{"type":"inlineMath","value":"P_{\\hi+1}","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"html":"Ph+1P_{\\hi+1}Ph+1","key":"IzvC47VpJQ"},{"type":"text","value":" is s.p.d. (by the inductive hypothesis),\nthis sum must also be s.p.d.,\nand so ","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"lgA0MfTyHh"},{"type":"inlineMath","value":"Q^\\star_\\hi","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"html":"QhQ^\\star_\\hiQh","key":"iAPBPHMQ3Q"},{"type":"text","value":" is indeed an upward-curved quadratic with respect to ","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"vEhawTog7o"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"html":"u\\actu","key":"UQPuTtEtHz"},{"type":"text","value":".\n(If this isn’t clear, try proving it as an exercise.)\nThe proof of its upward curvature with respect to ","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"EFEoJDOcE0"},{"type":"inlineMath","value":"\\st","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"html":"x\\stx","key":"nBpFgtJ3wo"},{"type":"text","value":" is equivalent.","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"W7lhmEhbmA"}],"key":"VjnkDujIzJ"}],"enumerator":"2.1","key":"yTodstXcZp"},{"type":"proof","kind":"lemma","label":"lemma_pi_linear","identifier":"lemma_pi_linear","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"inlineMath","value":"\\pi^\\star_\\hi","position":{"start":{"line":467,"column":1},"end":{"line":467,"column":1}},"html":"πh\\pi^\\star_\\hiπh","key":"bfLA1mozGR"},{"type":"text","value":" is linear","position":{"start":{"line":467,"column":1},"end":{"line":467,"column":1}},"key":"ACcxJuQKcE"}],"key":"KyBulOFr2m"},{"type":"paragraph","position":{"start":{"line":470,"column":1},"end":{"line":473,"column":1}},"children":[{"type":"text","value":"Since ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"gbKhHFjTag"},{"type":"inlineMath","value":"Q^\\star_\\hi","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"QhQ^\\star_\\hiQh","key":"h6FL3vOERd"},{"type":"text","value":" is an upward-curved quadratic,\nfinding its minimum over ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"qhrrXGlRVE"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"u\\actu","key":"wEmlRLnYOn"},{"type":"text","value":" is easy:\nwe simply set the gradient with respect to ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"kRLrw6pVJI"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"u\\actu","key":"cdNu64qTEH"},{"type":"text","value":" equal to zero and solve for ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"bu2patMAze"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"u\\actu","key":"NWemoDUZHv"},{"type":"text","value":".\nFirst, we calculate the gradient:","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"Qf6WEEgSQ2"}],"key":"aZuObBIyo9"},{"type":"math","value":"\\begin{aligned}\n \\nabla_\\act Q^\\star_\\hi(\\st, \\act) & = \\nabla_\\act [ \\act^\\top (R + B^\\top P_{\\hi+1} B) \\act + 2 \\st^\\top A^\\top P_{\\hi+1} B \\act ] \\\\\n & = 2 (R + B^\\top P_{\\hi+1} B) \\act + 2 (\\st^\\top A^\\top P_{\\hi+1} B)^\\top\n\\end{aligned}","position":{"start":{"line":475,"column":1},"end":{"line":480,"column":1}},"html":"uQh(x,u)=u[u(R+BPh+1B)u+2xAPh+1Bu]=2(R+BPh+1B)u+2(xAPh+1B)\\begin{aligned}\n \\nabla_\\act Q^\\star_\\hi(\\st, \\act) & = \\nabla_\\act [ \\act^\\top (R + B^\\top P_{\\hi+1} B) \\act + 2 \\st^\\top A^\\top P_{\\hi+1} B \\act ] \\\\\n & = 2 (R + B^\\top P_{\\hi+1} B) \\act + 2 (\\st^\\top A^\\top P_{\\hi+1} B)^\\top\n\\end{aligned}uQh(x,u)=u[u(R+BPh+1B)u+2xAPh+1Bu]=2(R+BPh+1B)u+2(xAPh+1B)","enumerator":"2.21","key":"yTWTF1uCFt"},{"type":"paragraph","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"children":[{"type":"text","value":"Setting this to zero, we get","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"devWpRU35K"}],"key":"pdYghu3X16"},{"type":"math","value":"\\begin{aligned}\n 0 & = (R + B^\\top P_{\\hi+1} B) \\pi^\\star_\\hi(\\st) + B^\\top P_{\\hi+1} A \\st \\nonumber \\\\\n \\pi^\\star_\\hi(\\st) & = (R + B^\\top P_{\\hi+1} B)^{-1} (-B^\\top P_{\\hi+1} A \\st) \\nonumber \\\\\n & = - K_\\hi \\st,\n\\end{aligned}","position":{"start":{"line":484,"column":1},"end":{"line":490,"column":1}},"html":"0=(R+BPh+1B)πh(x)+BPh+1Axπh(x)=(R+BPh+1B)1(BPh+1Ax)=Khx,\\begin{aligned}\n 0 & = (R + B^\\top P_{\\hi+1} B) \\pi^\\star_\\hi(\\st) + B^\\top P_{\\hi+1} A \\st \\nonumber \\\\\n \\pi^\\star_\\hi(\\st) & = (R + B^\\top P_{\\hi+1} B)^{-1} (-B^\\top P_{\\hi+1} A \\st) \\nonumber \\\\\n & = - K_\\hi \\st,\n\\end{aligned}0πh(x)=(R+BPh+1B)πh(x)+BPh+1Ax=(R+BPh+1B)1(BPh+1Ax)=Khx,","enumerator":"2.22","key":"XkjPBTeJPc"},{"type":"paragraph","position":{"start":{"line":492,"column":1},"end":{"line":492,"column":1}},"children":[{"type":"text","value":"where","position":{"start":{"line":492,"column":1},"end":{"line":492,"column":1}},"key":"dH8p8vrXBI"}],"key":"r6DtiHW30d"},{"type":"math","value":"K_\\hi = (R + B^\\top P_{\\hi+1} B)^{-1} B^\\top P_{\\hi+1} A.","position":{"start":{"line":494,"column":1},"end":{"line":494,"column":1}},"identifier":"k_pi","label":"k_pi","html_id":"k-pi","html":"Kh=(R+BPh+1B)1BPh+1A.K_\\hi = (R + B^\\top P_{\\hi+1} B)^{-1} B^\\top P_{\\hi+1} A.Kh=(R+BPh+1B)1BPh+1A.","enumerator":"2.23","key":"gM4uVuTJfj"},{"type":"paragraph","position":{"start":{"line":496,"column":1},"end":{"line":498,"column":1}},"children":[{"type":"text","value":"Note that this optimal policy doesn’t depend on the starting distribution ","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"UXWMzPRIqM"},{"type":"inlineMath","value":"\\mu_0","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"html":"μ0\\mu_0μ0","key":"FCyosztUjm"},{"type":"text","value":".\nIt’s also fully ","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"U7H6RdJBvF"},{"type":"strong","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"children":[{"type":"text","value":"deterministic","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"rNz8m3FLEn"}],"key":"gXLmbPILHI"},{"type":"text","value":" and isn’t affected by the noise terms\n","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"F8MtlWM6Gr"},{"type":"inlineMath","value":"w_0, \\dots, w_{\\hor-1}","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"html":"w0,,wH1w_0, \\dots, w_{\\hor-1}w0,,wH1","key":"IWpTFYuZZB"},{"type":"text","value":".","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"lDxaxDZ9Iz"}],"key":"ShDiBIXQhH"}],"enumerator":"2.2","html_id":"lemma-pi-linear","key":"jADkfXTlai"},{"type":"proof","kind":"lemma","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"inlineMath","value":"V^\\star_\\hi(\\st)","position":{"start":{"line":501,"column":1},"end":{"line":501,"column":1}},"html":"Vh(x)V^\\star_\\hi(\\st)Vh(x)","key":"F9CKqKMmJX"},{"type":"text","value":" is an upward-curved quadratic","position":{"start":{"line":501,"column":1},"end":{"line":501,"column":1}},"key":"GTixDZciJ0"}],"key":"yR3hiCERn4"},{"type":"paragraph","position":{"start":{"line":503,"column":1},"end":{"line":503,"column":1}},"children":[{"type":"text","value":"Using the identity ","position":{"start":{"line":503,"column":1},"end":{"line":503,"column":1}},"key":"DE7UgoubBx"},{"type":"inlineMath","value":"V^\\star_\\hi(\\st) = Q^\\star_\\hi(\\st, \\pi^\\star(\\st))","position":{"start":{"line":503,"column":1},"end":{"line":503,"column":1}},"html":"Vh(x)=Qh(x,π(x))V^\\star_\\hi(\\st) = Q^\\star_\\hi(\\st, \\pi^\\star(\\st))Vh(x)=Qh(x,π(x))","key":"VOdjqKQHFi"},{"type":"text","value":", we have:","position":{"start":{"line":503,"column":1},"end":{"line":503,"column":1}},"key":"IzM6PaU43N"}],"key":"TyCp9iqJjy"},{"type":"math","value":"\\begin{aligned}\n V^\\star_\\hi(\\st) & = Q^\\star_\\hi(\\st, \\pi^\\star(\\st)) \\\\\n & = \\st^\\top (Q + A^\\top P_{\\hi+1} A) \\st + (-K_\\hi \\st)^\\top (R + B^\\top P_{\\hi+1} B) (-K_\\hi \\st)\n + 2\\st^\\top A^\\top P_{\\hi+1} B (-K_\\hi \\st) \\\\\n & \\qquad + \\mathrm{Tr}(\\sigma^2 P_{\\hi+1}) + p_{\\hi+1}\n\\end{aligned}","position":{"start":{"line":505,"column":1},"end":{"line":512,"column":1}},"html":"Vh(x)=Qh(x,π(x))=x(Q+APh+1A)x+(Khx)(R+BPh+1B)(Khx)+2xAPh+1B(Khx)+Tr(σ2Ph+1)+ph+1\\begin{aligned}\n V^\\star_\\hi(\\st) & = Q^\\star_\\hi(\\st, \\pi^\\star(\\st)) \\\\\n & = \\st^\\top (Q + A^\\top P_{\\hi+1} A) \\st + (-K_\\hi \\st)^\\top (R + B^\\top P_{\\hi+1} B) (-K_\\hi \\st)\n + 2\\st^\\top A^\\top P_{\\hi+1} B (-K_\\hi \\st) \\\\\n & \\qquad + \\mathrm{Tr}(\\sigma^2 P_{\\hi+1}) + p_{\\hi+1}\n\\end{aligned}Vh(x)=Qh(x,π(x))=x(Q+APh+1A)x+(Khx)(R+BPh+1B)(Khx)+2xAPh+1B(Khx)+Tr(σ2Ph+1)+ph+1","enumerator":"2.24","key":"QIpnhIypMP"},{"type":"paragraph","position":{"start":{"line":514,"column":1},"end":{"line":517,"column":1}},"children":[{"type":"text","value":"Note that with respect to ","position":{"start":{"line":514,"column":1},"end":{"line":514,"column":1}},"key":"TLJxw5htNQ"},{"type":"inlineMath","value":"\\st","position":{"start":{"line":514,"column":1},"end":{"line":514,"column":1}},"html":"x\\stx","key":"ZLuzXXcCuz"},{"type":"text","value":",\nthis is the sum of a quadratic term and a constant,\nwhich is exactly what we were aiming for!\nThe scalar term is clearly","position":{"start":{"line":514,"column":1},"end":{"line":514,"column":1}},"key":"Rk440vaIAt"}],"key":"t0f5rXEKWg"},{"type":"math","value":"p_\\hi = \\mathrm{Tr}(\\sigma^2 P_{\\hi+1}) + p_{\\hi+1}.","position":{"start":{"line":519,"column":1},"end":{"line":519,"column":1}},"html":"ph=Tr(σ2Ph+1)+ph+1.p_\\hi = \\mathrm{Tr}(\\sigma^2 P_{\\hi+1}) + p_{\\hi+1}.ph=Tr(σ2Ph+1)+ph+1.","enumerator":"2.25","key":"MKl7oUdlVR"},{"type":"paragraph","position":{"start":{"line":521,"column":1},"end":{"line":524,"column":1}},"children":[{"type":"text","value":"We can simplify the quadratic term by substituting in ","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"hyE6aYEWEh"},{"type":"inlineMath","value":"K_\\hi","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"html":"KhK_\\hiKh","key":"b9h2CChIi3"},{"type":"text","value":" from ","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"kS1oIBtLoW"},{"type":"crossReference","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"children":[{"type":"text","value":"(","key":"mcIoA9AEnR"},{"type":"text","value":"2.23","key":"rmhrckjpau"},{"type":"text","value":")","key":"lN8fP0hhYN"}],"identifier":"k_pi","label":"k_pi","kind":"equation","template":"(%s)","enumerator":"2.23","resolved":true,"html_id":"k-pi","key":"NjrLIpDgU9"},{"type":"text","value":".\nNotice that when we do this,\nthe ","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"paPfU8SkvF"},{"type":"inlineMath","value":"(R+B^\\top P_{\\hi+1} B)","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"html":"(R+BPh+1B)(R+B^\\top P_{\\hi+1} B)(R+BPh+1B)","key":"z5gnNXxc1F"},{"type":"text","value":" term in the expression is cancelled out by its inverse,\nand the remaining terms combine to give the ","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"Vh5nOlvPDo"},{"type":"strong","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"children":[{"type":"text","value":"Riccati equation","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"VoWEMG8DYh"}],"key":"q9xH4hrfJ4"},{"type":"text","value":":","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"XpynJItVmf"}],"key":"uvF1Q4PqYo"},{"type":"proof","kind":"definition","label":"riccati","identifier":"riccati","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Riccati equation","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"EI1IRSszb9"}],"key":"DCIyAzfgvH"},{"type":"math","value":"P_\\hi = Q + A^\\top P_{\\hi+1} A - A^\\top P_{\\hi+1} B (R + B^\\top P_{\\hi+1} B)^{-1} B^\\top P_{\\hi+1} A.","position":{"start":{"line":529,"column":1},"end":{"line":531,"column":1}},"html":"Ph=Q+APh+1AAPh+1B(R+BPh+1B)1BPh+1A.P_\\hi = Q + A^\\top P_{\\hi+1} A - A^\\top P_{\\hi+1} B (R + B^\\top P_{\\hi+1} B)^{-1} B^\\top P_{\\hi+1} A.Ph=Q+APh+1AAPh+1B(R+BPh+1B)1BPh+1A.","enumerator":"2.26","key":"hLhcxtoYjo"}],"enumerator":"2.5","html_id":"riccati","key":"oKbcWKnv9A"},{"type":"paragraph","position":{"start":{"line":534,"column":1},"end":{"line":534,"column":1}},"children":[{"type":"text","value":"There are several nice properties to note about the Riccati equation:","position":{"start":{"line":534,"column":1},"end":{"line":534,"column":1}},"key":"pr0SUH0YkN"}],"key":"yNPwEhHpvM"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":536,"column":1},"end":{"line":544,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":536,"column":1},"end":{"line":538,"column":1}},"children":[{"type":"text","value":"It’s defined ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"zzAmDNP0in"},{"type":"strong","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"children":[{"type":"text","value":"recursively.","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"FyPVPdEIvp"}],"key":"MPXcWVJMWJ"},{"type":"text","value":"\nGiven the dynamics defined by ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"M2AC7NiLvi"},{"type":"inlineMath","value":"A","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"html":"AAA","key":"UKAZAfcXT0"},{"type":"text","value":" and ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"vRKmyT0Onc"},{"type":"inlineMath","value":"B","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"html":"BBB","key":"yTCuDkrtrh"},{"type":"text","value":", and the state cost matrix ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"KGd0O8F8vl"},{"type":"inlineMath","value":"Q","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"html":"QQQ","key":"WQrzfW9DfR"},{"type":"text","value":",\nwe can recursively calculate ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"kOZE8yBNZc"},{"type":"inlineMath","value":"P_\\hi","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"html":"PhP_\\hiPh","key":"XwL9iQCVcV"},{"type":"text","value":" across all timesteps starting from ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"ZmNpxC0raK"},{"type":"inlineMath","value":"P_\\hor = Q","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"html":"PH=QP_\\hor = QPH=Q","key":"vgNgBi0yqm"},{"type":"text","value":".","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"VaoA9GKSCF"}],"key":"vUVRiN4vm2"},{"type":"listItem","spread":true,"position":{"start":{"line":539,"column":1},"end":{"line":540,"column":1}},"children":[{"type":"inlineMath","value":"P_\\hi","position":{"start":{"line":539,"column":1},"end":{"line":539,"column":1}},"html":"PhP_\\hiPh","key":"UBQUUmJYvw"},{"type":"text","value":" often appears in calculations surrounding optimality,\nsuch as ","position":{"start":{"line":539,"column":1},"end":{"line":539,"column":1}},"key":"Jx06QBSigN"},{"type":"inlineMath","value":"V^\\star_\\hi, Q^\\star_\\hi","position":{"start":{"line":539,"column":1},"end":{"line":539,"column":1}},"html":"Vh,QhV^\\star_\\hi, Q^\\star_\\hiVh,Qh","key":"nyhpjFlAHj"},{"type":"text","value":", and ","position":{"start":{"line":539,"column":1},"end":{"line":539,"column":1}},"key":"BhacFL20Hp"},{"type":"inlineMath","value":"\\pi^\\star_\\hi","position":{"start":{"line":539,"column":1},"end":{"line":539,"column":1}},"html":"πh\\pi^\\star_\\hiπh","key":"hEXv3fRzou"},{"type":"text","value":".","position":{"start":{"line":539,"column":1},"end":{"line":539,"column":1}},"key":"uxkpiV4scF"}],"key":"hbbxMpxzc6"},{"type":"listItem","spread":true,"position":{"start":{"line":541,"column":1},"end":{"line":544,"column":1}},"children":[{"type":"text","value":"Together with the dynamics given by ","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"key":"k0rDEEM5HT"},{"type":"inlineMath","value":"A","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"html":"AAA","key":"ACom5h9qbt"},{"type":"text","value":" and ","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"key":"ZNzAYpwQBO"},{"type":"inlineMath","value":"B","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"html":"BBB","key":"dtKwXarz49"},{"type":"text","value":",\nand the action coefficients ","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"key":"vFC4XrEvrO"},{"type":"inlineMath","value":"R","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"html":"RRR","key":"zJyiPHxZQ6"},{"type":"text","value":" in the lost function,\nit fully defines the optimal policy ","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"key":"Y4btzy1woU"},{"type":"crossReference","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"children":[{"type":"text","value":"Lemma ","key":"cu4S6Rlj7n"},{"type":"text","value":"2.2","key":"iykvc0CHKU"}],"identifier":"lemma_pi_linear","label":"lemma_pi_linear","kind":"proof:lemma","template":"Lemma %s","enumerator":"2.2","resolved":true,"html_id":"lemma-pi-linear","key":"bMyc8zfeEY"},{"type":"text","value":".","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"key":"an1PQRXA3G"}],"key":"RN83giiAom"}],"key":"APmFaQopKD"},{"type":"paragraph","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"children":[{"type":"text","value":"It remains to prove that ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"Hqw9yek8Rl"},{"type":"inlineMath","value":"V^\\star_\\hi","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"html":"VhV^\\star_\\hiVh","key":"OnM6getMsr"},{"type":"text","value":" ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"xOPc2xf0bz"},{"type":"emphasis","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"children":[{"type":"text","value":"curves upwards,","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"OwGn85VM3V"}],"key":"kkAow2u3il"},{"type":"text","value":" that is, that ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"byI0ByVWip"},{"type":"inlineMath","value":"P_\\hi","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"html":"PhP_\\hiPh","key":"Zew0doQuAp"},{"type":"text","value":" is s.p.d. We will use the following fact about ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"pGpJLzOROE"},{"type":"strong","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"children":[{"type":"text","value":"Schur complements:","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"qr35lGx5a7"}],"key":"KGhOcITB9x"}],"key":"y1Djfx2UqZ"},{"type":"proof","kind":"lemma","label":"lemma_schur","identifier":"lemma_schur","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Positive definiteness of Schur complements","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"key":"RK0DomjBvZ"}],"key":"FpFQkVoZjm"},{"type":"paragraph","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"children":[{"type":"text","value":"Let","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"mDr7SzDvxP"}],"key":"APX71Dk5go"},{"type":"math","value":"D = \\begin{pmatrix}\nA & B \\\\\nB^\\top & C\n\\end{pmatrix}","position":{"start":{"line":552,"column":1},"end":{"line":557,"column":1}},"html":"D=(ABBC)D = \\begin{pmatrix}\nA & B \\\\\nB^\\top & C\n\\end{pmatrix}D=(ABBC)","enumerator":"2.27","key":"uWbCzTf5wF"},{"type":"paragraph","position":{"start":{"line":559,"column":1},"end":{"line":561,"column":1}},"children":[{"type":"text","value":"be a symmetric ","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"key":"Hbs6hyu1sn"},{"type":"inlineMath","value":"(m+n) \\times (m+n)","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"html":"(m+n)×(m+n)(m+n) \\times (m+n)(m+n)×(m+n)","key":"xk0RBUnhqZ"},{"type":"text","value":" block matrix,\nwhere ","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"key":"yzHeqdTkkA"},{"type":"inlineMath","value":"A \\in \\R^{m \\times m}, B \\in \\R^{m \\times n}, C \\in \\R^{n \\times n}","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"html":"ARm×m,BRm×n,CRn×nA \\in \\R^{m \\times m}, B \\in \\R^{m \\times n}, C \\in \\R^{n \\times n}ARm×m,BRm×n,CRn×n","key":"GYPlRpTerf"},{"type":"text","value":".\nThe ","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"key":"ozrQtIzyzj"},{"type":"strong","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"children":[{"type":"text","value":"Schur complement","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"key":"SyO9wF30M5"}],"key":"F0sKFZlmnb"},{"type":"text","value":" of ","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"key":"RoqK3LD9FB"},{"type":"inlineMath","value":"A","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"html":"AAA","key":"Ajca5pDKOk"},{"type":"text","value":" is denoted","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"key":"xYqEZ5dY0K"}],"key":"NPvzXKLmiD"},{"type":"math","value":"D/A = C - B^\\top A^{-1} B.","position":{"start":{"line":563,"column":1},"end":{"line":565,"column":1}},"html":"D/A=CBA1B.D/A = C - B^\\top A^{-1} B.D/A=CBA1B.","enumerator":"2.28","key":"t7imcoXHIw"},{"type":"paragraph","position":{"start":{"line":567,"column":1},"end":{"line":567,"column":1}},"children":[{"type":"text","value":"Schur complements have various uses in linear algebra and numerical computation.","position":{"start":{"line":567,"column":1},"end":{"line":567,"column":1}},"key":"vEVYpO9Y6K"}],"key":"gtY5xEhhFK"},{"type":"paragraph","position":{"start":{"line":569,"column":1},"end":{"line":572,"column":1}},"children":[{"type":"text","value":"A useful fact for us is that\nif ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"QdjeMqnje4"},{"type":"inlineMath","value":"A","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"html":"AAA","key":"vHx5qvsExi"},{"type":"text","value":" is positive ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"dH9Ezt8pxV"},{"type":"emphasis","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"children":[{"type":"text","value":"definite,","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"cYEBkAWlrV"}],"key":"fI0qLRi4oI"},{"type":"text","value":"\nthen ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"GVfAGwyNhd"},{"type":"inlineMath","value":"D","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"html":"DDD","key":"lRTMpBYDHE"},{"type":"text","value":" is positive ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"mLIeLqil2p"},{"type":"emphasis","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"children":[{"type":"text","value":"semidefinite","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"MOiHN26JEc"}],"key":"WPNbbY5oQK"},{"type":"text","value":"\nif and only if ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"ioBgxFWl2x"},{"type":"inlineMath","value":"D/A","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"html":"D/AD/AD/A","key":"QYvF0QvIdL"},{"type":"text","value":" is positive ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"xjVpAAJ7W9"},{"type":"emphasis","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"children":[{"type":"text","value":"semidefinite","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"YWAuSey1A3"}],"key":"ADYDHR9wRg"},{"type":"text","value":".","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"B236vQBoux"}],"key":"WLhSQcbk1y"}],"enumerator":"2.4","html_id":"lemma-schur","key":"ubhMxIwdwv"},{"type":"paragraph","position":{"start":{"line":575,"column":1},"end":{"line":577,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"key":"y00aXMU42X"},{"type":"inlineMath","value":"P","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"html":"PPP","key":"VdeCMP4Sg2"},{"type":"text","value":" denote ","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"key":"JGNbmt67kz"},{"type":"inlineMath","value":"P_{\\hi + 1}","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"html":"Ph+1P_{\\hi + 1}Ph+1","key":"uLy9R2vJSE"},{"type":"text","value":" for brevity.\nWe already know ","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"key":"nM7Wbs8RbE"},{"type":"inlineMath","value":"Q","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"html":"QQQ","key":"ivd9lNujhb"},{"type":"text","value":" is p.d.,\nso it suffices to show that","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"key":"D24jGMiNdJ"}],"key":"CCF41AGMW8"},{"type":"math","value":"S = P - P B (R + B^\\top P B)^{-1} B^\\top P","position":{"start":{"line":579,"column":1},"end":{"line":581,"column":1}},"html":"S=PPB(R+BPB)1BPS = P - P B (R + B^\\top P B)^{-1} B^\\top PS=PPB(R+BPB)1BP","enumerator":"2.29","key":"hgsqhkEKQ0"},{"type":"paragraph","position":{"start":{"line":583,"column":1},"end":{"line":586,"column":1}},"children":[{"type":"text","value":"is p.s.d. (positive semidefinite),\nsince left- and right- multiplying by ","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"key":"N7VMnMor14"},{"type":"inlineMath","value":"A^\\top","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"html":"AA^\\topA","key":"Hu0xC8x5vO"},{"type":"text","value":" and ","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"key":"V9DYuVwK9c"},{"type":"inlineMath","value":"A","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"html":"AAA","key":"KtW1mk1TpQ"},{"type":"text","value":" respectively\npreserves p.s.d.\nWe note that ","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"key":"FzSEWafppv"},{"type":"inlineMath","value":"S","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"html":"SSS","key":"ES4uuflqCv"},{"type":"text","value":" is the Schur complement ","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"key":"AdBMkKSenk"},{"type":"inlineMath","value":"D/(R + B^\\top P B)","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"html":"D/(R+BPB)D/(R + B^\\top P B)D/(R+BPB)","key":"NwBnluUxft"},{"type":"text","value":", where","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"key":"ijdfT3xMt9"}],"key":"lxB8ZINHQ5"},{"type":"math","value":"D = \\begin{pmatrix}\nR + B^\\top P B & B^\\top P \\\\\nP B & P\n\\end{pmatrix}.","position":{"start":{"line":588,"column":1},"end":{"line":593,"column":1}},"html":"D=(R+BPBBPPBP).D = \\begin{pmatrix}\nR + B^\\top P B & B^\\top P \\\\\nP B & P\n\\end{pmatrix}.D=(R+BPBPBBPP).","enumerator":"2.30","key":"fR2mohcx9r"},{"type":"paragraph","position":{"start":{"line":595,"column":1},"end":{"line":596,"column":1}},"children":[{"type":"text","value":"Thus we must show that ","position":{"start":{"line":595,"column":1},"end":{"line":595,"column":1}},"key":"ttd8djhn5f"},{"type":"inlineMath","value":"D","position":{"start":{"line":595,"column":1},"end":{"line":595,"column":1}},"html":"DDD","key":"bb5ZvooIMi"},{"type":"text","value":" is p.s.d..\nThis can be seen by computing","position":{"start":{"line":595,"column":1},"end":{"line":595,"column":1}},"key":"l4ENVwZYef"}],"key":"VdwhGLjc4r"},{"type":"math","value":"\\begin{aligned}\n\\begin{pmatrix}\ny^\\top & z^\\top\n\\end{pmatrix}\nD\n\\begin{pmatrix}\ny \\\\ z\n\\end{pmatrix}\n&= y^\\top R y + y^\\top B^\\top P B y + 2 y^\\top B^\\top P z + z^\\top P z \\\\\n&= y^\\top R y + (By + z)^\\top P (By + z) \\\\\n&> 0.\n\\end{aligned}","position":{"start":{"line":598,"column":1},"end":{"line":611,"column":1}},"html":"(yz)D(yz)=yRy+yBPBy+2yBPz+zPz=yRy+(By+z)P(By+z)>0.\\begin{aligned}\n\\begin{pmatrix}\ny^\\top & z^\\top\n\\end{pmatrix}\nD\n\\begin{pmatrix}\ny \\\\ z\n\\end{pmatrix}\n&= y^\\top R y + y^\\top B^\\top P B y + 2 y^\\top B^\\top P z + z^\\top P z \\\\\n&= y^\\top R y + (By + z)^\\top P (By + z) \\\\\n&> 0.\n\\end{aligned}(yz)D(yz)=yRy+yBPBy+2yBPz+zPz=yRy+(By+z)P(By+z)>0.","enumerator":"2.31","key":"VN91QvIg7S"},{"type":"paragraph","position":{"start":{"line":613,"column":1},"end":{"line":615,"column":1}},"children":[{"type":"text","value":"Since ","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"key":"BNRU8boPoU"},{"type":"inlineMath","value":"R + B^\\top P B","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"html":"R+BPBR + B^\\top P BR+BPB","key":"qraYTRbCL0"},{"type":"text","value":" is p.d. and ","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"key":"l0Diu8DTaG"},{"type":"inlineMath","value":"D","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"html":"DDD","key":"pbVq5Isj6S"},{"type":"text","value":" is p.s.d.,\nthen ","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"key":"zTHxeAPLzU"},{"type":"inlineMath","value":"S = D / (R + B^\\top P B)","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"html":"S=D/(R+BPB)S = D / (R + B^\\top P B)S=D/(R+BPB)","key":"UI8qNOTSTS"},{"type":"text","value":" must be p.s.d.,\nand ","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"key":"DnLkAiAzYl"},{"type":"inlineMath","value":"P_\\hi = Q + A S A^\\top","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"html":"Ph=Q+ASAP_\\hi = Q + A S A^\\topPh=Q+ASA","key":"bkgLOHwiB4"},{"type":"text","value":" must be p.d.","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"key":"kGFg3S3pOd"}],"key":"PrW0wnXTIf"}],"enumerator":"2.3","key":"vcYO0yCZlt"},{"type":"paragraph","position":{"start":{"line":618,"column":1},"end":{"line":620,"column":1}},"children":[{"type":"text","value":"Now we’ve shown that ","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"key":"PXDjHCDLx3"},{"type":"inlineMath","value":"V^\\star_\\hi(\\st) = \\st^\\top P_\\hi \\st + p_\\hi","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"html":"Vh(x)=xPhx+phV^\\star_\\hi(\\st) = \\st^\\top P_\\hi \\st + p_\\hiVh(x)=xPhx+ph","key":"kPWo6j66Ry"},{"type":"text","value":",\nwhere ","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"key":"uyZPdlShro"},{"type":"inlineMath","value":"P_\\hi","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"html":"PhP_\\hiPh","key":"IZmllTROIf"},{"type":"text","value":" is s.p.d.,\nproving the inductive hypothesis and completing the proof of ","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"key":"bqJ3zyKyH2"},{"type":"crossReference","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"children":[{"type":"text","value":"Theorem ","key":"QLFTqwPwDE"},{"type":"text","value":"2.2","key":"oCocGcbFJ9"}],"identifier":"optimal_policy_lqr_linear","label":"optimal_policy_lqr_linear","kind":"proof:theorem","template":"Theorem %s","enumerator":"2.2","resolved":true,"html_id":"optimal-policy-lqr-linear","key":"ACMapB7p9c"},{"type":"text","value":" and ","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"key":"oaEEXDCYWJ"},{"type":"crossReference","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"children":[{"type":"text","value":"Theorem ","key":"ROYREPmC2T"},{"type":"text","value":"2.1","key":"cbhu4yYk5s"}],"identifier":"optimal_value_lqr_quadratic","label":"optimal_value_lqr_quadratic","kind":"proof:theorem","template":"Theorem %s","enumerator":"2.1","resolved":true,"html_id":"optimal-value-lqr-quadratic","key":"AjCoV07Wi0"},{"type":"text","value":".","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"key":"zEn2L9Wmnq"}],"key":"SXGC4oZWcw"},{"type":"paragraph","position":{"start":{"line":622,"column":1},"end":{"line":628,"column":1}},"children":[{"type":"text","value":"In summary, we just demonstrated that at each timestep ","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"key":"Q81vbIYLi3"},{"type":"inlineMath","value":"\\hi \\in [\\hor]","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"html":"h[H]\\hi \\in [\\hor]h[H]","key":"R5iQQF9oDm"},{"type":"text","value":",\nthe optimal value function ","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"key":"J8fSwIQRZw"},{"type":"inlineMath","value":"V^\\star_\\hi","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"html":"VhV^\\star_\\hiVh","key":"NrVHDlc3oK"},{"type":"text","value":"\nand optimal Q-function ","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"key":"DFq48eIHkl"},{"type":"inlineMath","value":"Q^\\star_\\hi","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"html":"QhQ^\\star_\\hiQh","key":"y5SSvioWrG"},{"type":"text","value":" are both upward-curved quadratics\nand the optimal policy ","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"key":"e221cMhk53"},{"type":"inlineMath","value":"\\pi^\\star_\\hi","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"html":"πh\\pi^\\star_\\hiπh","key":"GGRJmM3p3N"},{"type":"text","value":" is linear.\nWe also showed that all of these quantities can be calculated\nusing a sequence of s.p.d. matrices ","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"key":"R0GO4vCWIQ"},{"type":"inlineMath","value":"P_0, \\dots, P_H","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"html":"P0,,PHP_0, \\dots, P_HP0,,PH","key":"E4i4XzClsE"},{"type":"text","value":"\nthat can be defined recursively using the Riccati equation ","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"key":"MSVGolEp7K"},{"type":"crossReference","kind":"proof:definition","identifier":"riccati","label":"riccati","children":[{"type":"text","value":"Definition ","key":"sGicoN829C"},{"type":"text","value":"2.5","key":"PLZMtsOgKS"}],"template":"Definition %s","enumerator":"2.5","resolved":true,"html_id":"riccati","key":"O7bz0EmGyj"},{"type":"text","value":".","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"key":"Bgj2DFgokv"}],"key":"bMaGXuNmSR"},{"type":"paragraph","position":{"start":{"line":630,"column":1},"end":{"line":632,"column":1}},"children":[{"type":"text","value":"Before we move on to some extensions of LQR, let’s consider how the\nstate at time ","position":{"start":{"line":630,"column":1},"end":{"line":630,"column":1}},"key":"VXz6vt12Ux"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":630,"column":1},"end":{"line":630,"column":1}},"html":"h\\hih","key":"qVgmX2y6m3"},{"type":"text","value":" behaves when we act according to this optimal\npolicy.","position":{"start":{"line":630,"column":1},"end":{"line":630,"column":1}},"key":"svEZPZ4zaJ"}],"key":"k7l5LsZ2Bn"},{"type":"heading","depth":3,"position":{"start":{"line":634,"column":1},"end":{"line":634,"column":1}},"children":[{"type":"text","value":"Expected state at time ","position":{"start":{"line":634,"column":1},"end":{"line":634,"column":1}},"key":"qalaLqVOS5"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":634,"column":1},"end":{"line":634,"column":1}},"html":"h\\hih","key":"oPKEnzhiyc"}],"identifier":"expected-state-at-time-hi","label":"Expected state at time \\hi","html_id":"expected-state-at-time-hi","implicit":true,"enumerator":"2.4.1","key":"Xqr2EJGCK3"},{"type":"paragraph","position":{"start":{"line":636,"column":1},"end":{"line":639,"column":1}},"children":[{"type":"text","value":"How can we compute the expected state at time ","position":{"start":{"line":636,"column":1},"end":{"line":636,"column":1}},"key":"quuFjwqoqx"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":636,"column":1},"end":{"line":636,"column":1}},"html":"h\\hih","key":"QC5AOlrWn2"},{"type":"text","value":" when acting\naccording to the optimal policy? Let’s first express ","position":{"start":{"line":636,"column":1},"end":{"line":636,"column":1}},"key":"FMYU1SJNy2"},{"type":"inlineMath","value":"\\st_\\hi","position":{"start":{"line":636,"column":1},"end":{"line":636,"column":1}},"html":"xh\\st_\\hixh","key":"wnTR0AcsAc"},{"type":"text","value":" in a\ncleaner way in terms of the history. Note that having linear dynamics\nmakes it easy to expand terms backwards in time:","position":{"start":{"line":636,"column":1},"end":{"line":636,"column":1}},"key":"ySIDw4GwUx"}],"key":"lIiMXAgX2f"},{"type":"math","value":"\\begin{aligned}\n \\st_\\hi & = A \\st_{\\hi-1} + B \\act_{\\hi-1} + w_{\\hi-1} \\\\\n & = A (A\\st_{\\hi-2} + B \\act_{\\hi-2} + w_{\\hi-2}) + B \\act_{\\hi-1} + w_{\\hi-1} \\\\\n & = \\cdots \\\\\n & = A^\\hi \\st_0 + \\sum_{i=0}^{\\hi-1} A^i (B \\act_{\\hi-i-1} + w_{\\hi-i-1}).\n\\end{aligned}","position":{"start":{"line":641,"column":1},"end":{"line":648,"column":1}},"html":"xh=Axh1+Buh1+wh1=A(Axh2+Buh2+wh2)+Buh1+wh1==Ahx0+i=0h1Ai(Buhi1+whi1).\\begin{aligned}\n \\st_\\hi & = A \\st_{\\hi-1} + B \\act_{\\hi-1} + w_{\\hi-1} \\\\\n & = A (A\\st_{\\hi-2} + B \\act_{\\hi-2} + w_{\\hi-2}) + B \\act_{\\hi-1} + w_{\\hi-1} \\\\\n & = \\cdots \\\\\n & = A^\\hi \\st_0 + \\sum_{i=0}^{\\hi-1} A^i (B \\act_{\\hi-i-1} + w_{\\hi-i-1}).\n\\end{aligned}xh=Axh1+Buh1+wh1=A(Axh2+Buh2+wh2)+Buh1+wh1==Ahx0+i=0h1Ai(Buhi1+whi1).","enumerator":"2.32","key":"IDRbyj9x4V"},{"type":"paragraph","position":{"start":{"line":652,"column":1},"end":{"line":655,"column":1}},"children":[{"type":"text","value":"Let’s consider the ","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"NjHbENYP78"},{"type":"emphasis","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"children":[{"type":"text","value":"average state","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"FWuCe4dq4D"}],"key":"fR8b8gJzMW"},{"type":"text","value":" at this time, given all the past\nstates and actions. Since we assume that ","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"rjBqPc8k7X"},{"type":"inlineMath","value":"\\E [w_\\hi] = 0","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"html":"E[wh]=0\\E [w_\\hi] = 0E[wh]=0","key":"x8G04GSsro"},{"type":"text","value":" (this is the\nzero vector in ","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"k1vAMk7eK7"},{"type":"inlineMath","value":"d","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"html":"ddd","key":"qDKvah0Xyf"},{"type":"text","value":" dimensions), when we take an expectation, the ","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"QROeADbISj"},{"type":"inlineMath","value":"w_\\hi","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"html":"whw_\\hiwh","key":"Nz5nb0pQe0"},{"type":"text","value":"\nterm vanishes due to linearity, and so we’re left with","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"VfhWZY4dsY"}],"key":"w2ssNDxsGu"},{"type":"math","value":"\\E [\\st_\\hi \\mid \\st_{0:(\\hi-1)}, \\act_{0:(\\hi-1)}] = A^\\hi \\st_0 + \\sum_{i=0}^{\\hi-1} A^i B \\act_{\\hi-i-1}.","position":{"start":{"line":658,"column":1},"end":{"line":661,"column":1}},"identifier":"expected_state","label":"expected_state","html_id":"expected-state","html":"E[xhx0:(h1),u0:(h1)]=Ahx0+i=0h1AiBuhi1.\\E [\\st_\\hi \\mid \\st_{0:(\\hi-1)}, \\act_{0:(\\hi-1)}] = A^\\hi \\st_0 + \\sum_{i=0}^{\\hi-1} A^i B \\act_{\\hi-i-1}.E[xhx0:(h1),u0:(h1)]=Ahx0+i=0h1AiBuhi1.","enumerator":"2.33","key":"v7HEVLEQ7e"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Exercise","position":{"start":{"line":664,"column":1},"end":{"line":664,"column":1}},"key":"ykKBBaA75s"}],"key":"Beq1YhrKtu"},{"type":"paragraph","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"children":[{"type":"text","value":"Show that if we choose actions according to the optimal policy ","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"key":"BGK8NR140S"},{"type":"crossReference","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"children":[{"type":"text","value":"Lemma ","key":"PQ4Zd7pKNF"},{"type":"text","value":"2.2","key":"d2B73f2nLx"}],"identifier":"lemma_pi_linear","label":"lemma_pi_linear","kind":"proof:lemma","template":"Lemma %s","enumerator":"2.2","resolved":true,"html_id":"lemma-pi-linear","key":"QXBeKsCQFX"},{"type":"text","value":", ","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"key":"oGtbYLA2JI"},{"type":"crossReference","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"children":[{"type":"text","value":"(","key":"gxB66FDUeH"},{"type":"text","value":"2.33","key":"UDGfnvKbe1"},{"type":"text","value":")","key":"ldMYg3Dij9"}],"identifier":"expected_state","label":"expected_state","kind":"equation","template":"(%s)","enumerator":"2.33","resolved":true,"html_id":"expected-state","key":"rj7jhSwSK7"},{"type":"text","value":" becomes","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"key":"oplayaP1EF"}],"key":"gzYrej4UrO"},{"type":"math","value":"\\E [\\st_\\hi \\mid \\st_0, \\act_i = \\pi^\\star_i(\\st_i)\\quad \\forall i \\le \\hi] = \\left( \\prod_{i=0}^{\\hi-1} (A - B K_i) \\right) \\st_0.","position":{"start":{"line":667,"column":1},"end":{"line":669,"column":1}},"html":"E[xhx0,ui=πi(xi)ih]=(i=0h1(ABKi))x0.\\E [\\st_\\hi \\mid \\st_0, \\act_i = \\pi^\\star_i(\\st_i)\\quad \\forall i \\le \\hi] = \\left( \\prod_{i=0}^{\\hi-1} (A - B K_i) \\right) \\st_0.E[xhx0,ui=πi(xi)ih]=(i=0h1(ABKi))x0.","enumerator":"2.34","key":"upLgb8hPdo"}],"key":"X8yXVjASIl"},{"type":"paragraph","position":{"start":{"line":672,"column":1},"end":{"line":679,"column":1}},"children":[{"type":"text","value":"This introdces the quantity ","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"W3TVWipkSf"},{"type":"inlineMath","value":"A - B K_i","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"ABKiA - B K_iABKi","key":"f3nVe551P1"},{"type":"text","value":", which shows up frequently in\ncontrol theory. For example, one important question is: will ","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"ScbBKMMXD1"},{"type":"inlineMath","value":"\\st_\\hi","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"xh\\st_\\hixh","key":"V0jX2Dg1ve"},{"type":"text","value":"\nremain bounded, or will it go to infinity as time goes on? To answer\nthis, let’s imagine for simplicity that these ","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"kRt01sTshC"},{"type":"inlineMath","value":"K_i","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"KiK_iKi","key":"W438dcLqBb"},{"type":"text","value":"s are equal (call\nthis matrix ","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"f9sQiaITbB"},{"type":"inlineMath","value":"K","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"KKK","key":"I0GpIYVrOB"},{"type":"text","value":"). Then the expression above becomes ","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"xjGQiN99sm"},{"type":"inlineMath","value":"(A-BK)^\\hi \\st_0","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"(ABK)hx0(A-BK)^\\hi \\st_0(ABK)hx0","key":"yC4SKV63JZ"},{"type":"text","value":".\nNow consider the maximum eigenvalue ","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"FmEgrRg51A"},{"type":"inlineMath","value":"\\lambda_{\\max}","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"λmax\\lambda_{\\max}λmax","key":"TmI80Ibecy"},{"type":"text","value":" of ","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"BLsFqfLxMv"},{"type":"inlineMath","value":"A - BK","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"ABKA - BKABK","key":"naSYwQihAw"},{"type":"text","value":". If\n","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"c6NL9MM59J"},{"type":"inlineMath","value":"|\\lambda_{\\max}| > 1","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"λmax>1|\\lambda_{\\max}| > 1λmax>1","key":"urojAVYLid"},{"type":"text","value":", then there’s some nonzero initial state\n","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"y0iR3sM9W9"},{"type":"inlineMath","value":"\\bar \\st_0","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"xˉ0\\bar \\st_0xˉ0","key":"p7a1gaqO4j"},{"type":"text","value":", the corresponding eigenvector, for which","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"vmcNdP8SPR"}],"key":"p1CJvpAFjG"},{"type":"math","value":"\\lim_{\\hi \\to \\infty} (A - BK)^\\hi \\bar \\st_0\n = \\lim_{\\hi \\to \\infty} \\lambda_{\\max}^\\hi \\bar \\st_0\n = \\infty.","position":{"start":{"line":682,"column":1},"end":{"line":686,"column":1}},"html":"limh(ABK)hxˉ0=limhλmaxhxˉ0=.\\lim_{\\hi \\to \\infty} (A - BK)^\\hi \\bar \\st_0\n = \\lim_{\\hi \\to \\infty} \\lambda_{\\max}^\\hi \\bar \\st_0\n = \\infty.hlim(ABK)hxˉ0=hlimλmaxhxˉ0=∞.","enumerator":"2.35","key":"pP9RR94ZAx"},{"type":"paragraph","position":{"start":{"line":688,"column":1},"end":{"line":688,"column":1}},"children":[{"type":"text","value":"Otherwise, if ","position":{"start":{"line":688,"column":1},"end":{"line":688,"column":1}},"key":"rl1OWa3lGF"},{"type":"inlineMath","value":"|\\lambda_{\\max}| < 1","position":{"start":{"line":688,"column":1},"end":{"line":688,"column":1}},"html":"λmax<1|\\lambda_{\\max}| < 1λmax<1","key":"oXpxAlfKnx"},{"type":"text","value":", then it’s impossible for your original state to explode as dramatically.","position":{"start":{"line":688,"column":1},"end":{"line":688,"column":1}},"key":"iaN0g53E5k"}],"key":"DNpFOiSOHR"},{"type":"heading","depth":2,"position":{"start":{"line":690,"column":1},"end":{"line":690,"column":1}},"children":[{"type":"text","value":"Extensions","position":{"start":{"line":690,"column":1},"end":{"line":690,"column":1}},"key":"TB6wXBfclX"}],"identifier":"extensions","label":"Extensions","html_id":"extensions","implicit":true,"enumerator":"2.5","key":"eEqUotGexJ"},{"type":"paragraph","position":{"start":{"line":692,"column":1},"end":{"line":697,"column":1}},"children":[{"type":"text","value":"We’ve now formulated an optimal solution for the time-homogeneous LQR\nand computed the expected state under the optimal policy. However, real\nworld tasks rarely have such simple dynamics, and we may wish to design\nmore complex cost functions. In this section, we’ll consider more\ngeneral extensions of LQR where some of the assumptions we made above\nare relaxed. Specifically, we’ll consider:","position":{"start":{"line":692,"column":1},"end":{"line":692,"column":1}},"key":"mWrIQ7ej6g"}],"key":"UOBb8S8ZY6"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":699,"column":1},"end":{"line":707,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":699,"column":1},"end":{"line":701,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":699,"column":1},"end":{"line":700,"column":1}},"children":[{"type":"strong","position":{"start":{"line":699,"column":1},"end":{"line":699,"column":1}},"children":[{"type":"text","value":"Time-dependency","position":{"start":{"line":699,"column":1},"end":{"line":699,"column":1}},"key":"rfrvKvaNjw"}],"key":"aSKxgCCZJo"},{"type":"text","value":", where the dynamics and cost function might\nchange depending on the timestep.","position":{"start":{"line":699,"column":1},"end":{"line":699,"column":1}},"key":"QVxIiazkcC"}],"key":"llNaJqeSSZ"}],"key":"M3RE2vvrXe"},{"type":"listItem","spread":true,"position":{"start":{"line":702,"column":1},"end":{"line":704,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":702,"column":1},"end":{"line":703,"column":1}},"children":[{"type":"strong","position":{"start":{"line":702,"column":1},"end":{"line":702,"column":1}},"children":[{"type":"text","value":"General quadratic cost","position":{"start":{"line":702,"column":1},"end":{"line":702,"column":1}},"key":"J3w8uli7YB"}],"key":"BCd2yidVxv"},{"type":"text","value":", where we allow for linear terms and a\nconstant term.","position":{"start":{"line":702,"column":1},"end":{"line":702,"column":1}},"key":"PUALcQPCde"}],"key":"G8K3PpfUOr"}],"key":"vAL8Wg6NGE"},{"type":"listItem","spread":true,"position":{"start":{"line":705,"column":1},"end":{"line":707,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":705,"column":1},"end":{"line":706,"column":1}},"children":[{"type":"strong","position":{"start":{"line":705,"column":1},"end":{"line":705,"column":1}},"children":[{"type":"text","value":"Tracking a goal trajectory","position":{"start":{"line":705,"column":1},"end":{"line":705,"column":1}},"key":"Imfq3zMeKX"}],"key":"wRMDo8d6Bi"},{"type":"text","value":" rather than aiming for a single goal\nstate-action pair.","position":{"start":{"line":705,"column":1},"end":{"line":705,"column":1}},"key":"P5zu53azCS"}],"key":"hrMkVWkU4M"}],"key":"Y4rMZmX1n3"}],"key":"H4j4ZQy61r"},{"type":"paragraph","position":{"start":{"line":708,"column":1},"end":{"line":710,"column":1}},"children":[{"type":"text","value":"Combining these will allow us to use the LQR solution to solve more\ncomplex setups by taking ","position":{"start":{"line":708,"column":1},"end":{"line":708,"column":1}},"key":"mYikLmyN3u"},{"type":"emphasis","position":{"start":{"line":708,"column":1},"end":{"line":708,"column":1}},"children":[{"type":"text","value":"Taylor approximations","position":{"start":{"line":708,"column":1},"end":{"line":708,"column":1}},"key":"ofNSbeLowY"}],"key":"KP3tC9yTBo"},{"type":"text","value":" of the dynamics and\ncost functions.","position":{"start":{"line":708,"column":1},"end":{"line":708,"column":1}},"key":"ZUU8mObgVl"}],"key":"rA24YYCcUd"},{"type":"heading","depth":3,"position":{"start":{"line":713,"column":1},"end":{"line":713,"column":1}},"children":[{"type":"text","value":"Time-dependent dynamics and cost function","position":{"start":{"line":713,"column":1},"end":{"line":713,"column":1}},"key":"LrDV196PRQ"}],"label":"time_dep_lqr","identifier":"time_dep_lqr","html_id":"time-dep-lqr","enumerator":"2.5.1","key":"VSXgAKw3mc"},{"type":"paragraph","position":{"start":{"line":715,"column":1},"end":{"line":723,"column":1}},"children":[{"type":"text","value":"So far, we’ve considered the ","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"nRIMfcQdJD"},{"type":"emphasis","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"children":[{"type":"text","value":"time-homogeneous","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"nORk3NVb8n"}],"key":"Y4E2r33tho"},{"type":"text","value":" case, where the dynamics\nand cost function stay the same at every timestep. However, this might\nnot always be the case. As an example, in many sports, the rules and\nscoring system might change during an overtime period. To address these\nsorts of problems, we can loosen the time-homogeneous restriction, and\nconsider the case where the dynamics and cost function are\n","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"PU7BiUI9vD"},{"type":"emphasis","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"children":[{"type":"text","value":"time-dependent.","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"oXKCDILq1E"}],"key":"VtdpBojQ6k"},{"type":"text","value":" Our analysis remains almost identical; in fact, we can\nsimply add a time index to the matrices ","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"wJxBxYcHfz"},{"type":"inlineMath","value":"A","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"html":"AAA","key":"l56ZqBsu9z"},{"type":"text","value":" and ","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"nPapqH5qpv"},{"type":"inlineMath","value":"B","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"html":"BBB","key":"fF1u8Lpvbm"},{"type":"text","value":" that determine the\ndynamics and the matrices ","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"n5QrdemqRO"},{"type":"inlineMath","value":"Q","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"html":"QQQ","key":"A6rGFlKYUl"},{"type":"text","value":" and ","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"OfsxOmbYcQ"},{"type":"inlineMath","value":"R","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"html":"RRR","key":"wZe6WZmpaC"},{"type":"text","value":" that determine the cost.","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"DksXlYNyHu"}],"key":"UYE9fDI17u"},{"type":"paragraph","position":{"start":{"line":725,"column":1},"end":{"line":725,"column":1}},"children":[{"type":"text","value":"The modified problem is now defined as follows:","position":{"start":{"line":725,"column":1},"end":{"line":725,"column":1}},"key":"LKqyeLt5QL"}],"key":"qQdkQGm2gn"},{"type":"proof","kind":"definition","label":"time_dependent_lqr","identifier":"time_dependent_lqr","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Time-dependent LQR","position":{"start":{"line":727,"column":1},"end":{"line":727,"column":1}},"key":"KMnGb2pTXd"}],"key":"obw9EH75pw"},{"type":"math","value":"\\begin{aligned}\n \\min_{\\pi_{0}, \\dots, \\pi_{\\hor-1}} \\quad & \\E \\left[ \\left( \\sum_{\\hi=0}^{\\hor-1} (\\st_\\hi^\\top Q_\\hi \\st_\\hi) + \\act_\\hi^\\top R_\\hi \\act_\\hi \\right) + \\st_\\hor^\\top Q_\\hor \\st_\\hor \\right] \\\\\n \\textrm{where} \\quad & \\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi) = A_\\hi \\st_\\hi + B_\\hi \\act_\\hi + w_\\hi \\\\\n & \\st_0 \\sim \\mu_0 \\\\\n & \\act_\\hi = \\pi_\\hi (\\st_\\hi) \\\\\n & w_\\hi \\sim \\mathcal{N}(0, \\sigma^2 I).\n\\end{aligned}","position":{"start":{"line":730,"column":1},"end":{"line":738,"column":1}},"html":"minπ0,,πH1E[(h=0H1(xhQhxh)+uhRhuh)+xHQHxH]wherexh+1=fh(xh,uh,wh)=Ahxh+Bhuh+whx0μ0uh=πh(xh)whN(0,σ2I).\\begin{aligned}\n \\min_{\\pi_{0}, \\dots, \\pi_{\\hor-1}} \\quad & \\E \\left[ \\left( \\sum_{\\hi=0}^{\\hor-1} (\\st_\\hi^\\top Q_\\hi \\st_\\hi) + \\act_\\hi^\\top R_\\hi \\act_\\hi \\right) + \\st_\\hor^\\top Q_\\hor \\st_\\hor \\right] \\\\\n \\textrm{where} \\quad & \\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi) = A_\\hi \\st_\\hi + B_\\hi \\act_\\hi + w_\\hi \\\\\n & \\st_0 \\sim \\mu_0 \\\\\n & \\act_\\hi = \\pi_\\hi (\\st_\\hi) \\\\\n & w_\\hi \\sim \\mathcal{N}(0, \\sigma^2 I).\n\\end{aligned}π0,,πH1minwhereE[(h=0H1(xhQhxh)+uhRhuh)+xHQHxH]xh+1=fh(xh,uh,wh)=Ahxh+Bhuh+whx0μ0uh=πh(xh)whN(0,σ2I).","enumerator":"2.36","key":"DwiC4CI7tQ"}],"enumerator":"2.6","html_id":"time-dependent-lqr","key":"qbgGYJygXu"},{"type":"paragraph","position":{"start":{"line":743,"column":1},"end":{"line":745,"column":1}},"children":[{"type":"text","value":"The derivation of the optimal value functions and the optimal policy\nremains almost exactly the same, and we can modify the Riccati equation\naccordingly:","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"key":"JLxsl8NjWw"}],"key":"xvMnv40fBY"},{"type":"proof","kind":"definition","label":"riccati_time_dependent","identifier":"riccati_time_dependent","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Time-dependent Riccati Equation","position":{"start":{"line":747,"column":1},"end":{"line":747,"column":1}},"key":"vWwrY0S5hk"}],"key":"mErsluC0WT"},{"type":"math","value":"P_\\hi = Q_\\hi + A_\\hi^\\top P_{\\hi+1} A_\\hi - A_\\hi^\\top P_{\\hi+1} B_\\hi (R_\\hi + B_\\hi^\\top P_{\\hi+1} B_\\hi)^{-1} B_\\hi^\\top P_{\\hi+1} A_\\hi.","position":{"start":{"line":750,"column":1},"end":{"line":752,"column":1}},"html":"Ph=Qh+AhPh+1AhAhPh+1Bh(Rh+BhPh+1Bh)1BhPh+1Ah.P_\\hi = Q_\\hi + A_\\hi^\\top P_{\\hi+1} A_\\hi - A_\\hi^\\top P_{\\hi+1} B_\\hi (R_\\hi + B_\\hi^\\top P_{\\hi+1} B_\\hi)^{-1} B_\\hi^\\top P_{\\hi+1} A_\\hi.Ph=Qh+AhPh+1AhAhPh+1Bh(Rh+BhPh+1Bh)1BhPh+1Ah.","enumerator":"2.37","key":"Z9YsIPYRE2"},{"type":"paragraph","position":{"start":{"line":754,"column":1},"end":{"line":756,"column":1}},"children":[{"type":"text","value":"Note that this is just the time-homogeneous Riccati equation\n(","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"fVxNFB6HSd"},{"type":"crossReference","kind":"proof:definition","identifier":"riccati","label":"riccati","children":[{"type":"text","value":"Definition ","key":"coAqZQCa4c"},{"type":"text","value":"2.5","key":"pc3ZFrk12P"}],"template":"Definition %s","enumerator":"2.5","resolved":true,"html_id":"riccati","key":"KWedAnZSDn"},{"type":"text","value":"), but with the time index added to each of the\nrelevant matrices.","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"jl2ENs6hIe"}],"key":"XNa5jeU6Ul"}],"enumerator":"2.7","html_id":"riccati-time-dependent","key":"pYl2TYgJ3i"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Exercise","position":{"start":{"line":759,"column":1},"end":{"line":759,"column":1}},"key":"QWbM3AltND"}],"key":"rPItt5fq5g"},{"type":"paragraph","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"children":[{"type":"text","value":"Walk through the proof in ","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"key":"eRCl0WHc9z"},{"type":"crossReference","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"children":[{"type":"text","value":"Section ","key":"MFJZkrLnaP"},{"type":"text","value":"2.4","key":"sMunxzGgRg"}],"identifier":"optimal_lqr","label":"optimal_lqr","kind":"heading","template":"Section %s","enumerator":"2.4","resolved":true,"html_id":"optimal-lqr","key":"PW5uZqXdww"},{"type":"text","value":" to verify that we can simply add ","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"key":"O1yYrhDH52"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"html":"h\\hih","key":"VPeCG4Af8B"},{"type":"text","value":" for the time-dependent case.","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"key":"KruXto9ba5"}],"key":"oNnBPIwhlI"}],"key":"imdLIt4t7u"},{"type":"paragraph","position":{"start":{"line":763,"column":1},"end":{"line":765,"column":1}},"children":[{"type":"text","value":"Additionally, by allowing the dynamics to vary across time, we gain the\nability to ","position":{"start":{"line":763,"column":1},"end":{"line":763,"column":1}},"key":"Sj7cTIzv0n"},{"type":"emphasis","position":{"start":{"line":763,"column":1},"end":{"line":763,"column":1}},"children":[{"type":"text","value":"locally approximate","position":{"start":{"line":763,"column":1},"end":{"line":763,"column":1}},"key":"AvbPKTSP7X"}],"key":"ATFVF9va7P"},{"type":"text","value":" nonlinear dynamics at each timestep.\nWe’ll discuss this later in the chapter.","position":{"start":{"line":763,"column":1},"end":{"line":763,"column":1}},"key":"D9AmPOT82P"}],"key":"M8ZbxrM5tH"},{"type":"heading","depth":3,"position":{"start":{"line":767,"column":1},"end":{"line":767,"column":1}},"children":[{"type":"text","value":"More general quadratic cost functions","position":{"start":{"line":767,"column":1},"end":{"line":767,"column":1}},"key":"fV5rGYuMKH"}],"identifier":"more-general-quadratic-cost-functions","label":"More general quadratic cost functions","html_id":"more-general-quadratic-cost-functions","implicit":true,"enumerator":"2.5.2","key":"KpVsRBcXfF"},{"type":"paragraph","position":{"start":{"line":769,"column":1},"end":{"line":776,"column":1}},"children":[{"type":"text","value":"Our original cost function had only second-order terms with respect to\nthe state and action, incentivizing staying as close as possible to\n","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"key":"ImnFGFZ8nu"},{"type":"inlineMath","value":"(\\st^\\star, \\act^\\star) = (0, 0)","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"html":"(x,u)=(0,0)(\\st^\\star, \\act^\\star) = (0, 0)(x,u)=(0,0)","key":"iROkgoLn7f"},{"type":"text","value":". We can also consider more general\nquadratic cost functions that also have first-order terms and a constant\nterm. Combining this with time-dependent dynamics results in the\nfollowing expression, where we introduce a new matrix ","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"key":"Waz9EugVyY"},{"type":"inlineMath","value":"M_\\hi","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"html":"MhM_\\hiMh","key":"xjMuSsisAK"},{"type":"text","value":" for the\ncross term, linear coefficients ","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"key":"yZxFsjY19Z"},{"type":"inlineMath","value":"q_\\hi","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"html":"qhq_\\hiqh","key":"R0JR1twHsK"},{"type":"text","value":" and ","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"key":"qSD1jPSSgn"},{"type":"inlineMath","value":"r_\\hi","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"html":"rhr_\\hirh","key":"pUkKbJOcOO"},{"type":"text","value":" for the state and\naction respectively, and a constant term ","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"key":"Sjy2s6d8Rv"},{"type":"inlineMath","value":"c_\\hi","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"html":"chc_\\hich","key":"nY6mJPvqh8"},{"type":"text","value":":","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"key":"obbJ6G0e3O"}],"key":"RSdlNkedo4"},{"type":"math","value":"c_\\hi(\\st_\\hi, \\act_\\hi) = ( \\st_\\hi^\\top Q_\\hi \\st_\\hi + \\st_\\hi^\\top M_\\hi \\act_\\hi + \\act_\\hi^\\top R_\\hi \\act_\\hi ) + (\\st_\\hi^\\top q_\\hi + \\act_\\hi^\\top r_\\hi) + c_\\hi.","label":"general_quadratic_cost","identifier":"general_quadratic_cost","html":"ch(xh,uh)=(xhQhxh+xhMhuh+uhRhuh)+(xhqh+uhrh)+ch.c_\\hi(\\st_\\hi, \\act_\\hi) = ( \\st_\\hi^\\top Q_\\hi \\st_\\hi + \\st_\\hi^\\top M_\\hi \\act_\\hi + \\act_\\hi^\\top R_\\hi \\act_\\hi ) + (\\st_\\hi^\\top q_\\hi + \\act_\\hi^\\top r_\\hi) + c_\\hi.ch(xh,uh)=(xhQhxh+xhMhuh+uhRhuh)+(xhqh+uhrh)+ch.","enumerator":"2.38","html_id":"general-quadratic-cost","key":"m2QZCfxQFf"},{"type":"paragraph","position":{"start":{"line":784,"column":1},"end":{"line":786,"column":1}},"children":[{"type":"text","value":"Similarly, we can also include a\nconstant term ","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"G2g3mjxeL4"},{"type":"inlineMath","value":"v_\\hi \\in \\mathbb{R}^{n_\\st}","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"html":"vhRnxv_\\hi \\in \\mathbb{R}^{n_\\st}vhRnx","key":"XV5AIFxYim"},{"type":"text","value":" in the dynamics (note that this is\n","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"J4MYWkEt50"},{"type":"emphasis","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"children":[{"type":"text","value":"deterministic","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"PDxfXExNsG"}],"key":"sgJGkDJExi"},{"type":"text","value":" at each timestep, unlike the stochastic noise ","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"rTcPf4bT8x"},{"type":"inlineMath","value":"w_\\hi","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"html":"whw_\\hiwh","key":"idWxsnhxrI"},{"type":"text","value":"):","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"VZlUX7sI4o"}],"key":"PIa0zIH5M7"},{"type":"math","value":"\\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi) = A_\\hi \\st_\\hi + B_\\hi \\act_\\hi + v_\\hi + w_\\hi.","position":{"start":{"line":789,"column":1},"end":{"line":791,"column":1}},"html":"xh+1=fh(xh,uh,wh)=Ahxh+Bhuh+vh+wh.\\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi) = A_\\hi \\st_\\hi + B_\\hi \\act_\\hi + v_\\hi + w_\\hi.xh+1=fh(xh,uh,wh)=Ahxh+Bhuh+vh+wh.","enumerator":"2.39","key":"p0Rkz8GdrH"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"exercise","position":{"start":{"line":795,"column":1},"end":{"line":795,"column":1}},"key":"Z2aS58ipOm"}],"key":"gaM0NoHpus"},{"type":"paragraph","position":{"start":{"line":796,"column":1},"end":{"line":797,"column":1}},"children":[{"type":"text","value":"Derive the optimal solution. You will need to slightly modify the\nproof in ","position":{"start":{"line":796,"column":1},"end":{"line":796,"column":1}},"key":"cJvy7ZaE4C"},{"type":"crossReference","position":{"start":{"line":796,"column":1},"end":{"line":796,"column":1}},"children":[{"type":"text","value":"Section ","key":"WWmIisqBbq"},{"type":"text","value":"2.4","key":"nwVGFDpTQD"}],"identifier":"optimal_lqr","label":"optimal_lqr","kind":"heading","template":"Section %s","enumerator":"2.4","resolved":true,"html_id":"optimal-lqr","key":"LZsnEXzUf9"},{"type":"text","value":".","position":{"start":{"line":796,"column":1},"end":{"line":796,"column":1}},"key":"gAIWzSbymR"}],"key":"GoN9xZAN6F"}],"key":"GVD7L0YDtl"},{"type":"heading","depth":3,"position":{"start":{"line":800,"column":1},"end":{"line":800,"column":1}},"children":[{"type":"text","value":"Tracking a predefined trajectory","position":{"start":{"line":800,"column":1},"end":{"line":800,"column":1}},"key":"SXcvjnPVYL"}],"identifier":"tracking-a-predefined-trajectory","label":"Tracking a predefined trajectory","html_id":"tracking-a-predefined-trajectory","implicit":true,"enumerator":"2.5.3","key":"MSsWQN4y4R"},{"type":"paragraph","position":{"start":{"line":802,"column":1},"end":{"line":807,"column":1}},"children":[{"type":"text","value":"Consider applying LQR to a task like autonomous driving, where the\ntarget state-action pair changes over time. We might want the vehicle to\nfollow a predefined ","position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"key":"NXnoTMqGgw"},{"type":"emphasis","position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"children":[{"type":"text","value":"trajectory","position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"key":"vPUVJNNcmY"}],"key":"jP6VHt61Dn"},{"type":"text","value":" of states and actions\n","position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"key":"T7m3meq1gO"},{"type":"inlineMath","value":"(\\st_\\hi^\\star, \\act_\\hi^\\star)_{\\hi=0}^{\\hor-1}","position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"html":"(xh,uh)h=0H1(\\st_\\hi^\\star, \\act_\\hi^\\star)_{\\hi=0}^{\\hor-1}(xh,uh)h=0H1","key":"wloSzcZ18q"},{"type":"text","value":". To express this as a\ncontrol problem, we’ll need a corresponding time-dependent cost\nfunction:","position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"key":"FclYptdt47"}],"key":"xq4NUHPyEk"},{"type":"math","value":"c_\\hi(\\st_\\hi, \\act_\\hi) = (\\st_\\hi - \\st^\\star_\\hi)^\\top Q (\\st_\\hi - \\st^\\star_\\hi) + (\\act_\\hi - \\act^\\star_\\hi)^\\top R (\\act_\\hi - \\act^\\star_\\hi).","position":{"start":{"line":810,"column":1},"end":{"line":812,"column":1}},"html":"ch(xh,uh)=(xhxh)Q(xhxh)+(uhuh)R(uhuh).c_\\hi(\\st_\\hi, \\act_\\hi) = (\\st_\\hi - \\st^\\star_\\hi)^\\top Q (\\st_\\hi - \\st^\\star_\\hi) + (\\act_\\hi - \\act^\\star_\\hi)^\\top R (\\act_\\hi - \\act^\\star_\\hi).ch(xh,uh)=(xhxh)Q(xhxh)+(uhuh)R(uhuh).","enumerator":"2.40","key":"NyRgnN37yu"},{"type":"paragraph","position":{"start":{"line":815,"column":1},"end":{"line":818,"column":1}},"children":[{"type":"text","value":"Note that this punishes states and actions that are far from the\nintended trajectory. By expanding out these multiplications, we can see\nthat this is actually a special case of the more general quadratic cost\nfunction above ","position":{"start":{"line":815,"column":1},"end":{"line":815,"column":1}},"key":"t0nFZVpPzO"},{"type":"crossReference","position":{"start":{"line":815,"column":1},"end":{"line":815,"column":1}},"children":[{"type":"text","value":"(","key":"fPXhFAYir0"},{"type":"text","value":"2.38","key":"MiIOsFKU1r"},{"type":"text","value":")","key":"TX1xRdwys8"}],"identifier":"general_quadratic_cost","label":"general_quadratic_cost","kind":"equation","template":"(%s)","enumerator":"2.38","resolved":true,"html_id":"general-quadratic-cost","key":"vZgBnDH94q"},{"type":"text","value":":","position":{"start":{"line":815,"column":1},"end":{"line":815,"column":1}},"key":"Mz0QBJakVz"}],"key":"uIV32qx360"},{"type":"math","value":"M_\\hi = 0, \\qquad q_\\hi = -2Q \\st^\\star_\\hi, \\qquad r_\\hi = -2R \\act^\\star_\\hi, \\qquad c_\\hi = (\\st^\\star_\\hi)^\\top Q (\\st^\\star_\\hi) + (\\act^\\star_\\hi)^\\top R (\\act^\\star_\\hi).","position":{"start":{"line":821,"column":1},"end":{"line":823,"column":1}},"html":"Mh=0,qh=2Qxh,rh=2Ruh,ch=(xh)Q(xh)+(uh)R(uh).M_\\hi = 0, \\qquad q_\\hi = -2Q \\st^\\star_\\hi, \\qquad r_\\hi = -2R \\act^\\star_\\hi, \\qquad c_\\hi = (\\st^\\star_\\hi)^\\top Q (\\st^\\star_\\hi) + (\\act^\\star_\\hi)^\\top R (\\act^\\star_\\hi).Mh=0,qh=2Qxh,rh=2Ruh,ch=(xh)Q(xh)+(uh)R(uh).","enumerator":"2.41","key":"bzoDTKVEkz"},{"type":"heading","depth":2,"position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"children":[{"type":"text","value":"Approximating nonlinear dynamics","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"key":"UD1kflXPr5"}],"label":"approx_nonlinear","identifier":"approx_nonlinear","html_id":"approx-nonlinear","enumerator":"2.6","key":"Xab9DsQkNw"},{"type":"paragraph","position":{"start":{"line":830,"column":1},"end":{"line":835,"column":1}},"children":[{"type":"text","value":"The LQR algorithm solves for the optimal policy when the dynamics are\n","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"key":"j9c6ApPEpt"},{"type":"emphasis","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"children":[{"type":"text","value":"linear","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"key":"vgGgoZrBMY"}],"key":"i0K9rGAnUa"},{"type":"text","value":" and the cost function is an ","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"key":"n9CipPEm9d"},{"type":"emphasis","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"children":[{"type":"text","value":"upward-curved quadratic","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"key":"r1KsMpQRUX"}],"key":"pyvXODzHyC"},{"type":"text","value":". However,\nreal settings are rarely this simple! Let’s return to the CartPole\nexample from the start of the chapter\n(","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"key":"nKqfI8ioRG"},{"type":"crossReference","kind":"proof:example","identifier":"cart_pole","label":"cart_pole","children":[{"type":"text","value":"Example ","key":"RnpmO2Hn3M"},{"type":"text","value":"2.1","key":"FihiPqFdoK"}],"template":"Example %s","enumerator":"2.1","resolved":true,"html_id":"cart-pole","key":"aisqFnKxhO"},{"type":"text","value":"). The dynamics (physics) aren’t linear. How\ncan we approximate this by an LQR problem?","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"key":"oRrmfcoFzo"}],"key":"qZwXHYYEZ0"},{"type":"paragraph","position":{"start":{"line":837,"column":1},"end":{"line":840,"column":1}},"children":[{"type":"text","value":"Concretely, let’s consider a ","position":{"start":{"line":837,"column":1},"end":{"line":837,"column":1}},"key":"jwqhbc6aYi"},{"type":"emphasis","position":{"start":{"line":837,"column":1},"end":{"line":837,"column":1}},"children":[{"type":"text","value":"noise-free","position":{"start":{"line":837,"column":1},"end":{"line":837,"column":1}},"key":"Tt3WAsRfCo"}],"key":"LMbl90FQFB"},{"type":"text","value":" problem since, as we saw, the\nnoise doesn’t factor into the optimal policy. Let’s assume the dynamics\nand cost function are stationary, and ignore the terminal state for\nsimplicity:","position":{"start":{"line":837,"column":1},"end":{"line":837,"column":1}},"key":"TmE6U5l7IE"}],"key":"QkdRWfrYys"},{"type":"proof","kind":"definition","label":"nonlinear_control","identifier":"nonlinear_control","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Nonlinear control problem","position":{"start":{"line":842,"column":1},"end":{"line":842,"column":1}},"key":"puOi7uMT3C"}],"key":"HCh4lIzatx"},{"type":"math","value":"\\begin{aligned}\n \\min_{\\pi_0, \\dots, \\pi_{\\hor-1} : \\mathcal{S} \\to \\mathcal{A}} \\quad & \\E_{\\st_0} \\left[ \\sum_{\\hi=0}^{\\hor-1} c(\\st_\\hi, \\act_\\hi) \\right] \\\\\n \\text{where} \\quad & \\st_{\\hi+1} = f(\\st_\\hi, \\act_\\hi) \\\\\n & \\act_\\hi = \\pi_\\hi(\\st_\\hi) \\\\\n & \\st_0 \\sim \\mu_0 \\\\\n & c(\\st, \\act) = d(\\st, \\st^\\star) + d(\\act, \\act^\\star).\n\\end{aligned}","position":{"start":{"line":847,"column":1},"end":{"line":855,"column":1}},"html":"minπ0,,πH1:SAEx0[h=0H1c(xh,uh)]wherexh+1=f(xh,uh)uh=πh(xh)x0μ0c(x,u)=d(x,x)+d(u,u).\\begin{aligned}\n \\min_{\\pi_0, \\dots, \\pi_{\\hor-1} : \\mathcal{S} \\to \\mathcal{A}} \\quad & \\E_{\\st_0} \\left[ \\sum_{\\hi=0}^{\\hor-1} c(\\st_\\hi, \\act_\\hi) \\right] \\\\\n \\text{where} \\quad & \\st_{\\hi+1} = f(\\st_\\hi, \\act_\\hi) \\\\\n & \\act_\\hi = \\pi_\\hi(\\st_\\hi) \\\\\n & \\st_0 \\sim \\mu_0 \\\\\n & c(\\st, \\act) = d(\\st, \\st^\\star) + d(\\act, \\act^\\star).\n\\end{aligned}π0,,πH1:SAminwhereEx0[h=0H1c(xh,uh)]xh+1=f(xh,uh)uh=πh(xh)x0μ0c(x,u)=d(x,x)+d(u,u).","enumerator":"2.42","key":"OdqaWXTwEg"},{"type":"paragraph","position":{"start":{"line":857,"column":1},"end":{"line":858,"column":1}},"children":[{"type":"text","value":"Here, ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"H6kSc1raUm"},{"type":"inlineMath","value":"d","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"ddd","key":"cHw8Tvx1um"},{"type":"text","value":" denotes a function that measures the\n“distance” between its two arguments.","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"gAHUa03Xak"}],"key":"Ofy1ijApqE"}],"enumerator":"2.8","html_id":"nonlinear-control","key":"kce4bs83bR"},{"type":"paragraph","position":{"start":{"line":861,"column":1},"end":{"line":871,"column":1}},"children":[{"type":"text","value":"This is now only slightly simplified from the general optimal control\nproblem (see\n","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"PsyCGdE0xj"},{"type":"crossReference","kind":"proof:definition","identifier":"optimal_control","label":"optimal_control","children":[{"type":"text","value":"Definition ","key":"mmJOoMFVCz"},{"type":"text","value":"2.1","key":"PG7Flyn59P"}],"template":"Definition %s","enumerator":"2.1","resolved":true,"html_id":"optimal-control","key":"stDvL6CWWg"},{"type":"text","value":"). Here, we don’t know an analytical form\nfor the dynamics ","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"FQLgt8eqpc"},{"type":"inlineMath","value":"f","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"html":"fff","key":"bA9Ikdbika"},{"type":"text","value":" or the cost function ","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"BRSExWULc9"},{"type":"inlineMath","value":"c","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"html":"ccc","key":"tRck6celdd"},{"type":"text","value":", but we assume that we’re\nable to ","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"CpGM399faI"},{"type":"emphasis","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"children":[{"type":"text","value":"query/sample/simulate","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"FQ55BzmLC5"}],"key":"ZAsKHj1Mzu"},{"type":"text","value":" them to get their values at a given\nstate and action. To clarify, consider the case where the dynamics are\ngiven by real world physics. We can’t (yet) write down an expression for\nthe dynamics that we can differentiate or integrate analytically.\nHowever, we can still ","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"aP4M4JTbzM"},{"type":"emphasis","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"children":[{"type":"text","value":"simulate","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"zgJrKcFG2h"}],"key":"sc2ATK2TxM"},{"type":"text","value":" the dynamics and cost function by\nrunning a real-world experiment and measuring the resulting states and\ncosts. How can we adapt LQR to this more general nonlinear case?","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"xXfMvqh4SL"}],"key":"QyDM9ue2dH"},{"type":"heading","depth":3,"position":{"start":{"line":873,"column":1},"end":{"line":873,"column":1}},"children":[{"type":"text","value":"Local linearization","position":{"start":{"line":873,"column":1},"end":{"line":873,"column":1}},"key":"nWkr8wVLQ0"}],"identifier":"local-linearization","label":"Local linearization","html_id":"local-linearization","implicit":true,"enumerator":"2.6.1","key":"kjdLetXABc"},{"type":"paragraph","position":{"start":{"line":875,"column":1},"end":{"line":883,"column":1}},"children":[{"type":"text","value":"How can we apply LQR when the dynamics are nonlinear or the cost\nfunction is more complex? We’ll exploit the useful fact that we can take\na function that’s ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"dODafYNgvJ"},{"type":"emphasis","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"children":[{"type":"text","value":"locally continuous","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"prHtUfUPR4"}],"key":"omBmuqbzrb"},{"type":"text","value":" around ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"FRBA3Weg3M"},{"type":"inlineMath","value":"(s^\\star, a^\\star)","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"html":"(s,a)(s^\\star, a^\\star)(s,a)","key":"VdH3hfV3b7"},{"type":"text","value":" and\napproximate it nearby with low-order polynomials (i.e. its Taylor\napproximation). In particular, as long as the dynamics ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"xhUFrsYeKo"},{"type":"inlineMath","value":"f","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"html":"fff","key":"befc1vTRhp"},{"type":"text","value":" are\ndifferentiable around ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"gzqVUUVkE9"},{"type":"inlineMath","value":"(\\st^\\star, \\act^\\star)","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"html":"(x,u)(\\st^\\star, \\act^\\star)(x,u)","key":"tFqZh3p5Lc"},{"type":"text","value":" and the cost function\n","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"RyHWvz6Xu6"},{"type":"inlineMath","value":"c","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"html":"ccc","key":"eWLFBQatAh"},{"type":"text","value":" is twice differentiable at ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"EqsZgbOTKn"},{"type":"inlineMath","value":"(\\st^\\star, \\act^\\star)","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"html":"(x,u)(\\st^\\star, \\act^\\star)(x,u)","key":"pYfpdb8Z42"},{"type":"text","value":", we can take a\nlinear approximation of ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"Vlt83kyUYW"},{"type":"inlineMath","value":"f","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"html":"fff","key":"FfkwuWIyD8"},{"type":"text","value":" and a quadratic approximation of ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"qPtx0U2NEo"},{"type":"inlineMath","value":"c","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"html":"ccc","key":"JRXTJG8vqA"},{"type":"text","value":" to\nbring us back to the regime of LQR.","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"yusJ7SKusB"}],"key":"onEqFVxsEh"},{"type":"paragraph","position":{"start":{"line":885,"column":1},"end":{"line":885,"column":1}},"children":[{"type":"text","value":"Linearizing the dynamics around ","position":{"start":{"line":885,"column":1},"end":{"line":885,"column":1}},"key":"keUasQ0rzp"},{"type":"inlineMath","value":"(\\st^\\star, \\act^\\star)","position":{"start":{"line":885,"column":1},"end":{"line":885,"column":1}},"html":"(x,u)(\\st^\\star, \\act^\\star)(x,u)","key":"Mgh2wVmvYw"},{"type":"text","value":" gives:","position":{"start":{"line":885,"column":1},"end":{"line":885,"column":1}},"key":"iA0q3qtSbX"}],"key":"briyFsWuts"},{"type":"math","value":"\\begin{gathered}\n f(\\st, \\act) \\approx f(\\st^\\star, \\act^\\star) + \\nabla_\\st f(\\st^\\star, \\act^\\star) (\\st - \\st^\\star) + \\nabla_\\act f(\\st^\\star, \\act^\\star) (\\act - \\act^\\star) \\\\\n (\\nabla_\\st f(\\st, \\act))_{ij} = \\frac{d f_i(\\st, \\act)}{d \\st_j}, \\quad i, j \\le n_\\st \\qquad (\\nabla_\\act f(\\st, \\act))_{ij} = \\frac{d f_i(\\st, \\act)}{d \\act_j}, \\quad i \\le n_\\st, j \\le n_\\act\n\\end{gathered}","position":{"start":{"line":888,"column":1},"end":{"line":893,"column":1}},"html":"f(x,u)f(x,u)+xf(x,u)(xx)+uf(x,u)(uu)(xf(x,u))ij=dfi(x,u)dxj,i,jnx(uf(x,u))ij=dfi(x,u)duj,inx,jnu\\begin{gathered}\n f(\\st, \\act) \\approx f(\\st^\\star, \\act^\\star) + \\nabla_\\st f(\\st^\\star, \\act^\\star) (\\st - \\st^\\star) + \\nabla_\\act f(\\st^\\star, \\act^\\star) (\\act - \\act^\\star) \\\\\n (\\nabla_\\st f(\\st, \\act))_{ij} = \\frac{d f_i(\\st, \\act)}{d \\st_j}, \\quad i, j \\le n_\\st \\qquad (\\nabla_\\act f(\\st, \\act))_{ij} = \\frac{d f_i(\\st, \\act)}{d \\act_j}, \\quad i \\le n_\\st, j \\le n_\\act\n\\end{gathered}f(x,u)f(x,u)+xf(x,u)(xx)+uf(x,u)(uu)(xf(x,u))ij=dxjdfi(x,u),i,jnx(uf(x,u))ij=dujdfi(x,u),inx,jnu","enumerator":"2.43","key":"YVWoDUERfk"},{"type":"paragraph","position":{"start":{"line":895,"column":1},"end":{"line":896,"column":1}},"children":[{"type":"text","value":"and quadratizing the cost function around\n","position":{"start":{"line":895,"column":1},"end":{"line":895,"column":1}},"key":"v0oFErH8Jk"},{"type":"inlineMath","value":"(\\st^\\star, \\act^\\star)","position":{"start":{"line":895,"column":1},"end":{"line":895,"column":1}},"html":"(x,u)(\\st^\\star, \\act^\\star)(x,u)","key":"xoPWsbAUI4"},{"type":"text","value":" gives:","position":{"start":{"line":895,"column":1},"end":{"line":895,"column":1}},"key":"NhVsvTd3if"}],"key":"ElAorhwPQj"},{"type":"math","value":"\\begin{aligned}\n c(\\st, \\act) & \\approx c(\\st^\\star, \\act^\\star) \\quad \\text{constant term} \\\\\n & \\qquad + \\nabla_\\st c(\\st^\\star, \\act^\\star) (\\st - \\st^\\star) + \\nabla_\\act c(\\st^\\star, \\act^\\star) (a - \\act^\\star) \\quad \\text{linear terms} \\\\\n & \\left. \\begin{aligned}\n & \\qquad + \\frac{1}{2} (\\st - \\st^\\star)^\\top \\nabla_{\\st \\st} c(\\st^\\star, \\act^\\star) (\\st - \\st^\\star) \\\\\n & \\qquad + \\frac{1}{2} (\\act - \\act^\\star)^\\top \\nabla_{\\act \\act} c(\\st^\\star, \\act^\\star) (\\act - \\act^\\star) \\\\\n & \\qquad + (\\st - \\st^\\star)^\\top \\nabla_{\\st \\act} c(\\st^\\star, \\act^\\star) (\\act - \\act^\\star)\n \\end{aligned} \\right\\} \\text{quadratic terms}\n\\end{aligned}","position":{"start":{"line":898,"column":1},"end":{"line":908,"column":1}},"html":"c(x,u)c(x,u)constant term+xc(x,u)(xx)+uc(x,u)(au)linear terms+12(xx)xxc(x,u)(xx)+12(uu)uuc(x,u)(uu)+(xx)xuc(x,u)(uu)}quadratic terms\\begin{aligned}\n c(\\st, \\act) & \\approx c(\\st^\\star, \\act^\\star) \\quad \\text{constant term} \\\\\n & \\qquad + \\nabla_\\st c(\\st^\\star, \\act^\\star) (\\st - \\st^\\star) + \\nabla_\\act c(\\st^\\star, \\act^\\star) (a - \\act^\\star) \\quad \\text{linear terms} \\\\\n & \\left. \\begin{aligned}\n & \\qquad + \\frac{1}{2} (\\st - \\st^\\star)^\\top \\nabla_{\\st \\st} c(\\st^\\star, \\act^\\star) (\\st - \\st^\\star) \\\\\n & \\qquad + \\frac{1}{2} (\\act - \\act^\\star)^\\top \\nabla_{\\act \\act} c(\\st^\\star, \\act^\\star) (\\act - \\act^\\star) \\\\\n & \\qquad + (\\st - \\st^\\star)^\\top \\nabla_{\\st \\act} c(\\st^\\star, \\act^\\star) (\\act - \\act^\\star)\n \\end{aligned} \\right\\} \\text{quadratic terms}\n\\end{aligned}c(x,u)c(x,u)constant term+xc(x,u)(xx)+uc(x,u)(au)linear terms+21(xx)xxc(x,u)(xx)+21(uu)uuc(x,u)(uu)+(xx)xuc(x,u)(uu)quadratic terms","enumerator":"2.44","key":"qAzuT7C674"},{"type":"paragraph","position":{"start":{"line":910,"column":1},"end":{"line":910,"column":1}},"children":[{"type":"text","value":"where the gradients and Hessians are defined as","position":{"start":{"line":910,"column":1},"end":{"line":910,"column":1}},"key":"UQrJy8QjY6"}],"key":"R2eagbwm9g"},{"type":"math","value":"\\begin{aligned}\n (\\nabla_\\st c(\\st, \\act))_{i} & = \\frac{d c(\\st, \\act)}{d \\st_i}, \\quad i \\le n_\\st\n & (\\nabla_\\act c(\\st, \\act))_{i} & = \\frac{d c(\\st, \\act)}{d \\act_i}, \\quad i \\le n_\\act \\\\\n (\\nabla_{\\st \\st} c(\\st, \\act))_{ij} & = \\frac{d^2 c(\\st, \\act)}{d \\st_i d \\st_j}, \\quad i, j \\le n_\\st\n & (\\nabla_{\\act \\act} c(\\st, \\act))_{ij} & = \\frac{d^2 c(\\st, \\act)}{d \\act_i d \\act_j}, \\quad i, j \\le n_\\act \\\\\n (\\nabla_{\\st \\act} c(\\st, \\act))_{ij} & = \\frac{d^2 c(\\st, \\act)}{d \\st_i d \\act_j}. \\quad i \\le n_\\st, j \\le n_\\act\n\\end{aligned}","position":{"start":{"line":913,"column":1},"end":{"line":921,"column":1}},"html":"(xc(x,u))i=dc(x,u)dxi,inx(uc(x,u))i=dc(x,u)dui,inu(xxc(x,u))ij=d2c(x,u)dxidxj,i,jnx(uuc(x,u))ij=d2c(x,u)duiduj,i,jnu(xuc(x,u))ij=d2c(x,u)dxiduj.inx,jnu\\begin{aligned}\n (\\nabla_\\st c(\\st, \\act))_{i} & = \\frac{d c(\\st, \\act)}{d \\st_i}, \\quad i \\le n_\\st\n & (\\nabla_\\act c(\\st, \\act))_{i} & = \\frac{d c(\\st, \\act)}{d \\act_i}, \\quad i \\le n_\\act \\\\\n (\\nabla_{\\st \\st} c(\\st, \\act))_{ij} & = \\frac{d^2 c(\\st, \\act)}{d \\st_i d \\st_j}, \\quad i, j \\le n_\\st\n & (\\nabla_{\\act \\act} c(\\st, \\act))_{ij} & = \\frac{d^2 c(\\st, \\act)}{d \\act_i d \\act_j}, \\quad i, j \\le n_\\act \\\\\n (\\nabla_{\\st \\act} c(\\st, \\act))_{ij} & = \\frac{d^2 c(\\st, \\act)}{d \\st_i d \\act_j}. \\quad i \\le n_\\st, j \\le n_\\act\n\\end{aligned}(xc(x,u))i(xxc(x,u))ij(xuc(x,u))ij=dxidc(x,u),inx=dxidxjd2c(x,u),i,jnx=dxidujd2c(x,u).inx,jnu(uc(x,u))i(uuc(x,u))ij=duidc(x,u),inu=duidujd2c(x,u),i,jnu","enumerator":"2.45","key":"yeotyWWusV"},{"type":"paragraph","position":{"start":{"line":925,"column":1},"end":{"line":928,"column":1}},"children":[{"type":"strong","position":{"start":{"line":925,"column":1},"end":{"line":925,"column":1}},"children":[{"type":"text","value":"Exercise:","position":{"start":{"line":925,"column":1},"end":{"line":925,"column":1}},"key":"rc0GIHW8Ut"}],"key":"VmzbEaIHv1"},{"type":"text","value":" Note that this cost can be expressed in the general\nquadratic form seen in\n","position":{"start":{"line":925,"column":1},"end":{"line":925,"column":1}},"key":"YHAeoIfobP"},{"type":"crossReference","kind":"equation","identifier":"general_quadratic_cost","label":"general_quadratic_cost","children":[{"type":"text","value":"(","key":"RWo47nSt9K"},{"type":"text","value":"2.38","key":"LsVGvTPZ02"},{"type":"text","value":")","key":"WHAo3NKY0K"}],"template":"(%s)","enumerator":"2.38","resolved":true,"html_id":"general-quadratic-cost","key":"Gh2JCpXMCI"},{"type":"text","value":". Derive the corresponding\nquantities ","position":{"start":{"line":925,"column":1},"end":{"line":925,"column":1}},"key":"zgvHB913xY"},{"type":"inlineMath","value":"Q, R, M, q, r, c","position":{"start":{"line":925,"column":1},"end":{"line":925,"column":1}},"html":"Q,R,M,q,r,cQ, R, M, q, r, cQ,R,M,q,r,c","key":"IBMTS4vOSw"},{"type":"text","value":".","position":{"start":{"line":925,"column":1},"end":{"line":925,"column":1}},"key":"CtElK04ds5"}],"key":"PgTfUKzSEZ"},{"type":"heading","depth":3,"position":{"start":{"line":930,"column":1},"end":{"line":930,"column":1}},"children":[{"type":"text","value":"Finite differencing","position":{"start":{"line":930,"column":1},"end":{"line":930,"column":1}},"key":"iK7PphstuY"}],"identifier":"finite-differencing","label":"Finite differencing","html_id":"finite-differencing","implicit":true,"enumerator":"2.6.2","key":"srK0lb4ytz"},{"type":"paragraph","position":{"start":{"line":932,"column":1},"end":{"line":936,"column":1}},"children":[{"type":"text","value":"To calculate these gradients and Hessians in practice,\nwe use a method known as ","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"key":"ljJO34eDyu"},{"type":"strong","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"children":[{"type":"text","value":"finite differencing","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"key":"cTIVOi33Zh"}],"key":"V4ENUMuhp1"},{"type":"text","value":" for numerically computing derivatives.\nNamely, we can simply use the limit definition of the derivative, and\nsee how the function changes as we add or subtract a tiny ","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"key":"V4KBz22iLM"},{"type":"text","value":"δ","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"key":"nBb6tJ3Iwm"},{"type":"text","value":" to\nthe input.","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"key":"rLDIUZML0F"}],"key":"aH5m0LS0OF"},{"type":"math","value":"\\frac{d}{dx} f(x) = \\lim_{\\delta \\to 0} \\frac{f(x + \\delta) - f(x)}{\\delta}","position":{"start":{"line":939,"column":1},"end":{"line":941,"column":1}},"html":"ddxf(x)=limδ0f(x+δ)f(x)δ\\frac{d}{dx} f(x) = \\lim_{\\delta \\to 0} \\frac{f(x + \\delta) - f(x)}{\\delta}dxdf(x)=δ0limδf(x+δ)f(x)","enumerator":"2.46","key":"IhJjX3J9VH"},{"type":"paragraph","position":{"start":{"line":944,"column":1},"end":{"line":946,"column":1}},"children":[{"type":"text","value":"Note that this only requires us to be able to ","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"xCou6WIgQp"},{"type":"emphasis","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"children":[{"type":"text","value":"query","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"SlmFJXJBvU"}],"key":"qoY2Ept2oN"},{"type":"text","value":" the function, not\nto have an analytical expression for it, which is why it’s so useful in\npractice.","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"JmGnCvVJgX"}],"key":"p0fR7pDL5g"},{"type":"heading","depth":3,"position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"children":[{"type":"text","value":"Local convexification","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"key":"cg4lvlZ7qv"}],"identifier":"local-convexification","label":"Local convexification","html_id":"local-convexification","implicit":true,"enumerator":"2.6.3","key":"tD3hgTJ3iL"},{"type":"paragraph","position":{"start":{"line":950,"column":1},"end":{"line":953,"column":1}},"children":[{"type":"text","value":"However, simply taking the second-order approximation of the cost\nfunction is insufficient, since for the LQR setup we required that the\n","position":{"start":{"line":950,"column":1},"end":{"line":950,"column":1}},"key":"PLDlckpOxa"},{"type":"inlineMath","value":"Q","position":{"start":{"line":950,"column":1},"end":{"line":950,"column":1}},"html":"QQQ","key":"DgXSADD7Z7"},{"type":"text","value":" and ","position":{"start":{"line":950,"column":1},"end":{"line":950,"column":1}},"key":"JwshxoUApj"},{"type":"inlineMath","value":"R","position":{"start":{"line":950,"column":1},"end":{"line":950,"column":1}},"html":"RRR","key":"wiED909In9"},{"type":"text","value":" matrices were positive definite, i.e. that all of their\neigenvalues were positive.","position":{"start":{"line":950,"column":1},"end":{"line":950,"column":1}},"key":"TlwkgZIgyA"}],"key":"zkowi6Nf8C"},{"type":"paragraph","position":{"start":{"line":955,"column":1},"end":{"line":960,"column":1}},"children":[{"type":"text","value":"One way to naively ","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"nvKCXY1tUr"},{"type":"emphasis","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"children":[{"type":"text","value":"force","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"RidQc7IMge"}],"key":"Lua9yAt97A"},{"type":"text","value":" some symmetric matrix ","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"CXR571neor"},{"type":"inlineMath","value":"D","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"html":"DDD","key":"Zc3aJvQ4yq"},{"type":"text","value":" to be positive definite\nis to set any non-positive eigenvalues to some small positive value ","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"sWxFtO4p8Z"},{"type":"inlineMath","value":"\\varepsilon > 0","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"html":"ε>0\\varepsilon > 0ε>0","key":"nl9e5tvX6O"},{"type":"text","value":".\nRecall that any real symmetric matrix ","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"g5SRmQNBU7"},{"type":"inlineMath","value":"D \\in \\mathbb{R}^{n \\times n}","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"html":"DRn×nD \\in \\mathbb{R}^{n \\times n}DRn×n","key":"pnF0H4Z6BU"},{"type":"text","value":" has an basis of eigenvectors ","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"eA89HQCJ0U"},{"type":"inlineMath","value":"u_1, \\dots, u_n","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"html":"u1,,unu_1, \\dots, u_nu1,,un","key":"NOZfOo2cl6"},{"type":"text","value":"\nwith corresponding eigenvalues ","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"LWD8Lpd3vh"},{"type":"inlineMath","value":"\\lambda_1, \\dots, \\lambda_n","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"html":"λ1,,λn\\lambda_1, \\dots, \\lambda_nλ1,,λn","key":"slDVaYIUbN"},{"type":"text","value":"\nsuch that ","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"PzJuBnYAS4"},{"type":"inlineMath","value":"D u_i = \\lambda_i u_i","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"html":"Dui=λiuiD u_i = \\lambda_i u_iDui=λiui","key":"GL9wv8uzbb"},{"type":"text","value":".\nThen we can construct the positive definite approximation by","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"tOPN7Pdcui"}],"key":"DGwEb40Quq"},{"type":"math","value":"\\widetilde{D} = \\left( \\sum_{i=1, \\dots, n \\mid \\lambda_i > 0} \\lambda_i u_i u_i^\\top \\right) + \\varepsilon I.","position":{"start":{"line":962,"column":1},"end":{"line":964,"column":1}},"html":"D~=(i=1,,nλi>0λiuiui)+εI.\\widetilde{D} = \\left( \\sum_{i=1, \\dots, n \\mid \\lambda_i > 0} \\lambda_i u_i u_i^\\top \\right) + \\varepsilon I.D=i=1,,nλi>0λiuiui+εI.","enumerator":"2.47","key":"nDsPoiMmwY"},{"type":"paragraph","position":{"start":{"line":968,"column":1},"end":{"line":969,"column":1}},"children":[{"type":"strong","position":{"start":{"line":968,"column":1},"end":{"line":968,"column":1}},"children":[{"type":"text","value":"Exercise:","position":{"start":{"line":968,"column":1},"end":{"line":968,"column":1}},"key":"VTBUFDfaHR"}],"key":"yXUGKANYkY"},{"type":"text","value":" Convince yourself that ","position":{"start":{"line":968,"column":1},"end":{"line":968,"column":1}},"key":"zkVw7CAcfC"},{"type":"inlineMath","value":"\\widetilde{D}","position":{"start":{"line":968,"column":1},"end":{"line":968,"column":1}},"html":"D~\\widetilde{D}D","key":"oom2iVSyg6"},{"type":"text","value":" is indeed positive\ndefinite.","position":{"start":{"line":968,"column":1},"end":{"line":968,"column":1}},"key":"yImIMkBCne"}],"key":"pG5Z367Qsi"},{"type":"paragraph","position":{"start":{"line":971,"column":1},"end":{"line":977,"column":1}},"children":[{"type":"text","value":"Note that Hessian matrices are generally symmetric, so we can apply this\nprocess to ","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"key":"eldicEnIu0"},{"type":"inlineMath","value":"Q","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"html":"QQQ","key":"UESKdCE9r2"},{"type":"text","value":" and ","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"key":"hgISRhfxh2"},{"type":"inlineMath","value":"R","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"html":"RRR","key":"qx7paAdBYe"},{"type":"text","value":" to obtain the positive definite approximations\n","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"key":"wQ3Jht51L3"},{"type":"inlineMath","value":"\\widetilde{Q}","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"html":"Q~\\widetilde{Q}Q","key":"QYdElC9XxQ"},{"type":"text","value":" and ","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"key":"ccoOeXzfTd"},{"type":"inlineMath","value":"\\widetilde{R}","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"html":"R~\\widetilde{R}R","key":"WIEUVTUtUQ"},{"type":"text","value":".\nNow that we have an upward-curved\nquadratic approximation to the cost function, and a linear approximation\nto the state transitions, we can simply apply the time-homogenous LQR\nmethods from ","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"key":"JhKB1T3zsg"},{"type":"crossReference","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"children":[{"type":"text","value":"Section ","key":"NCOrdaTYC5"},{"type":"text","value":"2.4","key":"sUtaMK4ecY"}],"identifier":"optimal_lqr","label":"optimal_lqr","kind":"heading","template":"Section %s","enumerator":"2.4","resolved":true,"html_id":"optimal-lqr","key":"PjhR43r58C"},{"type":"text","value":".","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"key":"Wuid7fv8gJ"}],"key":"z8ip1fStN4"},{"type":"paragraph","position":{"start":{"line":979,"column":1},"end":{"line":983,"column":1}},"children":[{"type":"text","value":"But what happens when we enter states far away from ","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"key":"fwmyIlhC08"},{"type":"inlineMath","value":"\\st^\\star","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"html":"x\\st^\\starx","key":"nEfQ2BEvOU"},{"type":"text","value":" or want\nto use actions far from ","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"key":"tpAo5e5vDB"},{"type":"inlineMath","value":"\\act^\\star","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"html":"u\\act^\\staru","key":"HOnhEosy8E"},{"type":"text","value":"? A Taylor approximation is only\naccurate in a ","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"key":"otBAPs7nAk"},{"type":"emphasis","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"children":[{"type":"text","value":"local","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"key":"cGVByoXFdY"}],"key":"MazjOCtvW3"},{"type":"text","value":" region around the point of linearization, so the\nperformance of our LQR controller will degrade as we move further away.\nWe’ll see how to address this in the next section using the ","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"key":"vJYLaBMsrm"},{"type":"strong","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"children":[{"type":"text","value":"iterative LQR","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"key":"zawpVclttI"}],"key":"vAVam7I4YH"},{"type":"text","value":" algorithm.","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"key":"p4sC9s1De2"}],"key":"GxIR0Mb5LP"},{"type":"container","kind":"figure","children":[{"type":"image","url":"/build/log_taylor-41fd83609bdd9fa0d89b4a0510fdfb5a.png","alt":"Local linearization might only be accurate in a small region around the\npoint of linearization.","data":{"altTextIsAutoGenerated":true},"key":"UmtnrEiEk4","urlSource":"shared/log_taylor.png","urlOptimized":"/build/log_taylor-41fd83609bdd9fa0d89b4a0510fdfb5a.webp"},{"type":"caption","children":[{"type":"paragraph","position":{"start":{"line":988,"column":1},"end":{"line":989,"column":1}},"children":[{"type":"captionNumber","kind":"figure","label":"local_linearization","identifier":"local_linearization","html_id":"local-linearization","enumerator":"2.3","children":[{"type":"text","value":"Figure ","key":"s5OG83nY92"},{"type":"text","value":"2.3","key":"zBD17Ge67K"},{"type":"text","value":":","key":"toz9TiJxTD"}],"template":"Figure %s:","key":"NeUDXpx3k9"},{"type":"text","value":"Local linearization might only be accurate in a small region around the\npoint of linearization.","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"key":"jsxQXzobPq"}],"key":"DANgYCYlR2"}],"key":"BIgB6ErPOY"}],"label":"local_linearization","identifier":"local_linearization","enumerator":"2.3","html_id":"local-linearization","key":"f0kXqI10K4"},{"type":"heading","depth":3,"position":{"start":{"line":993,"column":1},"end":{"line":993,"column":1}},"children":[{"type":"text","value":"Iterative LQR","position":{"start":{"line":993,"column":1},"end":{"line":993,"column":1}},"key":"HcgqPcRS44"}],"label":"iterative_lqr","identifier":"iterative_lqr","html_id":"iterative-lqr","enumerator":"2.6.4","key":"TqNQ5Vcvx5"},{"type":"paragraph","position":{"start":{"line":995,"column":1},"end":{"line":999,"column":1}},"children":[{"type":"text","value":"To address these issues with local linearization, we’ll use an iterative\napproach, where we repeatedly linearize around different points to\ncreate a ","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"zSUJvTmOiv"},{"type":"emphasis","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"children":[{"type":"text","value":"time-dependent","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"oOdM4QJTfW"}],"key":"eEVvXi7GQl"},{"type":"text","value":" approximation of the dynamics, and then solve\nthe resulting time-dependent LQR problem to obtain a better policy. This\nis known as ","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"PDRhXmYcPH"},{"type":"strong","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"children":[{"type":"text","value":"iterative LQR","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"uXC0zhuRSm"}],"key":"mKUWiybstb"},{"type":"text","value":" or ","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"Ys6xMqMv0L"},{"type":"strong","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"children":[{"type":"text","value":"iLQR","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"wFUQpHlcN0"}],"key":"Mk5tx71hH1"},{"type":"text","value":":","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"rSzSevRe9w"}],"key":"H1VLlYMkdN"},{"type":"proof","kind":"definition","label":"ilqr","identifier":"ilqr","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Iterative LQR","position":{"start":{"line":1001,"column":1},"end":{"line":1001,"column":1}},"key":"sab7wVc52x"}],"key":"b9mE6w9wnQ"},{"type":"paragraph","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"children":[{"type":"text","value":"For each iteration of the algorithm:","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"BdZi3u1MVW"}],"key":"vuDwavxmFG"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":1006,"column":1},"end":{"line":1011,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":1006,"column":1},"end":{"line":1007,"column":1}},"children":[{"type":"text","value":"Form a time-dependent LQR problem around the current candidate\ntrajectory using local linearization.","position":{"start":{"line":1006,"column":1},"end":{"line":1006,"column":1}},"key":"FnsYYXbk3v"}],"key":"ZhQ3aVxYfX"},{"type":"listItem","spread":true,"position":{"start":{"line":1008,"column":1},"end":{"line":1008,"column":1}},"children":[{"type":"text","value":"Compute the optimal policy using ","position":{"start":{"line":1008,"column":1},"end":{"line":1008,"column":1}},"key":"LCAqh4y22e"},{"type":"crossReference","position":{"start":{"line":1008,"column":1},"end":{"line":1008,"column":1}},"children":[{"type":"text","value":"Section ","key":"qpzt6GOe25"},{"type":"text","value":"2.5.1","key":"LZnbaMe2Ui"}],"identifier":"time_dep_lqr","label":"time_dep_lqr","kind":"heading","template":"Section %s","enumerator":"2.5.1","resolved":true,"html_id":"time-dep-lqr","key":"UTFj4MKioY"},{"type":"text","value":".","position":{"start":{"line":1008,"column":1},"end":{"line":1008,"column":1}},"key":"AOESNZhPmj"}],"key":"OvWnBu8tBl"},{"type":"listItem","spread":true,"position":{"start":{"line":1009,"column":1},"end":{"line":1009,"column":1}},"children":[{"type":"text","value":"Generate a new series of actions using this policy.","position":{"start":{"line":1009,"column":1},"end":{"line":1009,"column":1}},"key":"FIApQGnNKk"}],"key":"hhEIRf0pSR"},{"type":"listItem","spread":true,"position":{"start":{"line":1010,"column":1},"end":{"line":1011,"column":1}},"children":[{"type":"text","value":"Compute a better candidate trajectory by interpolating between the\ncurrent and proposed actions.","position":{"start":{"line":1010,"column":1},"end":{"line":1010,"column":1}},"key":"LjHFOtviEg"}],"key":"uf9lBX89Bu"}],"key":"nmNISUeT9y"}],"enumerator":"2.9","html_id":"ilqr","key":"o11eGS8IJb"},{"type":"paragraph","position":{"start":{"line":1014,"column":1},"end":{"line":1017,"column":1}},"children":[{"type":"text","value":"Now let’s go through the details of each step. We’ll use superscripts to\ndenote the iteration of the algorithm. We’ll also denote\n","position":{"start":{"line":1014,"column":1},"end":{"line":1014,"column":1}},"key":"Pfz1Ngqz01"},{"type":"inlineMath","value":"\\bar \\st_0 = \\E_{\\st_0 \\sim \\mu_0} [\\st_0]","position":{"start":{"line":1014,"column":1},"end":{"line":1014,"column":1}},"html":"xˉ0=Ex0μ0[x0]\\bar \\st_0 = \\E_{\\st_0 \\sim \\mu_0} [\\st_0]xˉ0=Ex0μ0[x0]","key":"lx5yzxtgNZ"},{"type":"text","value":" as the expected initial\nstate.","position":{"start":{"line":1014,"column":1},"end":{"line":1014,"column":1}},"key":"XWYMy8wsQ5"}],"key":"hCGbzHqCsn"},{"type":"paragraph","position":{"start":{"line":1019,"column":1},"end":{"line":1021,"column":1}},"children":[{"type":"text","value":"At iteration ","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"key":"WR6zg6ghLy"},{"type":"inlineMath","value":"i","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"html":"iii","key":"HaoqJNL4HQ"},{"type":"text","value":" of the algorithm, we begin with a ","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"key":"m6NW0U2gxd"},{"type":"strong","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"children":[{"type":"text","value":"candidate","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"key":"Tm1SVlIvTz"}],"key":"iPYSVhfidq"},{"type":"text","value":"\ntrajectory\n","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"key":"jGmNQsx0wF"},{"type":"inlineMath","value":"\\bar \\tau^i = (\\bar \\st^i_0, \\bar \\act^i_0, \\dots, \\bar \\st^i_{\\hor-1}, \\bar \\act^i_{\\hor-1})","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"html":"τˉi=(xˉ0i,uˉ0i,,xˉH1i,uˉH1i)\\bar \\tau^i = (\\bar \\st^i_0, \\bar \\act^i_0, \\dots, \\bar \\st^i_{\\hor-1}, \\bar \\act^i_{\\hor-1})τˉi=(xˉ0i,uˉ0i,,xˉH1i,uˉH1i)","key":"Ol9Sw7hhR1"},{"type":"text","value":".","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"key":"W3Z3Rrv04i"}],"key":"eaWIeGobOQ"},{"type":"paragraph","position":{"start":{"line":1023,"column":1},"end":{"line":1026,"column":1}},"children":[{"type":"strong","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"children":[{"type":"text","value":"Step 1: Form a time-dependent LQR problem.","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"key":"KP1HvDx4XS"}],"key":"CHb41n9VTB"},{"type":"text","value":" At each timestep\n","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"key":"T0NepsnWZM"},{"type":"inlineMath","value":"\\hi \\in [\\hor]","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"html":"h[H]\\hi \\in [\\hor]h[H]","key":"Y5czPP4rQZ"},{"type":"text","value":", we use the techniques from\n","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"key":"W53IJMO7zS"},{"type":"crossReference","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"children":[{"type":"text","value":"Section ","key":"oNhka1zJSU"},{"type":"text","value":"2.6","key":"INoHGnMxEe"}],"identifier":"approx_nonlinear","label":"approx_nonlinear","kind":"heading","template":"Section %s","enumerator":"2.6","resolved":true,"html_id":"approx-nonlinear","key":"UmIp07XaBw"},{"type":"text","value":" to linearize the dynamics and\nquadratize the cost function around ","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"key":"qDwTiuRVGU"},{"type":"inlineMath","value":"(\\bar \\st^i_\\hi, \\bar \\act^i_\\hi)","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"html":"(xˉhi,uˉhi)(\\bar \\st^i_\\hi, \\bar \\act^i_\\hi)(xˉhi,uˉhi)","key":"EKA7L8JUF9"},{"type":"text","value":":","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"key":"wExYXFZWvg"}],"key":"ivr6fHd4mV"},{"type":"math","value":"\\begin{aligned}\n f_\\hi(\\st, \\act) & \\approx f(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) + \\nabla_{\\st } f(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)(\\st - \\bar {\\st}^i_\\hi) + \\nabla_{\\act } f(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)(\\act - \\bar {\\act}^i_\\hi) \\\\\n c_\\hi(\\st, \\act) & \\approx c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) + \\begin{bmatrix}\n \\st - \\bar {\\st }^i_\\hi& \\act - \\bar {\\act}^i_\\hi\n \\end{bmatrix} \\begin{bmatrix}\n \\nabla_{\\st } c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)\\\\\n \\nabla_{\\act} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)\n \\end{bmatrix} \\\\\n & \\qquad + \\frac{1}{2} \\begin{bmatrix}\n \\st - \\bar {\\st }^i_\\hi& \\act - \\bar {\\act}^i_\\hi\n \\end{bmatrix} \\begin{bmatrix}\n \\nabla_{\\st \\st} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) & \\nabla_{\\st \\act} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) \\\\\n \\nabla_{\\act \\st} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) & \\nabla_{\\act \\act} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)\n \\end{bmatrix}\n \\begin{bmatrix}\n \\st - \\bar {\\st }^i_\\hi\\\\\n \\act - \\bar {\\act}^i_\\hi\n \\end{bmatrix}.\n\\end{aligned}","position":{"start":{"line":1029,"column":1},"end":{"line":1049,"column":1}},"html":"fh(x,u)f(xˉhi,uˉhi)+xf(xˉhi,uˉhi)(xxˉhi)+uf(xˉhi,uˉhi)(uuˉhi)ch(x,u)c(xˉhi,uˉhi)+[xxˉhiuuˉhi][xc(xˉhi,uˉhi)uc(xˉhi,uˉhi)]+12[xxˉhiuuˉhi][xxc(xˉhi,uˉhi)xuc(xˉhi,uˉhi)uxc(xˉhi,uˉhi)uuc(xˉhi,uˉhi)][xxˉhiuuˉhi].\\begin{aligned}\n f_\\hi(\\st, \\act) & \\approx f(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) + \\nabla_{\\st } f(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)(\\st - \\bar {\\st}^i_\\hi) + \\nabla_{\\act } f(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)(\\act - \\bar {\\act}^i_\\hi) \\\\\n c_\\hi(\\st, \\act) & \\approx c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) + \\begin{bmatrix}\n \\st - \\bar {\\st }^i_\\hi& \\act - \\bar {\\act}^i_\\hi\n \\end{bmatrix} \\begin{bmatrix}\n \\nabla_{\\st } c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)\\\\\n \\nabla_{\\act} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)\n \\end{bmatrix} \\\\\n & \\qquad + \\frac{1}{2} \\begin{bmatrix}\n \\st - \\bar {\\st }^i_\\hi& \\act - \\bar {\\act}^i_\\hi\n \\end{bmatrix} \\begin{bmatrix}\n \\nabla_{\\st \\st} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) & \\nabla_{\\st \\act} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) \\\\\n \\nabla_{\\act \\st} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) & \\nabla_{\\act \\act} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)\n \\end{bmatrix}\n \\begin{bmatrix}\n \\st - \\bar {\\st }^i_\\hi\\\\\n \\act - \\bar {\\act}^i_\\hi\n \\end{bmatrix}.\n\\end{aligned}fh(x,u)ch(x,u)f(xˉhi,uˉhi)+xf(xˉhi,uˉhi)(xxˉhi)+uf(xˉhi,uˉhi)(uuˉhi)c(xˉhi,uˉhi)+[xxˉhiuuˉhi][xc(xˉhi,uˉhi)uc(xˉhi,uˉhi)]+21[xxˉhiuuˉhi][xxc(xˉhi,uˉhi)uxc(xˉhi,uˉhi)xuc(xˉhi,uˉhi)uuc(xˉhi,uˉhi)][xxˉhiuuˉhi].","enumerator":"2.48","key":"yXnuGgzYDU"},{"type":"paragraph","position":{"start":{"line":1053,"column":1},"end":{"line":1056,"column":1}},"children":[{"type":"strong","position":{"start":{"line":1053,"column":1},"end":{"line":1053,"column":1}},"children":[{"type":"text","value":"Step 2: Compute the optimal policy.","position":{"start":{"line":1053,"column":1},"end":{"line":1053,"column":1}},"key":"aLp9qVccx8"}],"key":"xPfpmpcc3N"},{"type":"text","value":" We can now solve the\ntime-dependent LQR problem using the Riccati equation from\n","position":{"start":{"line":1053,"column":1},"end":{"line":1053,"column":1}},"key":"kv4rrlMFHq"},{"type":"crossReference","position":{"start":{"line":1053,"column":1},"end":{"line":1053,"column":1}},"children":[{"type":"text","value":"Section ","key":"c5dPmJ4c6E"},{"type":"text","value":"2.5.1","key":"xYeaCyY05u"}],"identifier":"time_dep_lqr","label":"time_dep_lqr","kind":"heading","template":"Section %s","enumerator":"2.5.1","resolved":true,"html_id":"time-dep-lqr","key":"oNLQ87TTnK"},{"type":"text","value":" to compute the optimal policy\n","position":{"start":{"line":1053,"column":1},"end":{"line":1053,"column":1}},"key":"fIWk0KWfPH"},{"type":"inlineMath","value":"\\pi^i_0, \\dots, \\pi^i_{\\hor-1}","position":{"start":{"line":1053,"column":1},"end":{"line":1053,"column":1}},"html":"π0i,,πH1i\\pi^i_0, \\dots, \\pi^i_{\\hor-1}π0i,,πH1i","key":"l1kFQNMXHW"},{"type":"text","value":".","position":{"start":{"line":1053,"column":1},"end":{"line":1053,"column":1}},"key":"aYuiJyjQ6L"}],"key":"If8vM20mVe"},{"type":"paragraph","position":{"start":{"line":1058,"column":1},"end":{"line":1059,"column":1}},"children":[{"type":"strong","position":{"start":{"line":1058,"column":1},"end":{"line":1058,"column":1}},"children":[{"type":"text","value":"Step 3: Generate a new series of actions.","position":{"start":{"line":1058,"column":1},"end":{"line":1058,"column":1}},"key":"kK5R3L5RCp"}],"key":"i3Gf7mZWtc"},{"type":"text","value":" We can then generate a new\nsample trajectory by taking actions according to this optimal policy:","position":{"start":{"line":1058,"column":1},"end":{"line":1058,"column":1}},"key":"Xv1ruqtUGw"}],"key":"vHtT0FnQey"},{"type":"math","value":"\\bar \\st^{i+1}_0 = \\bar \\st_0, \\qquad \\widetilde \\act_\\hi = \\pi^i_\\hi(\\bar \\st^{i+1}_\\hi), \\qquad \\bar \\st^{i+1}_{\\hi+1} = f(\\bar \\st^{i+1}_\\hi, \\widetilde \\act_\\hi).","position":{"start":{"line":1062,"column":1},"end":{"line":1064,"column":1}},"html":"xˉ0i+1=xˉ0,u~h=πhi(xˉhi+1),xˉh+1i+1=f(xˉhi+1,u~h).\\bar \\st^{i+1}_0 = \\bar \\st_0, \\qquad \\widetilde \\act_\\hi = \\pi^i_\\hi(\\bar \\st^{i+1}_\\hi), \\qquad \\bar \\st^{i+1}_{\\hi+1} = f(\\bar \\st^{i+1}_\\hi, \\widetilde \\act_\\hi).xˉ0i+1=xˉ0,uh=πhi(xˉhi+1),xˉh+1i+1=f(xˉhi+1,uh).","enumerator":"2.49","key":"JDJ43JXyqP"},{"type":"paragraph","position":{"start":{"line":1067,"column":1},"end":{"line":1068,"column":1}},"children":[{"type":"text","value":"Note that the states are sampled according to the ","position":{"start":{"line":1067,"column":1},"end":{"line":1067,"column":1}},"key":"whmKYiKmOZ"},{"type":"emphasis","position":{"start":{"line":1067,"column":1},"end":{"line":1067,"column":1}},"children":[{"type":"text","value":"true","position":{"start":{"line":1067,"column":1},"end":{"line":1067,"column":1}},"key":"L0bwVHLJaG"}],"key":"jgU2pT3J3T"},{"type":"text","value":" dynamics, which\nwe assume we have query access to.","position":{"start":{"line":1067,"column":1},"end":{"line":1067,"column":1}},"key":"JCTDLrUkQ9"}],"key":"OuPWx8Qybe"},{"type":"paragraph","position":{"start":{"line":1070,"column":1},"end":{"line":1077,"column":1}},"children":[{"type":"strong","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"children":[{"type":"text","value":"Step 4: Compute a better candidate trajectory.","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"dLd4FEqo4y"}],"key":"ZdVJ8JkMHZ"},{"type":"text","value":", Note that we’ve\ndenoted these actions as ","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"fKoLkmLVgX"},{"type":"inlineMath","value":"\\widetilde \\act_\\hi","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"html":"u~h\\widetilde \\act_\\hiuh","key":"paUGVoZGiR"},{"type":"text","value":" and aren’t directly using\nthem for the next iteration ","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"MrujkOzWCi"},{"type":"inlineMath","value":"\\bar \\act^{i+1}_\\hi","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"html":"uˉhi+1\\bar \\act^{i+1}_\\hiuˉhi+1","key":"MF2aiqMwL6"},{"type":"text","value":". Rather, we want to\n","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"VqxKgrYmX5"},{"type":"emphasis","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"children":[{"type":"text","value":"interpolate","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"x6rNpNsuzO"}],"key":"VeN4HZjrjD"},{"type":"text","value":" between them and the actions from the previous iteration\n","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"R0MGTAjRXD"},{"type":"inlineMath","value":"\\bar \\act^i_0, \\dots, \\bar \\act^i_{\\hor-1}","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"html":"uˉ0i,,uˉH1i\\bar \\act^i_0, \\dots, \\bar \\act^i_{\\hor-1}uˉ0i,,uˉH1i","key":"YwErKoJ2J9"},{"type":"text","value":". This is so that the cost\nwill ","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"XZqYZgGEzP"},{"type":"emphasis","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"children":[{"type":"text","value":"increase monotonically,","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"xRRtXBUOMh"}],"key":"kVheV701vU"},{"type":"text","value":" since if the new policy turns out to\nactually be worse, we can stay closer to the previous trajectory. (Can\nyou think of an intuitive example where this might happen?)","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"CKOd0R4fOi"}],"key":"l3rAl7ayBy"},{"type":"paragraph","position":{"start":{"line":1079,"column":1},"end":{"line":1082,"column":1}},"children":[{"type":"text","value":"Formally, we want to find ","position":{"start":{"line":1079,"column":1},"end":{"line":1079,"column":1}},"key":"DrSRxstLY1"},{"type":"inlineMath","value":"\\alpha \\in [0, 1]","position":{"start":{"line":1079,"column":1},"end":{"line":1079,"column":1}},"html":"α[0,1]\\alpha \\in [0, 1]α[0,1]","key":"IynuSU1dPD"},{"type":"text","value":" to generate the next\niteration of actions\n","position":{"start":{"line":1079,"column":1},"end":{"line":1079,"column":1}},"key":"azTk4GwAYK"},{"type":"inlineMath","value":"\\bar \\act^{i+1}_0, \\dots, \\bar \\act^{i+1}_{\\hor-1}","position":{"start":{"line":1079,"column":1},"end":{"line":1079,"column":1}},"html":"uˉ0i+1,,uˉH1i+1\\bar \\act^{i+1}_0, \\dots, \\bar \\act^{i+1}_{\\hor-1}uˉ0i+1,,uˉH1i+1","key":"ICKYjmRB78"},{"type":"text","value":" such that the cost\nis minimized:","position":{"start":{"line":1079,"column":1},"end":{"line":1079,"column":1}},"key":"pdkCp3L9Zk"}],"key":"gM406weRLS"},{"type":"math","value":"\\begin{aligned}\n \\min_{\\alpha \\in [0, 1]} \\quad & \\sum_{\\hi=0}^{\\hor-1} c(\\st_\\hi, \\bar \\act^{i+1}_\\hi) \\\\\n \\text{where} \\quad & \\st_{\\hi+1} = f(\\st_\\hi, \\bar \\act^{i+1}_\\hi) \\\\\n & \\bar \\act^{i+1}_\\hi = \\alpha \\bar \\act^i_\\hi + (1-\\alpha) \\widetilde \\act_\\hi \\\\\n & \\st_0 = \\bar \\st_0.\n\\end{aligned}","position":{"start":{"line":1084,"column":1},"end":{"line":1091,"column":1}},"html":"minα[0,1]h=0H1c(xh,uˉhi+1)wherexh+1=f(xh,uˉhi+1)uˉhi+1=αuˉhi+(1α)u~hx0=xˉ0.\\begin{aligned}\n \\min_{\\alpha \\in [0, 1]} \\quad & \\sum_{\\hi=0}^{\\hor-1} c(\\st_\\hi, \\bar \\act^{i+1}_\\hi) \\\\\n \\text{where} \\quad & \\st_{\\hi+1} = f(\\st_\\hi, \\bar \\act^{i+1}_\\hi) \\\\\n & \\bar \\act^{i+1}_\\hi = \\alpha \\bar \\act^i_\\hi + (1-\\alpha) \\widetilde \\act_\\hi \\\\\n & \\st_0 = \\bar \\st_0.\n\\end{aligned}α[0,1]minwhereh=0H1c(xh,uˉhi+1)xh+1=f(xh,uˉhi+1)uˉhi+1=αuˉhi+(1α)uhx0=xˉ0.","enumerator":"2.50","key":"ztPinY9uyq"},{"type":"paragraph","position":{"start":{"line":1093,"column":1},"end":{"line":1095,"column":1}},"children":[{"type":"text","value":"Note that this optimizes over the closed interval\n","position":{"start":{"line":1093,"column":1},"end":{"line":1093,"column":1}},"key":"FgR3JhJntd"},{"type":"inlineMath","value":"[0, 1]","position":{"start":{"line":1093,"column":1},"end":{"line":1093,"column":1}},"html":"[0,1][0, 1][0,1]","key":"CXmwfvaVe8"},{"type":"text","value":", so by the Extreme Value Theorem, it’s guaranteed to have a\nglobal maximum.","position":{"start":{"line":1093,"column":1},"end":{"line":1093,"column":1}},"key":"QlkC7o6fXe"}],"key":"A7J5nhBHux"},{"type":"paragraph","position":{"start":{"line":1097,"column":1},"end":{"line":1101,"column":1}},"children":[{"type":"text","value":"The final output of this algorithm is a policy ","position":{"start":{"line":1097,"column":1},"end":{"line":1097,"column":1}},"key":"w1n40z9L1m"},{"type":"inlineMath","value":"\\pi^{n_\\text{steps}}","position":{"start":{"line":1097,"column":1},"end":{"line":1097,"column":1}},"html":"πnsteps\\pi^{n_\\text{steps}}πnsteps","key":"N32BQ337KZ"},{"type":"text","value":"\nderived after ","position":{"start":{"line":1097,"column":1},"end":{"line":1097,"column":1}},"key":"w6FenIoM5q"},{"type":"inlineMath","value":"n_\\text{steps}","position":{"start":{"line":1097,"column":1},"end":{"line":1097,"column":1}},"html":"nstepsn_\\text{steps}nsteps","key":"V0UCFT6Djb"},{"type":"text","value":" of the algorithm. Though the proof is\nsomewhat complex, one can show that for many nonlinear control problems,\nthis solution converges to a locally optimal solution (in the policy\nspace).","position":{"start":{"line":1097,"column":1},"end":{"line":1097,"column":1}},"key":"dTwPP6O6b8"}],"key":"lNGt0PZyvI"},{"type":"heading","depth":2,"position":{"start":{"line":1103,"column":1},"end":{"line":1103,"column":1}},"children":[{"type":"text","value":"Summary","position":{"start":{"line":1103,"column":1},"end":{"line":1103,"column":1}},"key":"mm80d5oLp9"}],"identifier":"summary","label":"Summary","html_id":"summary","implicit":true,"enumerator":"2.7","key":"VL7QY0LpXM"},{"type":"paragraph","position":{"start":{"line":1105,"column":1},"end":{"line":1112,"column":1}},"children":[{"type":"text","value":"This chapter introduced some approaches to solving different variants of\nthe optimal control problem\n","position":{"start":{"line":1105,"column":1},"end":{"line":1105,"column":1}},"key":"L0gpQtdQho"},{"type":"crossReference","kind":"proof:definition","identifier":"optimal_control","label":"optimal_control","children":[{"type":"text","value":"Definition ","key":"i4o7ZJV6zY"},{"type":"text","value":"2.1","key":"NLNS94V2OF"}],"template":"Definition %s","enumerator":"2.1","resolved":true,"html_id":"optimal-control","key":"C9rX2dRV7m"},{"type":"text","value":". We began with the simple case of linear\ndynamics and an upward-curved quadratic cost. This model is called the\nLQR and we solved for the optimal policy using dynamic programming. We\nthen extended these results to the more general nonlinear case via local\nlinearization. We finally saw the iterative LQR algorithm for solving\nnonlinear control problems.","position":{"start":{"line":1105,"column":1},"end":{"line":1105,"column":1}},"key":"uCn7vNdJwY"}],"key":"SrKjiY8J4E"}],"key":"Godf9BJFef"}],"key":"vHaXFYpcGJ"},"references":{"cite":{"order":[],"data":{}}},"footer":{"navigation":{"prev":{"title":"1 Markov Decision Processes","url":"/mdps","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"3 Multi-Armed Bandits","url":"/bandits","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file +{"kind":"Notebook","sha256":"6b86f35044831ffbe0cf07af5eee27ce5d28fea0397ecdc730ddaa67506611c3","slug":"control","location":"/control.md","dependencies":[],"frontmatter":{"title":"2 Linear Quadratic Regulators","numbering":{"all":{"enabled":true},"enumerator":{"template":"2.%s"}},"math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"x"},"\\act":{"macro":"u"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","thumbnail":"/build/rubiks_cube-5d86d5b19a044eede0a3801e51b37815.jpg","thumbnailOptimized":"/build/rubiks_cube-5d86d5b19a044eede0a3801e51b37815.webp","exports":[{"format":"md","filename":"control.md","url":"/build/control-a8c1e7d39cf806d9a073317a2544cfca.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":21,"column":1},"end":{"line":21,"column":1}},"key":"zbOVj6XZbS"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"2.1","key":"Aj4vwnlXoX"},{"type":"paragraph","position":{"start":{"line":23,"column":1},"end":{"line":28,"column":1}},"children":[{"type":"text","value":"Up to this point, we have considered decision problems with finitely\nmany states and actions. However, in many applications, states and\nactions may take on continuous values. For example, consider autonomous\ndriving, controlling a robot’s joints, and automated manufacturing. How\ncan we teach computers to solve these kinds of problems? This is the\ntask of ","position":{"start":{"line":23,"column":1},"end":{"line":23,"column":1}},"key":"JNdu5lxEAQ"},{"type":"strong","position":{"start":{"line":23,"column":1},"end":{"line":23,"column":1}},"children":[{"type":"text","value":"continuous control","position":{"start":{"line":23,"column":1},"end":{"line":23,"column":1}},"key":"RTh1xb5xHw"}],"key":"xTKnsr84px"},{"type":"text","value":".","position":{"start":{"line":23,"column":1},"end":{"line":23,"column":1}},"key":"sYJWjVMg37"}],"key":"i1Do4n6TVM"},{"type":"container","kind":"figure","children":[{"type":"image","url":"/build/rubiks_cube-5d86d5b19a044eede0a3801e51b37815.jpg","alt":"Solving a Rubik’s Cube with a robot hand.","data":{"altTextIsAutoGenerated":true},"key":"FUiq7pGzPK","urlSource":"shared/rubiks_cube.jpg","urlOptimized":"/build/rubiks_cube-5d86d5b19a044eede0a3801e51b37815.webp"},{"type":"caption","children":[{"type":"paragraph","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"captionNumber","kind":"figure","label":"control_examples","identifier":"control_examples","html_id":"control-examples","enumerator":"2.1","children":[{"type":"text","value":"Figure ","key":"QRDvqDjP4b"},{"type":"text","value":"2.1","key":"SjJeJAt67Q"},{"type":"text","value":":","key":"hFzDYNvRcU"}],"template":"Figure %s:","key":"xjCLDNVkwP"},{"type":"text","value":"Solving a Rubik’s Cube with a robot hand.","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"sYtGA40nRb"}],"key":"awTmIVQBG1"}],"key":"yJfwUmnkFh"}],"label":"control_examples","identifier":"control_examples","enumerator":"2.1","html_id":"control-examples","key":"bRG7d2wc3y"},{"type":"container","kind":"figure","children":[{"type":"image","url":"/build/boston_dynamics-07bc07f0646e10c0fddbe75b26862eee.jpg","alt":"Boston Dynamics’s Spot robot.","data":{"altTextIsAutoGenerated":true},"key":"j8vWhRoqqC","urlSource":"shared/boston_dynamics.jpg","urlOptimized":"/build/boston_dynamics-07bc07f0646e10c0fddbe75b26862eee.webp"},{"type":"caption","children":[{"type":"paragraph","position":{"start":{"line":39,"column":1},"end":{"line":39,"column":1}},"children":[{"type":"captionNumber","kind":"figure","label":"robot_hand","identifier":"robot_hand","html_id":"robot-hand","enumerator":"2.2","children":[{"type":"text","value":"Figure ","key":"z0x4kLLLBG"},{"type":"text","value":"2.2","key":"r1iJZ6WZKU"},{"type":"text","value":":","key":"IPEjU6IAt0"}],"template":"Figure %s:","key":"PRofzY2SNV"},{"type":"text","value":"Boston Dynamics’s Spot robot.","position":{"start":{"line":39,"column":1},"end":{"line":39,"column":1}},"key":"U6OwHsQAlA"}],"key":"XuVvmTNmg5"}],"key":"M3ydIHksqA"}],"label":"robot_hand","identifier":"robot_hand","enumerator":"2.2","html_id":"robot-hand","key":"AF5gcNBQds"},{"type":"paragraph","position":{"start":{"line":42,"column":1},"end":{"line":46,"column":1}},"children":[{"type":"text","value":"Aside from the change in the state and action spaces, the general\nproblem setup remains the same: we seek to construct an ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"HVFJ13dmr7"},{"type":"emphasis","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"optimal policy","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"dwcmuvT3bc"}],"key":"wO4F9V6leB"},{"type":"text","value":"\nthat outputs actions to solve the desired task. We will see that many\nkey ideas and algorithms, in particular dynamic programming algorithms,\ncarry over to this new setting.","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"mXgyJZFS9J"}],"key":"HcuTX8TINl"},{"type":"paragraph","position":{"start":{"line":48,"column":1},"end":{"line":50,"column":1}},"children":[{"type":"text","value":"This chapter introduces a fundamental tool to solve a simple class of\ncontinuous control problems: the ","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"key":"ys9IAWDx3B"},{"type":"strong","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"children":[{"type":"text","value":"linear quadratic regulator","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"key":"Ca1H08sNZw"}],"key":"sxnYPXGQOw"},{"type":"text","value":". We will\nthen extend this basic method to more complex settings.","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"key":"vaYMpZX4WM"}],"key":"dk0IFaJuu2"},{"type":"proof","kind":"example","label":"cart_pole","identifier":"cart_pole","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"CartPole","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"a5IGy6wXh4"}],"key":"lFWWnj0JF1"},{"type":"paragraph","position":{"start":{"line":55,"column":1},"end":{"line":59,"column":1}},"children":[{"type":"text","value":"Try to balance a pencil on its point on a flat surface. It’s much more\ndifficult than it may first seem: the position of the pencil varies\ncontinuously, and the state transitions governing the system, i.e. the\nlaws of physics, are highly complex. This task is equivalent to the\nclassic control problem known as ","position":{"start":{"line":55,"column":1},"end":{"line":55,"column":1}},"key":"mmESh3ob98"},{"type":"emphasis","position":{"start":{"line":55,"column":1},"end":{"line":55,"column":1}},"children":[{"type":"text","value":"CartPole","position":{"start":{"line":55,"column":1},"end":{"line":55,"column":1}},"key":"hWGbVVuEc4"}],"key":"MbpcMdBtU4"},{"type":"text","value":":","position":{"start":{"line":55,"column":1},"end":{"line":55,"column":1}},"key":"phatTyqNpi"}],"key":"atoTvR4Pvf"},{"type":"image","url":"/build/cart_pole-cbbb59437cd1cf4230050ca053220243.png","width":"200px","align":"center","key":"HndPMDvKUA","urlSource":"shared/cart_pole.png","urlOptimized":"/build/cart_pole-cbbb59437cd1cf4230050ca053220243.webp"},{"type":"paragraph","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"children":[{"type":"text","value":"The state ","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"KLug8Vx8sx"},{"type":"inlineMath","value":"\\st \\in \\mathbb{R}^4","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"html":"xR4\\st \\in \\mathbb{R}^4xR4","key":"qZu8xAwa7h"},{"type":"text","value":" can be described by:","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"biWo5xTzZa"}],"key":"ORgbIHJi4H"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":67,"column":1},"end":{"line":74,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":67,"column":1},"end":{"line":68,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":67,"column":1},"end":{"line":67,"column":1}},"children":[{"type":"text","value":"the position of the cart;","position":{"start":{"line":67,"column":1},"end":{"line":67,"column":1}},"key":"uK3tTuxjBF"}],"key":"c5P7MJBmmX"}],"key":"vFrT62HoDW"},{"type":"listItem","spread":true,"position":{"start":{"line":69,"column":1},"end":{"line":70,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"children":[{"type":"text","value":"the velocity of the cart;","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"GqxMVeCfXm"}],"key":"B2k6E167X8"}],"key":"Y7NckJ1KxA"},{"type":"listItem","spread":true,"position":{"start":{"line":71,"column":1},"end":{"line":72,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"the angle of the pole;","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"Ana64Lkskm"}],"key":"ojBr2FE1HB"}],"key":"npJfsg8c8V"},{"type":"listItem","spread":true,"position":{"start":{"line":73,"column":1},"end":{"line":74,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":73,"column":1},"end":{"line":73,"column":1}},"children":[{"type":"text","value":"the angular velocity of the pole.","position":{"start":{"line":73,"column":1},"end":{"line":73,"column":1}},"key":"q2RIdCeWP8"}],"key":"rjnMJoASxS"}],"key":"EtnJXd9vzJ"}],"key":"c2tIQmVxiU"},{"type":"paragraph","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"children":[{"type":"text","value":"We can ","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"key":"oay8O86zIx"},{"type":"emphasis","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"children":[{"type":"text","value":"control","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"key":"QPkmyIcfP6"}],"key":"VK5xnNAG8F"},{"type":"text","value":" the cart by applying a horizontal force ","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"key":"AqbpzcF4Q9"},{"type":"inlineMath","value":"\\act \\in \\mathbb{R}","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"html":"uR\\act \\in \\mathbb{R}uR","key":"x4eg63A7fs"},{"type":"text","value":".","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"key":"J2w5eEauSS"}],"key":"kiAf2NZcMz"},{"type":"paragraph","position":{"start":{"line":77,"column":1},"end":{"line":78,"column":1}},"children":[{"type":"strong","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"children":[{"type":"text","value":"Goal:","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"key":"gsBLF526pd"}],"key":"ebLueaUIpS"},{"type":"text","value":" Stabilize the cart around an ideal state and action\n","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"key":"UbrOsgBM5d"},{"type":"inlineMath","value":"(\\st^\\star, \\act^\\star)","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"html":"(x,u)(\\st^\\star, \\act^\\star)(x,u)","key":"Oig6Djpt11"},{"type":"text","value":".","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"key":"EkRV4kHMCP"}],"key":"mnugpud7t2"}],"enumerator":"2.1","html_id":"cart-pole","key":"pYWhKxN52L"},{"type":"heading","depth":2,"position":{"start":{"line":81,"column":1},"end":{"line":81,"column":1}},"children":[{"type":"text","value":"Optimal control","position":{"start":{"line":81,"column":1},"end":{"line":81,"column":1}},"key":"QGsfy0nSHu"}],"identifier":"optimal-control","label":"Optimal control","html_id":"optimal-control","implicit":true,"enumerator":"2.2","key":"t3FydiBwNg"},{"type":"paragraph","position":{"start":{"line":83,"column":1},"end":{"line":86,"column":1}},"children":[{"type":"text","value":"Recall that an MDP is defined by its state space ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"qliuX9weuh"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"S\\mathcal{S}S","key":"iNYPMifMTj"},{"type":"text","value":", action space\n","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"LYB8J9V3uE"},{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"A\\mathcal{A}A","key":"Q8igNdTXJt"},{"type":"text","value":", state transitions ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"kbh2aLGYIB"},{"type":"inlineMath","value":"P","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"PPP","key":"pgrXd3ir41"},{"type":"text","value":", reward function ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"AoeIUnAeWI"},{"type":"inlineMath","value":"r","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"rrr","key":"OCL9djG3cw"},{"type":"text","value":", and discount factor\n","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"GL473gBnWU"},{"type":"text","value":"γ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"cU0CvgL79c"},{"type":"text","value":" or time horizon ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"l0rUPCcxqr"},{"type":"inlineMath","value":"\\hor","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"H\\horH","key":"F6ZnCFQbBl"},{"type":"text","value":". These have equivalents in the control\nsetting:","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"Ne6OEIY1fw"}],"key":"hXZs8PjVuN"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":88,"column":1},"end":{"line":114,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":88,"column":1},"end":{"line":93,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":88,"column":1},"end":{"line":92,"column":1}},"children":[{"type":"text","value":"The state and action spaces are ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"KniHRtAqeo"},{"type":"emphasis","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"children":[{"type":"text","value":"continuous","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"rmPWf0CPA8"}],"key":"bF18zXXzyr"},{"type":"text","value":" rather than finite.\nThat is, ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"w5Ud0ePrmX"},{"type":"inlineMath","value":"\\mathcal{S} \\subseteq \\mathbb{R}^{n_\\st}","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"html":"SRnx\\mathcal{S} \\subseteq \\mathbb{R}^{n_\\st}SRnx","key":"Edt34ZCBRS"},{"type":"text","value":" and ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"c1ExgZFQVF"},{"type":"inlineMath","value":"\\mathcal{A} \\subseteq \\mathbb{R}^{n_\\act}","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"html":"ARnu\\mathcal{A} \\subseteq \\mathbb{R}^{n_\\act}ARnu","key":"FGVE5s4gBC"},{"type":"text","value":",\nwhere ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"fyBDqEWPJO"},{"type":"inlineMath","value":"n_\\st","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"html":"nxn_\\stnx","key":"uQePD2PTKD"},{"type":"text","value":" and ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"JhO1m8wZfp"},{"type":"inlineMath","value":"n_\\act","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"html":"nun_\\actnu","key":"iKFB131n56"},{"type":"text","value":" are the corresponding dimensions of these\nspaces, i.e. the number of coordinates to specify a single state or\naction respectively.","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"Kh9IDC8XMD"}],"key":"f4AJHhq1Bv"}],"key":"H9FjfYEIZ1"},{"type":"listItem","spread":true,"position":{"start":{"line":94,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":94,"column":1},"end":{"line":101,"column":1}},"children":[{"type":"text","value":"We call the state transitions the ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"kD66ctE5hH"},{"type":"strong","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"dynamics","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"Pa09CTNDFQ"}],"key":"viti0W0GTM"},{"type":"text","value":" of the system. In the\nmost general case, these might change across timesteps and also\ninclude some stochastic ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"naOstxpn7C"},{"type":"strong","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"noise","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"G1xAbXB0Ja"}],"key":"Xp9vbISBr5"},{"type":"text","value":" ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"N2nvoFmF0S"},{"type":"inlineMath","value":"w_\\hi","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"whw_\\hiwh","key":"j5V6z8JeeG"},{"type":"text","value":" at each timestep. We\ndenote these dynamics as the function ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"gHBz7uhjyU"},{"type":"inlineMath","value":"f_\\hi","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"fhf_\\hifh","key":"en9MttNuRR"},{"type":"text","value":" such that\n","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"r8Ej64yJEO"},{"type":"inlineMath","value":"\\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi)","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"xh+1=fh(xh,uh,wh)\\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi)xh+1=fh(xh,uh,wh)","key":"SXhQ3G8doz"},{"type":"text","value":". Of course, we can\nsimplify to cases where the dynamics are ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"vRs90MoTZ6"},{"type":"emphasis","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"deterministic/noise-free","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"jXrYMngDdx"}],"key":"kKfolJAnP9"},{"type":"text","value":"\n(no ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"sjJTRSRuH9"},{"type":"inlineMath","value":"w_\\hi","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"whw_\\hiwh","key":"xyOk9ncEu8"},{"type":"text","value":" term) and/or ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"aKRtuFsp53"},{"type":"emphasis","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"time-homogeneous","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"N4L9r1Tiu5"}],"key":"i7fx1RAMgz"},{"type":"text","value":" (the same function ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"YoGQqt8cIO"},{"type":"inlineMath","value":"f","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"fff","key":"ir4hd3RPvg"},{"type":"text","value":"\nacross timesteps).","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"HlCN4F3BAo"}],"key":"fq8AfvWxAl"}],"key":"CFR94mLBSW"},{"type":"listItem","spread":true,"position":{"start":{"line":103,"column":1},"end":{"line":109,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":103,"column":1},"end":{"line":108,"column":1}},"children":[{"type":"text","value":"Instead of maximizing the reward function, we seek to minimize the\n","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"Cn4JFDHbiL"},{"type":"strong","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"children":[{"type":"text","value":"cost function","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"y4mWrov85G"}],"key":"EDT7mfx3Tq"},{"type":"text","value":" ","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"UFm8YggqVr"},{"type":"inlineMath","value":"c_\\hi: \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"html":"ch:S×ARc_\\hi: \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}ch:S×AR","key":"C9lsb48W7G"},{"type":"text","value":". Often, the cost\nfunction describes ","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"eFDhE3PhHn"},{"type":"emphasis","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"children":[{"type":"text","value":"how far away","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"f0TgNYSJIX"}],"key":"fxnO7ltCg4"},{"type":"text","value":" we are from a ","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"q9xDCsAtOn"},{"type":"strong","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"children":[{"type":"text","value":"target\nstate-action pair","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"tKSfeJIrbj"}],"key":"jyoxeGCCh7"},{"type":"text","value":" ","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"obgT4kKwGS"},{"type":"inlineMath","value":"(\\st^\\star, \\act^\\star)","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"html":"(x,u)(\\st^\\star, \\act^\\star)(x,u)","key":"bg4puiWptm"},{"type":"text","value":". An important special\ncase is when the cost is ","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"nbBE7bL4sO"},{"type":"emphasis","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"children":[{"type":"text","value":"time-homogeneous","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"AXWF2skPof"}],"key":"BKKxAyxwye"},{"type":"text","value":"; that is, it remains the\nsame function ","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"glLMrV3rFT"},{"type":"inlineMath","value":"c","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"html":"ccc","key":"T5jWQ3j6EW"},{"type":"text","value":" at each timestep ","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"TxMtwDoEFq"},{"type":"inlineMath","value":"h","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"html":"hhh","key":"OuxB3c5hL3"},{"type":"text","value":".","position":{"start":{"line":103,"column":1},"end":{"line":103,"column":1}},"key":"pqfEoEzJtf"}],"key":"UmQbe1kQpP"}],"key":"xRRGTuYrju"},{"type":"listItem","spread":true,"position":{"start":{"line":110,"column":1},"end":{"line":114,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":110,"column":1},"end":{"line":113,"column":1}},"children":[{"type":"text","value":"We seek to minimize the ","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"F0gLrgXL5T"},{"type":"emphasis","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"children":[{"type":"text","value":"undiscounted","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"zYgh4X1Oy9"}],"key":"noRqBAbJ10"},{"type":"text","value":" cost within a ","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"GzRzMHUrO8"},{"type":"emphasis","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"children":[{"type":"text","value":"finite time\nhorizon","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"OvVq77TvOH"}],"key":"AXS5ur1R4h"},{"type":"text","value":" ","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"AdqGfjkQOU"},{"type":"inlineMath","value":"\\hor","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"html":"H\\horH","key":"MW2xKhBfX4"},{"type":"text","value":". Note that we end an episode at the final state\n","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"SpcypwvSCN"},{"type":"inlineMath","value":"\\st_\\hor","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"html":"xH\\st_\\horxH","key":"eBXiYZncF9"},{"type":"text","value":" -- there is no ","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"eteKrEF6Yn"},{"type":"inlineMath","value":"\\act_\\hor","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"html":"uH\\act_\\horuH","key":"JRES2TguYq"},{"type":"text","value":", and so we denote the cost for\nthe final state as ","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"aBeMgvhSbX"},{"type":"inlineMath","value":"c_\\hor(\\st_\\hor)","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"html":"cH(xH)c_\\hor(\\st_\\hor)cH(xH)","key":"fAbw5OjITM"},{"type":"text","value":".","position":{"start":{"line":110,"column":1},"end":{"line":110,"column":1}},"key":"RFnsjYlKxO"}],"key":"hvbwdQDRED"}],"key":"wjt1FVZvkg"}],"key":"cWwAtm6mQh"},{"type":"paragraph","position":{"start":{"line":115,"column":1},"end":{"line":120,"column":1}},"children":[{"type":"text","value":"With all of these components, we can now formulate the ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"SJV4E40yxN"},{"type":"strong","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"children":[{"type":"text","value":"optimal control\nproblem:","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"gKEUcDCjwM"}],"key":"XctxTb2SJ8"},{"type":"text","value":" ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"MRUnonYYih"},{"type":"emphasis","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"children":[{"type":"text","value":"compute a policy to minimize the expected undiscounted cost\nover ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"bD80P4lnUs"},{"type":"inlineMath","value":"\\hor","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"H\\horH","key":"N4OsC8wn4g"},{"type":"text","value":" timesteps.","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"r97wXDQteZ"}],"key":"CkiB9vgBzR"},{"type":"text","value":" In this chapter, we will only consider\n","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"n738gkreIr"},{"type":"emphasis","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"children":[{"type":"text","value":"deterministic, time-dependent","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"OjGTq9Zjgk"}],"key":"B9lH8IAiC2"},{"type":"text","value":" policies\n","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"H664aKwiYM"},{"type":"inlineMath","value":"\\pi = (\\pi_0, \\dots, \\pi_{H-1})","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"π=(π0,,πH1)\\pi = (\\pi_0, \\dots, \\pi_{H-1})π=(π0,,πH1)","key":"XSNcEnysrx"},{"type":"text","value":" where ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"UDbhxSf22m"},{"type":"inlineMath","value":"\\pi_h : \\mathcal{S} \\to \\mathcal{A}","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"πh:SA\\pi_h : \\mathcal{S} \\to \\mathcal{A}πh:SA","key":"NTNl8lixfg"},{"type":"text","value":" for each\n","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"QMq0ikNVVa"},{"type":"inlineMath","value":"\\hi \\in [\\hor]","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"h[H]\\hi \\in [\\hor]h[H]","key":"pxuqFMJsKl"},{"type":"text","value":".","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"v964xpyCzg"}],"key":"jIRGY2Diox"},{"type":"proof","kind":"definition","label":"optimal_control","identifier":"optimal_control","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"General optimal control problem","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"sb84cgS1Z0"}],"key":"XMYsPOehMm"},{"type":"math","value":"\\begin{split}\n \\min_{\\pi_0, \\dots, \\pi_{\\hor-1} : \\mathcal{S} \\to \\mathcal{A}} \\quad & \\E \\left[\n \\left( \\sum_{\\hi=0}^{\\hor-1} c_\\hi(\\st_\\hi, \\act_\\hi) \\right) + c_\\hor(\\st_\\hor)\n \\right] \\\\\n \\text{where} \\quad & \\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi), \\\\\n & \\act_\\hi = \\pi_\\hi(\\st_\\hi) \\\\\n & \\st_0 \\sim \\mu_0 \\\\\n & w_\\hi \\sim \\text{noise}\n\\end{split}","position":{"start":{"line":125,"column":1},"end":{"line":135,"column":1}},"html":"minπ0,,πH1:SAE[(h=0H1ch(xh,uh))+cH(xH)]wherexh+1=fh(xh,uh,wh),uh=πh(xh)x0μ0whnoise\\begin{split}\n \\min_{\\pi_0, \\dots, \\pi_{\\hor-1} : \\mathcal{S} \\to \\mathcal{A}} \\quad & \\E \\left[\n \\left( \\sum_{\\hi=0}^{\\hor-1} c_\\hi(\\st_\\hi, \\act_\\hi) \\right) + c_\\hor(\\st_\\hor)\n \\right] \\\\\n \\text{where} \\quad & \\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi), \\\\\n & \\act_\\hi = \\pi_\\hi(\\st_\\hi) \\\\\n & \\st_0 \\sim \\mu_0 \\\\\n & w_\\hi \\sim \\text{noise}\n\\end{split}π0,,πH1:SAminwhereE[(h=0H1ch(xh,uh))+cH(xH)]xh+1=fh(xh,uh,wh),uh=πh(xh)x0μ0whnoise","enumerator":"2.1","key":"hn1kBGMR45"}],"enumerator":"2.1","html_id":"optimal-control","key":"BZ12ay2Sfw"},{"type":"heading","depth":3,"position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"children":[{"type":"text","value":"A first attempt: Discretization","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"key":"JJPa8o7TCY"}],"identifier":"a-first-attempt-discretization","label":"A first attempt: Discretization","html_id":"a-first-attempt-discretization","implicit":true,"enumerator":"2.2.1","key":"lrEtRUVg6I"},{"type":"paragraph","position":{"start":{"line":140,"column":1},"end":{"line":143,"column":1}},"children":[{"type":"text","value":"Can we solve this problem using tools from the finite MDP setting? If\n","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"XAEQ53Xaj9"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"html":"S\\mathcal{S}S","key":"IepE2sk05U"},{"type":"text","value":" and ","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"niJEbrTXUo"},{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"html":"A\\mathcal{A}A","key":"Nt8UQP9WpH"},{"type":"text","value":" were finite, then we’d be able to work backwards using the DP algorithm for computing the optimal policy in an MDP (","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"dF1St0kopK"},{"type":"crossReference","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"children":[{"type":"text","value":"Definition ","key":"SYuqcBHLbg"},{"type":"text","value":"1.11","key":"wD5CDix3RD"}],"identifier":"pi_star_dp","label":"pi_star_dp","kind":"proof:definition","template":"Definition %s","enumerator":"1.11","resolved":true,"html_id":"pi-star-dp","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"fYU8FvwhYs"},{"type":"text","value":").\nThis inspires us to try ","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"N5V1cvEnFq"},{"type":"emphasis","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"children":[{"type":"text","value":"discretizing","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"bkNyUAly10"}],"key":"jHWPSOALAE"},{"type":"text","value":" the\nproblem.","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"we2PwTSIcA"}],"key":"IbkSHHbYKg"},{"type":"paragraph","position":{"start":{"line":145,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"Suppose ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"YWvsdsyR2v"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"S\\mathcal{S}S","key":"cLVe5KoPsK"},{"type":"text","value":" and ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"XyKU03OFeO"},{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"A\\mathcal{A}A","key":"fl4aXbL5My"},{"type":"text","value":" are bounded, that is,\n","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"XsiHmkBqUH"},{"type":"inlineMath","value":"\\max_{\\st \\in \\mathcal{S}} \\|\\st\\| \\le B_\\st","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"maxxSxBx\\max_{\\st \\in \\mathcal{S}} \\|\\st\\| \\le B_\\stmaxxSxBx","key":"jRC3sGdezA"},{"type":"text","value":" and\n","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"kVlZPLDoxk"},{"type":"inlineMath","value":"\\max_{\\act \\in \\mathcal{A}} \\|\\act\\| \\le B_\\act","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"maxuAuBu\\max_{\\act \\in \\mathcal{A}} \\|\\act\\| \\le B_\\actmaxuAuBu","key":"AHaP3ng05l"},{"type":"text","value":". To make ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"TbTvea8KAJ"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"S\\mathcal{S}S","key":"NrjZT7wmlJ"},{"type":"text","value":" and ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"Fq1SyIEmmC"},{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"A\\mathcal{A}A","key":"zEQeUCtMiR"},{"type":"text","value":" finite,\nlet’s choose some small positive ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"ux5jQPPpdB"},{"type":"text","value":"ε","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"lkYGcL39hY"},{"type":"text","value":", and simply round each\ncoordinate to the nearest multiple of ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"BZnEgC3GHu"},{"type":"text","value":"ε","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"oh59YfnzMV"},{"type":"text","value":". For example, if\n","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"OrcfY4Dkm3"},{"type":"inlineMath","value":"\\epsilon = 0.01","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"ϵ=0.01\\epsilon = 0.01ϵ=0.01","key":"fCUTmO1ATa"},{"type":"text","value":", then we round each element of ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"AlBWdiP7Hw"},{"type":"inlineMath","value":"\\st","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"x\\stx","key":"hbQmHHmshj"},{"type":"text","value":" and ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"kXp1GMvl5Y"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"u\\actu","key":"Pb3YiyY19F"},{"type":"text","value":" to two\ndecimal spaces.","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"kH6i18hpb0"}],"key":"HnG6Kzmt5Z"},{"type":"paragraph","position":{"start":{"line":153,"column":1},"end":{"line":161,"column":1}},"children":[{"type":"text","value":"However, the discretized ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"uTzsmbULxB"},{"type":"inlineMath","value":"\\widetilde{\\mathcal{S}}","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"S~\\widetilde{\\mathcal{S}}S","key":"rsbhmDuX8P"},{"type":"text","value":" and ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"kQwP9NNFl7"},{"type":"inlineMath","value":"\\widetilde{\\mathcal{A}}","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"A~\\widetilde{\\mathcal{A}}A","key":"CNd85uj9yx"},{"type":"text","value":" may be finite, but\nthey may be infeasibly large: we must divide ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"VVBDX8bAL8"},{"type":"emphasis","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"children":[{"type":"text","value":"each dimension","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"eV498jMUgC"}],"key":"yjei7BMPHf"},{"type":"text","value":" into\nintervals of length ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"p0PEte8Anw"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"ε\\varepsilonε","key":"FsD7pqVoo4"},{"type":"text","value":", resulting in\n","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"lWZoB2tAqr"},{"type":"inlineMath","value":"|\\widetilde{\\mathcal{S}}| = (B_\\st/\\varepsilon)^{n_\\st}","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"S~=(Bx/ε)nx|\\widetilde{\\mathcal{S}}| = (B_\\st/\\varepsilon)^{n_\\st}S=(Bx/ε)nx","key":"EMLd2JLuRx"},{"type":"text","value":" and\n","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"a4r9kiqiYB"},{"type":"inlineMath","value":"|\\widetilde{\\mathcal{A}}| = (B_\\act/\\varepsilon)^{n_\\act}","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"A~=(Bu/ε)nu|\\widetilde{\\mathcal{A}}| = (B_\\act/\\varepsilon)^{n_\\act}A=(Bu/ε)nu","key":"T0SRdf6v5o"},{"type":"text","value":". To get a sense of how\nquickly this grows, consider ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"K6dQ8DLTOZ"},{"type":"inlineMath","value":"\\varepsilon = 0.01, n_\\st = n_\\act = 10","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"ε=0.01,nx=nu=10\\varepsilon = 0.01, n_\\st = n_\\act = 10ε=0.01,nx=nu=10","key":"VZzixEfNJY"},{"type":"text","value":".\nThen the number of elements in the transition matrix would be\n","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"gLoFtfAcc9"},{"type":"inlineMath","value":"|\\widetilde{\\mathcal{S}}|^2 |\\widetilde{\\mathcal{A}}| = (100^{10})^2 (100^{10}) = 10^{60}","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"S~2A~=(10010)2(10010)=1060|\\widetilde{\\mathcal{S}}|^2 |\\widetilde{\\mathcal{A}}| = (100^{10})^2 (100^{10}) = 10^{60}S2A=(10010)2(10010)=1060","key":"GCSUnsN66j"},{"type":"text","value":"! (That’s\na trillion trillion trillion trillion trillion.)","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"syLVZDy8Bg"}],"key":"ftQgOSNJah"},{"type":"paragraph","position":{"start":{"line":163,"column":1},"end":{"line":169,"column":1}},"children":[{"type":"text","value":"What properties of the problem could we instead make use of? Note that\nby discretizing the state and action spaces, we implicitly assumed that\nrounding each state or action vector by some tiny amount ","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"KxRBiSHajm"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"html":"ε\\varepsilonε","key":"kB6GcMnCzD"},{"type":"text","value":"\nwouldn’t change the behavior of the system by much; namely, that the\ncost and dynamics were relatively ","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"WsPsTlBvXA"},{"type":"emphasis","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"children":[{"type":"text","value":"continuous","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"TFgzwngW44"}],"key":"tcvETJYM3q"},{"type":"text","value":". Can we use this\ncontinuous structure in other ways? This leads us to the ","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"yAEB3Onz7I"},{"type":"strong","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"children":[{"type":"text","value":"linear\nquadratic regulator","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"pt4mZgLG90"}],"key":"eFj7bOolnC"},{"type":"text","value":".","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"RmxRZrfvgD"}],"key":"lu9ru0GFeY"},{"type":"heading","depth":2,"position":{"start":{"line":172,"column":1},"end":{"line":172,"column":1}},"children":[{"type":"text","value":"The Linear Quadratic Regulator","position":{"start":{"line":172,"column":1},"end":{"line":172,"column":1}},"key":"QzQ0PdDoFg"}],"label":"lqr","identifier":"lqr","html_id":"lqr","enumerator":"2.3","key":"SksyjtcE7a"},{"type":"paragraph","position":{"start":{"line":174,"column":1},"end":{"line":175,"column":1}},"children":[{"type":"text","value":"The optimal control problem ","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"vPPWa4P515"},{"type":"crossReference","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"children":[{"type":"text","value":"Definition ","key":"uK0Z3fmWgy"},{"type":"text","value":"2.1","key":"MSB5o81PnR"}],"identifier":"optimal_control","label":"optimal_control","kind":"proof:definition","template":"Definition %s","enumerator":"2.1","resolved":true,"html_id":"optimal-control","key":"GJY8mIDGHl"},{"type":"text","value":" seems highly complex in general. Is there a relevant simplification that we can analyze?\nThe ","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"umjGORsY8L"},{"type":"strong","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"children":[{"type":"text","value":"linear quadratic regulator","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"wW3NZTDVaQ"}],"key":"UJiF7Kb7AT"},{"type":"text","value":" (LQR) is a solvable case and a fundamental tool in control theory.","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"HQeuScIcLC"}],"key":"g36yfmOl7S"},{"type":"proof","kind":"definition","label":"lqr_definition","identifier":"lqr_definition","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"The linear quadratic regulator","position":{"start":{"line":177,"column":1},"end":{"line":177,"column":1}},"key":"KTo5XAOdO3"}],"key":"iLWLxJ4Ue9"},{"type":"paragraph","position":{"start":{"line":180,"column":1},"end":{"line":181,"column":1}},"children":[{"type":"text","value":"The LQR problem is a special case of the ","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"Q6tzuVGiwo"},{"type":"crossReference","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"children":[{"type":"text","value":"General optimal control problem","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"QNjwS6Un2E"}],"identifier":"optimal_control","label":"optimal_control","kind":"proof:definition","template":"Definition %s","enumerator":"2.1","resolved":true,"html_id":"optimal-control","key":"XQcFz45aT0"},{"type":"text","value":" with ","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"yUhPyynFRM"},{"type":"emphasis","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"children":[{"type":"text","value":"linear dynamics","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"zwIUmp46st"}],"key":"Z6CZHUjdId"},{"type":"text","value":" and an ","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"UNWWDRDp2L"},{"type":"emphasis","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"children":[{"type":"text","value":"upward-curved quadratic cost function","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"DwnJ8csCoG"}],"key":"hWJYozlDsr"},{"type":"text","value":".\nSolving the LQR problem will additionally enable us to ","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"loMl9HnQsq"},{"type":"emphasis","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"children":[{"type":"text","value":"locally approximate","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"rLcJlHzoLf"}],"key":"UEoX2lRODJ"},{"type":"text","value":" more complex setups using ","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"tuW1fGTpV1"},{"type":"emphasis","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"children":[{"type":"text","value":"Taylor approximations","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"NiUWoNSWlY"}],"key":"X54ffCBl00"},{"type":"text","value":".","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"DjehXjStjv"}],"key":"CIhanD942Q"},{"type":"paragraph","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"children":[{"type":"strong","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"children":[{"type":"text","value":"Linear, time-homogeneous dynamics","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"tiNoT54trC"}],"key":"WiG2Yg4uah"},{"type":"text","value":": for each timestep ","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"zm7Jlq23Ag"},{"type":"inlineMath","value":"\\hi \\in [\\hor]","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"html":"h[H]\\hi \\in [\\hor]h[H]","key":"SaYEhatUYY"},{"type":"text","value":",","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"shb246Er6z"}],"key":"MUgxjPBrUT"},{"type":"math","value":"\\begin{aligned}\n \\st_{\\hi+1} &= f(\\st_\\hi, \\act_\\hi, w_\\hi) = A \\st_\\hi + B \\act_\\hi + w_\\hi \\\\\n \\text{where } w_\\hi &\\sim \\mathcal{N}(0, \\sigma^2 I).\n\\end{aligned}","position":{"start":{"line":185,"column":1},"end":{"line":190,"column":1}},"html":"xh+1=f(xh,uh,wh)=Axh+Buh+whwhere whN(0,σ2I).\\begin{aligned}\n \\st_{\\hi+1} &= f(\\st_\\hi, \\act_\\hi, w_\\hi) = A \\st_\\hi + B \\act_\\hi + w_\\hi \\\\\n \\text{where } w_\\hi &\\sim \\mathcal{N}(0, \\sigma^2 I).\n\\end{aligned}xh+1where wh=f(xh,uh,wh)=Axh+Buh+whN(0,σ2I).","enumerator":"2.2","key":"rqHoBuqvKm"},{"type":"paragraph","position":{"start":{"line":192,"column":1},"end":{"line":194,"column":1}},"children":[{"type":"text","value":"Here, ","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"wRa9BM9k7d"},{"type":"inlineMath","value":"w_\\hi","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"html":"whw_\\hiwh","key":"ztkBKzEaP5"},{"type":"text","value":" is a spherical Gaussian ","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"DOFD4l6s0u"},{"type":"strong","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"children":[{"type":"text","value":"noise term","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"rtcUTJ5FxC"}],"key":"n7bdoaPGB4"},{"type":"text","value":" that makes the dynamics random.\nSetting ","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"ZqI0MYuv9V"},{"type":"inlineMath","value":"\\sigma = 0","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"html":"σ=0\\sigma = 0σ=0","key":"jQorYqEKEF"},{"type":"text","value":" gives us ","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"bMDgdUE8mC"},{"type":"strong","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"children":[{"type":"text","value":"deterministic","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"iuU62UBeCf"}],"key":"osj9ybkdgS"},{"type":"text","value":" state transitions.\nWe will find that the optimal policy actually ","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"vQOtKlehEb"},{"type":"emphasis","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"children":[{"type":"text","value":"does not depend on the noise","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"lVy2UjaxIx"}],"key":"jQ25pc1dxi"},{"type":"text","value":", although the optimal value function and Q-function do.","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"HYS288HQI9"}],"key":"coVBrDffGq"},{"type":"paragraph","position":{"start":{"line":196,"column":1},"end":{"line":196,"column":1}},"children":[{"type":"strong","position":{"start":{"line":196,"column":1},"end":{"line":196,"column":1}},"children":[{"type":"text","value":"Upward-curved quadratic, time-homogeneous cost function","position":{"start":{"line":196,"column":1},"end":{"line":196,"column":1}},"key":"cIX8fzhg2F"}],"key":"ILlQjJk8GF"},{"type":"text","value":":","position":{"start":{"line":196,"column":1},"end":{"line":196,"column":1}},"key":"gmIpOtyYeB"}],"key":"ruY3atGYZP"},{"type":"math","value":"c(\\st_\\hi, \\act_\\hi) = \\begin{cases}\n \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi & \\hi < \\hor \\\\\n \\st_\\hi^\\top Q \\st_\\hi & \\hi = \\hor\n\\end{cases}.","position":{"start":{"line":198,"column":1},"end":{"line":203,"column":1}},"html":"c(xh,uh)={xhQxh+uhRuhh<HxhQxhh=H.c(\\st_\\hi, \\act_\\hi) = \\begin{cases}\n \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi & \\hi < \\hor \\\\\n \\st_\\hi^\\top Q \\st_\\hi & \\hi = \\hor\n\\end{cases}.c(xh,uh)={xhQxh+uhRuhxhQxhh<Hh=H.","enumerator":"2.3","key":"gtrYIWIGN1"},{"type":"paragraph","position":{"start":{"line":205,"column":1},"end":{"line":207,"column":1}},"children":[{"type":"text","value":"This cost function attempts to stabilize the state and action about ","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"CcwRX0lYeG"},{"type":"inlineMath","value":"(s^\\star, a^\\star) = (0, 0)","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"html":"(s,a)=(0,0)(s^\\star, a^\\star) = (0, 0)(s,a)=(0,0)","key":"wRPIIpnYO5"},{"type":"text","value":".\nWe require ","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"wa6NsBMfpc"},{"type":"inlineMath","value":"Q \\in \\R^{n_\\st \\times n_\\st}","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"html":"QRnx×nxQ \\in \\R^{n_\\st \\times n_\\st}QRnx×nx","key":"hxcHZvmoj5"},{"type":"text","value":" and ","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"MpkrJghqZ4"},{"type":"inlineMath","value":"R \\in \\R^{n_\\act \\times n_\\act}","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"html":"RRnu×nuR \\in \\R^{n_\\act \\times n_\\act}RRnu×nu","key":"AxYm6Pkx59"},{"type":"text","value":" to both be ","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"X87C2Ht2mJ"},{"type":"emphasis","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"children":[{"type":"text","value":"positive definite","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"AudQLJrnxa"}],"key":"rWNpyCLWXi"},{"type":"text","value":" matrices so that ","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"gygJCxJESy"},{"type":"inlineMath","value":"c","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"html":"ccc","key":"FvpmNLQyEf"},{"type":"text","value":" has a well-defined unique minimum.\nWe can furthermore assume without loss of generality that they are both ","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"z1JTN3tTz7"},{"type":"emphasis","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"children":[{"type":"text","value":"symmetric","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"xUIJYMTA4n"}],"key":"Gd8Cw9rrNZ"},{"type":"text","value":" (see exercise below).","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"JmhfSmZGFA"}],"key":"GUTCYYsOOi"},{"type":"paragraph","position":{"start":{"line":209,"column":1},"end":{"line":209,"column":1}},"children":[{"type":"text","value":"This results in the LQR optimization problem:","position":{"start":{"line":209,"column":1},"end":{"line":209,"column":1}},"key":"p8LCUUelI0"}],"key":"Fel3jS9wu5"},{"type":"math","value":"\\begin{aligned}\n \\min_{\\pi_0, \\dots, \\pi_{\\hor-1} : \\mathcal{S} \\to \\mathcal{A}} \\quad & \\E \\left[ \\left( \\sum_{\\hi=0}^{\\hor-1} \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi \\right) + \\st_\\hor^\\top Q \\st_\\hor \\right] \\\\\n \\textrm{where} \\quad & \\st_{\\hi+1} = A \\st_\\hi + B \\act_\\hi + w_\\hi \\\\\n & \\act_\\hi = \\pi_\\hi (\\st_\\hi) \\\\\n & w_\\hi \\sim \\mathcal{N}(0, \\sigma^2 I) \\\\\n & \\st_0 \\sim \\mu_0.\n\\end{aligned}","position":{"start":{"line":211,"column":1},"end":{"line":219,"column":1}},"html":"minπ0,,πH1:SAE[(h=0H1xhQxh+uhRuh)+xHQxH]wherexh+1=Axh+Buh+whuh=πh(xh)whN(0,σ2I)x0μ0.\\begin{aligned}\n \\min_{\\pi_0, \\dots, \\pi_{\\hor-1} : \\mathcal{S} \\to \\mathcal{A}} \\quad & \\E \\left[ \\left( \\sum_{\\hi=0}^{\\hor-1} \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi \\right) + \\st_\\hor^\\top Q \\st_\\hor \\right] \\\\\n \\textrm{where} \\quad & \\st_{\\hi+1} = A \\st_\\hi + B \\act_\\hi + w_\\hi \\\\\n & \\act_\\hi = \\pi_\\hi (\\st_\\hi) \\\\\n & w_\\hi \\sim \\mathcal{N}(0, \\sigma^2 I) \\\\\n & \\st_0 \\sim \\mu_0.\n\\end{aligned}π0,,πH1:SAminwhereE[(h=0H1xhQxh+uhRuh)+xHQxH]xh+1=Axh+Buh+whuh=πh(xh)whN(0,σ2I)x0μ0.","enumerator":"2.4","key":"ekpcUWGMwW"}],"enumerator":"2.2","html_id":"lqr-definition","key":"BvwlGrd8Q6"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Exercise","position":{"start":{"line":222,"column":1},"end":{"line":222,"column":1}},"key":"tD80TpZQG0"}],"key":"oERe42aPBU"},{"type":"paragraph","position":{"start":{"line":223,"column":1},"end":{"line":224,"column":1}},"children":[{"type":"text","value":"Here we’ll show that we don’t lose generality by assuming that ","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"cu9j4qJiMC"},{"type":"inlineMath","value":"Q","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"QQQ","key":"Udi2zaSgcS"},{"type":"text","value":" and ","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"X2Ru69wyWL"},{"type":"inlineMath","value":"R","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"RRR","key":"XHYNuFielg"},{"type":"text","value":" are symmetric.\nShow that replacing ","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"D2g7snDwcq"},{"type":"inlineMath","value":"Q","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"QQQ","key":"trbw8rxZSV"},{"type":"text","value":" and ","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"FpuXVBqV0J"},{"type":"inlineMath","value":"R","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"RRR","key":"tEhCQYxw7t"},{"type":"text","value":" with ","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"kN2cmkKnkg"},{"type":"inlineMath","value":"(Q + Q^\\top) / 2","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"(Q+Q)/2(Q + Q^\\top) / 2(Q+Q)/2","key":"WrHQ82wByV"},{"type":"text","value":" and ","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"F5fbhLiOfI"},{"type":"inlineMath","value":"(R + R^\\top) / 2","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"(R+R)/2(R + R^\\top) / 2(R+R)/2","key":"XQtQ06Xb2X"},{"type":"text","value":" (which are symmetric) yields the same cost function.","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"n55LllEjnX"}],"key":"VgDvdYYuOI"}],"key":"qIVnxsTiIO"},{"type":"paragraph","position":{"start":{"line":227,"column":1},"end":{"line":228,"column":1}},"children":[{"type":"text","value":"We will henceforth abbreviate “symmetric positive definite” as s.p.d.\nand “positive definite” as p.d.","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"key":"hzKfTUAKmE"}],"key":"vVO8YJRqMW"},{"type":"paragraph","position":{"start":{"line":230,"column":1},"end":{"line":232,"column":1}},"children":[{"type":"text","value":"It will be helpful to reintroduce the ","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"key":"lDenJchkwX"},{"type":"emphasis","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"children":[{"type":"text","value":"value function","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"key":"KSFhsOrUIz"}],"key":"QPB7NXwmyO"},{"type":"text","value":" notation for a policy to denote the average cost it incurs.\nThese will be instrumental in constructing the optimal policy via ","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"key":"dqCkauyupo"},{"type":"strong","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"children":[{"type":"text","value":"dynamic programming,","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"key":"eNgDIpCWQi"}],"key":"tXXUoiR6xt"},{"type":"text","value":"\nas we did in ","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"key":"lLtEp6hnMB"},{"type":"crossReference","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"children":[{"type":"text","value":"Section ","key":"lncvW5JktL"},{"type":"text","value":"1.3.2","key":"LIsUoWYsdD"}],"identifier":"opt_dynamic_programming","label":"opt_dynamic_programming","kind":"heading","template":"Section %s","enumerator":"1.3.2","resolved":true,"html_id":"opt-dynamic-programming","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"EBxdT9bfuM"},{"type":"text","value":" for MDPs.","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"key":"gZGJdMvdhd"}],"key":"tvC2G08hwM"},{"type":"proof","kind":"definition","label":"value_lqr","identifier":"value_lqr","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Value functions for LQR","position":{"start":{"line":234,"column":1},"end":{"line":234,"column":1}},"key":"PYpcrefxUL"}],"key":"S69ct9oj09"},{"type":"paragraph","position":{"start":{"line":237,"column":1},"end":{"line":238,"column":1}},"children":[{"type":"text","value":"Given a policy ","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"key":"jrUvcXEhFA"},{"type":"inlineMath","value":"\\mathbf{\\pi} = (\\pi_0, \\dots, \\pi_{\\hor-1})","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"html":"π=(π0,,πH1)\\mathbf{\\pi} = (\\pi_0, \\dots, \\pi_{\\hor-1})π=(π0,,πH1)","key":"zOF0sUe3Cm"},{"type":"text","value":",\nwe can define its value function ","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"key":"HXIuHiBi3d"},{"type":"inlineMath","value":"V^\\pi_\\hi : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"html":"Vhπ:SRV^\\pi_\\hi : \\mathcal{S} \\to \\mathbb{R}Vhπ:SR","key":"yBrRaNeRP1"},{"type":"text","value":" at time ","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"key":"X7caHQMP34"},{"type":"inlineMath","value":"\\hi \\in [\\hor]","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"html":"h[H]\\hi \\in [\\hor]h[H]","key":"UucuwQE4Dn"},{"type":"text","value":" as the average ","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"key":"owpNqY8TDr"},{"type":"strong","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"children":[{"type":"text","value":"cost-to-go","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"key":"EF8ABdrc50"}],"key":"gQKSdxZrlB"},{"type":"text","value":" incurred by that policy:","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"key":"i0sce0le3F"}],"key":"IdHyssV9Bm"},{"type":"math","value":"\\begin{split}\n V^\\pi_\\hi (\\st) &= \\E \\left[ \\left( \\sum_{i=\\hi}^{\\hor-1} c(\\st_i, \\act_i) \\right) + c(\\st_\\hor) \\mid \\st_\\hi = \\st, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\right] \\\\\n &= \\E \\left[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_i^\\top Q \\st_i + \\act_i^\\top R \\act_i \\right) + \\st_\\hor^\\top Q \\st_\\hor \\mid \\st_\\hi = \\st, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\right] \\\\\n\\end{split}","position":{"start":{"line":240,"column":1},"end":{"line":245,"column":1}},"html":"Vhπ(x)=E[(i=hH1c(xi,ui))+c(xH)xh=x,ui=πi(xi)hi<H]=E[(i=hH1xiQxi+uiRui)+xHQxHxh=x,ui=πi(xi)hi<H]\\begin{split}\n V^\\pi_\\hi (\\st) &= \\E \\left[ \\left( \\sum_{i=\\hi}^{\\hor-1} c(\\st_i, \\act_i) \\right) + c(\\st_\\hor) \\mid \\st_\\hi = \\st, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\right] \\\\\n &= \\E \\left[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_i^\\top Q \\st_i + \\act_i^\\top R \\act_i \\right) + \\st_\\hor^\\top Q \\st_\\hor \\mid \\st_\\hi = \\st, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\right] \\\\\n\\end{split}Vhπ(x)=E[(i=hH1c(xi,ui))+c(xH)xh=x,ui=πi(xi)hi<H]=E[(i=hH1xiQxi+uiRui)+xHQxHxh=x,ui=πi(xi)hi<H]","enumerator":"2.5","key":"QbLR2nmt2R"},{"type":"paragraph","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"children":[{"type":"text","value":"The Q-function additionally conditions on the first action we take:","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"JEBXGQX4Qk"}],"key":"PDPe0RDx6J"},{"type":"math","value":"\\begin{split}\n Q^\\pi_\\hi (\\st, \\act) &= \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} c(\\st_i, \\act_i) \\right) + c(\\st_\\hor) \\\\\n &\\qquad\\qquad \\mid (\\st_\\hi, \\act_\\hi) = (\\st, \\act), \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\bigg] \\\\\n &= \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_i^\\top Q \\st_i + \\act_i^\\top R \\act_i \\right) + \\st_\\hor^\\top Q \\st_\\hor \\\\\n &\\qquad\\qquad \\mid (\\st_\\hi, \\act_\\hi) = (\\st, \\act), \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\bigg] \\\\\n\\end{split}","position":{"start":{"line":249,"column":1},"end":{"line":256,"column":1}},"html":"Qhπ(x,u)=E[(i=hH1c(xi,ui))+c(xH)(xh,uh)=(x,u),ui=πi(xi)hi<H]=E[(i=hH1xiQxi+uiRui)+xHQxH(xh,uh)=(x,u),ui=πi(xi)hi<H]\\begin{split}\n Q^\\pi_\\hi (\\st, \\act) &= \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} c(\\st_i, \\act_i) \\right) + c(\\st_\\hor) \\\\\n &\\qquad\\qquad \\mid (\\st_\\hi, \\act_\\hi) = (\\st, \\act), \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\bigg] \\\\\n &= \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_i^\\top Q \\st_i + \\act_i^\\top R \\act_i \\right) + \\st_\\hor^\\top Q \\st_\\hor \\\\\n &\\qquad\\qquad \\mid (\\st_\\hi, \\act_\\hi) = (\\st, \\act), \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\bigg] \\\\\n\\end{split}Qhπ(x,u)=E[(i=hH1c(xi,ui))+c(xH)(xh,uh)=(x,u),ui=πi(xi)hi<H]=E[(i=hH1xiQxi+uiRui)+xHQxH(xh,uh)=(x,u),ui=πi(xi)hi<H]","enumerator":"2.6","key":"Dq4liTtlCq"},{"type":"paragraph","position":{"start":{"line":258,"column":1},"end":{"line":259,"column":1}},"children":[{"type":"text","value":"Note that since we use ","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"OaIVD91QkC"},{"type":"emphasis","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"children":[{"type":"text","value":"cost","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"gHNgA9pgsB"}],"key":"RjiID6EYqT"},{"type":"text","value":" instead of ","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"tKnEfK8FUW"},{"type":"emphasis","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"children":[{"type":"text","value":"reward,","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"O2ReYCrpzA"}],"key":"M2Dqp5tHoJ"},{"type":"text","value":"\nthe best policies are the ones with ","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"POwoIxcyzB"},{"type":"emphasis","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"children":[{"type":"text","value":"smaller","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"tQ2F5edtcD"}],"key":"tShjgg0qa0"},{"type":"text","value":" values of the value function.","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"dT6FyEg3lQ"}],"key":"o8eszDuQRP"}],"enumerator":"2.3","html_id":"value-lqr","key":"niVrk2p0UK"},{"type":"heading","depth":2,"position":{"start":{"line":263,"column":1},"end":{"line":263,"column":1}},"children":[{"type":"text","value":"Optimality and the Riccati Equation","position":{"start":{"line":263,"column":1},"end":{"line":263,"column":1}},"key":"U3GHjDvjm1"}],"label":"optimal_lqr","identifier":"optimal_lqr","html_id":"optimal-lqr","enumerator":"2.4","key":"DL0tbUBs1B"},{"type":"paragraph","position":{"start":{"line":265,"column":1},"end":{"line":270,"column":1}},"children":[{"type":"text","value":"In this section,\nwe’ll compute the optimal value function ","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"nnODUQZtRq"},{"type":"inlineMath","value":"V^\\star_h","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"html":"VhV^\\star_hVh","key":"bGsGBzRL9u"},{"type":"text","value":",\nQ-function ","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"gUJZuOTTlQ"},{"type":"inlineMath","value":"Q^\\star_h","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"html":"QhQ^\\star_hQh","key":"RXodlv6Qmf"},{"type":"text","value":",\nand policy ","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"IaexXKS2mb"},{"type":"inlineMath","value":"\\pi^\\star_h","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"html":"πh\\pi^\\star_hπh","key":"we7oebo6aw"},{"type":"text","value":" in ","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"YyEcuuzqGU"},{"type":"crossReference","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"children":[{"type":"text","value":"the linear quadratic regulator","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"ijv6IYZhMa"}],"identifier":"lqr_definition","label":"lqr_definition","kind":"proof:definition","template":"Definition %s","enumerator":"2.2","resolved":true,"html_id":"lqr-definition","key":"ATyiVjtNRU"},{"type":"text","value":" using ","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"Hnb90rgYDT"},{"type":"strong","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"children":[{"type":"text","value":"dynamic programming","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"d4Lt6DO8FT"}],"key":"chpfbbM0Fx"},{"type":"text","value":"\nin a very similar way to the DP algorithms ","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"kQq1iDsfG3"},{"type":"crossReference","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"children":[{"type":"text","value":"in the MDP setting","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"LnbdDtbxgw"}],"identifier":"eval_dp","label":"eval_dp","kind":"heading","template":"Section %s","enumerator":"1.3.1","resolved":true,"html_id":"eval-dp","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"aw99UZeqYQ"},{"type":"text","value":".\nRecall the definition of the optimal value function:","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"UDUSJ46czj"}],"key":"NKTWzNNnSK"},{"type":"proof","kind":"definition","label":"optimal_value_lqr","identifier":"optimal_value_lqr","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Optimal value function in LQR","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"ihPasEM5hx"}],"key":"kUcClvzhXJ"},{"type":"paragraph","position":{"start":{"line":275,"column":1},"end":{"line":277,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"GGyhckHOCc"},{"type":"strong","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"children":[{"type":"text","value":"optimal value function","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"gsrQdzFufg"}],"key":"mkg5PKAxmB"},{"type":"text","value":" is the one that,\nat any time and in any state,\nachieves ","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"D2fNkC6yoZ"},{"type":"emphasis","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"children":[{"type":"text","value":"minimum cost","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"w9SkzTNLiv"}],"key":"MwKXsd669B"},{"type":"text","value":" across ","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"buRZiRJc5J"},{"type":"emphasis","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"children":[{"type":"text","value":"all policies","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"UOvQgE4fry"}],"key":"r7akdn2N4h"},{"type":"text","value":":","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"EhdtNtPJjI"}],"key":"NRO6B1XpF2"},{"type":"math","value":"\\begin{split}\n V^\\star_\\hi(\\st) &= \\min_{\\pi_\\hi, \\dots, \\pi_{\\hor-1}} V^\\pi_\\hi(\\st) \\\\\n &= \\min_{\\pi_{\\hi}, \\dots, \\pi_{\\hor-1}} \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi \\right) + \\st_\\hor^\\top Q \\st_\\hor \\\\\n &\\hspace{8em} \\mid \\st_\\hi = \\st, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\bigg] \\\\\n\\end{split}","position":{"start":{"line":279,"column":1},"end":{"line":285,"column":1}},"html":"Vh(x)=minπh,,πH1Vhπ(x)=minπh,,πH1E[(i=hH1xhQxh+uhRuh)+xHQxHxh=x,ui=πi(xi)hi<H]\\begin{split}\n V^\\star_\\hi(\\st) &= \\min_{\\pi_\\hi, \\dots, \\pi_{\\hor-1}} V^\\pi_\\hi(\\st) \\\\\n &= \\min_{\\pi_{\\hi}, \\dots, \\pi_{\\hor-1}} \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi \\right) + \\st_\\hor^\\top Q \\st_\\hor \\\\\n &\\hspace{8em} \\mid \\st_\\hi = \\st, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\bigg] \\\\\n\\end{split}Vh(x)=πh,,πH1minVhπ(x)=πh,,πH1minE[(i=hH1xhQxh+uhRuh)+xHQxHxh=x,ui=πi(xi)hi<H]","enumerator":"2.7","key":"jkFrh0vE4y"},{"type":"paragraph","position":{"start":{"line":287,"column":1},"end":{"line":288,"column":1}},"children":[{"type":"text","value":"The optimal Q-function is defined similarly,\nconditioned on the starting action as well:","position":{"start":{"line":287,"column":1},"end":{"line":287,"column":1}},"key":"O61lC39dQG"}],"key":"qKD152d8cy"},{"type":"math","value":"\\begin{split}\n Q^\\star_\\hi(\\st, \\act) &= \\min_{\\pi_\\hi, \\dots, \\pi_{\\hor-1}} Q^\\pi_\\hi(\\st, \\act) \\\\\n &= \\min_{\\pi_{\\hi}, \\dots, \\pi_{\\hor-1}} \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi \\right) + \\st_\\hor^\\top Q \\st_\\hor \\\\\n &\\hspace{8em} \\mid \\st_\\hi = \\st, \\act_\\hi = \\act, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi < i < H \\bigg] \\\\\n\\end{split}","position":{"start":{"line":290,"column":1},"end":{"line":296,"column":1}},"html":"Qh(x,u)=minπh,,πH1Qhπ(x,u)=minπh,,πH1E[(i=hH1xhQxh+uhRuh)+xHQxHxh=x,uh=u,ui=πi(xi)h<i<H]\\begin{split}\n Q^\\star_\\hi(\\st, \\act) &= \\min_{\\pi_\\hi, \\dots, \\pi_{\\hor-1}} Q^\\pi_\\hi(\\st, \\act) \\\\\n &= \\min_{\\pi_{\\hi}, \\dots, \\pi_{\\hor-1}} \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi \\right) + \\st_\\hor^\\top Q \\st_\\hor \\\\\n &\\hspace{8em} \\mid \\st_\\hi = \\st, \\act_\\hi = \\act, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi < i < H \\bigg] \\\\\n\\end{split}Qh(x,u)=πh,,πH1minQhπ(x,u)=πh,,πH1minE[(i=hH1xhQxh+uhRuh)+xHQxHxh=x,uh=u,ui=πi(xi)h<i<H]","enumerator":"2.8","key":"H6LvTkVVzW"},{"type":"paragraph","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"children":[{"type":"text","value":"Both of the definitions above assume ","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"key":"OZo3rYOADI"},{"type":"emphasis","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"children":[{"type":"text","value":"deterministic","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"key":"OpBecEBzo0"}],"key":"Nm5CpkaaP3"},{"type":"text","value":" policies. Otherwise we would have to take an ","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"key":"lLqUC0ai3G"},{"type":"emphasis","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"children":[{"type":"text","value":"expectation","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"key":"yNzQ9N5Nbs"}],"key":"ZNO7jh1fnI"},{"type":"text","value":" over actions drawn from the policy, i.e. ","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"key":"n5B40QDKrY"},{"type":"inlineMath","value":"\\act_\\hi \\sim \\pi_\\hi (\\st_\\hi)","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"html":"uhπh(xh)\\act_\\hi \\sim \\pi_\\hi (\\st_\\hi)uhπh(xh)","key":"U3vy7C4qCM"},{"type":"text","value":".","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"key":"u0fDYE2Blu"}],"key":"ZTSb51jOFO"}],"enumerator":"2.4","html_id":"optimal-value-lqr","key":"BLr8sERXTc"},{"type":"paragraph","position":{"start":{"line":301,"column":1},"end":{"line":303,"column":1}},"children":[{"type":"text","value":"We will prove the striking fact that the solution has very simple structure:\n","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"vKiFLyyVdF"},{"type":"inlineMath","value":"V_h^\\star","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"html":"VhV_h^\\starVh","key":"VXlT2Qsq26"},{"type":"text","value":" and ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"OOS1Vwuo8K"},{"type":"inlineMath","value":"Q^\\star_h","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"html":"QhQ^\\star_hQh","key":"zlYpjrWfRM"},{"type":"text","value":" are ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"VeoyPE1b2Q"},{"type":"emphasis","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"children":[{"type":"text","value":"upward-curved quadratics","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"zPL7RXUlG2"}],"key":"gnhvCXZwyY"},{"type":"text","value":"\nand ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"nPvjiwtZnN"},{"type":"inlineMath","value":"\\pi_h^\\star","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"html":"πh\\pi_h^\\starπh","key":"brff7JnXzR"},{"type":"text","value":" is ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"XRv3d2kBty"},{"type":"emphasis","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"children":[{"type":"text","value":"linear","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"iydD0tQjT3"}],"key":"H1KrcxconF"},{"type":"text","value":" and furthermore does not depend on the noise!","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"BT8CD9Nb6d"}],"key":"nQ5a9WWHXw"},{"type":"proof","kind":"theorem","label":"optimal_value_lqr_quadratic","identifier":"optimal_value_lqr_quadratic","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Optimal value function in LQR is an upward-curved quadratic","position":{"start":{"line":305,"column":1},"end":{"line":305,"column":1}},"key":"R6yzgTMQJR"}],"key":"ZyzaOVqzaH"},{"type":"paragraph","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"children":[{"type":"text","value":"At each timestep ","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"mLxOEqvL2w"},{"type":"inlineMath","value":"\\hi \\in [\\hor]","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"html":"h[H]\\hi \\in [\\hor]h[H]","key":"CEMLKgJqSF"},{"type":"text","value":",","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"mfPmYvhopr"}],"key":"uyA3Oaao3P"},{"type":"math","value":"V^\\star_\\hi(\\st) = \\st^\\top P_\\hi \\st + p_\\hi","position":{"start":{"line":310,"column":1},"end":{"line":312,"column":1}},"html":"Vh(x)=xPhx+phV^\\star_\\hi(\\st) = \\st^\\top P_\\hi \\st + p_\\hiVh(x)=xPhx+ph","enumerator":"2.9","key":"ZuxIFBPbHu"},{"type":"paragraph","position":{"start":{"line":314,"column":1},"end":{"line":315,"column":1}},"children":[{"type":"text","value":"for some s.p.d. matrix ","position":{"start":{"line":314,"column":1},"end":{"line":314,"column":1}},"key":"BQ2YWO1e2S"},{"type":"inlineMath","value":"P_\\hi \\in \\mathbb{R}^{n_\\st \\times n_\\st}","position":{"start":{"line":314,"column":1},"end":{"line":314,"column":1}},"html":"PhRnx×nxP_\\hi \\in \\mathbb{R}^{n_\\st \\times n_\\st}PhRnx×nx","key":"BXfwjO2UHs"},{"type":"text","value":" and scalar\n","position":{"start":{"line":314,"column":1},"end":{"line":314,"column":1}},"key":"zxp82HXt3h"},{"type":"inlineMath","value":"p_\\hi \\in \\mathbb{R}","position":{"start":{"line":314,"column":1},"end":{"line":314,"column":1}},"html":"phRp_\\hi \\in \\mathbb{R}phR","key":"mAwNHdGVfg"},{"type":"text","value":".","position":{"start":{"line":314,"column":1},"end":{"line":314,"column":1}},"key":"J6HbiJBKcO"}],"key":"Nm1GKmBrNm"}],"enumerator":"2.1","html_id":"optimal-value-lqr-quadratic","key":"N94T4NzQbH"},{"type":"proof","kind":"theorem","label":"optimal_policy_lqr_linear","identifier":"optimal_policy_lqr_linear","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Optimal policy in LQR is linear","position":{"start":{"line":318,"column":1},"end":{"line":318,"column":1}},"key":"e3jpxuJo0R"}],"key":"lfatD38Upn"},{"type":"paragraph","position":{"start":{"line":321,"column":1},"end":{"line":321,"column":1}},"children":[{"type":"text","value":"At each timestep ","position":{"start":{"line":321,"column":1},"end":{"line":321,"column":1}},"key":"VA0YmXXZGi"},{"type":"inlineMath","value":"\\hi \\in [\\hor]","position":{"start":{"line":321,"column":1},"end":{"line":321,"column":1}},"html":"h[H]\\hi \\in [\\hor]h[H]","key":"eJuWH5Witv"},{"type":"text","value":",","position":{"start":{"line":321,"column":1},"end":{"line":321,"column":1}},"key":"sjKnBFIUxx"}],"key":"Ztu0i1337Q"},{"type":"math","value":"\\pi^\\star_\\hi (\\st) = - K_\\hi \\st","position":{"start":{"line":323,"column":1},"end":{"line":325,"column":1}},"html":"πh(x)=Khx\\pi^\\star_\\hi (\\st) = - K_\\hi \\stπh(x)=Khx","enumerator":"2.10","key":"PW0taE0NSG"},{"type":"paragraph","position":{"start":{"line":327,"column":1},"end":{"line":328,"column":1}},"children":[{"type":"text","value":"for some ","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"key":"epnf8sM87M"},{"type":"inlineMath","value":"K_\\hi \\in \\mathbb{R}^{n_\\act \\times n_\\st}","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"html":"KhRnu×nxK_\\hi \\in \\mathbb{R}^{n_\\act \\times n_\\st}KhRnu×nx","key":"ue4ldVe4oX"},{"type":"text","value":".\n(The negative is due to convention.)","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"key":"GSvUyfEOMn"}],"key":"CKoPPPSXpI"}],"enumerator":"2.2","html_id":"optimal-policy-lqr-linear","key":"QXMYKpUmUF"},{"type":"paragraph","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"children":[{"type":"text","value":"The construction (and inductive proof) proceeds similarly to the one ","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"key":"bZzcS6CTtg"},{"type":"crossReference","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"children":[{"type":"text","value":"in the MDP setting","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"key":"QHY7NxFukc"}],"identifier":"eval_dp","label":"eval_dp","kind":"heading","template":"Section %s","enumerator":"1.3.1","resolved":true,"html_id":"eval-dp","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"CRuGiqZ9Za"},{"type":"text","value":".","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"key":"SqPiY72pnK"}],"key":"cNYXSWJi2P"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":333,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"We’ll compute ","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"S9iZoLYJlt"},{"type":"inlineMath","value":"V_\\hor^\\star","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"html":"VHV_\\hor^\\starVH","key":"jkW6fFH5Qm"},{"type":"text","value":" (at the end of the horizon) as our base case.","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"AXNtz3DsJd"}],"key":"lYQVynNCw5"},{"type":"listItem","spread":true,"position":{"start":{"line":334,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"Then we’ll work step-by-step backwards in time, using ","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"key":"NGUY8eGlRs"},{"type":"inlineMath","value":"V_{\\hi+1}^\\star","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"html":"Vh+1V_{\\hi+1}^\\starVh+1","key":"Z0Z9Yh5uis"},{"type":"text","value":" to compute ","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"key":"iZqeDIEL51"},{"type":"inlineMath","value":"Q_\\hi^\\star","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"html":"QhQ_\\hi^\\starQh","key":"jtFfuwMyMc"},{"type":"text","value":", ","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"key":"XVzP4G9PIs"},{"type":"inlineMath","value":"\\pi_{\\hi}^\\star","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"html":"πh\\pi_{\\hi}^\\starπh","key":"GrBj5Ifa37"},{"type":"text","value":", and ","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"key":"isgNvx47at"},{"type":"inlineMath","value":"V_\\hi^\\star","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"html":"VhV_\\hi^\\starVh","key":"Q3zxcrRYHY"},{"type":"text","value":".","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"key":"MY72zBwbWv"}],"key":"DFuKskN40l"}],"key":"VOxMSHNwl0"},{"type":"comment","value":" TODO insert reference for proof by induction ","key":"VMctwXUe3T"},{"type":"paragraph","position":{"start":{"line":338,"column":1},"end":{"line":343,"column":1}},"children":[{"type":"strong","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"children":[{"type":"text","value":"Base case:","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"key":"IxjPiBMAzv"}],"key":"LAzerDWwEZ"},{"type":"text","value":"\nAt the final timestep,\nthere are no possible actions to take,\nand so ","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"key":"t4ulWZv4Ta"},{"type":"inlineMath","value":"V^\\star_\\hor(\\st) = c(\\st) = \\st^\\top Q \\st","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"html":"VH(x)=c(x)=xQxV^\\star_\\hor(\\st) = c(\\st) = \\st^\\top Q \\stVH(x)=c(x)=xQx","key":"RStYDuEz4j"},{"type":"text","value":".\nThus ","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"key":"ZToSGLuUwn"},{"type":"inlineMath","value":"V_\\hor^\\star(\\st) = \\st^\\top P_\\hor \\st + p_\\hor","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"html":"VH(x)=xPHx+pHV_\\hor^\\star(\\st) = \\st^\\top P_\\hor \\st + p_\\horVH(x)=xPHx+pH","key":"sBr8Hf7xvK"},{"type":"text","value":"\nwhere ","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"key":"mm3YSYfDB6"},{"type":"inlineMath","value":"P_\\hor = Q","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"html":"PH=QP_\\hor = QPH=Q","key":"Cnm1AK2hy5"},{"type":"text","value":" and ","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"key":"RDd13Kb3Kk"},{"type":"inlineMath","value":"p_\\hor = 0","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"html":"pH=0p_\\hor = 0pH=0","key":"VIMgcqSdAb"},{"type":"text","value":".","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"key":"krhWPeYYVb"}],"key":"vwbIehDqrh"},{"type":"paragraph","position":{"start":{"line":345,"column":1},"end":{"line":350,"column":1}},"children":[{"type":"strong","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"children":[{"type":"text","value":"Inductive hypothesis:","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"Z43vbHWBga"}],"key":"IQoS6tuAxz"},{"type":"text","value":"\nWe seek to show that the inductive step holds for both theorems:\nIf ","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"bu9qmb5kvk"},{"type":"inlineMath","value":"V^\\star_{\\hi+1}(\\st)","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"html":"Vh+1(x)V^\\star_{\\hi+1}(\\st)Vh+1(x)","key":"tGrNuRKqrF"},{"type":"text","value":" is an upward-curved quadratic,\nthen ","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"dEpNj95wpn"},{"type":"inlineMath","value":"V^\\star_\\hi(\\st)","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"html":"Vh(x)V^\\star_\\hi(\\st)Vh(x)","key":"KsfW8otBv3"},{"type":"text","value":" must also be an upward-curved quadratic,\nand ","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"z30wp8iGYl"},{"type":"inlineMath","value":"\\pi^\\star_\\hi(\\st)","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"html":"πh(x)\\pi^\\star_\\hi(\\st)πh(x)","key":"gQd8pKqEpP"},{"type":"text","value":" must be linear.\nWe’ll break this down into the following steps:","position":{"start":{"line":345,"column":1},"end":{"line":345,"column":1}},"key":"GeTgMuLFdn"}],"key":"bZoHhpBboR"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":352,"column":1},"end":{"line":358,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":352,"column":1},"end":{"line":353,"column":1}},"children":[{"type":"text","value":"Show that ","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"key":"mgQryzhbd4"},{"type":"inlineMath","value":"Q^\\star_\\hi(\\st, \\act)","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"html":"Qh(x,u)Q^\\star_\\hi(\\st, \\act)Qh(x,u)","key":"YLSfQAWBiG"},{"type":"text","value":" is an upward-curved quadratic (in both\n","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"key":"kSeUwf0iJp"},{"type":"inlineMath","value":"\\st","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"html":"x\\stx","key":"XPfXKJ9mSG"},{"type":"text","value":" and ","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"key":"JzJU89PnUO"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"html":"u\\actu","key":"VtPvYhVgnR"},{"type":"text","value":").","position":{"start":{"line":352,"column":1},"end":{"line":352,"column":1}},"key":"lhKjj88uBp"}],"key":"nKuofIzVb1"},{"type":"listItem","spread":true,"position":{"start":{"line":354,"column":1},"end":{"line":356,"column":1}},"children":[{"type":"text","value":"Derive the optimal policy\n","position":{"start":{"line":354,"column":1},"end":{"line":354,"column":1}},"key":"TrSuXZe6YA"},{"type":"inlineMath","value":"\\pi^\\star_\\hi(\\st) = \\arg \\min_\\act Q^\\star_\\hi(\\st, \\act)","position":{"start":{"line":354,"column":1},"end":{"line":354,"column":1}},"html":"πh(x)=argminuQh(x,u)\\pi^\\star_\\hi(\\st) = \\arg \\min_\\act Q^\\star_\\hi(\\st, \\act)πh(x)=argminuQh(x,u)","key":"lAuNtCbxKv"},{"type":"text","value":" and show\nthat it’s linear.","position":{"start":{"line":354,"column":1},"end":{"line":354,"column":1}},"key":"lnTM6icbAt"}],"key":"tw5bjwUs20"},{"type":"listItem","spread":true,"position":{"start":{"line":357,"column":1},"end":{"line":358,"column":1}},"children":[{"type":"text","value":"Show that ","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"key":"QzVmuPfKqI"},{"type":"inlineMath","value":"V^\\star_\\hi(\\st)","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"html":"Vh(x)V^\\star_\\hi(\\st)Vh(x)","key":"ZtiSrGQhLY"},{"type":"text","value":" is an upward-curved quadratic.","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"key":"rOqQpnFSVg"}],"key":"elPg65paVW"}],"key":"bI5fVDs1bG"},{"type":"paragraph","position":{"start":{"line":359,"column":1},"end":{"line":360,"column":1}},"children":[{"type":"text","value":"We first assume the inductive hypothesis that our theorems are true at\ntime ","position":{"start":{"line":359,"column":1},"end":{"line":359,"column":1}},"key":"SLrukVcqAM"},{"type":"inlineMath","value":"\\hi+1","position":{"start":{"line":359,"column":1},"end":{"line":359,"column":1}},"html":"h+1\\hi+1h+1","key":"Czc27SQFVH"},{"type":"text","value":". That is,","position":{"start":{"line":359,"column":1},"end":{"line":359,"column":1}},"key":"LfMVnoQy0J"}],"key":"h8W2FnNmIL"},{"type":"math","value":"V^\\star_{\\hi+1}(\\st) = \\st^\\top P_{\\hi+1} \\st + p_{\\hi+1} \\quad \\forall \\st \\in \\mathcal{S}.","position":{"start":{"line":362,"column":1},"end":{"line":364,"column":1}},"html":"Vh+1(x)=xPh+1x+ph+1xS.V^\\star_{\\hi+1}(\\st) = \\st^\\top P_{\\hi+1} \\st + p_{\\hi+1} \\quad \\forall \\st \\in \\mathcal{S}.Vh+1(x)=xPh+1x+ph+1xS.","enumerator":"2.11","key":"wwuKRVIMcs"},{"type":"proof","kind":"lemma","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"inlineMath","value":"Q^\\star_\\hi(\\st, \\act)","position":{"start":{"line":366,"column":1},"end":{"line":366,"column":1}},"html":"Qh(x,u)Q^\\star_\\hi(\\st, \\act)Qh(x,u)","key":"a1mNZr4R1Q"},{"type":"text","value":" is an upward-curved quadratic","position":{"start":{"line":366,"column":1},"end":{"line":366,"column":1}},"key":"jffgtamuUs"}],"key":"Yhzm1EJoE9"},{"type":"paragraph","position":{"start":{"line":367,"column":1},"end":{"line":368,"column":1}},"children":[{"type":"text","value":"Let us decompose ","position":{"start":{"line":367,"column":1},"end":{"line":367,"column":1}},"key":"oUJxJwempf"},{"type":"inlineMath","value":"Q^\\star_\\hi : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}","position":{"start":{"line":367,"column":1},"end":{"line":367,"column":1}},"html":"Qh:S×ARQ^\\star_\\hi : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}Qh:S×AR","key":"a15yLpfBsX"},{"type":"text","value":"\ninto the immediate reward plus the expected cost-to-go:","position":{"start":{"line":367,"column":1},"end":{"line":367,"column":1}},"key":"F4csX36lZ7"}],"key":"ltkx7vGFwf"},{"type":"math","value":"Q^\\star_\\hi(\\st, \\act) = c(\\st, \\act) + \\E_{\\st' \\sim f(\\st, \\act, w_{\\hi+1})} [V^\\star_{\\hi+1}(\\st')].","position":{"start":{"line":370,"column":1},"end":{"line":372,"column":1}},"html":"Qh(x,u)=c(x,u)+Exf(x,u,wh+1)[Vh+1(x)].Q^\\star_\\hi(\\st, \\act) = c(\\st, \\act) + \\E_{\\st' \\sim f(\\st, \\act, w_{\\hi+1})} [V^\\star_{\\hi+1}(\\st')].Qh(x,u)=c(x,u)+Exf(x,u,wh+1)[Vh+1(x)].","enumerator":"2.12","key":"XAh1fK5rNw"},{"type":"paragraph","position":{"start":{"line":374,"column":1},"end":{"line":378,"column":1}},"children":[{"type":"text","value":"Recall ","position":{"start":{"line":374,"column":1},"end":{"line":374,"column":1}},"key":"O8Ue6KHoH2"},{"type":"inlineMath","value":"c(\\st, \\act) := \\st^\\top Q \\st + \\act^\\top R \\act","position":{"start":{"line":374,"column":1},"end":{"line":374,"column":1}},"html":"c(x,u):=xQx+uRuc(\\st, \\act) := \\st^\\top Q \\st + \\act^\\top R \\actc(x,u):=xQx+uRu","key":"lfJrIfpjfl"},{"type":"text","value":".\nLet’s consider the expectation over the next timestep.\nThe only randomness in the dynamics comes from the noise\n","position":{"start":{"line":374,"column":1},"end":{"line":374,"column":1}},"key":"uRoJXnqBFA"},{"type":"inlineMath","value":"w_{\\hi+1} \\sim \\mathcal{N}(0, \\sigma^2 I)","position":{"start":{"line":374,"column":1},"end":{"line":374,"column":1}},"html":"wh+1N(0,σ2I)w_{\\hi+1} \\sim \\mathcal{N}(0, \\sigma^2 I)wh+1N(0,σ2I)","key":"sZmNaoNeWN"},{"type":"text","value":",\nso we can expand the expectation as:","position":{"start":{"line":374,"column":1},"end":{"line":374,"column":1}},"key":"AXIi7RD6ZU"}],"key":"FldRVNcsbs"},{"type":"math","value":"\\begin{aligned}\n & \\E_{\\st'} [V^\\star_{\\hi+1}(\\st')] \\\\\n {} = {} & \\E_{w_{\\hi+1}} [V^\\star_{\\hi+1}(A \\st + B \\act + w_{\\hi+1})] & & \\text{definition of } f \\\\\n {} = {} & \\E_{w_{\\hi+1}} [ (A \\st + B \\act + w_{\\hi+1})^\\top P_{\\hi+1} (A \\st + B \\act + w_{\\hi+1}) + p_{\\hi+1} ]. & & \\text{inductive hypothesis}\n\\end{aligned}","position":{"start":{"line":380,"column":1},"end":{"line":386,"column":1}},"html":"Ex[Vh+1(x)]=Ewh+1[Vh+1(Ax+Bu+wh+1)]definition of f=Ewh+1[(Ax+Bu+wh+1)Ph+1(Ax+Bu+wh+1)+ph+1].inductive hypothesis\\begin{aligned}\n & \\E_{\\st'} [V^\\star_{\\hi+1}(\\st')] \\\\\n {} = {} & \\E_{w_{\\hi+1}} [V^\\star_{\\hi+1}(A \\st + B \\act + w_{\\hi+1})] & & \\text{definition of } f \\\\\n {} = {} & \\E_{w_{\\hi+1}} [ (A \\st + B \\act + w_{\\hi+1})^\\top P_{\\hi+1} (A \\st + B \\act + w_{\\hi+1}) + p_{\\hi+1} ]. & & \\text{inductive hypothesis}\n\\end{aligned}==Ex[Vh+1(x)]Ewh+1[Vh+1(Ax+Bu+wh+1)]Ewh+1[(Ax+Bu+wh+1)Ph+1(Ax+Bu+wh+1)+ph+1].definition of finductive hypothesis","enumerator":"2.13","key":"S9YwE22Tgc"},{"type":"paragraph","position":{"start":{"line":388,"column":1},"end":{"line":388,"column":1}},"children":[{"type":"text","value":"Summing and combining like terms, we get","position":{"start":{"line":388,"column":1},"end":{"line":388,"column":1}},"key":"QD2jHm4uVp"}],"key":"MqvuIl5GbD"},{"type":"math","value":"\\begin{aligned}\n Q^\\star_\\hi(\\st, \\act) & = \\st^\\top Q \\st + \\act^\\top R \\act + \\E_{w_{\\hi+1}} [(A \\st + B \\act + w_{\\hi+1})^\\top P_{\\hi+1} (A \\st + B \\act + w_{\\hi+1}) + p_{\\hi+1}] \\\\\n & = \\st^\\top (Q + A^\\top P_{\\hi+1} A)\\st + \\act^\\top (R + B^\\top P_{\\hi+1} B) \\act + 2 \\st^\\top A^\\top P_{\\hi+1} B \\act \\\\\n & \\qquad + \\E_{w_{\\hi+1}} [w_{\\hi+1}^\\top P_{\\hi+1} w_{\\hi+1}] + p_{\\hi+1}.\n\\end{aligned}","position":{"start":{"line":390,"column":1},"end":{"line":396,"column":1}},"html":"Qh(x,u)=xQx+uRu+Ewh+1[(Ax+Bu+wh+1)Ph+1(Ax+Bu+wh+1)+ph+1]=x(Q+APh+1A)x+u(R+BPh+1B)u+2xAPh+1Bu+Ewh+1[wh+1Ph+1wh+1]+ph+1.\\begin{aligned}\n Q^\\star_\\hi(\\st, \\act) & = \\st^\\top Q \\st + \\act^\\top R \\act + \\E_{w_{\\hi+1}} [(A \\st + B \\act + w_{\\hi+1})^\\top P_{\\hi+1} (A \\st + B \\act + w_{\\hi+1}) + p_{\\hi+1}] \\\\\n & = \\st^\\top (Q + A^\\top P_{\\hi+1} A)\\st + \\act^\\top (R + B^\\top P_{\\hi+1} B) \\act + 2 \\st^\\top A^\\top P_{\\hi+1} B \\act \\\\\n & \\qquad + \\E_{w_{\\hi+1}} [w_{\\hi+1}^\\top P_{\\hi+1} w_{\\hi+1}] + p_{\\hi+1}.\n\\end{aligned}Qh(x,u)=xQx+uRu+Ewh+1[(Ax+Bu+wh+1)Ph+1(Ax+Bu+wh+1)+ph+1]=x(Q+APh+1A)x+u(R+BPh+1B)u+2xAPh+1Bu+Ewh+1[wh+1Ph+1wh+1]+ph+1.","enumerator":"2.14","key":"puNHfcTeW6"},{"type":"paragraph","position":{"start":{"line":398,"column":1},"end":{"line":401,"column":1}},"children":[{"type":"text","value":"Note that the terms that are linear in ","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"key":"g6wd3Va0eq"},{"type":"inlineMath","value":"w_\\hi","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"html":"whw_\\hiwh","key":"y0Xbb3JNVq"},{"type":"text","value":" have mean\nzero and vanish. Now consider the remaining expectation over the noise.\nBy expanding out the product and using linearity of expectation, we can\nwrite this out as","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"key":"xngHMwGdvb"}],"key":"a9lgQaXbZk"},{"type":"math","value":"\\begin{aligned}\n \\E_{w_{\\hi+1}} [w_{\\hi+1}^\\top P_{\\hi+1} w_{\\hi+1}] & = \\sum_{i=1}^d \\sum_{j=1}^d (P_{\\hi+1})_{ij} \\E_{w_{\\hi+1}} [(w_{\\hi+1})_i (w_{\\hi+1})_j] \\\\\n & = \\sigma^2 \\mathrm{Tr}(P_{\\hi + 1})\n\\end{aligned}","position":{"start":{"line":403,"column":1},"end":{"line":408,"column":1}},"html":"Ewh+1[wh+1Ph+1wh+1]=i=1dj=1d(Ph+1)ijEwh+1[(wh+1)i(wh+1)j]=σ2Tr(Ph+1)\\begin{aligned}\n \\E_{w_{\\hi+1}} [w_{\\hi+1}^\\top P_{\\hi+1} w_{\\hi+1}] & = \\sum_{i=1}^d \\sum_{j=1}^d (P_{\\hi+1})_{ij} \\E_{w_{\\hi+1}} [(w_{\\hi+1})_i (w_{\\hi+1})_j] \\\\\n & = \\sigma^2 \\mathrm{Tr}(P_{\\hi + 1})\n\\end{aligned}Ewh+1[wh+1Ph+1wh+1]=i=1dj=1d(Ph+1)ijEwh+1[(wh+1)i(wh+1)j]=σ2Tr(Ph+1)","enumerator":"2.15","key":"JdLM8yMXGa"},{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Quadratic forms","position":{"start":{"line":410,"column":1},"end":{"line":410,"column":1}},"key":"WBVKlLVBsk"}],"key":"N3fUGsLo4J"},{"type":"paragraph","position":{"start":{"line":411,"column":1},"end":{"line":412,"column":1}},"children":[{"type":"text","value":"When solving ","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"key":"ki90cKAd45"},{"type":"emphasis","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"children":[{"type":"text","value":"quadratic forms","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"key":"XsYlvalIYf"}],"key":"Pvb0PSFAKb"},{"type":"text","value":", i.e. expressions of the form ","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"key":"nCZDRRfsEi"},{"type":"inlineMath","value":"x^\\top A x","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"html":"xAxx^\\top A xxAx","key":"yBJymxdU63"},{"type":"text","value":",\nit’s often helpful to consider the terms on the diagonal (","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"key":"pGvWgiROID"},{"type":"inlineMath","value":"i = j","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"html":"i=ji = ji=j","key":"H0iaZqGGVz"},{"type":"text","value":") separately from those off the diagonal.","position":{"start":{"line":411,"column":1},"end":{"line":411,"column":1}},"key":"Sqz6YnfzkO"}],"key":"STlOuhpggY"},{"type":"paragraph","position":{"start":{"line":414,"column":1},"end":{"line":414,"column":1}},"children":[{"type":"text","value":"In this case, the expectation of each diagonal term becomes","position":{"start":{"line":414,"column":1},"end":{"line":414,"column":1}},"key":"hGiBoyTZDf"}],"key":"V6RkS9FhW5"},{"type":"math","value":"(P_{\\hi+1})_{ii} \\E (w_{\\hi+1})_i^2 = \\sigma^2 (P_{\\hi+1})_{ii}.","position":{"start":{"line":417,"column":1},"end":{"line":419,"column":1}},"html":"(Ph+1)iiE(wh+1)i2=σ2(Ph+1)ii.(P_{\\hi+1})_{ii} \\E (w_{\\hi+1})_i^2 = \\sigma^2 (P_{\\hi+1})_{ii}.(Ph+1)iiE(wh+1)i2=σ2(Ph+1)ii.","enumerator":"2.16","key":"GdYZ59vQPT"},{"type":"paragraph","position":{"start":{"line":421,"column":1},"end":{"line":423,"column":1}},"children":[{"type":"text","value":"Off the diagonal, since the elements of ","position":{"start":{"line":421,"column":1},"end":{"line":421,"column":1}},"key":"MYELm60fWy"},{"type":"inlineMath","value":"w_{\\hi+1}","position":{"start":{"line":421,"column":1},"end":{"line":421,"column":1}},"html":"wh+1w_{\\hi+1}wh+1","key":"SGGjjvDTGi"},{"type":"text","value":" are independent, the\nexpectation factors, and since each element has mean zero, the term\nvanishes:","position":{"start":{"line":421,"column":1},"end":{"line":421,"column":1}},"key":"cC8mnW07Mr"}],"key":"WuWvWYShIc"},{"type":"math","value":"(P_{\\hi+1})_{ij} \\E [(w_{\\hi+1})_i] \\E [(w_{\\hi+1})_j] = 0.","position":{"start":{"line":425,"column":1},"end":{"line":427,"column":1}},"html":"(Ph+1)ijE[(wh+1)i]E[(wh+1)j]=0.(P_{\\hi+1})_{ij} \\E [(w_{\\hi+1})_i] \\E [(w_{\\hi+1})_j] = 0.(Ph+1)ijE[(wh+1)i]E[(wh+1)j]=0.","enumerator":"2.17","key":"kM7Fh8ZgK2"},{"type":"paragraph","position":{"start":{"line":429,"column":1},"end":{"line":431,"column":1}},"children":[{"type":"text","value":"Thus,\nthe only terms left are the ones on the diagonal,\nso the sum of these can be expressed as the trace of ","position":{"start":{"line":429,"column":1},"end":{"line":429,"column":1}},"key":"CCVwpV5LbD"},{"type":"inlineMath","value":"\\sigma^2 P_{\\hi+1}","position":{"start":{"line":429,"column":1},"end":{"line":429,"column":1}},"html":"σ2Ph+1\\sigma^2 P_{\\hi+1}σ2Ph+1","key":"Rzcz0hM6Cw"},{"type":"text","value":":","position":{"start":{"line":429,"column":1},"end":{"line":429,"column":1}},"key":"i8CkdYU9qi"}],"key":"Lqj8ZQsm2G"},{"type":"math","value":"\\E_{w_{\\hi+1}} [w_{\\hi+1}^\\top P_{\\hi+1} w_{\\hi+1}] = \\sigma^2 \\mathrm{Tr}(P_{\\hi+1}).","position":{"start":{"line":433,"column":1},"end":{"line":435,"column":1}},"html":"Ewh+1[wh+1Ph+1wh+1]=σ2Tr(Ph+1).\\E_{w_{\\hi+1}} [w_{\\hi+1}^\\top P_{\\hi+1} w_{\\hi+1}] = \\sigma^2 \\mathrm{Tr}(P_{\\hi+1}).Ewh+1[wh+1Ph+1wh+1]=σ2Tr(Ph+1).","enumerator":"2.18","key":"eGTRGYk2Lh"}],"key":"JaJC8rclCj"},{"type":"paragraph","position":{"start":{"line":438,"column":1},"end":{"line":438,"column":1}},"children":[{"type":"text","value":"Substituting this back into the expression for ","position":{"start":{"line":438,"column":1},"end":{"line":438,"column":1}},"key":"OfJHX7Q0Vn"},{"type":"inlineMath","value":"Q^\\star_\\hi","position":{"start":{"line":438,"column":1},"end":{"line":438,"column":1}},"html":"QhQ^\\star_\\hiQh","key":"Mlr8zYfakU"},{"type":"text","value":", we have:","position":{"start":{"line":438,"column":1},"end":{"line":438,"column":1}},"key":"o8Ar7AcUB9"}],"key":"arMnSLow8a"},{"type":"math","value":"\\begin{aligned}\n Q^\\star_\\hi(\\st, \\act) & = \\st^\\top (Q + A^\\top P_{\\hi+1} A) \\st + \\act^\\top (R + B^\\top P_{\\hi+1} B) \\act\n + 2\\st^\\top A^\\top P_{\\hi+1} B \\act \\\\\n & \\qquad + \\sigma^2 \\mathrm{Tr}(P_{\\hi+1}) + p_{\\hi+1}.\n\\end{aligned}","position":{"start":{"line":440,"column":1},"end":{"line":446,"column":1}},"html":"Qh(x,u)=x(Q+APh+1A)x+u(R+BPh+1B)u+2xAPh+1Bu+σ2Tr(Ph+1)+ph+1.\\begin{aligned}\n Q^\\star_\\hi(\\st, \\act) & = \\st^\\top (Q + A^\\top P_{\\hi+1} A) \\st + \\act^\\top (R + B^\\top P_{\\hi+1} B) \\act\n + 2\\st^\\top A^\\top P_{\\hi+1} B \\act \\\\\n & \\qquad + \\sigma^2 \\mathrm{Tr}(P_{\\hi+1}) + p_{\\hi+1}.\n\\end{aligned}Qh(x,u)=x(Q+APh+1A)x+u(R+BPh+1B)u+2xAPh+1Bu+σ2Tr(Ph+1)+ph+1.","enumerator":"2.19","key":"DkytWqOEZ5"},{"type":"paragraph","position":{"start":{"line":448,"column":1},"end":{"line":453,"column":1}},"children":[{"type":"text","value":"As we hoped, this expression is quadratic in ","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"ugYXBQ70At"},{"type":"inlineMath","value":"\\st","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"html":"x\\stx","key":"P1NLViE0XY"},{"type":"text","value":" and ","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"nO1AKOFHBX"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"html":"u\\actu","key":"MTmJ8cyt6A"},{"type":"text","value":".\nFurthermore,\nwe’d like to show that it also ","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"UEGBytZSDP"},{"type":"emphasis","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"children":[{"type":"text","value":"curves upwards","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"dqfx0XrnTh"}],"key":"VLzGLdaojZ"},{"type":"text","value":"\nwith respect to ","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"pHEUg9STZ9"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"html":"u\\actu","key":"pPrLWhCoHw"},{"type":"text","value":"\nso that its minimum with respect to ","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"Ly0JxUagks"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"html":"u\\actu","key":"zjOAQDSBbj"},{"type":"text","value":" is well-defined.\nWe can do this by noting that the ","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"qtOiMA8zOA"},{"type":"strong","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"children":[{"type":"text","value":"Hessian matrix","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"epl6pzeJSo"}],"key":"m8bpDBGOtL"},{"type":"text","value":" of second derivatives is positive definite:","position":{"start":{"line":448,"column":1},"end":{"line":448,"column":1}},"key":"fn7i7nGPw0"}],"key":"CLY44BoxFy"},{"type":"math","value":"\\nabla_{\\act \\act} Q_\\hi^\\star(\\st, \\act) = R + B^\\top P_{\\hi+1} B","position":{"start":{"line":455,"column":1},"end":{"line":457,"column":1}},"html":"uuQh(x,u)=R+BPh+1B\\nabla_{\\act \\act} Q_\\hi^\\star(\\st, \\act) = R + B^\\top P_{\\hi+1} BuuQh(x,u)=R+BPh+1B","enumerator":"2.20","key":"jwtlMUXqt4"},{"type":"paragraph","position":{"start":{"line":459,"column":1},"end":{"line":464,"column":1}},"children":[{"type":"text","value":"Since ","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"IgCujgzBPA"},{"type":"inlineMath","value":"R","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"html":"RRR","key":"sAwRtZmDZ7"},{"type":"text","value":" is s.p.d. (by ","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"sR5YU14dSy"},{"type":"crossReference","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"children":[{"type":"text","value":"the LQR definition","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"MHbaCjE1xN"}],"identifier":"lqr_definition","label":"lqr_definition","kind":"proof:definition","template":"Definition %s","enumerator":"2.2","resolved":true,"html_id":"lqr-definition","key":"tYiIEmgicJ"},{"type":"text","value":"),\nand ","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"gj215YUFd9"},{"type":"inlineMath","value":"P_{\\hi+1}","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"html":"Ph+1P_{\\hi+1}Ph+1","key":"OW2l8TNjMM"},{"type":"text","value":" is s.p.d. (by the inductive hypothesis),\nthis sum must also be s.p.d.,\nand so ","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"MPO6gp36bZ"},{"type":"inlineMath","value":"Q^\\star_\\hi","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"html":"QhQ^\\star_\\hiQh","key":"yzKfzjA2lu"},{"type":"text","value":" is indeed an upward-curved quadratic with respect to ","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"fOcpBuREJ5"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"html":"u\\actu","key":"RodZWyojkS"},{"type":"text","value":".\n(If this isn’t clear, try proving it as an exercise.)\nThe proof of its upward curvature with respect to ","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"VDnoakNOga"},{"type":"inlineMath","value":"\\st","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"html":"x\\stx","key":"sXvsUs4svI"},{"type":"text","value":" is equivalent.","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"qsrET82am7"}],"key":"CBEfeqBNqf"}],"enumerator":"2.1","key":"ibnWArqUNZ"},{"type":"proof","kind":"lemma","label":"lemma_pi_linear","identifier":"lemma_pi_linear","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"inlineMath","value":"\\pi^\\star_\\hi","position":{"start":{"line":467,"column":1},"end":{"line":467,"column":1}},"html":"πh\\pi^\\star_\\hiπh","key":"CMByZ4NB8N"},{"type":"text","value":" is linear","position":{"start":{"line":467,"column":1},"end":{"line":467,"column":1}},"key":"ZMacMIbiEw"}],"key":"iGZzCzr8Jo"},{"type":"paragraph","position":{"start":{"line":470,"column":1},"end":{"line":473,"column":1}},"children":[{"type":"text","value":"Since ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"bXtFZr5QpR"},{"type":"inlineMath","value":"Q^\\star_\\hi","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"QhQ^\\star_\\hiQh","key":"iwGQ8TrWT9"},{"type":"text","value":" is an upward-curved quadratic,\nfinding its minimum over ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"XhqRAzMuuM"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"u\\actu","key":"yFLnEl20ke"},{"type":"text","value":" is easy:\nwe simply set the gradient with respect to ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"HaJWQkkSEi"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"u\\actu","key":"mI51tbw7ZE"},{"type":"text","value":" equal to zero and solve for ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"SIyPe1p2rc"},{"type":"inlineMath","value":"\\act","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"u\\actu","key":"ZmVQSH1la1"},{"type":"text","value":".\nFirst, we calculate the gradient:","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"yfpNKgQcXO"}],"key":"gZSXqK8qXt"},{"type":"math","value":"\\begin{aligned}\n \\nabla_\\act Q^\\star_\\hi(\\st, \\act) & = \\nabla_\\act [ \\act^\\top (R + B^\\top P_{\\hi+1} B) \\act + 2 \\st^\\top A^\\top P_{\\hi+1} B \\act ] \\\\\n & = 2 (R + B^\\top P_{\\hi+1} B) \\act + 2 (\\st^\\top A^\\top P_{\\hi+1} B)^\\top\n\\end{aligned}","position":{"start":{"line":475,"column":1},"end":{"line":480,"column":1}},"html":"uQh(x,u)=u[u(R+BPh+1B)u+2xAPh+1Bu]=2(R+BPh+1B)u+2(xAPh+1B)\\begin{aligned}\n \\nabla_\\act Q^\\star_\\hi(\\st, \\act) & = \\nabla_\\act [ \\act^\\top (R + B^\\top P_{\\hi+1} B) \\act + 2 \\st^\\top A^\\top P_{\\hi+1} B \\act ] \\\\\n & = 2 (R + B^\\top P_{\\hi+1} B) \\act + 2 (\\st^\\top A^\\top P_{\\hi+1} B)^\\top\n\\end{aligned}uQh(x,u)=u[u(R+BPh+1B)u+2xAPh+1Bu]=2(R+BPh+1B)u+2(xAPh+1B)","enumerator":"2.21","key":"dveEuTXIWa"},{"type":"paragraph","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"children":[{"type":"text","value":"Setting this to zero, we get","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"RT1b7w4GMH"}],"key":"EPyZbnSzzC"},{"type":"math","value":"\\begin{aligned}\n 0 & = (R + B^\\top P_{\\hi+1} B) \\pi^\\star_\\hi(\\st) + B^\\top P_{\\hi+1} A \\st \\nonumber \\\\\n \\pi^\\star_\\hi(\\st) & = (R + B^\\top P_{\\hi+1} B)^{-1} (-B^\\top P_{\\hi+1} A \\st) \\nonumber \\\\\n & = - K_\\hi \\st,\n\\end{aligned}","position":{"start":{"line":484,"column":1},"end":{"line":490,"column":1}},"html":"0=(R+BPh+1B)πh(x)+BPh+1Axπh(x)=(R+BPh+1B)1(BPh+1Ax)=Khx,\\begin{aligned}\n 0 & = (R + B^\\top P_{\\hi+1} B) \\pi^\\star_\\hi(\\st) + B^\\top P_{\\hi+1} A \\st \\nonumber \\\\\n \\pi^\\star_\\hi(\\st) & = (R + B^\\top P_{\\hi+1} B)^{-1} (-B^\\top P_{\\hi+1} A \\st) \\nonumber \\\\\n & = - K_\\hi \\st,\n\\end{aligned}0πh(x)=(R+BPh+1B)πh(x)+BPh+1Ax=(R+BPh+1B)1(BPh+1Ax)=Khx,","enumerator":"2.22","key":"K1RNzdhoqs"},{"type":"paragraph","position":{"start":{"line":492,"column":1},"end":{"line":492,"column":1}},"children":[{"type":"text","value":"where","position":{"start":{"line":492,"column":1},"end":{"line":492,"column":1}},"key":"UFsl933YQJ"}],"key":"kggKYNAxbE"},{"type":"math","value":"K_\\hi = (R + B^\\top P_{\\hi+1} B)^{-1} B^\\top P_{\\hi+1} A.","position":{"start":{"line":494,"column":1},"end":{"line":494,"column":1}},"identifier":"k_pi","label":"k_pi","html_id":"k-pi","html":"Kh=(R+BPh+1B)1BPh+1A.K_\\hi = (R + B^\\top P_{\\hi+1} B)^{-1} B^\\top P_{\\hi+1} A.Kh=(R+BPh+1B)1BPh+1A.","enumerator":"2.23","key":"qQ4NQL0olK"},{"type":"paragraph","position":{"start":{"line":496,"column":1},"end":{"line":498,"column":1}},"children":[{"type":"text","value":"Note that this optimal policy doesn’t depend on the starting distribution ","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"JhGCvmRb34"},{"type":"inlineMath","value":"\\mu_0","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"html":"μ0\\mu_0μ0","key":"gSHhNKqJzd"},{"type":"text","value":".\nIt’s also fully ","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"GtOkRzpXb9"},{"type":"strong","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"children":[{"type":"text","value":"deterministic","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"uG5jK8JzsN"}],"key":"io5kyrQy3Q"},{"type":"text","value":" and isn’t affected by the noise terms\n","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"LjgCB8bwdQ"},{"type":"inlineMath","value":"w_0, \\dots, w_{\\hor-1}","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"html":"w0,,wH1w_0, \\dots, w_{\\hor-1}w0,,wH1","key":"FcA7DsE7hZ"},{"type":"text","value":".","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"GclmPNmHwy"}],"key":"MI0jwSTsCB"}],"enumerator":"2.2","html_id":"lemma-pi-linear","key":"hA9oGNrazT"},{"type":"proof","kind":"lemma","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"inlineMath","value":"V^\\star_\\hi(\\st)","position":{"start":{"line":501,"column":1},"end":{"line":501,"column":1}},"html":"Vh(x)V^\\star_\\hi(\\st)Vh(x)","key":"fi5jJXn4KY"},{"type":"text","value":" is an upward-curved quadratic","position":{"start":{"line":501,"column":1},"end":{"line":501,"column":1}},"key":"mHkhlZbyfh"}],"key":"aFkByAfXoW"},{"type":"paragraph","position":{"start":{"line":503,"column":1},"end":{"line":503,"column":1}},"children":[{"type":"text","value":"Using the identity ","position":{"start":{"line":503,"column":1},"end":{"line":503,"column":1}},"key":"HFm5OlgMek"},{"type":"inlineMath","value":"V^\\star_\\hi(\\st) = Q^\\star_\\hi(\\st, \\pi^\\star(\\st))","position":{"start":{"line":503,"column":1},"end":{"line":503,"column":1}},"html":"Vh(x)=Qh(x,π(x))V^\\star_\\hi(\\st) = Q^\\star_\\hi(\\st, \\pi^\\star(\\st))Vh(x)=Qh(x,π(x))","key":"HnKuWgC6TF"},{"type":"text","value":", we have:","position":{"start":{"line":503,"column":1},"end":{"line":503,"column":1}},"key":"MSVFK26keb"}],"key":"AYNj6TyMsW"},{"type":"math","value":"\\begin{aligned}\n V^\\star_\\hi(\\st) & = Q^\\star_\\hi(\\st, \\pi^\\star(\\st)) \\\\\n & = \\st^\\top (Q + A^\\top P_{\\hi+1} A) \\st + (-K_\\hi \\st)^\\top (R + B^\\top P_{\\hi+1} B) (-K_\\hi \\st)\n + 2\\st^\\top A^\\top P_{\\hi+1} B (-K_\\hi \\st) \\\\\n & \\qquad + \\mathrm{Tr}(\\sigma^2 P_{\\hi+1}) + p_{\\hi+1}\n\\end{aligned}","position":{"start":{"line":505,"column":1},"end":{"line":512,"column":1}},"html":"Vh(x)=Qh(x,π(x))=x(Q+APh+1A)x+(Khx)(R+BPh+1B)(Khx)+2xAPh+1B(Khx)+Tr(σ2Ph+1)+ph+1\\begin{aligned}\n V^\\star_\\hi(\\st) & = Q^\\star_\\hi(\\st, \\pi^\\star(\\st)) \\\\\n & = \\st^\\top (Q + A^\\top P_{\\hi+1} A) \\st + (-K_\\hi \\st)^\\top (R + B^\\top P_{\\hi+1} B) (-K_\\hi \\st)\n + 2\\st^\\top A^\\top P_{\\hi+1} B (-K_\\hi \\st) \\\\\n & \\qquad + \\mathrm{Tr}(\\sigma^2 P_{\\hi+1}) + p_{\\hi+1}\n\\end{aligned}Vh(x)=Qh(x,π(x))=x(Q+APh+1A)x+(Khx)(R+BPh+1B)(Khx)+2xAPh+1B(Khx)+Tr(σ2Ph+1)+ph+1","enumerator":"2.24","key":"D18lHIlQWX"},{"type":"paragraph","position":{"start":{"line":514,"column":1},"end":{"line":517,"column":1}},"children":[{"type":"text","value":"Note that with respect to ","position":{"start":{"line":514,"column":1},"end":{"line":514,"column":1}},"key":"x9TPAFYOby"},{"type":"inlineMath","value":"\\st","position":{"start":{"line":514,"column":1},"end":{"line":514,"column":1}},"html":"x\\stx","key":"z53AS1sodt"},{"type":"text","value":",\nthis is the sum of a quadratic term and a constant,\nwhich is exactly what we were aiming for!\nThe scalar term is clearly","position":{"start":{"line":514,"column":1},"end":{"line":514,"column":1}},"key":"EFBHp7leTu"}],"key":"NY4xoJV0ov"},{"type":"math","value":"p_\\hi = \\mathrm{Tr}(\\sigma^2 P_{\\hi+1}) + p_{\\hi+1}.","position":{"start":{"line":519,"column":1},"end":{"line":519,"column":1}},"html":"ph=Tr(σ2Ph+1)+ph+1.p_\\hi = \\mathrm{Tr}(\\sigma^2 P_{\\hi+1}) + p_{\\hi+1}.ph=Tr(σ2Ph+1)+ph+1.","enumerator":"2.25","key":"s1vTcshw5A"},{"type":"paragraph","position":{"start":{"line":521,"column":1},"end":{"line":524,"column":1}},"children":[{"type":"text","value":"We can simplify the quadratic term by substituting in ","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"sYrSzPoSWi"},{"type":"inlineMath","value":"K_\\hi","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"html":"KhK_\\hiKh","key":"GkcMBZuwhs"},{"type":"text","value":" from ","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"ru7xpMBgvW"},{"type":"crossReference","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"children":[{"type":"text","value":"(","key":"IC2BKuJgsT"},{"type":"text","value":"2.23","key":"vb0Z0Wszdh"},{"type":"text","value":")","key":"YYc2cgGaws"}],"identifier":"k_pi","label":"k_pi","kind":"equation","template":"(%s)","enumerator":"2.23","resolved":true,"html_id":"k-pi","key":"WVDjBIJYxc"},{"type":"text","value":".\nNotice that when we do this,\nthe ","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"zBjPSx2uTm"},{"type":"inlineMath","value":"(R+B^\\top P_{\\hi+1} B)","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"html":"(R+BPh+1B)(R+B^\\top P_{\\hi+1} B)(R+BPh+1B)","key":"iiTkkTwm2h"},{"type":"text","value":" term in the expression is cancelled out by its inverse,\nand the remaining terms combine to give the ","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"ZTRf1rk3Im"},{"type":"strong","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"children":[{"type":"text","value":"Riccati equation","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"XrWwKCzmCU"}],"key":"y9SP7ojRlG"},{"type":"text","value":":","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"dUcgtXGbPx"}],"key":"UchGSM1Nil"},{"type":"proof","kind":"definition","label":"riccati","identifier":"riccati","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Riccati equation","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"nNbgIrx0Ug"}],"key":"zvCctyAofi"},{"type":"math","value":"P_\\hi = Q + A^\\top P_{\\hi+1} A - A^\\top P_{\\hi+1} B (R + B^\\top P_{\\hi+1} B)^{-1} B^\\top P_{\\hi+1} A.","position":{"start":{"line":529,"column":1},"end":{"line":531,"column":1}},"html":"Ph=Q+APh+1AAPh+1B(R+BPh+1B)1BPh+1A.P_\\hi = Q + A^\\top P_{\\hi+1} A - A^\\top P_{\\hi+1} B (R + B^\\top P_{\\hi+1} B)^{-1} B^\\top P_{\\hi+1} A.Ph=Q+APh+1AAPh+1B(R+BPh+1B)1BPh+1A.","enumerator":"2.26","key":"vco4ZWca4Y"}],"enumerator":"2.5","html_id":"riccati","key":"SkgKpvY09r"},{"type":"paragraph","position":{"start":{"line":534,"column":1},"end":{"line":534,"column":1}},"children":[{"type":"text","value":"There are several nice properties to note about the Riccati equation:","position":{"start":{"line":534,"column":1},"end":{"line":534,"column":1}},"key":"lskjlDMVtG"}],"key":"dSUQKK98hC"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":536,"column":1},"end":{"line":544,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":536,"column":1},"end":{"line":538,"column":1}},"children":[{"type":"text","value":"It’s defined ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"K83gHnzV1R"},{"type":"strong","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"children":[{"type":"text","value":"recursively.","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"h2JtoPXFUF"}],"key":"F1WHgCjhr4"},{"type":"text","value":"\nGiven the dynamics defined by ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"wliA9y5wPV"},{"type":"inlineMath","value":"A","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"html":"AAA","key":"ZxwlcvKu1t"},{"type":"text","value":" and ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"XVu1eQ8beI"},{"type":"inlineMath","value":"B","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"html":"BBB","key":"bm54dMZmaW"},{"type":"text","value":", and the state cost matrix ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"AuqtPsdUQV"},{"type":"inlineMath","value":"Q","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"html":"QQQ","key":"gPiz2CnGX1"},{"type":"text","value":",\nwe can recursively calculate ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"fYqkm4Ek7e"},{"type":"inlineMath","value":"P_\\hi","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"html":"PhP_\\hiPh","key":"Om6aaUR85Y"},{"type":"text","value":" across all timesteps starting from ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"hPtwsF9PFe"},{"type":"inlineMath","value":"P_\\hor = Q","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"html":"PH=QP_\\hor = QPH=Q","key":"pgtQx5FIcd"},{"type":"text","value":".","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"DKhlyU5XxL"}],"key":"pBtIyHgbBN"},{"type":"listItem","spread":true,"position":{"start":{"line":539,"column":1},"end":{"line":540,"column":1}},"children":[{"type":"inlineMath","value":"P_\\hi","position":{"start":{"line":539,"column":1},"end":{"line":539,"column":1}},"html":"PhP_\\hiPh","key":"lrQUKTbT5K"},{"type":"text","value":" often appears in calculations surrounding optimality,\nsuch as ","position":{"start":{"line":539,"column":1},"end":{"line":539,"column":1}},"key":"crsSiYlC5F"},{"type":"inlineMath","value":"V^\\star_\\hi, Q^\\star_\\hi","position":{"start":{"line":539,"column":1},"end":{"line":539,"column":1}},"html":"Vh,QhV^\\star_\\hi, Q^\\star_\\hiVh,Qh","key":"LmegRCQprb"},{"type":"text","value":", and ","position":{"start":{"line":539,"column":1},"end":{"line":539,"column":1}},"key":"ZYRPtUxGUO"},{"type":"inlineMath","value":"\\pi^\\star_\\hi","position":{"start":{"line":539,"column":1},"end":{"line":539,"column":1}},"html":"πh\\pi^\\star_\\hiπh","key":"NzC9UrMYMB"},{"type":"text","value":".","position":{"start":{"line":539,"column":1},"end":{"line":539,"column":1}},"key":"JF2FiISGst"}],"key":"zyqpvl5Tp1"},{"type":"listItem","spread":true,"position":{"start":{"line":541,"column":1},"end":{"line":544,"column":1}},"children":[{"type":"text","value":"Together with the dynamics given by ","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"key":"JSEEantwin"},{"type":"inlineMath","value":"A","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"html":"AAA","key":"lKgmVyhoE9"},{"type":"text","value":" and ","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"key":"qY9n9I8LbX"},{"type":"inlineMath","value":"B","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"html":"BBB","key":"aoiZdEEyj3"},{"type":"text","value":",\nand the action coefficients ","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"key":"PIM2d7aUDy"},{"type":"inlineMath","value":"R","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"html":"RRR","key":"OEHl6gwbBz"},{"type":"text","value":" in the lost function,\nit fully defines the optimal policy ","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"key":"wTR112UHEP"},{"type":"crossReference","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"children":[{"type":"text","value":"Lemma ","key":"zVAZ5UyvoB"},{"type":"text","value":"2.2","key":"DA4UBsKTqM"}],"identifier":"lemma_pi_linear","label":"lemma_pi_linear","kind":"proof:lemma","template":"Lemma %s","enumerator":"2.2","resolved":true,"html_id":"lemma-pi-linear","key":"yeRrAyPL6Z"},{"type":"text","value":".","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"key":"xPj0QegOW2"}],"key":"djxjeG44SS"}],"key":"HDbuzQTFHq"},{"type":"paragraph","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"children":[{"type":"text","value":"It remains to prove that ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"YpO5dDcsBD"},{"type":"inlineMath","value":"V^\\star_\\hi","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"html":"VhV^\\star_\\hiVh","key":"LtZV5LZDNy"},{"type":"text","value":" ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"lJKD0xsj1c"},{"type":"emphasis","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"children":[{"type":"text","value":"curves upwards,","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"kxa6I08cOj"}],"key":"b4jEltHQN4"},{"type":"text","value":" that is, that ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"GVQbA6gotN"},{"type":"inlineMath","value":"P_\\hi","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"html":"PhP_\\hiPh","key":"J0Xl9QnM0R"},{"type":"text","value":" is s.p.d. We will use the following fact about ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"d2hKbv4I4s"},{"type":"strong","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"children":[{"type":"text","value":"Schur complements:","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"KcD2ijvmNq"}],"key":"lnF7sWnH3x"}],"key":"yr2J4UiNvQ"},{"type":"proof","kind":"lemma","label":"lemma_schur","identifier":"lemma_schur","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Positive definiteness of Schur complements","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"key":"Rb7Wwv88Sc"}],"key":"tjhNhCmXjb"},{"type":"paragraph","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"children":[{"type":"text","value":"Let","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"OLx3SZ5mxQ"}],"key":"Dr82O450Q7"},{"type":"math","value":"D = \\begin{pmatrix}\nA & B \\\\\nB^\\top & C\n\\end{pmatrix}","position":{"start":{"line":552,"column":1},"end":{"line":557,"column":1}},"html":"D=(ABBC)D = \\begin{pmatrix}\nA & B \\\\\nB^\\top & C\n\\end{pmatrix}D=(ABBC)","enumerator":"2.27","key":"nFfZPrkixn"},{"type":"paragraph","position":{"start":{"line":559,"column":1},"end":{"line":561,"column":1}},"children":[{"type":"text","value":"be a symmetric ","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"key":"eCGjvhy9OK"},{"type":"inlineMath","value":"(m+n) \\times (m+n)","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"html":"(m+n)×(m+n)(m+n) \\times (m+n)(m+n)×(m+n)","key":"xv4obbqWwF"},{"type":"text","value":" block matrix,\nwhere ","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"key":"pNKXb5zQ51"},{"type":"inlineMath","value":"A \\in \\R^{m \\times m}, B \\in \\R^{m \\times n}, C \\in \\R^{n \\times n}","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"html":"ARm×m,BRm×n,CRn×nA \\in \\R^{m \\times m}, B \\in \\R^{m \\times n}, C \\in \\R^{n \\times n}ARm×m,BRm×n,CRn×n","key":"zRsmP6oy6p"},{"type":"text","value":".\nThe ","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"key":"QG9X9BKWi9"},{"type":"strong","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"children":[{"type":"text","value":"Schur complement","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"key":"GcLt7R93Wm"}],"key":"yrqM0cHILv"},{"type":"text","value":" of ","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"key":"LfWD2TqpnA"},{"type":"inlineMath","value":"A","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"html":"AAA","key":"pHgagtci8L"},{"type":"text","value":" is denoted","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"key":"JdHJkK1Lhk"}],"key":"XkdHR7hr0Q"},{"type":"math","value":"D/A = C - B^\\top A^{-1} B.","position":{"start":{"line":563,"column":1},"end":{"line":565,"column":1}},"html":"D/A=CBA1B.D/A = C - B^\\top A^{-1} B.D/A=CBA1B.","enumerator":"2.28","key":"AavB3esGOQ"},{"type":"paragraph","position":{"start":{"line":567,"column":1},"end":{"line":567,"column":1}},"children":[{"type":"text","value":"Schur complements have various uses in linear algebra and numerical computation.","position":{"start":{"line":567,"column":1},"end":{"line":567,"column":1}},"key":"loHHva1Yli"}],"key":"aJyNke1eMP"},{"type":"paragraph","position":{"start":{"line":569,"column":1},"end":{"line":572,"column":1}},"children":[{"type":"text","value":"A useful fact for us is that\nif ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"ENfgFYM8ym"},{"type":"inlineMath","value":"A","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"html":"AAA","key":"I5IFYijquy"},{"type":"text","value":" is positive ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"oKSg07zjCH"},{"type":"emphasis","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"children":[{"type":"text","value":"definite,","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"W4rCWrENc8"}],"key":"yEgPh1KvZx"},{"type":"text","value":"\nthen ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"v5bTZYRWhg"},{"type":"inlineMath","value":"D","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"html":"DDD","key":"UZnXKVJPa9"},{"type":"text","value":" is positive ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"NUX36xV5ew"},{"type":"emphasis","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"children":[{"type":"text","value":"semidefinite","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"tF4VKfHOI3"}],"key":"lGGAo3GgTI"},{"type":"text","value":"\nif and only if ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"Zjik5TrJpf"},{"type":"inlineMath","value":"D/A","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"html":"D/AD/AD/A","key":"Bc3WKRbLWw"},{"type":"text","value":" is positive ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"QiyCwBw5b5"},{"type":"emphasis","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"children":[{"type":"text","value":"semidefinite","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"MkjlEiLQ8E"}],"key":"CgapKX9abG"},{"type":"text","value":".","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"jZBT6dMLhk"}],"key":"Y2kfuFSohO"}],"enumerator":"2.4","html_id":"lemma-schur","key":"OXQB52Pxp9"},{"type":"paragraph","position":{"start":{"line":575,"column":1},"end":{"line":577,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"key":"LiN5vyRgSB"},{"type":"inlineMath","value":"P","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"html":"PPP","key":"ql3vFAT6k0"},{"type":"text","value":" denote ","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"key":"MGhLoFs9XG"},{"type":"inlineMath","value":"P_{\\hi + 1}","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"html":"Ph+1P_{\\hi + 1}Ph+1","key":"NU5yreSohn"},{"type":"text","value":" for brevity.\nWe already know ","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"key":"RBo6KQkWfy"},{"type":"inlineMath","value":"Q","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"html":"QQQ","key":"gl2JXYAVUh"},{"type":"text","value":" is p.d.,\nso it suffices to show that","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"key":"CiJhqrZoF7"}],"key":"Qvay6fotaX"},{"type":"math","value":"S = P - P B (R + B^\\top P B)^{-1} B^\\top P","position":{"start":{"line":579,"column":1},"end":{"line":581,"column":1}},"html":"S=PPB(R+BPB)1BPS = P - P B (R + B^\\top P B)^{-1} B^\\top PS=PPB(R+BPB)1BP","enumerator":"2.29","key":"nCntRsj3mm"},{"type":"paragraph","position":{"start":{"line":583,"column":1},"end":{"line":586,"column":1}},"children":[{"type":"text","value":"is p.s.d. (positive semidefinite),\nsince left- and right- multiplying by ","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"key":"eQhjTo1lOC"},{"type":"inlineMath","value":"A^\\top","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"html":"AA^\\topA","key":"fmcKjzbn82"},{"type":"text","value":" and ","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"key":"TyBdV8I45I"},{"type":"inlineMath","value":"A","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"html":"AAA","key":"CPZcxfOSdR"},{"type":"text","value":" respectively\npreserves p.s.d.\nWe note that ","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"key":"PSveMhYmfm"},{"type":"inlineMath","value":"S","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"html":"SSS","key":"jmsmoZTXqZ"},{"type":"text","value":" is the Schur complement ","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"key":"nCaE9ynmhC"},{"type":"inlineMath","value":"D/(R + B^\\top P B)","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"html":"D/(R+BPB)D/(R + B^\\top P B)D/(R+BPB)","key":"VuydkZZW5f"},{"type":"text","value":", where","position":{"start":{"line":583,"column":1},"end":{"line":583,"column":1}},"key":"RA2wiTHZ5b"}],"key":"My1fUEq54V"},{"type":"math","value":"D = \\begin{pmatrix}\nR + B^\\top P B & B^\\top P \\\\\nP B & P\n\\end{pmatrix}.","position":{"start":{"line":588,"column":1},"end":{"line":593,"column":1}},"html":"D=(R+BPBBPPBP).D = \\begin{pmatrix}\nR + B^\\top P B & B^\\top P \\\\\nP B & P\n\\end{pmatrix}.D=(R+BPBPBBPP).","enumerator":"2.30","key":"TIZ47c24Yx"},{"type":"paragraph","position":{"start":{"line":595,"column":1},"end":{"line":596,"column":1}},"children":[{"type":"text","value":"Thus we must show that ","position":{"start":{"line":595,"column":1},"end":{"line":595,"column":1}},"key":"EIEARprVhC"},{"type":"inlineMath","value":"D","position":{"start":{"line":595,"column":1},"end":{"line":595,"column":1}},"html":"DDD","key":"op58P9QXTl"},{"type":"text","value":" is p.s.d..\nThis can be seen by computing","position":{"start":{"line":595,"column":1},"end":{"line":595,"column":1}},"key":"u01TVLDoWw"}],"key":"KmdCPFht86"},{"type":"math","value":"\\begin{aligned}\n\\begin{pmatrix}\ny^\\top & z^\\top\n\\end{pmatrix}\nD\n\\begin{pmatrix}\ny \\\\ z\n\\end{pmatrix}\n&= y^\\top R y + y^\\top B^\\top P B y + 2 y^\\top B^\\top P z + z^\\top P z \\\\\n&= y^\\top R y + (By + z)^\\top P (By + z) \\\\\n&> 0.\n\\end{aligned}","position":{"start":{"line":598,"column":1},"end":{"line":611,"column":1}},"html":"(yz)D(yz)=yRy+yBPBy+2yBPz+zPz=yRy+(By+z)P(By+z)>0.\\begin{aligned}\n\\begin{pmatrix}\ny^\\top & z^\\top\n\\end{pmatrix}\nD\n\\begin{pmatrix}\ny \\\\ z\n\\end{pmatrix}\n&= y^\\top R y + y^\\top B^\\top P B y + 2 y^\\top B^\\top P z + z^\\top P z \\\\\n&= y^\\top R y + (By + z)^\\top P (By + z) \\\\\n&> 0.\n\\end{aligned}(yz)D(yz)=yRy+yBPBy+2yBPz+zPz=yRy+(By+z)P(By+z)>0.","enumerator":"2.31","key":"CFJKEz3XbB"},{"type":"paragraph","position":{"start":{"line":613,"column":1},"end":{"line":615,"column":1}},"children":[{"type":"text","value":"Since ","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"key":"Mv1tqIKuFL"},{"type":"inlineMath","value":"R + B^\\top P B","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"html":"R+BPBR + B^\\top P BR+BPB","key":"uHv6cI4HKA"},{"type":"text","value":" is p.d. and ","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"key":"IWpJuFQUSi"},{"type":"inlineMath","value":"D","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"html":"DDD","key":"KwRI4YHpJf"},{"type":"text","value":" is p.s.d.,\nthen ","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"key":"UUSy19zQmg"},{"type":"inlineMath","value":"S = D / (R + B^\\top P B)","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"html":"S=D/(R+BPB)S = D / (R + B^\\top P B)S=D/(R+BPB)","key":"ctZDIifKJY"},{"type":"text","value":" must be p.s.d.,\nand ","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"key":"ACCBjowg5A"},{"type":"inlineMath","value":"P_\\hi = Q + A S A^\\top","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"html":"Ph=Q+ASAP_\\hi = Q + A S A^\\topPh=Q+ASA","key":"kE1EBdOsb1"},{"type":"text","value":" must be p.d.","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"key":"bti0wE4CTg"}],"key":"Wa9jBJzLr4"}],"enumerator":"2.3","key":"c7qCC4vqMk"},{"type":"paragraph","position":{"start":{"line":618,"column":1},"end":{"line":620,"column":1}},"children":[{"type":"text","value":"Now we’ve shown that ","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"key":"IeITPgt9B5"},{"type":"inlineMath","value":"V^\\star_\\hi(\\st) = \\st^\\top P_\\hi \\st + p_\\hi","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"html":"Vh(x)=xPhx+phV^\\star_\\hi(\\st) = \\st^\\top P_\\hi \\st + p_\\hiVh(x)=xPhx+ph","key":"r2RymPHZAT"},{"type":"text","value":",\nwhere ","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"key":"RlUqKiBsb2"},{"type":"inlineMath","value":"P_\\hi","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"html":"PhP_\\hiPh","key":"O8pNLqmmCv"},{"type":"text","value":" is s.p.d.,\nproving the inductive hypothesis and completing the proof of ","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"key":"QxlIZXpXQR"},{"type":"crossReference","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"children":[{"type":"text","value":"Theorem ","key":"Ze8DQ4bI76"},{"type":"text","value":"2.2","key":"GV1x08BrwZ"}],"identifier":"optimal_policy_lqr_linear","label":"optimal_policy_lqr_linear","kind":"proof:theorem","template":"Theorem %s","enumerator":"2.2","resolved":true,"html_id":"optimal-policy-lqr-linear","key":"kxsu3fAGZV"},{"type":"text","value":" and ","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"key":"TT6tCH6EEH"},{"type":"crossReference","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"children":[{"type":"text","value":"Theorem ","key":"EXsPZGSaoO"},{"type":"text","value":"2.1","key":"LkNjdG1M1H"}],"identifier":"optimal_value_lqr_quadratic","label":"optimal_value_lqr_quadratic","kind":"proof:theorem","template":"Theorem %s","enumerator":"2.1","resolved":true,"html_id":"optimal-value-lqr-quadratic","key":"hI5xYPGTJp"},{"type":"text","value":".","position":{"start":{"line":618,"column":1},"end":{"line":618,"column":1}},"key":"xhSadPrsxP"}],"key":"sQ2ZxztoKw"},{"type":"paragraph","position":{"start":{"line":622,"column":1},"end":{"line":628,"column":1}},"children":[{"type":"text","value":"In summary, we just demonstrated that at each timestep ","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"key":"sxnSks8MQq"},{"type":"inlineMath","value":"\\hi \\in [\\hor]","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"html":"h[H]\\hi \\in [\\hor]h[H]","key":"oLov4oZhmV"},{"type":"text","value":",\nthe optimal value function ","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"key":"Gtuzp9xTV6"},{"type":"inlineMath","value":"V^\\star_\\hi","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"html":"VhV^\\star_\\hiVh","key":"vn4yIofMh8"},{"type":"text","value":"\nand optimal Q-function ","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"key":"MXLUWRENtg"},{"type":"inlineMath","value":"Q^\\star_\\hi","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"html":"QhQ^\\star_\\hiQh","key":"y2MV9RTxqk"},{"type":"text","value":" are both upward-curved quadratics\nand the optimal policy ","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"key":"RlY3HglPCw"},{"type":"inlineMath","value":"\\pi^\\star_\\hi","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"html":"πh\\pi^\\star_\\hiπh","key":"FaBthcuOPt"},{"type":"text","value":" is linear.\nWe also showed that all of these quantities can be calculated\nusing a sequence of s.p.d. matrices ","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"key":"t5LfvlgDf5"},{"type":"inlineMath","value":"P_0, \\dots, P_H","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"html":"P0,,PHP_0, \\dots, P_HP0,,PH","key":"kRFJkJ3NT7"},{"type":"text","value":"\nthat can be defined recursively using the Riccati equation ","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"key":"BmCEpYkz5N"},{"type":"crossReference","kind":"proof:definition","identifier":"riccati","label":"riccati","children":[{"type":"text","value":"Definition ","key":"TvPjMrC1ca"},{"type":"text","value":"2.5","key":"GIsnkMDozk"}],"template":"Definition %s","enumerator":"2.5","resolved":true,"html_id":"riccati","key":"Y1bP4r1HI9"},{"type":"text","value":".","position":{"start":{"line":622,"column":1},"end":{"line":622,"column":1}},"key":"iZWzQeYPmr"}],"key":"LMFpv4pAa3"},{"type":"paragraph","position":{"start":{"line":630,"column":1},"end":{"line":632,"column":1}},"children":[{"type":"text","value":"Before we move on to some extensions of LQR, let’s consider how the\nstate at time ","position":{"start":{"line":630,"column":1},"end":{"line":630,"column":1}},"key":"w6e9Mcr8Q9"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":630,"column":1},"end":{"line":630,"column":1}},"html":"h\\hih","key":"lVzDGi53zv"},{"type":"text","value":" behaves when we act according to this optimal\npolicy.","position":{"start":{"line":630,"column":1},"end":{"line":630,"column":1}},"key":"xSUnrpYVfH"}],"key":"H9OblvGzig"},{"type":"heading","depth":3,"position":{"start":{"line":634,"column":1},"end":{"line":634,"column":1}},"children":[{"type":"text","value":"Expected state at time ","position":{"start":{"line":634,"column":1},"end":{"line":634,"column":1}},"key":"rbUwx2ZP8V"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":634,"column":1},"end":{"line":634,"column":1}},"html":"h\\hih","key":"CKPFEhBMpr"}],"identifier":"expected-state-at-time-hi","label":"Expected state at time \\hi","html_id":"expected-state-at-time-hi","implicit":true,"enumerator":"2.4.1","key":"cwnSUbDVri"},{"type":"paragraph","position":{"start":{"line":636,"column":1},"end":{"line":639,"column":1}},"children":[{"type":"text","value":"How can we compute the expected state at time ","position":{"start":{"line":636,"column":1},"end":{"line":636,"column":1}},"key":"cr7RdJincr"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":636,"column":1},"end":{"line":636,"column":1}},"html":"h\\hih","key":"V8PpzJ6ZEE"},{"type":"text","value":" when acting\naccording to the optimal policy? Let’s first express ","position":{"start":{"line":636,"column":1},"end":{"line":636,"column":1}},"key":"Jya8PCNOmM"},{"type":"inlineMath","value":"\\st_\\hi","position":{"start":{"line":636,"column":1},"end":{"line":636,"column":1}},"html":"xh\\st_\\hixh","key":"ZMCIW1Nlek"},{"type":"text","value":" in a\ncleaner way in terms of the history. Note that having linear dynamics\nmakes it easy to expand terms backwards in time:","position":{"start":{"line":636,"column":1},"end":{"line":636,"column":1}},"key":"ZrrXoYzNEK"}],"key":"d44sfKb2oY"},{"type":"math","value":"\\begin{aligned}\n \\st_\\hi & = A \\st_{\\hi-1} + B \\act_{\\hi-1} + w_{\\hi-1} \\\\\n & = A (A\\st_{\\hi-2} + B \\act_{\\hi-2} + w_{\\hi-2}) + B \\act_{\\hi-1} + w_{\\hi-1} \\\\\n & = \\cdots \\\\\n & = A^\\hi \\st_0 + \\sum_{i=0}^{\\hi-1} A^i (B \\act_{\\hi-i-1} + w_{\\hi-i-1}).\n\\end{aligned}","position":{"start":{"line":641,"column":1},"end":{"line":648,"column":1}},"html":"xh=Axh1+Buh1+wh1=A(Axh2+Buh2+wh2)+Buh1+wh1==Ahx0+i=0h1Ai(Buhi1+whi1).\\begin{aligned}\n \\st_\\hi & = A \\st_{\\hi-1} + B \\act_{\\hi-1} + w_{\\hi-1} \\\\\n & = A (A\\st_{\\hi-2} + B \\act_{\\hi-2} + w_{\\hi-2}) + B \\act_{\\hi-1} + w_{\\hi-1} \\\\\n & = \\cdots \\\\\n & = A^\\hi \\st_0 + \\sum_{i=0}^{\\hi-1} A^i (B \\act_{\\hi-i-1} + w_{\\hi-i-1}).\n\\end{aligned}xh=Axh1+Buh1+wh1=A(Axh2+Buh2+wh2)+Buh1+wh1==Ahx0+i=0h1Ai(Buhi1+whi1).","enumerator":"2.32","key":"zNlFR9IJ8c"},{"type":"paragraph","position":{"start":{"line":652,"column":1},"end":{"line":655,"column":1}},"children":[{"type":"text","value":"Let’s consider the ","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"BWNc9Opo8q"},{"type":"emphasis","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"children":[{"type":"text","value":"average state","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"o9wVipuY1e"}],"key":"kFf9yJuIUZ"},{"type":"text","value":" at this time, given all the past\nstates and actions. Since we assume that ","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"Cs5CnCeJra"},{"type":"inlineMath","value":"\\E [w_\\hi] = 0","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"html":"E[wh]=0\\E [w_\\hi] = 0E[wh]=0","key":"FCY8cdYnOH"},{"type":"text","value":" (this is the\nzero vector in ","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"jQAf09L2tU"},{"type":"inlineMath","value":"d","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"html":"ddd","key":"T20rBj5MoY"},{"type":"text","value":" dimensions), when we take an expectation, the ","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"GBv5n7ZKYY"},{"type":"inlineMath","value":"w_\\hi","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"html":"whw_\\hiwh","key":"vDG4fc9QSB"},{"type":"text","value":"\nterm vanishes due to linearity, and so we’re left with","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"gCD4StUz43"}],"key":"zRUOBq1oPu"},{"type":"math","value":"\\E [\\st_\\hi \\mid \\st_{0:(\\hi-1)}, \\act_{0:(\\hi-1)}] = A^\\hi \\st_0 + \\sum_{i=0}^{\\hi-1} A^i B \\act_{\\hi-i-1}.","position":{"start":{"line":658,"column":1},"end":{"line":661,"column":1}},"identifier":"expected_state","label":"expected_state","html_id":"expected-state","html":"E[xhx0:(h1),u0:(h1)]=Ahx0+i=0h1AiBuhi1.\\E [\\st_\\hi \\mid \\st_{0:(\\hi-1)}, \\act_{0:(\\hi-1)}] = A^\\hi \\st_0 + \\sum_{i=0}^{\\hi-1} A^i B \\act_{\\hi-i-1}.E[xhx0:(h1),u0:(h1)]=Ahx0+i=0h1AiBuhi1.","enumerator":"2.33","key":"nI2UWr0kt8"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Exercise","position":{"start":{"line":664,"column":1},"end":{"line":664,"column":1}},"key":"Sits8woovn"}],"key":"SKkan9sXBN"},{"type":"paragraph","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"children":[{"type":"text","value":"Show that if we choose actions according to the optimal policy ","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"key":"M60upJCUUT"},{"type":"crossReference","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"children":[{"type":"text","value":"Lemma ","key":"bi233vcSVe"},{"type":"text","value":"2.2","key":"r5rUpWzSuL"}],"identifier":"lemma_pi_linear","label":"lemma_pi_linear","kind":"proof:lemma","template":"Lemma %s","enumerator":"2.2","resolved":true,"html_id":"lemma-pi-linear","key":"A4PHPNB4si"},{"type":"text","value":", ","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"key":"qoywuAheCO"},{"type":"crossReference","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"children":[{"type":"text","value":"(","key":"iwSx7n6BxV"},{"type":"text","value":"2.33","key":"v7yBBc09Ra"},{"type":"text","value":")","key":"fbRtvNvJKA"}],"identifier":"expected_state","label":"expected_state","kind":"equation","template":"(%s)","enumerator":"2.33","resolved":true,"html_id":"expected-state","key":"gKRv1kyRlv"},{"type":"text","value":" becomes","position":{"start":{"line":665,"column":1},"end":{"line":665,"column":1}},"key":"cexyVpey0l"}],"key":"Xsd754qWYQ"},{"type":"math","value":"\\E [\\st_\\hi \\mid \\st_0, \\act_i = \\pi^\\star_i(\\st_i)\\quad \\forall i \\le \\hi] = \\left( \\prod_{i=0}^{\\hi-1} (A - B K_i) \\right) \\st_0.","position":{"start":{"line":667,"column":1},"end":{"line":669,"column":1}},"html":"E[xhx0,ui=πi(xi)ih]=(i=0h1(ABKi))x0.\\E [\\st_\\hi \\mid \\st_0, \\act_i = \\pi^\\star_i(\\st_i)\\quad \\forall i \\le \\hi] = \\left( \\prod_{i=0}^{\\hi-1} (A - B K_i) \\right) \\st_0.E[xhx0,ui=πi(xi)ih]=(i=0h1(ABKi))x0.","enumerator":"2.34","key":"bvKQBSk6Uv"}],"key":"aL2lEOQql9"},{"type":"paragraph","position":{"start":{"line":672,"column":1},"end":{"line":679,"column":1}},"children":[{"type":"text","value":"This introdces the quantity ","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"yb2nWJWCCJ"},{"type":"inlineMath","value":"A - B K_i","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"ABKiA - B K_iABKi","key":"nRxoLWDUQR"},{"type":"text","value":", which shows up frequently in\ncontrol theory. For example, one important question is: will ","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"pEch7BhNyp"},{"type":"inlineMath","value":"\\st_\\hi","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"xh\\st_\\hixh","key":"UrWo2PDqC3"},{"type":"text","value":"\nremain bounded, or will it go to infinity as time goes on? To answer\nthis, let’s imagine for simplicity that these ","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"h8oDZnGMkM"},{"type":"inlineMath","value":"K_i","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"KiK_iKi","key":"wIQWsRshiL"},{"type":"text","value":"s are equal (call\nthis matrix ","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"c0BUtZ8c7W"},{"type":"inlineMath","value":"K","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"KKK","key":"AXfte3hJrU"},{"type":"text","value":"). Then the expression above becomes ","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"m48xW7LFXR"},{"type":"inlineMath","value":"(A-BK)^\\hi \\st_0","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"(ABK)hx0(A-BK)^\\hi \\st_0(ABK)hx0","key":"jhXnXQt0ht"},{"type":"text","value":".\nNow consider the maximum eigenvalue ","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"ki5eyS0ORL"},{"type":"inlineMath","value":"\\lambda_{\\max}","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"λmax\\lambda_{\\max}λmax","key":"OTn1FRWmuX"},{"type":"text","value":" of ","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"hisLCjBwpC"},{"type":"inlineMath","value":"A - BK","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"ABKA - BKABK","key":"shgjNjcp4Q"},{"type":"text","value":". If\n","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"cHLcjFR1SL"},{"type":"inlineMath","value":"|\\lambda_{\\max}| > 1","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"λmax>1|\\lambda_{\\max}| > 1λmax>1","key":"GYG1Zi9Fi4"},{"type":"text","value":", then there’s some nonzero initial state\n","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"kTR2npgHEi"},{"type":"inlineMath","value":"\\bar \\st_0","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"html":"xˉ0\\bar \\st_0xˉ0","key":"GxadtL9gG3"},{"type":"text","value":", the corresponding eigenvector, for which","position":{"start":{"line":672,"column":1},"end":{"line":672,"column":1}},"key":"zlpMTiEinn"}],"key":"IAiDqbAYSC"},{"type":"math","value":"\\lim_{\\hi \\to \\infty} (A - BK)^\\hi \\bar \\st_0\n = \\lim_{\\hi \\to \\infty} \\lambda_{\\max}^\\hi \\bar \\st_0\n = \\infty.","position":{"start":{"line":682,"column":1},"end":{"line":686,"column":1}},"html":"limh(ABK)hxˉ0=limhλmaxhxˉ0=.\\lim_{\\hi \\to \\infty} (A - BK)^\\hi \\bar \\st_0\n = \\lim_{\\hi \\to \\infty} \\lambda_{\\max}^\\hi \\bar \\st_0\n = \\infty.hlim(ABK)hxˉ0=hlimλmaxhxˉ0=∞.","enumerator":"2.35","key":"FhRdxREzTE"},{"type":"paragraph","position":{"start":{"line":688,"column":1},"end":{"line":688,"column":1}},"children":[{"type":"text","value":"Otherwise, if ","position":{"start":{"line":688,"column":1},"end":{"line":688,"column":1}},"key":"gBMVBBe7iX"},{"type":"inlineMath","value":"|\\lambda_{\\max}| < 1","position":{"start":{"line":688,"column":1},"end":{"line":688,"column":1}},"html":"λmax<1|\\lambda_{\\max}| < 1λmax<1","key":"pmhp50725u"},{"type":"text","value":", then it’s impossible for your original state to explode as dramatically.","position":{"start":{"line":688,"column":1},"end":{"line":688,"column":1}},"key":"YQDwGoDVQ8"}],"key":"d00xgRvujf"},{"type":"heading","depth":2,"position":{"start":{"line":690,"column":1},"end":{"line":690,"column":1}},"children":[{"type":"text","value":"Extensions","position":{"start":{"line":690,"column":1},"end":{"line":690,"column":1}},"key":"Lbn5TFQiKo"}],"identifier":"extensions","label":"Extensions","html_id":"extensions","implicit":true,"enumerator":"2.5","key":"ifUjkvX0tD"},{"type":"paragraph","position":{"start":{"line":692,"column":1},"end":{"line":697,"column":1}},"children":[{"type":"text","value":"We’ve now formulated an optimal solution for the time-homogeneous LQR\nand computed the expected state under the optimal policy. However, real\nworld tasks rarely have such simple dynamics, and we may wish to design\nmore complex cost functions. In this section, we’ll consider more\ngeneral extensions of LQR where some of the assumptions we made above\nare relaxed. Specifically, we’ll consider:","position":{"start":{"line":692,"column":1},"end":{"line":692,"column":1}},"key":"VAq77SkhSF"}],"key":"g7t73dpYRW"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":699,"column":1},"end":{"line":707,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":699,"column":1},"end":{"line":701,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":699,"column":1},"end":{"line":700,"column":1}},"children":[{"type":"strong","position":{"start":{"line":699,"column":1},"end":{"line":699,"column":1}},"children":[{"type":"text","value":"Time-dependency","position":{"start":{"line":699,"column":1},"end":{"line":699,"column":1}},"key":"dQIohPk0mO"}],"key":"SAyCCBCag3"},{"type":"text","value":", where the dynamics and cost function might\nchange depending on the timestep.","position":{"start":{"line":699,"column":1},"end":{"line":699,"column":1}},"key":"PUKZqYWL4Z"}],"key":"DAINHJgbKM"}],"key":"Rs2nyFo1YI"},{"type":"listItem","spread":true,"position":{"start":{"line":702,"column":1},"end":{"line":704,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":702,"column":1},"end":{"line":703,"column":1}},"children":[{"type":"strong","position":{"start":{"line":702,"column":1},"end":{"line":702,"column":1}},"children":[{"type":"text","value":"General quadratic cost","position":{"start":{"line":702,"column":1},"end":{"line":702,"column":1}},"key":"biKxItJad6"}],"key":"UPoF0whF0o"},{"type":"text","value":", where we allow for linear terms and a\nconstant term.","position":{"start":{"line":702,"column":1},"end":{"line":702,"column":1}},"key":"huWbKxnEvn"}],"key":"iOkfNW4DxP"}],"key":"p8s7avAONW"},{"type":"listItem","spread":true,"position":{"start":{"line":705,"column":1},"end":{"line":707,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":705,"column":1},"end":{"line":706,"column":1}},"children":[{"type":"strong","position":{"start":{"line":705,"column":1},"end":{"line":705,"column":1}},"children":[{"type":"text","value":"Tracking a goal trajectory","position":{"start":{"line":705,"column":1},"end":{"line":705,"column":1}},"key":"CneSKxjRGu"}],"key":"eGoKuwFhNE"},{"type":"text","value":" rather than aiming for a single goal\nstate-action pair.","position":{"start":{"line":705,"column":1},"end":{"line":705,"column":1}},"key":"HRQI5AAXpC"}],"key":"z2yDEio8Ln"}],"key":"PMHChXecXN"}],"key":"peq20OpSel"},{"type":"paragraph","position":{"start":{"line":708,"column":1},"end":{"line":710,"column":1}},"children":[{"type":"text","value":"Combining these will allow us to use the LQR solution to solve more\ncomplex setups by taking ","position":{"start":{"line":708,"column":1},"end":{"line":708,"column":1}},"key":"OkVt5ElbK2"},{"type":"emphasis","position":{"start":{"line":708,"column":1},"end":{"line":708,"column":1}},"children":[{"type":"text","value":"Taylor approximations","position":{"start":{"line":708,"column":1},"end":{"line":708,"column":1}},"key":"WodvsTyMKD"}],"key":"DmngaLIFGj"},{"type":"text","value":" of the dynamics and\ncost functions.","position":{"start":{"line":708,"column":1},"end":{"line":708,"column":1}},"key":"xqj5UTVMbB"}],"key":"j4U7eWlFjF"},{"type":"heading","depth":3,"position":{"start":{"line":713,"column":1},"end":{"line":713,"column":1}},"children":[{"type":"text","value":"Time-dependent dynamics and cost function","position":{"start":{"line":713,"column":1},"end":{"line":713,"column":1}},"key":"RmMArN9LTo"}],"label":"time_dep_lqr","identifier":"time_dep_lqr","html_id":"time-dep-lqr","enumerator":"2.5.1","key":"LViFr8Tjoa"},{"type":"paragraph","position":{"start":{"line":715,"column":1},"end":{"line":723,"column":1}},"children":[{"type":"text","value":"So far, we’ve considered the ","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"PnPVQHpSdx"},{"type":"emphasis","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"children":[{"type":"text","value":"time-homogeneous","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"scI6zI1TPC"}],"key":"nIKgAjhiXw"},{"type":"text","value":" case, where the dynamics\nand cost function stay the same at every timestep. However, this might\nnot always be the case. As an example, in many sports, the rules and\nscoring system might change during an overtime period. To address these\nsorts of problems, we can loosen the time-homogeneous restriction, and\nconsider the case where the dynamics and cost function are\n","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"Yk784Up6CI"},{"type":"emphasis","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"children":[{"type":"text","value":"time-dependent.","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"h7qIB9JKuL"}],"key":"ixizpqV1bW"},{"type":"text","value":" Our analysis remains almost identical; in fact, we can\nsimply add a time index to the matrices ","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"LbGLXi0uSO"},{"type":"inlineMath","value":"A","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"html":"AAA","key":"t1Vs8UzluW"},{"type":"text","value":" and ","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"LbQwImFxqX"},{"type":"inlineMath","value":"B","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"html":"BBB","key":"pxRUmPgUzd"},{"type":"text","value":" that determine the\ndynamics and the matrices ","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"B0NazK5RI5"},{"type":"inlineMath","value":"Q","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"html":"QQQ","key":"IiXampEejo"},{"type":"text","value":" and ","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"ap88F3gHAo"},{"type":"inlineMath","value":"R","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"html":"RRR","key":"widFQD6UGS"},{"type":"text","value":" that determine the cost.","position":{"start":{"line":715,"column":1},"end":{"line":715,"column":1}},"key":"XPa48BQlz4"}],"key":"A6v0dugMjq"},{"type":"paragraph","position":{"start":{"line":725,"column":1},"end":{"line":725,"column":1}},"children":[{"type":"text","value":"The modified problem is now defined as follows:","position":{"start":{"line":725,"column":1},"end":{"line":725,"column":1}},"key":"gg2KwE61QF"}],"key":"QWw4clZnuj"},{"type":"proof","kind":"definition","label":"time_dependent_lqr","identifier":"time_dependent_lqr","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Time-dependent LQR","position":{"start":{"line":727,"column":1},"end":{"line":727,"column":1}},"key":"bj6483ZPFx"}],"key":"Da9LYK68AX"},{"type":"math","value":"\\begin{aligned}\n \\min_{\\pi_{0}, \\dots, \\pi_{\\hor-1}} \\quad & \\E \\left[ \\left( \\sum_{\\hi=0}^{\\hor-1} (\\st_\\hi^\\top Q_\\hi \\st_\\hi) + \\act_\\hi^\\top R_\\hi \\act_\\hi \\right) + \\st_\\hor^\\top Q_\\hor \\st_\\hor \\right] \\\\\n \\textrm{where} \\quad & \\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi) = A_\\hi \\st_\\hi + B_\\hi \\act_\\hi + w_\\hi \\\\\n & \\st_0 \\sim \\mu_0 \\\\\n & \\act_\\hi = \\pi_\\hi (\\st_\\hi) \\\\\n & w_\\hi \\sim \\mathcal{N}(0, \\sigma^2 I).\n\\end{aligned}","position":{"start":{"line":730,"column":1},"end":{"line":738,"column":1}},"html":"minπ0,,πH1E[(h=0H1(xhQhxh)+uhRhuh)+xHQHxH]wherexh+1=fh(xh,uh,wh)=Ahxh+Bhuh+whx0μ0uh=πh(xh)whN(0,σ2I).\\begin{aligned}\n \\min_{\\pi_{0}, \\dots, \\pi_{\\hor-1}} \\quad & \\E \\left[ \\left( \\sum_{\\hi=0}^{\\hor-1} (\\st_\\hi^\\top Q_\\hi \\st_\\hi) + \\act_\\hi^\\top R_\\hi \\act_\\hi \\right) + \\st_\\hor^\\top Q_\\hor \\st_\\hor \\right] \\\\\n \\textrm{where} \\quad & \\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi) = A_\\hi \\st_\\hi + B_\\hi \\act_\\hi + w_\\hi \\\\\n & \\st_0 \\sim \\mu_0 \\\\\n & \\act_\\hi = \\pi_\\hi (\\st_\\hi) \\\\\n & w_\\hi \\sim \\mathcal{N}(0, \\sigma^2 I).\n\\end{aligned}π0,,πH1minwhereE[(h=0H1(xhQhxh)+uhRhuh)+xHQHxH]xh+1=fh(xh,uh,wh)=Ahxh+Bhuh+whx0μ0uh=πh(xh)whN(0,σ2I).","enumerator":"2.36","key":"dwaY4SgTEl"}],"enumerator":"2.6","html_id":"time-dependent-lqr","key":"qJ387cOoYn"},{"type":"paragraph","position":{"start":{"line":743,"column":1},"end":{"line":745,"column":1}},"children":[{"type":"text","value":"The derivation of the optimal value functions and the optimal policy\nremains almost exactly the same, and we can modify the Riccati equation\naccordingly:","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"key":"hDpvVZexem"}],"key":"dztOBFu4fJ"},{"type":"proof","kind":"definition","label":"riccati_time_dependent","identifier":"riccati_time_dependent","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Time-dependent Riccati Equation","position":{"start":{"line":747,"column":1},"end":{"line":747,"column":1}},"key":"ZgZxieS3jj"}],"key":"Ii0inyn9ul"},{"type":"math","value":"P_\\hi = Q_\\hi + A_\\hi^\\top P_{\\hi+1} A_\\hi - A_\\hi^\\top P_{\\hi+1} B_\\hi (R_\\hi + B_\\hi^\\top P_{\\hi+1} B_\\hi)^{-1} B_\\hi^\\top P_{\\hi+1} A_\\hi.","position":{"start":{"line":750,"column":1},"end":{"line":752,"column":1}},"html":"Ph=Qh+AhPh+1AhAhPh+1Bh(Rh+BhPh+1Bh)1BhPh+1Ah.P_\\hi = Q_\\hi + A_\\hi^\\top P_{\\hi+1} A_\\hi - A_\\hi^\\top P_{\\hi+1} B_\\hi (R_\\hi + B_\\hi^\\top P_{\\hi+1} B_\\hi)^{-1} B_\\hi^\\top P_{\\hi+1} A_\\hi.Ph=Qh+AhPh+1AhAhPh+1Bh(Rh+BhPh+1Bh)1BhPh+1Ah.","enumerator":"2.37","key":"UyfWtHRu41"},{"type":"paragraph","position":{"start":{"line":754,"column":1},"end":{"line":756,"column":1}},"children":[{"type":"text","value":"Note that this is just the time-homogeneous Riccati equation\n(","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"qMeop9ZQRy"},{"type":"crossReference","kind":"proof:definition","identifier":"riccati","label":"riccati","children":[{"type":"text","value":"Definition ","key":"V60qKqdC8s"},{"type":"text","value":"2.5","key":"vWUiSlSoWu"}],"template":"Definition %s","enumerator":"2.5","resolved":true,"html_id":"riccati","key":"b143eQHEkx"},{"type":"text","value":"), but with the time index added to each of the\nrelevant matrices.","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"gh6NxVmEQP"}],"key":"vqohMbnP5U"}],"enumerator":"2.7","html_id":"riccati-time-dependent","key":"cm6vXKdX3l"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Exercise","position":{"start":{"line":759,"column":1},"end":{"line":759,"column":1}},"key":"otrHOU98iZ"}],"key":"C5XfiMoabx"},{"type":"paragraph","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"children":[{"type":"text","value":"Walk through the proof in ","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"key":"w4Olf84Q8g"},{"type":"crossReference","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"children":[{"type":"text","value":"Section ","key":"RChhrWJBHw"},{"type":"text","value":"2.4","key":"iqNcCYAJBe"}],"identifier":"optimal_lqr","label":"optimal_lqr","kind":"heading","template":"Section %s","enumerator":"2.4","resolved":true,"html_id":"optimal-lqr","key":"ohS7jf68Ce"},{"type":"text","value":" to verify that we can simply add ","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"key":"ZomQ5SblS6"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"html":"h\\hih","key":"DwUjkQPs6w"},{"type":"text","value":" for the time-dependent case.","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"key":"V7PttU9YYe"}],"key":"pQy9d0xVvO"}],"key":"DM7YFP4t2q"},{"type":"paragraph","position":{"start":{"line":763,"column":1},"end":{"line":765,"column":1}},"children":[{"type":"text","value":"Additionally, by allowing the dynamics to vary across time, we gain the\nability to ","position":{"start":{"line":763,"column":1},"end":{"line":763,"column":1}},"key":"JHMJVvUrE6"},{"type":"emphasis","position":{"start":{"line":763,"column":1},"end":{"line":763,"column":1}},"children":[{"type":"text","value":"locally approximate","position":{"start":{"line":763,"column":1},"end":{"line":763,"column":1}},"key":"IybYnSuav1"}],"key":"ctHJYM9esM"},{"type":"text","value":" nonlinear dynamics at each timestep.\nWe’ll discuss this later in the chapter.","position":{"start":{"line":763,"column":1},"end":{"line":763,"column":1}},"key":"HiJoblgH1S"}],"key":"vi4a5uUWgV"},{"type":"heading","depth":3,"position":{"start":{"line":767,"column":1},"end":{"line":767,"column":1}},"children":[{"type":"text","value":"More general quadratic cost functions","position":{"start":{"line":767,"column":1},"end":{"line":767,"column":1}},"key":"mxTUpYoR5U"}],"identifier":"more-general-quadratic-cost-functions","label":"More general quadratic cost functions","html_id":"more-general-quadratic-cost-functions","implicit":true,"enumerator":"2.5.2","key":"QU99BEy1N2"},{"type":"paragraph","position":{"start":{"line":769,"column":1},"end":{"line":776,"column":1}},"children":[{"type":"text","value":"Our original cost function had only second-order terms with respect to\nthe state and action, incentivizing staying as close as possible to\n","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"key":"lVYbaWyvhG"},{"type":"inlineMath","value":"(\\st^\\star, \\act^\\star) = (0, 0)","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"html":"(x,u)=(0,0)(\\st^\\star, \\act^\\star) = (0, 0)(x,u)=(0,0)","key":"vYBMihL3N3"},{"type":"text","value":". We can also consider more general\nquadratic cost functions that also have first-order terms and a constant\nterm. Combining this with time-dependent dynamics results in the\nfollowing expression, where we introduce a new matrix ","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"key":"UYhJuHHpYW"},{"type":"inlineMath","value":"M_\\hi","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"html":"MhM_\\hiMh","key":"WCyCkyiIiV"},{"type":"text","value":" for the\ncross term, linear coefficients ","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"key":"M44DQ85Pg0"},{"type":"inlineMath","value":"q_\\hi","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"html":"qhq_\\hiqh","key":"s0yYBEpvXd"},{"type":"text","value":" and ","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"key":"aGo4qvcSDY"},{"type":"inlineMath","value":"r_\\hi","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"html":"rhr_\\hirh","key":"l98hSwBHDc"},{"type":"text","value":" for the state and\naction respectively, and a constant term ","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"key":"sGJnBAFkWy"},{"type":"inlineMath","value":"c_\\hi","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"html":"chc_\\hich","key":"y87jwOjxBG"},{"type":"text","value":":","position":{"start":{"line":769,"column":1},"end":{"line":769,"column":1}},"key":"tj3xdiCtkq"}],"key":"uXJQBGypOT"},{"type":"math","value":"c_\\hi(\\st_\\hi, \\act_\\hi) = ( \\st_\\hi^\\top Q_\\hi \\st_\\hi + \\st_\\hi^\\top M_\\hi \\act_\\hi + \\act_\\hi^\\top R_\\hi \\act_\\hi ) + (\\st_\\hi^\\top q_\\hi + \\act_\\hi^\\top r_\\hi) + c_\\hi.","label":"general_quadratic_cost","identifier":"general_quadratic_cost","html":"ch(xh,uh)=(xhQhxh+xhMhuh+uhRhuh)+(xhqh+uhrh)+ch.c_\\hi(\\st_\\hi, \\act_\\hi) = ( \\st_\\hi^\\top Q_\\hi \\st_\\hi + \\st_\\hi^\\top M_\\hi \\act_\\hi + \\act_\\hi^\\top R_\\hi \\act_\\hi ) + (\\st_\\hi^\\top q_\\hi + \\act_\\hi^\\top r_\\hi) + c_\\hi.ch(xh,uh)=(xhQhxh+xhMhuh+uhRhuh)+(xhqh+uhrh)+ch.","enumerator":"2.38","html_id":"general-quadratic-cost","key":"PAf4QUVR4g"},{"type":"paragraph","position":{"start":{"line":784,"column":1},"end":{"line":786,"column":1}},"children":[{"type":"text","value":"Similarly, we can also include a\nconstant term ","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"BwyFcDXcqr"},{"type":"inlineMath","value":"v_\\hi \\in \\mathbb{R}^{n_\\st}","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"html":"vhRnxv_\\hi \\in \\mathbb{R}^{n_\\st}vhRnx","key":"lOo08uxA0j"},{"type":"text","value":" in the dynamics (note that this is\n","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"gPhVLSzvVX"},{"type":"emphasis","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"children":[{"type":"text","value":"deterministic","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"RxhZor9iZq"}],"key":"qp2eBwphux"},{"type":"text","value":" at each timestep, unlike the stochastic noise ","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"aJkFAhbQ7K"},{"type":"inlineMath","value":"w_\\hi","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"html":"whw_\\hiwh","key":"xEFuhkAywM"},{"type":"text","value":"):","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"pgubwuQjdG"}],"key":"PLNVg4IWc4"},{"type":"math","value":"\\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi) = A_\\hi \\st_\\hi + B_\\hi \\act_\\hi + v_\\hi + w_\\hi.","position":{"start":{"line":789,"column":1},"end":{"line":791,"column":1}},"html":"xh+1=fh(xh,uh,wh)=Ahxh+Bhuh+vh+wh.\\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi) = A_\\hi \\st_\\hi + B_\\hi \\act_\\hi + v_\\hi + w_\\hi.xh+1=fh(xh,uh,wh)=Ahxh+Bhuh+vh+wh.","enumerator":"2.39","key":"LS0TVhdGmS"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"exercise","position":{"start":{"line":795,"column":1},"end":{"line":795,"column":1}},"key":"sloohH9wtN"}],"key":"WzE23AvrD3"},{"type":"paragraph","position":{"start":{"line":796,"column":1},"end":{"line":797,"column":1}},"children":[{"type":"text","value":"Derive the optimal solution. You will need to slightly modify the\nproof in ","position":{"start":{"line":796,"column":1},"end":{"line":796,"column":1}},"key":"lcmyQLRxve"},{"type":"crossReference","position":{"start":{"line":796,"column":1},"end":{"line":796,"column":1}},"children":[{"type":"text","value":"Section ","key":"QoghENNIVf"},{"type":"text","value":"2.4","key":"Gz4g28WbUa"}],"identifier":"optimal_lqr","label":"optimal_lqr","kind":"heading","template":"Section %s","enumerator":"2.4","resolved":true,"html_id":"optimal-lqr","key":"P22mT2C4UE"},{"type":"text","value":".","position":{"start":{"line":796,"column":1},"end":{"line":796,"column":1}},"key":"PEdftVhjs2"}],"key":"mfVrg0oIsM"}],"key":"DF0SPVNzcZ"},{"type":"heading","depth":3,"position":{"start":{"line":800,"column":1},"end":{"line":800,"column":1}},"children":[{"type":"text","value":"Tracking a predefined trajectory","position":{"start":{"line":800,"column":1},"end":{"line":800,"column":1}},"key":"QB0UGT5C8u"}],"identifier":"tracking-a-predefined-trajectory","label":"Tracking a predefined trajectory","html_id":"tracking-a-predefined-trajectory","implicit":true,"enumerator":"2.5.3","key":"biQYo7j2Q8"},{"type":"paragraph","position":{"start":{"line":802,"column":1},"end":{"line":807,"column":1}},"children":[{"type":"text","value":"Consider applying LQR to a task like autonomous driving, where the\ntarget state-action pair changes over time. We might want the vehicle to\nfollow a predefined ","position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"key":"yNmFBhM1ZG"},{"type":"emphasis","position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"children":[{"type":"text","value":"trajectory","position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"key":"qH7yY96vvu"}],"key":"Axe9g5N0xA"},{"type":"text","value":" of states and actions\n","position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"key":"iI4ZuTOqa5"},{"type":"inlineMath","value":"(\\st_\\hi^\\star, \\act_\\hi^\\star)_{\\hi=0}^{\\hor-1}","position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"html":"(xh,uh)h=0H1(\\st_\\hi^\\star, \\act_\\hi^\\star)_{\\hi=0}^{\\hor-1}(xh,uh)h=0H1","key":"NDRSqIpdXj"},{"type":"text","value":". To express this as a\ncontrol problem, we’ll need a corresponding time-dependent cost\nfunction:","position":{"start":{"line":802,"column":1},"end":{"line":802,"column":1}},"key":"BlaKfhtkVE"}],"key":"eAm2AKcRao"},{"type":"math","value":"c_\\hi(\\st_\\hi, \\act_\\hi) = (\\st_\\hi - \\st^\\star_\\hi)^\\top Q (\\st_\\hi - \\st^\\star_\\hi) + (\\act_\\hi - \\act^\\star_\\hi)^\\top R (\\act_\\hi - \\act^\\star_\\hi).","position":{"start":{"line":810,"column":1},"end":{"line":812,"column":1}},"html":"ch(xh,uh)=(xhxh)Q(xhxh)+(uhuh)R(uhuh).c_\\hi(\\st_\\hi, \\act_\\hi) = (\\st_\\hi - \\st^\\star_\\hi)^\\top Q (\\st_\\hi - \\st^\\star_\\hi) + (\\act_\\hi - \\act^\\star_\\hi)^\\top R (\\act_\\hi - \\act^\\star_\\hi).ch(xh,uh)=(xhxh)Q(xhxh)+(uhuh)R(uhuh).","enumerator":"2.40","key":"ukLq4QGAPE"},{"type":"paragraph","position":{"start":{"line":815,"column":1},"end":{"line":818,"column":1}},"children":[{"type":"text","value":"Note that this punishes states and actions that are far from the\nintended trajectory. By expanding out these multiplications, we can see\nthat this is actually a special case of the more general quadratic cost\nfunction above ","position":{"start":{"line":815,"column":1},"end":{"line":815,"column":1}},"key":"bP82kRbY22"},{"type":"crossReference","position":{"start":{"line":815,"column":1},"end":{"line":815,"column":1}},"children":[{"type":"text","value":"(","key":"QpFNNktzMf"},{"type":"text","value":"2.38","key":"XhSpvpjFy2"},{"type":"text","value":")","key":"gFXvFKVdGk"}],"identifier":"general_quadratic_cost","label":"general_quadratic_cost","kind":"equation","template":"(%s)","enumerator":"2.38","resolved":true,"html_id":"general-quadratic-cost","key":"VHX064dRfl"},{"type":"text","value":":","position":{"start":{"line":815,"column":1},"end":{"line":815,"column":1}},"key":"ooPoOG2N2j"}],"key":"nRM6REDo6i"},{"type":"math","value":"M_\\hi = 0, \\qquad q_\\hi = -2Q \\st^\\star_\\hi, \\qquad r_\\hi = -2R \\act^\\star_\\hi, \\qquad c_\\hi = (\\st^\\star_\\hi)^\\top Q (\\st^\\star_\\hi) + (\\act^\\star_\\hi)^\\top R (\\act^\\star_\\hi).","position":{"start":{"line":821,"column":1},"end":{"line":823,"column":1}},"html":"Mh=0,qh=2Qxh,rh=2Ruh,ch=(xh)Q(xh)+(uh)R(uh).M_\\hi = 0, \\qquad q_\\hi = -2Q \\st^\\star_\\hi, \\qquad r_\\hi = -2R \\act^\\star_\\hi, \\qquad c_\\hi = (\\st^\\star_\\hi)^\\top Q (\\st^\\star_\\hi) + (\\act^\\star_\\hi)^\\top R (\\act^\\star_\\hi).Mh=0,qh=2Qxh,rh=2Ruh,ch=(xh)Q(xh)+(uh)R(uh).","enumerator":"2.41","key":"x7qqJb4SC3"},{"type":"heading","depth":2,"position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"children":[{"type":"text","value":"Approximating nonlinear dynamics","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"key":"ku43ajpmcr"}],"label":"approx_nonlinear","identifier":"approx_nonlinear","html_id":"approx-nonlinear","enumerator":"2.6","key":"VL5Wxp4J04"},{"type":"paragraph","position":{"start":{"line":830,"column":1},"end":{"line":835,"column":1}},"children":[{"type":"text","value":"The LQR algorithm solves for the optimal policy when the dynamics are\n","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"key":"qbH23mM62M"},{"type":"emphasis","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"children":[{"type":"text","value":"linear","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"key":"VJdU7chyKU"}],"key":"ysmPu7y7o3"},{"type":"text","value":" and the cost function is an ","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"key":"kv1Dx1n9cF"},{"type":"emphasis","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"children":[{"type":"text","value":"upward-curved quadratic","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"key":"C4P3btJ7rP"}],"key":"kGhdOesTDK"},{"type":"text","value":". However,\nreal settings are rarely this simple! Let’s return to the CartPole\nexample from the start of the chapter\n(","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"key":"TXLYm3fmuW"},{"type":"crossReference","kind":"proof:example","identifier":"cart_pole","label":"cart_pole","children":[{"type":"text","value":"Example ","key":"NN2DU9DuFr"},{"type":"text","value":"2.1","key":"ShR40JHIKC"}],"template":"Example %s","enumerator":"2.1","resolved":true,"html_id":"cart-pole","key":"qfGCBPRYmo"},{"type":"text","value":"). The dynamics (physics) aren’t linear. How\ncan we approximate this by an LQR problem?","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"key":"H3S2JdM5FX"}],"key":"K5VnGSwz41"},{"type":"paragraph","position":{"start":{"line":837,"column":1},"end":{"line":840,"column":1}},"children":[{"type":"text","value":"Concretely, let’s consider a ","position":{"start":{"line":837,"column":1},"end":{"line":837,"column":1}},"key":"gKQdnFDgGg"},{"type":"emphasis","position":{"start":{"line":837,"column":1},"end":{"line":837,"column":1}},"children":[{"type":"text","value":"noise-free","position":{"start":{"line":837,"column":1},"end":{"line":837,"column":1}},"key":"kVa7rKYvz5"}],"key":"MlZpgiR0Kx"},{"type":"text","value":" problem since, as we saw, the\nnoise doesn’t factor into the optimal policy. Let’s assume the dynamics\nand cost function are stationary, and ignore the terminal state for\nsimplicity:","position":{"start":{"line":837,"column":1},"end":{"line":837,"column":1}},"key":"fXuPNJk0EG"}],"key":"sUVqH7tERZ"},{"type":"proof","kind":"definition","label":"nonlinear_control","identifier":"nonlinear_control","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Nonlinear control problem","position":{"start":{"line":842,"column":1},"end":{"line":842,"column":1}},"key":"yl2q8cw0V0"}],"key":"bKeq2ME7Zv"},{"type":"math","value":"\\begin{aligned}\n \\min_{\\pi_0, \\dots, \\pi_{\\hor-1} : \\mathcal{S} \\to \\mathcal{A}} \\quad & \\E_{\\st_0} \\left[ \\sum_{\\hi=0}^{\\hor-1} c(\\st_\\hi, \\act_\\hi) \\right] \\\\\n \\text{where} \\quad & \\st_{\\hi+1} = f(\\st_\\hi, \\act_\\hi) \\\\\n & \\act_\\hi = \\pi_\\hi(\\st_\\hi) \\\\\n & \\st_0 \\sim \\mu_0 \\\\\n & c(\\st, \\act) = d(\\st, \\st^\\star) + d(\\act, \\act^\\star).\n\\end{aligned}","position":{"start":{"line":847,"column":1},"end":{"line":855,"column":1}},"html":"minπ0,,πH1:SAEx0[h=0H1c(xh,uh)]wherexh+1=f(xh,uh)uh=πh(xh)x0μ0c(x,u)=d(x,x)+d(u,u).\\begin{aligned}\n \\min_{\\pi_0, \\dots, \\pi_{\\hor-1} : \\mathcal{S} \\to \\mathcal{A}} \\quad & \\E_{\\st_0} \\left[ \\sum_{\\hi=0}^{\\hor-1} c(\\st_\\hi, \\act_\\hi) \\right] \\\\\n \\text{where} \\quad & \\st_{\\hi+1} = f(\\st_\\hi, \\act_\\hi) \\\\\n & \\act_\\hi = \\pi_\\hi(\\st_\\hi) \\\\\n & \\st_0 \\sim \\mu_0 \\\\\n & c(\\st, \\act) = d(\\st, \\st^\\star) + d(\\act, \\act^\\star).\n\\end{aligned}π0,,πH1:SAminwhereEx0[h=0H1c(xh,uh)]xh+1=f(xh,uh)uh=πh(xh)x0μ0c(x,u)=d(x,x)+d(u,u).","enumerator":"2.42","key":"KZr94PG0ZT"},{"type":"paragraph","position":{"start":{"line":857,"column":1},"end":{"line":858,"column":1}},"children":[{"type":"text","value":"Here, ","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"wWbtVmBlMk"},{"type":"inlineMath","value":"d","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"html":"ddd","key":"DmrUsJpr1h"},{"type":"text","value":" denotes a function that measures the\n“distance” between its two arguments.","position":{"start":{"line":857,"column":1},"end":{"line":857,"column":1}},"key":"oYLaRw9qB1"}],"key":"yxRS4Cr1Jl"}],"enumerator":"2.8","html_id":"nonlinear-control","key":"udCOd4qXx1"},{"type":"paragraph","position":{"start":{"line":861,"column":1},"end":{"line":871,"column":1}},"children":[{"type":"text","value":"This is now only slightly simplified from the general optimal control\nproblem (see\n","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"tVJVcx0yKc"},{"type":"crossReference","kind":"proof:definition","identifier":"optimal_control","label":"optimal_control","children":[{"type":"text","value":"Definition ","key":"WY1Rq4VT30"},{"type":"text","value":"2.1","key":"Wja4zs14y9"}],"template":"Definition %s","enumerator":"2.1","resolved":true,"html_id":"optimal-control","key":"hnX85Nom4v"},{"type":"text","value":"). Here, we don’t know an analytical form\nfor the dynamics ","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"XlDQbtCJSK"},{"type":"inlineMath","value":"f","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"html":"fff","key":"mn8IwuBnDp"},{"type":"text","value":" or the cost function ","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"TaAdV44j3F"},{"type":"inlineMath","value":"c","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"html":"ccc","key":"G3XG037Rre"},{"type":"text","value":", but we assume that we’re\nable to ","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"e9lOg0VSBy"},{"type":"emphasis","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"children":[{"type":"text","value":"query/sample/simulate","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"H0nH976Fla"}],"key":"Ywz6MDHspQ"},{"type":"text","value":" them to get their values at a given\nstate and action. To clarify, consider the case where the dynamics are\ngiven by real world physics. We can’t (yet) write down an expression for\nthe dynamics that we can differentiate or integrate analytically.\nHowever, we can still ","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"ay4DP2EnpP"},{"type":"emphasis","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"children":[{"type":"text","value":"simulate","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"lyCLgu7A1a"}],"key":"Zzb2Cir5OO"},{"type":"text","value":" the dynamics and cost function by\nrunning a real-world experiment and measuring the resulting states and\ncosts. How can we adapt LQR to this more general nonlinear case?","position":{"start":{"line":861,"column":1},"end":{"line":861,"column":1}},"key":"geDpgFGno4"}],"key":"rXzezoqSFq"},{"type":"heading","depth":3,"position":{"start":{"line":873,"column":1},"end":{"line":873,"column":1}},"children":[{"type":"text","value":"Local linearization","position":{"start":{"line":873,"column":1},"end":{"line":873,"column":1}},"key":"lCUMqspQzw"}],"identifier":"local-linearization","label":"Local linearization","html_id":"local-linearization","implicit":true,"enumerator":"2.6.1","key":"JVRD8XrPE9"},{"type":"paragraph","position":{"start":{"line":875,"column":1},"end":{"line":883,"column":1}},"children":[{"type":"text","value":"How can we apply LQR when the dynamics are nonlinear or the cost\nfunction is more complex? We’ll exploit the useful fact that we can take\na function that’s ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"fM3XR8HdNF"},{"type":"emphasis","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"children":[{"type":"text","value":"locally continuous","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"ZkyJ3MfmcD"}],"key":"ATVyGUuvNx"},{"type":"text","value":" around ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"FR52oxCmVX"},{"type":"inlineMath","value":"(s^\\star, a^\\star)","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"html":"(s,a)(s^\\star, a^\\star)(s,a)","key":"vkVAvdth0V"},{"type":"text","value":" and\napproximate it nearby with low-order polynomials (i.e. its Taylor\napproximation). In particular, as long as the dynamics ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"Iwtf82D42P"},{"type":"inlineMath","value":"f","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"html":"fff","key":"gGqXIJMJ1W"},{"type":"text","value":" are\ndifferentiable around ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"c4OorvGfiO"},{"type":"inlineMath","value":"(\\st^\\star, \\act^\\star)","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"html":"(x,u)(\\st^\\star, \\act^\\star)(x,u)","key":"aVBn4M7s8p"},{"type":"text","value":" and the cost function\n","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"CRr0A9tpH1"},{"type":"inlineMath","value":"c","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"html":"ccc","key":"zTiGGH420A"},{"type":"text","value":" is twice differentiable at ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"Y3uUNkHtUV"},{"type":"inlineMath","value":"(\\st^\\star, \\act^\\star)","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"html":"(x,u)(\\st^\\star, \\act^\\star)(x,u)","key":"paQ3f8xBXh"},{"type":"text","value":", we can take a\nlinear approximation of ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"rV42ErIhtk"},{"type":"inlineMath","value":"f","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"html":"fff","key":"V00gBJexSD"},{"type":"text","value":" and a quadratic approximation of ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"Nw24FTkjYE"},{"type":"inlineMath","value":"c","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"html":"ccc","key":"BIqyW7jeB0"},{"type":"text","value":" to\nbring us back to the regime of LQR.","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"EpoNLG71tp"}],"key":"nmedcm40Wh"},{"type":"paragraph","position":{"start":{"line":885,"column":1},"end":{"line":885,"column":1}},"children":[{"type":"text","value":"Linearizing the dynamics around ","position":{"start":{"line":885,"column":1},"end":{"line":885,"column":1}},"key":"CNebNjIjlm"},{"type":"inlineMath","value":"(\\st^\\star, \\act^\\star)","position":{"start":{"line":885,"column":1},"end":{"line":885,"column":1}},"html":"(x,u)(\\st^\\star, \\act^\\star)(x,u)","key":"TnYs4VkV4n"},{"type":"text","value":" gives:","position":{"start":{"line":885,"column":1},"end":{"line":885,"column":1}},"key":"LeQIzWXw2P"}],"key":"ul5sXVBJDR"},{"type":"math","value":"\\begin{gathered}\n f(\\st, \\act) \\approx f(\\st^\\star, \\act^\\star) + \\nabla_\\st f(\\st^\\star, \\act^\\star) (\\st - \\st^\\star) + \\nabla_\\act f(\\st^\\star, \\act^\\star) (\\act - \\act^\\star) \\\\\n (\\nabla_\\st f(\\st, \\act))_{ij} = \\frac{d f_i(\\st, \\act)}{d \\st_j}, \\quad i, j \\le n_\\st \\qquad (\\nabla_\\act f(\\st, \\act))_{ij} = \\frac{d f_i(\\st, \\act)}{d \\act_j}, \\quad i \\le n_\\st, j \\le n_\\act\n\\end{gathered}","position":{"start":{"line":888,"column":1},"end":{"line":893,"column":1}},"html":"f(x,u)f(x,u)+xf(x,u)(xx)+uf(x,u)(uu)(xf(x,u))ij=dfi(x,u)dxj,i,jnx(uf(x,u))ij=dfi(x,u)duj,inx,jnu\\begin{gathered}\n f(\\st, \\act) \\approx f(\\st^\\star, \\act^\\star) + \\nabla_\\st f(\\st^\\star, \\act^\\star) (\\st - \\st^\\star) + \\nabla_\\act f(\\st^\\star, \\act^\\star) (\\act - \\act^\\star) \\\\\n (\\nabla_\\st f(\\st, \\act))_{ij} = \\frac{d f_i(\\st, \\act)}{d \\st_j}, \\quad i, j \\le n_\\st \\qquad (\\nabla_\\act f(\\st, \\act))_{ij} = \\frac{d f_i(\\st, \\act)}{d \\act_j}, \\quad i \\le n_\\st, j \\le n_\\act\n\\end{gathered}f(x,u)f(x,u)+xf(x,u)(xx)+uf(x,u)(uu)(xf(x,u))ij=dxjdfi(x,u),i,jnx(uf(x,u))ij=dujdfi(x,u),inx,jnu","enumerator":"2.43","key":"wimQeGGZ1R"},{"type":"paragraph","position":{"start":{"line":895,"column":1},"end":{"line":896,"column":1}},"children":[{"type":"text","value":"and quadratizing the cost function around\n","position":{"start":{"line":895,"column":1},"end":{"line":895,"column":1}},"key":"rmHyGkgBaS"},{"type":"inlineMath","value":"(\\st^\\star, \\act^\\star)","position":{"start":{"line":895,"column":1},"end":{"line":895,"column":1}},"html":"(x,u)(\\st^\\star, \\act^\\star)(x,u)","key":"vlNNoWE1MF"},{"type":"text","value":" gives:","position":{"start":{"line":895,"column":1},"end":{"line":895,"column":1}},"key":"KFQpmHD3hu"}],"key":"EaVL03GTmZ"},{"type":"math","value":"\\begin{aligned}\n c(\\st, \\act) & \\approx c(\\st^\\star, \\act^\\star) \\quad \\text{constant term} \\\\\n & \\qquad + \\nabla_\\st c(\\st^\\star, \\act^\\star) (\\st - \\st^\\star) + \\nabla_\\act c(\\st^\\star, \\act^\\star) (a - \\act^\\star) \\quad \\text{linear terms} \\\\\n & \\left. \\begin{aligned}\n & \\qquad + \\frac{1}{2} (\\st - \\st^\\star)^\\top \\nabla_{\\st \\st} c(\\st^\\star, \\act^\\star) (\\st - \\st^\\star) \\\\\n & \\qquad + \\frac{1}{2} (\\act - \\act^\\star)^\\top \\nabla_{\\act \\act} c(\\st^\\star, \\act^\\star) (\\act - \\act^\\star) \\\\\n & \\qquad + (\\st - \\st^\\star)^\\top \\nabla_{\\st \\act} c(\\st^\\star, \\act^\\star) (\\act - \\act^\\star)\n \\end{aligned} \\right\\} \\text{quadratic terms}\n\\end{aligned}","position":{"start":{"line":898,"column":1},"end":{"line":908,"column":1}},"html":"c(x,u)c(x,u)constant term+xc(x,u)(xx)+uc(x,u)(au)linear terms+12(xx)xxc(x,u)(xx)+12(uu)uuc(x,u)(uu)+(xx)xuc(x,u)(uu)}quadratic terms\\begin{aligned}\n c(\\st, \\act) & \\approx c(\\st^\\star, \\act^\\star) \\quad \\text{constant term} \\\\\n & \\qquad + \\nabla_\\st c(\\st^\\star, \\act^\\star) (\\st - \\st^\\star) + \\nabla_\\act c(\\st^\\star, \\act^\\star) (a - \\act^\\star) \\quad \\text{linear terms} \\\\\n & \\left. \\begin{aligned}\n & \\qquad + \\frac{1}{2} (\\st - \\st^\\star)^\\top \\nabla_{\\st \\st} c(\\st^\\star, \\act^\\star) (\\st - \\st^\\star) \\\\\n & \\qquad + \\frac{1}{2} (\\act - \\act^\\star)^\\top \\nabla_{\\act \\act} c(\\st^\\star, \\act^\\star) (\\act - \\act^\\star) \\\\\n & \\qquad + (\\st - \\st^\\star)^\\top \\nabla_{\\st \\act} c(\\st^\\star, \\act^\\star) (\\act - \\act^\\star)\n \\end{aligned} \\right\\} \\text{quadratic terms}\n\\end{aligned}c(x,u)c(x,u)constant term+xc(x,u)(xx)+uc(x,u)(au)linear terms+21(xx)xxc(x,u)(xx)+21(uu)uuc(x,u)(uu)+(xx)xuc(x,u)(uu)quadratic terms","enumerator":"2.44","key":"nRy0HkKlp1"},{"type":"paragraph","position":{"start":{"line":910,"column":1},"end":{"line":910,"column":1}},"children":[{"type":"text","value":"where the gradients and Hessians are defined as","position":{"start":{"line":910,"column":1},"end":{"line":910,"column":1}},"key":"TjgdSmMi56"}],"key":"c6XFSWrguE"},{"type":"math","value":"\\begin{aligned}\n (\\nabla_\\st c(\\st, \\act))_{i} & = \\frac{d c(\\st, \\act)}{d \\st_i}, \\quad i \\le n_\\st\n & (\\nabla_\\act c(\\st, \\act))_{i} & = \\frac{d c(\\st, \\act)}{d \\act_i}, \\quad i \\le n_\\act \\\\\n (\\nabla_{\\st \\st} c(\\st, \\act))_{ij} & = \\frac{d^2 c(\\st, \\act)}{d \\st_i d \\st_j}, \\quad i, j \\le n_\\st\n & (\\nabla_{\\act \\act} c(\\st, \\act))_{ij} & = \\frac{d^2 c(\\st, \\act)}{d \\act_i d \\act_j}, \\quad i, j \\le n_\\act \\\\\n (\\nabla_{\\st \\act} c(\\st, \\act))_{ij} & = \\frac{d^2 c(\\st, \\act)}{d \\st_i d \\act_j}. \\quad i \\le n_\\st, j \\le n_\\act\n\\end{aligned}","position":{"start":{"line":913,"column":1},"end":{"line":921,"column":1}},"html":"(xc(x,u))i=dc(x,u)dxi,inx(uc(x,u))i=dc(x,u)dui,inu(xxc(x,u))ij=d2c(x,u)dxidxj,i,jnx(uuc(x,u))ij=d2c(x,u)duiduj,i,jnu(xuc(x,u))ij=d2c(x,u)dxiduj.inx,jnu\\begin{aligned}\n (\\nabla_\\st c(\\st, \\act))_{i} & = \\frac{d c(\\st, \\act)}{d \\st_i}, \\quad i \\le n_\\st\n & (\\nabla_\\act c(\\st, \\act))_{i} & = \\frac{d c(\\st, \\act)}{d \\act_i}, \\quad i \\le n_\\act \\\\\n (\\nabla_{\\st \\st} c(\\st, \\act))_{ij} & = \\frac{d^2 c(\\st, \\act)}{d \\st_i d \\st_j}, \\quad i, j \\le n_\\st\n & (\\nabla_{\\act \\act} c(\\st, \\act))_{ij} & = \\frac{d^2 c(\\st, \\act)}{d \\act_i d \\act_j}, \\quad i, j \\le n_\\act \\\\\n (\\nabla_{\\st \\act} c(\\st, \\act))_{ij} & = \\frac{d^2 c(\\st, \\act)}{d \\st_i d \\act_j}. \\quad i \\le n_\\st, j \\le n_\\act\n\\end{aligned}(xc(x,u))i(xxc(x,u))ij(xuc(x,u))ij=dxidc(x,u),inx=dxidxjd2c(x,u),i,jnx=dxidujd2c(x,u).inx,jnu(uc(x,u))i(uuc(x,u))ij=duidc(x,u),inu=duidujd2c(x,u),i,jnu","enumerator":"2.45","key":"TQq4x1LD0v"},{"type":"paragraph","position":{"start":{"line":925,"column":1},"end":{"line":928,"column":1}},"children":[{"type":"strong","position":{"start":{"line":925,"column":1},"end":{"line":925,"column":1}},"children":[{"type":"text","value":"Exercise:","position":{"start":{"line":925,"column":1},"end":{"line":925,"column":1}},"key":"m9vZqACB76"}],"key":"GWMEjGOI46"},{"type":"text","value":" Note that this cost can be expressed in the general\nquadratic form seen in\n","position":{"start":{"line":925,"column":1},"end":{"line":925,"column":1}},"key":"ffgqejL8Bg"},{"type":"crossReference","kind":"equation","identifier":"general_quadratic_cost","label":"general_quadratic_cost","children":[{"type":"text","value":"(","key":"ZVbLF8SSUZ"},{"type":"text","value":"2.38","key":"CQ9m9wWHbR"},{"type":"text","value":")","key":"EQF9xCvCR1"}],"template":"(%s)","enumerator":"2.38","resolved":true,"html_id":"general-quadratic-cost","key":"JwGV5ygeMK"},{"type":"text","value":". Derive the corresponding\nquantities ","position":{"start":{"line":925,"column":1},"end":{"line":925,"column":1}},"key":"M1EKmvKQaA"},{"type":"inlineMath","value":"Q, R, M, q, r, c","position":{"start":{"line":925,"column":1},"end":{"line":925,"column":1}},"html":"Q,R,M,q,r,cQ, R, M, q, r, cQ,R,M,q,r,c","key":"HbX1kma97V"},{"type":"text","value":".","position":{"start":{"line":925,"column":1},"end":{"line":925,"column":1}},"key":"Rd9Wjmobor"}],"key":"K4uezpMsUN"},{"type":"heading","depth":3,"position":{"start":{"line":930,"column":1},"end":{"line":930,"column":1}},"children":[{"type":"text","value":"Finite differencing","position":{"start":{"line":930,"column":1},"end":{"line":930,"column":1}},"key":"seLVZd6HjP"}],"identifier":"finite-differencing","label":"Finite differencing","html_id":"finite-differencing","implicit":true,"enumerator":"2.6.2","key":"owN3aIgjVz"},{"type":"paragraph","position":{"start":{"line":932,"column":1},"end":{"line":936,"column":1}},"children":[{"type":"text","value":"To calculate these gradients and Hessians in practice,\nwe use a method known as ","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"key":"dZPXI8ZPx2"},{"type":"strong","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"children":[{"type":"text","value":"finite differencing","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"key":"cXfwHN2Fha"}],"key":"NB0nkK4X5t"},{"type":"text","value":" for numerically computing derivatives.\nNamely, we can simply use the limit definition of the derivative, and\nsee how the function changes as we add or subtract a tiny ","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"key":"Xy8aWabFrQ"},{"type":"text","value":"δ","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"key":"rdGddb3XPg"},{"type":"text","value":" to\nthe input.","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"key":"ynZIBcFOpQ"}],"key":"NnHUhc7fnZ"},{"type":"math","value":"\\frac{d}{dx} f(x) = \\lim_{\\delta \\to 0} \\frac{f(x + \\delta) - f(x)}{\\delta}","position":{"start":{"line":939,"column":1},"end":{"line":941,"column":1}},"html":"ddxf(x)=limδ0f(x+δ)f(x)δ\\frac{d}{dx} f(x) = \\lim_{\\delta \\to 0} \\frac{f(x + \\delta) - f(x)}{\\delta}dxdf(x)=δ0limδf(x+δ)f(x)","enumerator":"2.46","key":"TM1FN3TdfD"},{"type":"paragraph","position":{"start":{"line":944,"column":1},"end":{"line":946,"column":1}},"children":[{"type":"text","value":"Note that this only requires us to be able to ","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"McntYkyec0"},{"type":"emphasis","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"children":[{"type":"text","value":"query","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"bEkO15MjFZ"}],"key":"aOztcKFzmD"},{"type":"text","value":" the function, not\nto have an analytical expression for it, which is why it’s so useful in\npractice.","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"rrF8zt3fTy"}],"key":"aelEI2leWD"},{"type":"heading","depth":3,"position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"children":[{"type":"text","value":"Local convexification","position":{"start":{"line":948,"column":1},"end":{"line":948,"column":1}},"key":"ACIXvuBcie"}],"identifier":"local-convexification","label":"Local convexification","html_id":"local-convexification","implicit":true,"enumerator":"2.6.3","key":"g350HfbiBN"},{"type":"paragraph","position":{"start":{"line":950,"column":1},"end":{"line":953,"column":1}},"children":[{"type":"text","value":"However, simply taking the second-order approximation of the cost\nfunction is insufficient, since for the LQR setup we required that the\n","position":{"start":{"line":950,"column":1},"end":{"line":950,"column":1}},"key":"svHDXBDDPD"},{"type":"inlineMath","value":"Q","position":{"start":{"line":950,"column":1},"end":{"line":950,"column":1}},"html":"QQQ","key":"RcaFOuFq6M"},{"type":"text","value":" and ","position":{"start":{"line":950,"column":1},"end":{"line":950,"column":1}},"key":"iUEly4TE29"},{"type":"inlineMath","value":"R","position":{"start":{"line":950,"column":1},"end":{"line":950,"column":1}},"html":"RRR","key":"np1AVzxXBV"},{"type":"text","value":" matrices were positive definite, i.e. that all of their\neigenvalues were positive.","position":{"start":{"line":950,"column":1},"end":{"line":950,"column":1}},"key":"RTU6AKCksn"}],"key":"PZsPKbdyVg"},{"type":"paragraph","position":{"start":{"line":955,"column":1},"end":{"line":960,"column":1}},"children":[{"type":"text","value":"One way to naively ","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"Jm6aXd7lAH"},{"type":"emphasis","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"children":[{"type":"text","value":"force","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"xKuYHGOXTI"}],"key":"m2etELYlsX"},{"type":"text","value":" some symmetric matrix ","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"FbNSIOGqrB"},{"type":"inlineMath","value":"D","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"html":"DDD","key":"eGYMWbfDsB"},{"type":"text","value":" to be positive definite\nis to set any non-positive eigenvalues to some small positive value ","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"j25qTuWQIZ"},{"type":"inlineMath","value":"\\varepsilon > 0","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"html":"ε>0\\varepsilon > 0ε>0","key":"LiFFZZLmPe"},{"type":"text","value":".\nRecall that any real symmetric matrix ","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"Fd5bu4cgbW"},{"type":"inlineMath","value":"D \\in \\mathbb{R}^{n \\times n}","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"html":"DRn×nD \\in \\mathbb{R}^{n \\times n}DRn×n","key":"vJdmSXyUqw"},{"type":"text","value":" has an basis of eigenvectors ","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"npwjFTEWWG"},{"type":"inlineMath","value":"u_1, \\dots, u_n","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"html":"u1,,unu_1, \\dots, u_nu1,,un","key":"qhgXYvOk3l"},{"type":"text","value":"\nwith corresponding eigenvalues ","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"XQ7ETta69e"},{"type":"inlineMath","value":"\\lambda_1, \\dots, \\lambda_n","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"html":"λ1,,λn\\lambda_1, \\dots, \\lambda_nλ1,,λn","key":"ca2RYV5wA1"},{"type":"text","value":"\nsuch that ","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"gPo98eWOvf"},{"type":"inlineMath","value":"D u_i = \\lambda_i u_i","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"html":"Dui=λiuiD u_i = \\lambda_i u_iDui=λiui","key":"bb1gTahcBY"},{"type":"text","value":".\nThen we can construct the positive definite approximation by","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"obUzCvpOn2"}],"key":"FDVE4LE6PE"},{"type":"math","value":"\\widetilde{D} = \\left( \\sum_{i=1, \\dots, n \\mid \\lambda_i > 0} \\lambda_i u_i u_i^\\top \\right) + \\varepsilon I.","position":{"start":{"line":962,"column":1},"end":{"line":964,"column":1}},"html":"D~=(i=1,,nλi>0λiuiui)+εI.\\widetilde{D} = \\left( \\sum_{i=1, \\dots, n \\mid \\lambda_i > 0} \\lambda_i u_i u_i^\\top \\right) + \\varepsilon I.D=i=1,,nλi>0λiuiui+εI.","enumerator":"2.47","key":"v37yxzWTuH"},{"type":"paragraph","position":{"start":{"line":968,"column":1},"end":{"line":969,"column":1}},"children":[{"type":"strong","position":{"start":{"line":968,"column":1},"end":{"line":968,"column":1}},"children":[{"type":"text","value":"Exercise:","position":{"start":{"line":968,"column":1},"end":{"line":968,"column":1}},"key":"Z1IaZYbGNo"}],"key":"nlZgDlGIZO"},{"type":"text","value":" Convince yourself that ","position":{"start":{"line":968,"column":1},"end":{"line":968,"column":1}},"key":"BjSFSiIpSO"},{"type":"inlineMath","value":"\\widetilde{D}","position":{"start":{"line":968,"column":1},"end":{"line":968,"column":1}},"html":"D~\\widetilde{D}D","key":"YLsMATTtGK"},{"type":"text","value":" is indeed positive\ndefinite.","position":{"start":{"line":968,"column":1},"end":{"line":968,"column":1}},"key":"O3Bo2WylGh"}],"key":"ZRHTWPm200"},{"type":"paragraph","position":{"start":{"line":971,"column":1},"end":{"line":977,"column":1}},"children":[{"type":"text","value":"Note that Hessian matrices are generally symmetric, so we can apply this\nprocess to ","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"key":"LqzPBz8XqS"},{"type":"inlineMath","value":"Q","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"html":"QQQ","key":"vsSU9gjJNX"},{"type":"text","value":" and ","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"key":"w1ZBD1Zvd1"},{"type":"inlineMath","value":"R","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"html":"RRR","key":"asywn6Etbp"},{"type":"text","value":" to obtain the positive definite approximations\n","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"key":"OVNwCHlOuB"},{"type":"inlineMath","value":"\\widetilde{Q}","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"html":"Q~\\widetilde{Q}Q","key":"PlMzWLiuxI"},{"type":"text","value":" and ","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"key":"NLRxmnkTPx"},{"type":"inlineMath","value":"\\widetilde{R}","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"html":"R~\\widetilde{R}R","key":"cLrRMmJw8N"},{"type":"text","value":".\nNow that we have an upward-curved\nquadratic approximation to the cost function, and a linear approximation\nto the state transitions, we can simply apply the time-homogenous LQR\nmethods from ","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"key":"VMcZDwdLlO"},{"type":"crossReference","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"children":[{"type":"text","value":"Section ","key":"JYlIr7ddz9"},{"type":"text","value":"2.4","key":"sxEyr1lHLY"}],"identifier":"optimal_lqr","label":"optimal_lqr","kind":"heading","template":"Section %s","enumerator":"2.4","resolved":true,"html_id":"optimal-lqr","key":"u4t5MXMxQW"},{"type":"text","value":".","position":{"start":{"line":971,"column":1},"end":{"line":971,"column":1}},"key":"EHpt5CjCl3"}],"key":"Zieih8Ucj7"},{"type":"paragraph","position":{"start":{"line":979,"column":1},"end":{"line":983,"column":1}},"children":[{"type":"text","value":"But what happens when we enter states far away from ","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"key":"jKIoXyQefL"},{"type":"inlineMath","value":"\\st^\\star","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"html":"x\\st^\\starx","key":"bubtkkAdYL"},{"type":"text","value":" or want\nto use actions far from ","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"key":"mEEqyF5ie5"},{"type":"inlineMath","value":"\\act^\\star","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"html":"u\\act^\\staru","key":"CCJCUcUlQj"},{"type":"text","value":"? A Taylor approximation is only\naccurate in a ","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"key":"bM4F8iP5QC"},{"type":"emphasis","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"children":[{"type":"text","value":"local","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"key":"HrWquV1BhO"}],"key":"Uw9aiZ4u96"},{"type":"text","value":" region around the point of linearization, so the\nperformance of our LQR controller will degrade as we move further away.\nWe’ll see how to address this in the next section using the ","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"key":"T5DtkJ7rYc"},{"type":"strong","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"children":[{"type":"text","value":"iterative LQR","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"key":"yQQ75anKDt"}],"key":"F8i7JXPLyB"},{"type":"text","value":" algorithm.","position":{"start":{"line":979,"column":1},"end":{"line":979,"column":1}},"key":"T94cRaPhKp"}],"key":"nvXIHCyypF"},{"type":"container","kind":"figure","children":[{"type":"image","url":"/build/log_taylor-41fd83609bdd9fa0d89b4a0510fdfb5a.png","alt":"Local linearization might only be accurate in a small region around the\npoint of linearization.","data":{"altTextIsAutoGenerated":true},"key":"NxNUETl8xD","urlSource":"shared/log_taylor.png","urlOptimized":"/build/log_taylor-41fd83609bdd9fa0d89b4a0510fdfb5a.webp"},{"type":"caption","children":[{"type":"paragraph","position":{"start":{"line":988,"column":1},"end":{"line":989,"column":1}},"children":[{"type":"captionNumber","kind":"figure","label":"local_linearization","identifier":"local_linearization","html_id":"local-linearization","enumerator":"2.3","children":[{"type":"text","value":"Figure ","key":"rBxJmSwW9t"},{"type":"text","value":"2.3","key":"C5tPpM15hV"},{"type":"text","value":":","key":"paOvlasbUO"}],"template":"Figure %s:","key":"WvtUMsZYwE"},{"type":"text","value":"Local linearization might only be accurate in a small region around the\npoint of linearization.","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"key":"xvNREjnCUo"}],"key":"OVDd135pvY"}],"key":"wV7GaVt6nx"}],"label":"local_linearization","identifier":"local_linearization","enumerator":"2.3","html_id":"local-linearization","key":"EJPXAOvBJ4"},{"type":"heading","depth":3,"position":{"start":{"line":993,"column":1},"end":{"line":993,"column":1}},"children":[{"type":"text","value":"Iterative LQR","position":{"start":{"line":993,"column":1},"end":{"line":993,"column":1}},"key":"iuC8QHMR37"}],"label":"iterative_lqr","identifier":"iterative_lqr","html_id":"iterative-lqr","enumerator":"2.6.4","key":"bopgxWjb5c"},{"type":"paragraph","position":{"start":{"line":995,"column":1},"end":{"line":999,"column":1}},"children":[{"type":"text","value":"To address these issues with local linearization, we’ll use an iterative\napproach, where we repeatedly linearize around different points to\ncreate a ","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"y63r2qNK4o"},{"type":"emphasis","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"children":[{"type":"text","value":"time-dependent","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"Lq23HAHmHu"}],"key":"YENSuU66E1"},{"type":"text","value":" approximation of the dynamics, and then solve\nthe resulting time-dependent LQR problem to obtain a better policy. This\nis known as ","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"NvoJgVLOTd"},{"type":"strong","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"children":[{"type":"text","value":"iterative LQR","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"wu3KJsF1Hj"}],"key":"OnTNtSgfLQ"},{"type":"text","value":" or ","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"Wysx7TRAWm"},{"type":"strong","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"children":[{"type":"text","value":"iLQR","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"GKna2IC0xN"}],"key":"GoN7spcL90"},{"type":"text","value":":","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"UckBmQrP5l"}],"key":"Dz96cskj86"},{"type":"proof","kind":"definition","label":"ilqr","identifier":"ilqr","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Iterative LQR","position":{"start":{"line":1001,"column":1},"end":{"line":1001,"column":1}},"key":"IcOycJ2MbD"}],"key":"RgRNGs1Sjr"},{"type":"paragraph","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"children":[{"type":"text","value":"For each iteration of the algorithm:","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"bAF5Ty7kpT"}],"key":"qpdI69OmEA"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":1006,"column":1},"end":{"line":1011,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":1006,"column":1},"end":{"line":1007,"column":1}},"children":[{"type":"text","value":"Form a time-dependent LQR problem around the current candidate\ntrajectory using local linearization.","position":{"start":{"line":1006,"column":1},"end":{"line":1006,"column":1}},"key":"r9rWis9wl2"}],"key":"vLWbytTHSb"},{"type":"listItem","spread":true,"position":{"start":{"line":1008,"column":1},"end":{"line":1008,"column":1}},"children":[{"type":"text","value":"Compute the optimal policy using ","position":{"start":{"line":1008,"column":1},"end":{"line":1008,"column":1}},"key":"My5X8redIy"},{"type":"crossReference","position":{"start":{"line":1008,"column":1},"end":{"line":1008,"column":1}},"children":[{"type":"text","value":"Section ","key":"fsMSM71ZXP"},{"type":"text","value":"2.5.1","key":"Y2nJ4FkKcp"}],"identifier":"time_dep_lqr","label":"time_dep_lqr","kind":"heading","template":"Section %s","enumerator":"2.5.1","resolved":true,"html_id":"time-dep-lqr","key":"XtP8XRIPMg"},{"type":"text","value":".","position":{"start":{"line":1008,"column":1},"end":{"line":1008,"column":1}},"key":"nEBnc8Ck1R"}],"key":"nj5iynROM0"},{"type":"listItem","spread":true,"position":{"start":{"line":1009,"column":1},"end":{"line":1009,"column":1}},"children":[{"type":"text","value":"Generate a new series of actions using this policy.","position":{"start":{"line":1009,"column":1},"end":{"line":1009,"column":1}},"key":"Ur0fL8LOOf"}],"key":"bPYIInSh6A"},{"type":"listItem","spread":true,"position":{"start":{"line":1010,"column":1},"end":{"line":1011,"column":1}},"children":[{"type":"text","value":"Compute a better candidate trajectory by interpolating between the\ncurrent and proposed actions.","position":{"start":{"line":1010,"column":1},"end":{"line":1010,"column":1}},"key":"MSLnCUJJ2l"}],"key":"XiS8E9RLDt"}],"key":"zygHZ3NkOf"}],"enumerator":"2.9","html_id":"ilqr","key":"KEYXxNXJF0"},{"type":"paragraph","position":{"start":{"line":1014,"column":1},"end":{"line":1017,"column":1}},"children":[{"type":"text","value":"Now let’s go through the details of each step. We’ll use superscripts to\ndenote the iteration of the algorithm. We’ll also denote\n","position":{"start":{"line":1014,"column":1},"end":{"line":1014,"column":1}},"key":"ry6gbJQExS"},{"type":"inlineMath","value":"\\bar \\st_0 = \\E_{\\st_0 \\sim \\mu_0} [\\st_0]","position":{"start":{"line":1014,"column":1},"end":{"line":1014,"column":1}},"html":"xˉ0=Ex0μ0[x0]\\bar \\st_0 = \\E_{\\st_0 \\sim \\mu_0} [\\st_0]xˉ0=Ex0μ0[x0]","key":"k7l6mNAKZo"},{"type":"text","value":" as the expected initial\nstate.","position":{"start":{"line":1014,"column":1},"end":{"line":1014,"column":1}},"key":"mVhLj2abnZ"}],"key":"XIhpNUwzpa"},{"type":"paragraph","position":{"start":{"line":1019,"column":1},"end":{"line":1021,"column":1}},"children":[{"type":"text","value":"At iteration ","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"key":"Ie4qrcpK5n"},{"type":"inlineMath","value":"i","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"html":"iii","key":"FAxDlafBNq"},{"type":"text","value":" of the algorithm, we begin with a ","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"key":"wbOSMjWMqc"},{"type":"strong","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"children":[{"type":"text","value":"candidate","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"key":"pMzq1bVamj"}],"key":"r7dNnH7qTE"},{"type":"text","value":"\ntrajectory\n","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"key":"I6LSTqyx0r"},{"type":"inlineMath","value":"\\bar \\tau^i = (\\bar \\st^i_0, \\bar \\act^i_0, \\dots, \\bar \\st^i_{\\hor-1}, \\bar \\act^i_{\\hor-1})","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"html":"τˉi=(xˉ0i,uˉ0i,,xˉH1i,uˉH1i)\\bar \\tau^i = (\\bar \\st^i_0, \\bar \\act^i_0, \\dots, \\bar \\st^i_{\\hor-1}, \\bar \\act^i_{\\hor-1})τˉi=(xˉ0i,uˉ0i,,xˉH1i,uˉH1i)","key":"l3BewSnpft"},{"type":"text","value":".","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"key":"Zx4zd6hHV8"}],"key":"PQFDkqSytO"},{"type":"paragraph","position":{"start":{"line":1023,"column":1},"end":{"line":1026,"column":1}},"children":[{"type":"strong","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"children":[{"type":"text","value":"Step 1: Form a time-dependent LQR problem.","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"key":"X5CQg609ZZ"}],"key":"yGnA67qw8Y"},{"type":"text","value":" At each timestep\n","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"key":"D3zXXjs4tW"},{"type":"inlineMath","value":"\\hi \\in [\\hor]","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"html":"h[H]\\hi \\in [\\hor]h[H]","key":"ftFzBhYdfz"},{"type":"text","value":", we use the techniques from\n","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"key":"YRM9Ubip1Y"},{"type":"crossReference","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"children":[{"type":"text","value":"Section ","key":"ZswHbbNYAg"},{"type":"text","value":"2.6","key":"qyhCDq9P40"}],"identifier":"approx_nonlinear","label":"approx_nonlinear","kind":"heading","template":"Section %s","enumerator":"2.6","resolved":true,"html_id":"approx-nonlinear","key":"FjDSl9q17j"},{"type":"text","value":" to linearize the dynamics and\nquadratize the cost function around ","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"key":"vfZm4Swcjr"},{"type":"inlineMath","value":"(\\bar \\st^i_\\hi, \\bar \\act^i_\\hi)","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"html":"(xˉhi,uˉhi)(\\bar \\st^i_\\hi, \\bar \\act^i_\\hi)(xˉhi,uˉhi)","key":"uN3pYZ8dma"},{"type":"text","value":":","position":{"start":{"line":1023,"column":1},"end":{"line":1023,"column":1}},"key":"y2nxmORngw"}],"key":"ypQNzxNuSD"},{"type":"math","value":"\\begin{aligned}\n f_\\hi(\\st, \\act) & \\approx f(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) + \\nabla_{\\st } f(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)(\\st - \\bar {\\st}^i_\\hi) + \\nabla_{\\act } f(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)(\\act - \\bar {\\act}^i_\\hi) \\\\\n c_\\hi(\\st, \\act) & \\approx c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) + \\begin{bmatrix}\n \\st - \\bar {\\st }^i_\\hi& \\act - \\bar {\\act}^i_\\hi\n \\end{bmatrix} \\begin{bmatrix}\n \\nabla_{\\st } c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)\\\\\n \\nabla_{\\act} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)\n \\end{bmatrix} \\\\\n & \\qquad + \\frac{1}{2} \\begin{bmatrix}\n \\st - \\bar {\\st }^i_\\hi& \\act - \\bar {\\act}^i_\\hi\n \\end{bmatrix} \\begin{bmatrix}\n \\nabla_{\\st \\st} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) & \\nabla_{\\st \\act} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) \\\\\n \\nabla_{\\act \\st} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) & \\nabla_{\\act \\act} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)\n \\end{bmatrix}\n \\begin{bmatrix}\n \\st - \\bar {\\st }^i_\\hi\\\\\n \\act - \\bar {\\act}^i_\\hi\n \\end{bmatrix}.\n\\end{aligned}","position":{"start":{"line":1029,"column":1},"end":{"line":1049,"column":1}},"html":"fh(x,u)f(xˉhi,uˉhi)+xf(xˉhi,uˉhi)(xxˉhi)+uf(xˉhi,uˉhi)(uuˉhi)ch(x,u)c(xˉhi,uˉhi)+[xxˉhiuuˉhi][xc(xˉhi,uˉhi)uc(xˉhi,uˉhi)]+12[xxˉhiuuˉhi][xxc(xˉhi,uˉhi)xuc(xˉhi,uˉhi)uxc(xˉhi,uˉhi)uuc(xˉhi,uˉhi)][xxˉhiuuˉhi].\\begin{aligned}\n f_\\hi(\\st, \\act) & \\approx f(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) + \\nabla_{\\st } f(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)(\\st - \\bar {\\st}^i_\\hi) + \\nabla_{\\act } f(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)(\\act - \\bar {\\act}^i_\\hi) \\\\\n c_\\hi(\\st, \\act) & \\approx c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) + \\begin{bmatrix}\n \\st - \\bar {\\st }^i_\\hi& \\act - \\bar {\\act}^i_\\hi\n \\end{bmatrix} \\begin{bmatrix}\n \\nabla_{\\st } c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)\\\\\n \\nabla_{\\act} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)\n \\end{bmatrix} \\\\\n & \\qquad + \\frac{1}{2} \\begin{bmatrix}\n \\st - \\bar {\\st }^i_\\hi& \\act - \\bar {\\act}^i_\\hi\n \\end{bmatrix} \\begin{bmatrix}\n \\nabla_{\\st \\st} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) & \\nabla_{\\st \\act} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) \\\\\n \\nabla_{\\act \\st} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) & \\nabla_{\\act \\act} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)\n \\end{bmatrix}\n \\begin{bmatrix}\n \\st - \\bar {\\st }^i_\\hi\\\\\n \\act - \\bar {\\act}^i_\\hi\n \\end{bmatrix}.\n\\end{aligned}fh(x,u)ch(x,u)f(xˉhi,uˉhi)+xf(xˉhi,uˉhi)(xxˉhi)+uf(xˉhi,uˉhi)(uuˉhi)c(xˉhi,uˉhi)+[xxˉhiuuˉhi][xc(xˉhi,uˉhi)uc(xˉhi,uˉhi)]+21[xxˉhiuuˉhi][xxc(xˉhi,uˉhi)uxc(xˉhi,uˉhi)xuc(xˉhi,uˉhi)uuc(xˉhi,uˉhi)][xxˉhiuuˉhi].","enumerator":"2.48","key":"zDzxBfVBB8"},{"type":"paragraph","position":{"start":{"line":1053,"column":1},"end":{"line":1056,"column":1}},"children":[{"type":"strong","position":{"start":{"line":1053,"column":1},"end":{"line":1053,"column":1}},"children":[{"type":"text","value":"Step 2: Compute the optimal policy.","position":{"start":{"line":1053,"column":1},"end":{"line":1053,"column":1}},"key":"RlAjWGosyi"}],"key":"I1LQdKDpkl"},{"type":"text","value":" We can now solve the\ntime-dependent LQR problem using the Riccati equation from\n","position":{"start":{"line":1053,"column":1},"end":{"line":1053,"column":1}},"key":"jBKEoqtuiT"},{"type":"crossReference","position":{"start":{"line":1053,"column":1},"end":{"line":1053,"column":1}},"children":[{"type":"text","value":"Section ","key":"rsw7Gz786n"},{"type":"text","value":"2.5.1","key":"M8zFa1urZ5"}],"identifier":"time_dep_lqr","label":"time_dep_lqr","kind":"heading","template":"Section %s","enumerator":"2.5.1","resolved":true,"html_id":"time-dep-lqr","key":"kWKc87Kbwz"},{"type":"text","value":" to compute the optimal policy\n","position":{"start":{"line":1053,"column":1},"end":{"line":1053,"column":1}},"key":"hENaUM4UAy"},{"type":"inlineMath","value":"\\pi^i_0, \\dots, \\pi^i_{\\hor-1}","position":{"start":{"line":1053,"column":1},"end":{"line":1053,"column":1}},"html":"π0i,,πH1i\\pi^i_0, \\dots, \\pi^i_{\\hor-1}π0i,,πH1i","key":"fsCY1RewkU"},{"type":"text","value":".","position":{"start":{"line":1053,"column":1},"end":{"line":1053,"column":1}},"key":"RCRvfi05Vl"}],"key":"tcwyrkJdbn"},{"type":"paragraph","position":{"start":{"line":1058,"column":1},"end":{"line":1059,"column":1}},"children":[{"type":"strong","position":{"start":{"line":1058,"column":1},"end":{"line":1058,"column":1}},"children":[{"type":"text","value":"Step 3: Generate a new series of actions.","position":{"start":{"line":1058,"column":1},"end":{"line":1058,"column":1}},"key":"jqlJNI1E9C"}],"key":"fNrDFWuW0u"},{"type":"text","value":" We can then generate a new\nsample trajectory by taking actions according to this optimal policy:","position":{"start":{"line":1058,"column":1},"end":{"line":1058,"column":1}},"key":"Dk5w9wJ1Vk"}],"key":"tpoeYtdbq4"},{"type":"math","value":"\\bar \\st^{i+1}_0 = \\bar \\st_0, \\qquad \\widetilde \\act_\\hi = \\pi^i_\\hi(\\bar \\st^{i+1}_\\hi), \\qquad \\bar \\st^{i+1}_{\\hi+1} = f(\\bar \\st^{i+1}_\\hi, \\widetilde \\act_\\hi).","position":{"start":{"line":1062,"column":1},"end":{"line":1064,"column":1}},"html":"xˉ0i+1=xˉ0,u~h=πhi(xˉhi+1),xˉh+1i+1=f(xˉhi+1,u~h).\\bar \\st^{i+1}_0 = \\bar \\st_0, \\qquad \\widetilde \\act_\\hi = \\pi^i_\\hi(\\bar \\st^{i+1}_\\hi), \\qquad \\bar \\st^{i+1}_{\\hi+1} = f(\\bar \\st^{i+1}_\\hi, \\widetilde \\act_\\hi).xˉ0i+1=xˉ0,uh=πhi(xˉhi+1),xˉh+1i+1=f(xˉhi+1,uh).","enumerator":"2.49","key":"JfAWZjYxM2"},{"type":"paragraph","position":{"start":{"line":1067,"column":1},"end":{"line":1068,"column":1}},"children":[{"type":"text","value":"Note that the states are sampled according to the ","position":{"start":{"line":1067,"column":1},"end":{"line":1067,"column":1}},"key":"cn0rLSFoHW"},{"type":"emphasis","position":{"start":{"line":1067,"column":1},"end":{"line":1067,"column":1}},"children":[{"type":"text","value":"true","position":{"start":{"line":1067,"column":1},"end":{"line":1067,"column":1}},"key":"pxU3HKqyCh"}],"key":"VDyiyxgFgm"},{"type":"text","value":" dynamics, which\nwe assume we have query access to.","position":{"start":{"line":1067,"column":1},"end":{"line":1067,"column":1}},"key":"HBlslT9pgY"}],"key":"dOpbsy0rki"},{"type":"paragraph","position":{"start":{"line":1070,"column":1},"end":{"line":1077,"column":1}},"children":[{"type":"strong","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"children":[{"type":"text","value":"Step 4: Compute a better candidate trajectory.","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"vpVBn3bvBm"}],"key":"AMoqfbszGz"},{"type":"text","value":", Note that we’ve\ndenoted these actions as ","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"RdtGnOguXU"},{"type":"inlineMath","value":"\\widetilde \\act_\\hi","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"html":"u~h\\widetilde \\act_\\hiuh","key":"Ifiu1moozR"},{"type":"text","value":" and aren’t directly using\nthem for the next iteration ","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"ZciOUNJB1i"},{"type":"inlineMath","value":"\\bar \\act^{i+1}_\\hi","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"html":"uˉhi+1\\bar \\act^{i+1}_\\hiuˉhi+1","key":"UDE3PvFy72"},{"type":"text","value":". Rather, we want to\n","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"NKV7iiWyc3"},{"type":"emphasis","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"children":[{"type":"text","value":"interpolate","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"HC0kCfmqJq"}],"key":"oFEOE7n5bn"},{"type":"text","value":" between them and the actions from the previous iteration\n","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"V3AhGgaW3x"},{"type":"inlineMath","value":"\\bar \\act^i_0, \\dots, \\bar \\act^i_{\\hor-1}","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"html":"uˉ0i,,uˉH1i\\bar \\act^i_0, \\dots, \\bar \\act^i_{\\hor-1}uˉ0i,,uˉH1i","key":"hSbFrIMklK"},{"type":"text","value":". This is so that the cost\nwill ","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"hPn7DQ25Tv"},{"type":"emphasis","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"children":[{"type":"text","value":"increase monotonically,","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"jrL0ruSySx"}],"key":"Ct1gmTvTjX"},{"type":"text","value":" since if the new policy turns out to\nactually be worse, we can stay closer to the previous trajectory. (Can\nyou think of an intuitive example where this might happen?)","position":{"start":{"line":1070,"column":1},"end":{"line":1070,"column":1}},"key":"X7o7aaBQ4N"}],"key":"KYBGFkOaKM"},{"type":"paragraph","position":{"start":{"line":1079,"column":1},"end":{"line":1082,"column":1}},"children":[{"type":"text","value":"Formally, we want to find ","position":{"start":{"line":1079,"column":1},"end":{"line":1079,"column":1}},"key":"Q2jRJPF1lc"},{"type":"inlineMath","value":"\\alpha \\in [0, 1]","position":{"start":{"line":1079,"column":1},"end":{"line":1079,"column":1}},"html":"α[0,1]\\alpha \\in [0, 1]α[0,1]","key":"qCQKaPe5DC"},{"type":"text","value":" to generate the next\niteration of actions\n","position":{"start":{"line":1079,"column":1},"end":{"line":1079,"column":1}},"key":"nyu1b6aq7o"},{"type":"inlineMath","value":"\\bar \\act^{i+1}_0, \\dots, \\bar \\act^{i+1}_{\\hor-1}","position":{"start":{"line":1079,"column":1},"end":{"line":1079,"column":1}},"html":"uˉ0i+1,,uˉH1i+1\\bar \\act^{i+1}_0, \\dots, \\bar \\act^{i+1}_{\\hor-1}uˉ0i+1,,uˉH1i+1","key":"HVI0USxyGt"},{"type":"text","value":" such that the cost\nis minimized:","position":{"start":{"line":1079,"column":1},"end":{"line":1079,"column":1}},"key":"zdSyBD2ZFY"}],"key":"wrJ9mc6t3o"},{"type":"math","value":"\\begin{aligned}\n \\min_{\\alpha \\in [0, 1]} \\quad & \\sum_{\\hi=0}^{\\hor-1} c(\\st_\\hi, \\bar \\act^{i+1}_\\hi) \\\\\n \\text{where} \\quad & \\st_{\\hi+1} = f(\\st_\\hi, \\bar \\act^{i+1}_\\hi) \\\\\n & \\bar \\act^{i+1}_\\hi = \\alpha \\bar \\act^i_\\hi + (1-\\alpha) \\widetilde \\act_\\hi \\\\\n & \\st_0 = \\bar \\st_0.\n\\end{aligned}","position":{"start":{"line":1084,"column":1},"end":{"line":1091,"column":1}},"html":"minα[0,1]h=0H1c(xh,uˉhi+1)wherexh+1=f(xh,uˉhi+1)uˉhi+1=αuˉhi+(1α)u~hx0=xˉ0.\\begin{aligned}\n \\min_{\\alpha \\in [0, 1]} \\quad & \\sum_{\\hi=0}^{\\hor-1} c(\\st_\\hi, \\bar \\act^{i+1}_\\hi) \\\\\n \\text{where} \\quad & \\st_{\\hi+1} = f(\\st_\\hi, \\bar \\act^{i+1}_\\hi) \\\\\n & \\bar \\act^{i+1}_\\hi = \\alpha \\bar \\act^i_\\hi + (1-\\alpha) \\widetilde \\act_\\hi \\\\\n & \\st_0 = \\bar \\st_0.\n\\end{aligned}α[0,1]minwhereh=0H1c(xh,uˉhi+1)xh+1=f(xh,uˉhi+1)uˉhi+1=αuˉhi+(1α)uhx0=xˉ0.","enumerator":"2.50","key":"CJrKzqFZkh"},{"type":"paragraph","position":{"start":{"line":1093,"column":1},"end":{"line":1095,"column":1}},"children":[{"type":"text","value":"Note that this optimizes over the closed interval\n","position":{"start":{"line":1093,"column":1},"end":{"line":1093,"column":1}},"key":"qRP2YpKsbN"},{"type":"inlineMath","value":"[0, 1]","position":{"start":{"line":1093,"column":1},"end":{"line":1093,"column":1}},"html":"[0,1][0, 1][0,1]","key":"UkfmQfTrrj"},{"type":"text","value":", so by the Extreme Value Theorem, it’s guaranteed to have a\nglobal maximum.","position":{"start":{"line":1093,"column":1},"end":{"line":1093,"column":1}},"key":"I4UMS1poGs"}],"key":"rAydOxdWvK"},{"type":"paragraph","position":{"start":{"line":1097,"column":1},"end":{"line":1101,"column":1}},"children":[{"type":"text","value":"The final output of this algorithm is a policy ","position":{"start":{"line":1097,"column":1},"end":{"line":1097,"column":1}},"key":"CJIKxLdOlw"},{"type":"inlineMath","value":"\\pi^{n_\\text{steps}}","position":{"start":{"line":1097,"column":1},"end":{"line":1097,"column":1}},"html":"πnsteps\\pi^{n_\\text{steps}}πnsteps","key":"Wl0lisq6d4"},{"type":"text","value":"\nderived after ","position":{"start":{"line":1097,"column":1},"end":{"line":1097,"column":1}},"key":"XyxnOrZAQV"},{"type":"inlineMath","value":"n_\\text{steps}","position":{"start":{"line":1097,"column":1},"end":{"line":1097,"column":1}},"html":"nstepsn_\\text{steps}nsteps","key":"hwHc8JJEgT"},{"type":"text","value":" of the algorithm. Though the proof is\nsomewhat complex, one can show that for many nonlinear control problems,\nthis solution converges to a locally optimal solution (in the policy\nspace).","position":{"start":{"line":1097,"column":1},"end":{"line":1097,"column":1}},"key":"GQ6aBRrkBo"}],"key":"VAk324wiMm"},{"type":"heading","depth":2,"position":{"start":{"line":1103,"column":1},"end":{"line":1103,"column":1}},"children":[{"type":"text","value":"Summary","position":{"start":{"line":1103,"column":1},"end":{"line":1103,"column":1}},"key":"THzK5htJV7"}],"identifier":"summary","label":"Summary","html_id":"summary","implicit":true,"enumerator":"2.7","key":"sUAfms46dp"},{"type":"paragraph","position":{"start":{"line":1105,"column":1},"end":{"line":1112,"column":1}},"children":[{"type":"text","value":"This chapter introduced some approaches to solving different variants of\nthe optimal control problem\n","position":{"start":{"line":1105,"column":1},"end":{"line":1105,"column":1}},"key":"ish2Dm61Pe"},{"type":"crossReference","kind":"proof:definition","identifier":"optimal_control","label":"optimal_control","children":[{"type":"text","value":"Definition ","key":"YYw8YQ04Bi"},{"type":"text","value":"2.1","key":"BOkfwH15Ji"}],"template":"Definition %s","enumerator":"2.1","resolved":true,"html_id":"optimal-control","key":"mY8YKya9ti"},{"type":"text","value":". We began with the simple case of linear\ndynamics and an upward-curved quadratic cost. This model is called the\nLQR and we solved for the optimal policy using dynamic programming. We\nthen extended these results to the more general nonlinear case via local\nlinearization. We finally saw the iterative LQR algorithm for solving\nnonlinear control problems.","position":{"start":{"line":1105,"column":1},"end":{"line":1105,"column":1}},"key":"ycaco5zYU3"}],"key":"w06OGJJcds"}],"key":"ovZgA9L75q"}],"key":"bovAezcTGA"},"references":{"cite":{"order":[],"data":{}}},"footer":{"navigation":{"prev":{"title":"1 Markov Decision Processes","url":"/mdps","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"3 Multi-Armed Bandits","url":"/bandits","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file diff --git a/exploration.html b/exploration.html index b1445fe..62149a6 100644 --- a/exploration.html +++ b/exploration.html @@ -1,4 +1,4 @@ -9 Exploration in MDPs - CS/STAT 184: Introduction to Reinforcement Learning

9 Exploration in MDPs

9.1Introduction

One of the key challenges of reinforcement learning is the exploration-exploitation tradeoff. Should we exploit actions we know will give high reward, or should we explore different actions to discover potentially better strategies? An algorithm that doesn’t explore effectively might easily overfit to certain areas of the state space, and fail to generalize once they enter a region they haven’t yet seen. The algorithms we saw in the chapter on fitted DP 5 Fitted Dynamic Programming Algorithms suffer from this issue.

In 3 Multi-Armed Bandits, where the state never changes so all we care about are the actions, we saw algorithms like Section 3.6 and Thompson sampling that incentivize the learner to explore arms that it is uncertain about. In this chapter, we will see how to generalize these ideas to the MDP setting.

9.1.1Sparse reward

Exploration is especially crucial in sparse reward problems where reward doesn’t come until after many steps, and algorithms which do not systematically explore new states may fail to learn anything meaningful (within a reasonable amount of time).

For example, policy gradient algorithms require the gradient to be nonzero in order to learn. If we never observe any reward, the gradient will always be zero, and the policy will never change or improve.

9.1.2Exploration in deterministic MDPs

Let us address the exploration problem in a deterministic MDP where taking action aa in state ss always leads to the state P(s,a)SP(s, a) \in \mathcal{S}. In this simple setting, there will be no “automatic” exploration due to randomness, so our strategy must actively explore new states. One simple strategy is to visit every possible state-action pair to learn the entire MDP. Then, once the MDP is known, we can use DP to solve for the optimal policy. (This should remind you of the Section 3.4 algorithm.)

9.2Treating an unknown MDP as a MAB

We also explored the exploration-exploitation tradeoff in 3 Multi-Armed Bandits. Recall tthat in the MAB setting, we have KK arms, each of which has an unknown reward distribution, and we want to learn which of the arms is optimal, i.e. has the highest mean reward.

One algorithm that struck a good balance between exploration and exploitation was the upper confidence bound algorithm Section 3.6: For each arm, we construct a confidence interval for its true mean award, and then choose the arm with the highest upper confidence bound. In summary,

kt+1argmaxk[K]RtkNtk+ln(2t/δ)2Ntkk_{t+1} \gets \arg\max_{k \in [K]} \frac{R^{k}_t}{N^{k}_t} + \sqrt{\frac{\ln(2t/\delta)}{2 N^{k}_t}}

where NtkN_t^k indicates the number of times arm kk has been pulled up until time tt, RtkR_t^k indicates the total reward obtained by pulling arm kk up until time tt, and δ>0\delta > 0 controls the width of the confidence interval. How might we extend UCB to the MDP case?

Let us formally describe an unknown MDP as an MAB problem. In an unknown MDP, we want to learn which policy is optimal. So if we want to apply MAB techniques to solving an MDP, it makes sense to think of arms as policies. There are K=(AS)HK = (|\mathcal{A}|^{|\mathcal{S}|})^\hor deterministic policies in a finite MDP. Then, “pulling” arm π corresponds to using π to act through a trajectory in the MDP, and observing the total reward.

Recall that UCB incurs regret O~(TK)\tilde{O}(\sqrt{TK})

where NtkN_t^k indicates the number of times arm kk has been pulled up until time tt, RtkR_t^k indicates the total reward obtained by pulling arm kk up until time tt, and δ>0\delta > 0 controls the width of the confidence interval. How might we extend UCB to the MDP case?

Let us formally describe an unknown MDP as an MAB problem. In an unknown MDP, we want to learn which policy is optimal. So if we want to apply MAB techniques to solving an MDP, it makes sense to think of arms as policies. There are K=(AS)HK = (|\mathcal{A}|^{|\mathcal{S}|})^\hor deterministic policies in a finite MDP. Then, “pulling” arm π corresponds to using π to act through a trajectory in the MDP, and observing the total reward.

Recall that UCB incurs regret O~(TK)\tilde{O}(\sqrt{TK})

This scales exponentially in S|\mathcal{S}| and H\hor, which quickly becomes intractable. Notably, this method doesn’t consider the information that we gain across different policies. We can illustrate this with the following example:

9.3UCB-VI

The approach above is inefficient: We shouldn’t need to consider all ASH|\mathcal{A}|^{|\mathcal{S}| H} deterministic policies to achieve low regret. Rather, all we need to describe the optimal policy is QQ^\star, which has HSAH |\mathcal{S}||\mathcal{A}| entries to be learned. Can we borrow ideas from UCB to reduce the regret to this order (i.e. polynomial in S|\mathcal{S}|, A|\mathcal{A}|, and HH)?

One way to frame the UCB algorithm is that, when choosing arms, we optimize over a proxy reward that is the sum of the estimated mean reward and an exploration term. In the UCB-VI algorithm, we will extend this idea to the case of an unknown MDP M?\mathcal{M}^{?} by modelling a proxy MDP M~\tilde{\mathcal{M}} with a reward function that encourages exploration. Then, we will use DP to solve for the optimal policy in M~\tilde{\mathcal{M}}.

Assumptions: For simplicity, here we assume the reward function of M?\mathcal{M}^{?} is known, so we only need to model the state transitions, though the rewards can be modelled similarly. We will also consider the more general case of a time-varying MDP, where the transition and reward functions can change over time. We take the convention that PhP_\hi is the distribution of sh+1sh,ahs_{h+1} \mid s_{h}, a_{h} and rhr_\hi is applied to sh,ahs_\hi, a_\hi.

At a high level, the UCB-VI algorithm can be described as follows:

  1. Modelling: Use previous data to model the transitions P^0,,P^H1\hat{P}_0, \dots, \hat{P}_{H-1}.

  2. Reward bonus: Design a reward bonus bh(s,a)Rb_\hi(s, a) \in \mathbb{R} to encourage exploration, analogous to the UCB term.

  3. Optimistic planning: Use DP to compute the optimal policy π^h(s)\hat \pi_\hi(s) in the modelled MDP

M~=(S,A,{P^h}h[H],{rh+bh}h[H],H).\tilde{\mathcal{M}} = (\mathcal{S}, \mathcal{A}, \{ \hat{P}_\hi \}_{h \in [H]}, \{ r_\hi + b_\hi \}_{h \in [H]}, H).
  1. Execution: Use π^h(s)\hat \pi_\hi(s) to collect a new trajectory, and repeat.

We detail each of these steps below. The full definition follows in (9.16).

9.3.1Modelling the transitions

We seek to approximate Ph(sh+1sh,ah)=P(sh,ah,sh+1)P(sh,ah)P_\hi(s_{h+1} \mid s_\hi, a_\hi) = \frac{\pr(s_\hi, a_\hi, s_{h+1})}{\pr(s_\hi, a_\hi)}. We can estimate these using their sample probabilities from the dataset. That is, define

Nht(s,a,s):=i=0t11{(shi,ahi,sh+1i)=(s,a,s)}Nht(s,a):=i=0t11{(shi,ahi)=(s,a)}\begin{aligned} +M1001 80h400000v40h-400000z'/>)

This scales exponentially in S|\mathcal{S}| and H\hor, which quickly becomes intractable. Notably, this method doesn’t consider the information that we gain across different policies. We can illustrate this with the following example:

9.3UCB-VI

The approach above is inefficient: We shouldn’t need to consider all ASH|\mathcal{A}|^{|\mathcal{S}| H} deterministic policies to achieve low regret. Rather, all we need to describe the optimal policy is QQ^\star, which has HSAH |\mathcal{S}||\mathcal{A}| entries to be learned. Can we borrow ideas from UCB to reduce the regret to this order (i.e. polynomial in S|\mathcal{S}|, A|\mathcal{A}|, and HH)?

One way to frame the UCB algorithm is that, when choosing arms, we optimize over a proxy reward that is the sum of the estimated mean reward and an exploration term. In the UCB-VI algorithm, we will extend this idea to the case of an unknown MDP M?\mathcal{M}^{?} by modelling a proxy MDP M~\tilde{\mathcal{M}} with a reward function that encourages exploration. Then, we will use DP to solve for the optimal policy in M~\tilde{\mathcal{M}}.

Assumptions: For simplicity, here we assume the reward function of M?\mathcal{M}^{?} is known, so we only need to model the state transitions, though the rewards can be modelled similarly. We will also consider the more general case of a time-varying MDP, where the transition and reward functions can change over time. We take the convention that PhP_\hi is the distribution of sh+1sh,ahs_{h+1} \mid s_{h}, a_{h} and rhr_\hi is applied to sh,ahs_\hi, a_\hi.

At a high level, the UCB-VI algorithm can be described as follows:

  1. Modelling: Use previous data to model the transitions P^0,,P^H1\hat{P}_0, \dots, \hat{P}_{H-1}.

  2. Reward bonus: Design a reward bonus bh(s,a)Rb_\hi(s, a) \in \mathbb{R} to encourage exploration, analogous to the UCB term.

  3. Optimistic planning: Use DP to compute the optimal policy π^h(s)\hat \pi_\hi(s) in the modelled MDP

M~=(S,A,{P^h}h[H],{rh+bh}h[H],H).\tilde{\mathcal{M}} = (\mathcal{S}, \mathcal{A}, \{ \hat{P}_\hi \}_{h \in [H]}, \{ r_\hi + b_\hi \}_{h \in [H]}, H).
  1. Execution: Use π^h(s)\hat \pi_\hi(s) to collect a new trajectory, and repeat.

We detail each of these steps below. The full definition follows in (9.16).

9.3.1Modelling the transitions

We seek to approximate Ph(sh+1sh,ah)=P(sh,ah,sh+1)P(sh,ah)P_\hi(s_{h+1} \mid s_\hi, a_\hi) = \frac{\pr(s_\hi, a_\hi, s_{h+1})}{\pr(s_\hi, a_\hi)}. We can estimate these using their sample probabilities from the dataset. That is, define

Nht(s,a,s):=i=0t11{(shi,ahi,sh+1i)=(s,a,s)}Nht(s,a):=i=0t11{(shi,ahi)=(s,a)}\begin{aligned} N_\hi^t(s, a, s') & := \sum_{i=0}^{t-1} \ind{ (s_\hi^i, a_\hi^i, s_{h+1}^i) = (s, a, s') } \\ N_\hi^t(s, a) & := \sum_{i=0}^{t-1} \ind{ (s_\hi^i, a_\hi^i) = (s, a) } \\ -\end{aligned}

Then we can model

P^ht(ss,a)=Nht(s,a,s)Nht(s,a).\hat{P}_\hi^t(s' \mid s, a) = \frac{N_\hi^t(s, a, s')}{N_\hi^t(s, a)}.

9.3.2Reward bonus

To motivate the reward bonus term bht(s,a)b_\hi^t(s, a), recall how we designed the reward bonus term for UCB:

  1. We used Hoeffding’s inequality to bound, with high probability, how far the sample mean μ^tk\hat \mu_t^k deviated from the true mean μk\mu^k.

  2. By inverting this inequality, we obtained a (1δ)(1-\delta)-confidence interval for the true mean, centered at our estimate.

  3. To make this bound uniform across all timesteps t[T]t \in [T], we applied the union bound and multiplied δ by a factor of TT.

We’d like to do the same for UCB-VI, and construct the bonus term such that Vh(s)V^ht(s)V^\star_\hi(s) \le \hat{V}_\hi^t(s) with high probability. However, our construction will be more complex than the MAB case, since V^ht(s)\hat{V}_\hi^t(s) depends on the bonus bht(s,a)b_\hi^t(s, a) implicitly via DP. We claim that the bonus term that gives the proper bound is

bht(s,a)=2Hlog(SAHT/δ)Nht(s,a).b_\hi^t(s, a) = 2 H \sqrt{\frac{\log( |\mathcal{S}||\mathcal{A}|H T/\delta )}{N_\hi^t(s, a)}}.

We will only provide a heuristic sketch of the proof; see Agarwal et al. (2022) (Section 7.3) for a full proof.

Comparing this to the UCB regret bound O~(TK)\tilde{O}(\sqrt{T K}), where KK is the number of arms of the MAB, we see that we’ve reduced the number of effective arms from ASH|\mathcal{A}|^{|\mathcal{S}|\hor} (in (9.4)) to H4SAH^4 |\mathcal{S}||\mathcal{A}|, which is indeed polynomial in S|\mathcal{S}|, A|\mathcal{A}|, and HH, as desired. This is also roughly the number of episodes it takes to achieve constant-order average regret:

1TE[RegretT]=O~(H4SAT)\frac{1}{T} \E[\text{Regret}_T] = \tilde{O}\left(\sqrt{\frac{H^4 |\mathcal{S}||\mathcal{A}|}{T}}\right), where KK is the number of arms of the MAB, we see that we’ve reduced the number of effective arms from ASH|\mathcal{A}|^{|\mathcal{S}|\hor} (in (9.4)) to H4SAH^4 |\mathcal{S}||\mathcal{A}|, which is indeed polynomial in S|\mathcal{S}|, A|\mathcal{A}|, and HH, as desired. This is also roughly the number of episodes it takes to achieve constant-order average regret:

1TE[RegretT]=O~(H4SAT)\frac{1}{T} \E[\text{Regret}_T] = \tilde{O}\left(\sqrt{\frac{H^4 |\mathcal{S}||\mathcal{A}|}{T}}\right)

Note that the time-dependent transition matrix has HS2AH |\mathcal{S}|^2 |\mathcal{A}| entries. Assuming HSH \ll |\mathcal{S}|, this shows that it’s possible to achieve low regret, and achieve a near-optimal policy, while only understanding a 1/S1/|\mathcal{S}| fraction of the world’s dynamics.

9.4Linear MDPs

A polynomial dependency on S|\mathcal{S}| and A|\mathcal{A}| is manageable when the state and action spaces are small. But for large or continuous state and action spaces, even this polynomial factor will become intractable. Can we find algorithms that don’t depend on S|\mathcal{S}| or A|\mathcal{A}| at all, effectively reducing the dimensionality of the MDP? In this section, we’ll explore linear MDPs: an example of a parameterized MDP where the rewards and state transitions depend only on some parameter space of dimension dd that is independent from S|\mathcal{S}| or A|\mathcal{A}|.

9.4.1Planning in a linear MDP

It turns out that QhQ^\star_\hi is also linear with respect to this feature mapping. We can prove this by simply computing it using DP. We initialize VH(s)=0sV_{H}^\star(s) = 0 \forall s. Then we iterate:

Qh(s,a)=rh(s,a)+EsPh(s,a)[Vh+1(s)]=ϕ(s,a)θh+(μhϕ(s,a))Vh+1=ϕ(s,a)(θh+(μh)Vh+1)whVh(s)=maxaQh(s,a)πh(s)=argmaxaQh(s,a)\begin{aligned} Q^\star_\hi(s, a) & = r_\hi(s, a) + \E_{s' \sim P_\hi(\cdot \mid s, a)} [V^\star_{h+1}(s')] \\ & = \phi(s, a)^\top \theta_\hi^\star + (\mu_\hi^\star \phi(s, a))^\top V^\star_{h+1} \\ & = \phi(s, a)^\top \underbrace{( \theta_\hi^\star + (\mu_\hi^\star)^\top V^\star_{h+1})}_{w_\hi} \\ @@ -129,10 +129,10 @@ 11.7-311.7 78.3-403 201-6 8-9.7 12-11 12-.7.7-6.7 1-18 1s-17.3-.3-18-1c-1.3 0 -5-4-11-12-44.7-59.3-101.3-106.3-170-141s-145.3-54.3-229-60H0V214z'/>(θh+(μh)Vh+1)=amaxQh(s,a)=argamaxQh(s,a)

9.4.2UCB-VI in a linear MDP

9.4.2.1Modelling the transitions

This linear assumption on the MDP will also allow us to model the unknown dynamics Ph?(ss,a)P^?_\hi(s' \mid s, a) with techniques from supervised learning (SL). Recall that SL is useful for estimating conditional expectations by minimizing mean squared error. We can rephrase the estimation of Ph?(ss,a)P^?_\hi(s' \mid s, a) as a least-squares problem as follows: Write δs\delta_s to denote a one-hot vector in RS\mathbb{R}^{|\mathcal{S}|}, with a 1 in the ss-th entry and 0 everywhere else. Note that

EsPh(s,a)[δs]=Ph(s,a)=μhϕ(s,a).\E_{s' \sim P_h(\cdot \mid s, a)} [\delta_{s'}] = P_h(\cdot \mid s, a) = \mu_h^\star \phi(s, a).

Furthermore, since the expectation here is linear with respect to ϕ(s,a)\phi(s, a), we can directly apply least-squares multi-target linear regression to construct the estimate

μ^=argminμRS×dt=0T1μϕ(shi,ahi)δsh+1i22.\hat \mu = \arg\min_{\mu \in \mathbb{R}^{|\mathcal{S}| \times d}} \sum_{t=0}^{T-1} \|\mu \phi(s_h^i, a_h^i) - \delta_{s_{h+1}^i} \|_2^2.

This has a well-known closed-form solution:

μ^=(Aht)1i=0t1ϕ(shi,ahi)δsh+1iwhereAht=i=0t1ϕ(shi,ahi)ϕ(shi,ahi)+λI\begin{aligned} +-174 2.7-5 6-9 10-13 .7-1 7.3-1 20-1h17z'/>(θh+(μh)Vh+1)=amaxQh(s,a)=argamaxQh(s,a)

9.4.2UCB-VI in a linear MDP

9.4.2.1Modelling the transitions

This linear assumption on the MDP will also allow us to model the unknown dynamics Ph?(ss,a)P^?_\hi(s' \mid s, a) with techniques from supervised learning (SL). Recall that SL is useful for estimating conditional expectations by minimizing mean squared error. We can rephrase the estimation of Ph?(ss,a)P^?_\hi(s' \mid s, a) as a least-squares problem as follows: Write δs\delta_s to denote a one-hot vector in RS\mathbb{R}^{|\mathcal{S}|}, with a 1 in the ss-th entry and 0 everywhere else. Note that

EsPh(s,a)[δs]=Ph(s,a)=μhϕ(s,a).\E_{s' \sim P_h(\cdot \mid s, a)} [\delta_{s'}] = P_h(\cdot \mid s, a) = \mu_h^\star \phi(s, a).

Furthermore, since the expectation here is linear with respect to ϕ(s,a)\phi(s, a), we can directly apply least-squares multi-target linear regression to construct the estimate

μ^=argminμRS×dt=0T1μϕ(shi,ahi)δsh+1i22.\hat \mu = \arg\min_{\mu \in \mathbb{R}^{|\mathcal{S}| \times d}} \sum_{t=0}^{T-1} \|\mu \phi(s_h^i, a_h^i) - \delta_{s_{h+1}^i} \|_2^2.

This has a well-known closed-form solution:

μ^=(Aht)1i=0t1ϕ(shi,ahi)δsh+1iwhereAht=i=0t1ϕ(shi,ahi)ϕ(shi,ahi)+λI\begin{aligned} \hat \mu^\top & = (A_h^t)^{-1} \sum_{i=0}^{t-1} \phi(s_h^i, a_h^i) \delta_{s_{h+1}^i}^\top \\ \text{where} \quad A_h^t & = \sum_{i=0}^{t-1} \phi(s_h^i, a_h^i) \phi(s_h^i, a_h^i)^\top + \lambda I -\end{aligned}

where we include a λI\lambda I term to ensure that the matrix AhtA^t_h is invertible. (This can also be derived by adding a λμF2\lambda \|\mu\|_{\text{F}}^2 regularization term to the objective.) We can directly plug in this estimate into P^ht(s,a)=μ^htϕ(s,a)\hat{P}^t_h(\cdot \mid s, a) = \hat \mu^t_h \phi(s, a).

9.4.2.2Reward bonus

Now, to design the reward bonus, we can’t apply Hoeffding anymore, since the terms no longer involve sample means of bounded random variables; Instead, we’re incorporating information across different states and actions. Rather, we can construct an upper bound using Chebyshev’s inequality in the same way we did for the LinUCB algorithm in the MAB setting Section 3.8.1:

bht(s,a)=βϕ(s,a)(Aht)1ϕ(s,a),β=O~(dH).b^t_\hi(s, a) = \beta \sqrt{\phi(s, a)^\top (A^t_h)^{-1} \phi(s, a)}, \quad \beta = \tilde O(d \hor).

Note that this isn’t explicitly inversely proportional to Nht(s,a)N_h^t(s, a) as in the original UCB-VI bonus term (9.8). Rather, it is inversely proportional to the amount that the direction ϕ(s,a)\phi(s, a) has been explored in the history. That is, if AhtA_h^t has a large component in the direction ϕ(s,a)\phi(s, a), implying that this direction is well explored, then the bonus term will be small, and vice versa.

We can now plug in these transition estimates and reward bonuses into the UCB-VI algorithm (9.16).

9.4.2.3Performance

\ No newline at end of file diff --git a/exploration.json b/exploration.json index f0aa90c..03b0fff 100644 --- a/exploration.json +++ b/exploration.json @@ -1 +1 @@ -{"kind":"Notebook","sha256":"16601dd72e8b5e5b5a3530b6022d894d109f606501a1e0863d8a727655c4c852","slug":"exploration","location":"/exploration.md","dependencies":[],"frontmatter":{"title":"9 Exploration in MDPs","numbering":{"all":{"enabled":true},"enumerator":{"template":"9.%s"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"thumbnail":"/build/sparse_reward_mdp-d4beda7e57ed42a0bbe96cfa6c5ecbbe.png","thumbnailOptimized":"/build/sparse_reward_mdp-d4beda7e57ed42a0bbe96cfa6c5ecbbe.webp","exports":[{"format":"md","filename":"exploration.md","url":"/build/exploration-81ded2f1b068acb6df548cb9ef312d11.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"cH1jkOw0WH"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"9.1","key":"bOzGH7REVR"},{"type":"paragraph","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"One of the key challenges of reinforcement learning is the ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"cLszJ5Mbni"},{"type":"emphasis","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"exploration-exploitation tradeoff","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"uIKruxy1vc"}],"key":"fDSPPWyY5z"},{"type":"text","value":". Should we ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"eUfUIboulU"},{"type":"emphasis","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"exploit","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"VTAhZDrzpZ"}],"key":"HjabhB2wGM"},{"type":"text","value":" actions we know will give high reward, or should we ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"YQ41mVLhpo"},{"type":"emphasis","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"explore","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"XK2No94NhR"}],"key":"SXeTqdeJHi"},{"type":"text","value":" different actions to discover potentially better strategies? An algorithm that doesn’t explore effectively might easily ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"g4Ws9oRlJH"},{"type":"emphasis","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"overfit","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"aOUxGQpC9M"}],"key":"s20Taby0hA"},{"type":"text","value":" to certain areas of the state space, and fail to generalize once they enter a region they haven’t yet seen. The algorithms we saw in the chapter on fitted DP ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"JiP7aOQaMF"},{"type":"link","url":"/fitted-dp","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"5 Fitted Dynamic Programming Algorithms","key":"Ne4P5MnaPC"}],"urlSource":"./fitted_dp.md","dataUrl":"/fitted-dp.json","internal":true,"protocol":"file","key":"euOp5LQWwu"},{"type":"text","value":" suffer from this issue.","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"KtQrvy2FLT"}],"key":"MefQyL6x9c"},{"type":"paragraph","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"children":[{"type":"text","value":"In ","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"key":"VWz5BNKTFI"},{"type":"link","url":"/bandits","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"children":[{"type":"text","value":"3 Multi-Armed Bandits","key":"vusuRI6Isf"}],"urlSource":"./bandits.md","dataUrl":"/bandits.json","internal":true,"protocol":"file","key":"XndvOh8BIS"},{"type":"text","value":", where the state never changes so all we care about are the actions, we saw algorithms like ","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"key":"QU28URusya"},{"type":"crossReference","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"children":[{"type":"text","value":"Section ","key":"M7snO3Frg0"},{"type":"text","value":"3.6","key":"zqL6HNtPW0"}],"identifier":"ucb","label":"ucb","kind":"heading","template":"Section %s","enumerator":"3.6","resolved":true,"html_id":"ucb","remote":true,"url":"/bandits","dataUrl":"/bandits.json","key":"x547R8FhLx"},{"type":"text","value":" and ","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"key":"NsmHsd6nYh"},{"type":"crossReference","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"children":[{"type":"text","value":"Thompson sampling","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"key":"sLP1OMBR72"}],"identifier":"thompson_sampling","label":"thompson_sampling","kind":"heading","template":"Section %s","enumerator":"3.7","resolved":true,"html_id":"thompson-sampling","remote":true,"url":"/bandits","dataUrl":"/bandits.json","key":"yzRmW1biDH"},{"type":"text","value":" that incentivize the learner to explore arms that it is uncertain about. In this chapter, we will see how to generalize these ideas to the MDP setting.","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"key":"GpVyXe41pQ"}],"key":"EZI0BNDMyD"},{"type":"proof","kind":"definition","label":"per_episode_regret","identifier":"per_episode_regret","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Per-episode regret","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"EUGUrWqOhl"}],"key":"OjRDfRWovk"},{"type":"paragraph","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"text","value":"To quantify the performance of a learning algorithm, we will consider its per-episode regret over ","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"iiBRSX6DbZ"},{"type":"inlineMath","value":"T","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"html":"TTT","key":"VXDi4Bd87O"},{"type":"text","value":" timesteps/episodes:","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"rmMKmh7MZ8"}],"key":"W6S8Taknqd"},{"type":"math","value":"\\text{Regret}_T = \\E\\left[ \\sum_{t=0}^{T-1} V^\\star_0(s_0) - V^{\\pi^t}_0(s_0) \\right]","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"html":"RegretT=E[t=0T1V0(s0)V0πt(s0)]\\text{Regret}_T = \\E\\left[ \\sum_{t=0}^{T-1} V^\\star_0(s_0) - V^{\\pi^t}_0(s_0) \\right]RegretT=E[t=0T1V0(s0)V0πt(s0)]","enumerator":"9.1","key":"v992lSJm0Y"},{"type":"paragraph","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"XxqAGsRqYL"},{"type":"inlineMath","value":"\\pi^t","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"html":"πt\\pi^tπt","key":"YtWyTtai5D"},{"type":"text","value":" is the policy generated by the algorithm at the ","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"syrlbo3PZf"},{"type":"inlineMath","value":"t","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"html":"ttt","key":"Ux3gJiXN7h"},{"type":"text","value":"th iteration.","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"zeKcRVr7hl"}],"key":"dhO7YeqylQ"}],"enumerator":"9.1","html_id":"per-episode-regret","key":"OIwdLzReu6"},{"type":"heading","depth":3,"position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"children":[{"type":"text","value":"Sparse reward","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"MCKQK7ivFh"}],"identifier":"sparse-reward","label":"Sparse reward","html_id":"sparse-reward","implicit":true,"enumerator":"9.1.1","key":"Spqkj1975M"},{"type":"paragraph","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"children":[{"type":"text","value":"Exploration is especially crucial in ","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"key":"JtUPZFlf2C"},{"type":"strong","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"children":[{"type":"text","value":"sparse reward","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"key":"LubaXLsR2v"}],"key":"Zs70qqqS8h"},{"type":"text","value":" problems where reward doesn’t come until after many steps, and algorithms which do not ","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"key":"aO6Pl7jCIH"},{"type":"emphasis","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"children":[{"type":"text","value":"systematically","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"key":"Tj675xvnX2"}],"key":"gSm3TdN1J5"},{"type":"text","value":" explore new states may fail to learn anything meaningful (within a reasonable amount of time).","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"key":"qvKrsNR8nj"}],"key":"BzLwCIZvAd"},{"type":"paragraph","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"children":[{"type":"text","value":"For example, policy gradient algorithms require the gradient to be nonzero in order to learn. If we never observe any reward, the gradient will always be zero, and the policy will never change or improve.","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"PJ1XrBKOkd"}],"key":"vjuZlXkhsM"},{"type":"proof","kind":"example","label":"sparse_reward_mdp","identifier":"sparse_reward_mdp","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Sparse Reward MDP","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"GM0qKJbkjQ"}],"key":"kJf48KypRv"},{"type":"paragraph","position":{"start":{"line":43,"column":1},"end":{"line":43,"column":1}},"children":[{"type":"text","value":"Here’s a simple example of an MDP with sparse reward:","position":{"start":{"line":43,"column":1},"end":{"line":43,"column":1}},"key":"bZzS9Bjr0g"}],"key":"lph0m5omIE"},{"type":"image","url":"/build/sparse_reward_mdp-d4beda7e57ed42a0bbe96cfa6c5ecbbe.png","alt":"image","position":{"start":{"line":45,"column":1},"end":{"line":45,"column":1}},"key":"PV7HLfv84o","urlSource":"shared/sparse_reward_mdp.png","urlOptimized":"/build/sparse_reward_mdp-d4beda7e57ed42a0bbe96cfa6c5ecbbe.webp"},{"type":"paragraph","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"children":[{"type":"text","value":"There are ","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"key":"DCsnoh8DY4"},{"type":"inlineMath","value":"|\\mathcal{S}|","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"html":"S|\\mathcal{S}|S","key":"PVI3j3JygS"},{"type":"text","value":" states. The agent starts in the leftmost state. In every state, there are three possible actions, two of which move the agent left and one which moves the agent right. The reward function assigns ","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"key":"K9arWGyqQF"},{"type":"inlineMath","value":"r=1","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"html":"r=1r=1r=1","key":"nmR3HhRPcq"},{"type":"text","value":" to the rightmost cell.","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"key":"N3lD8MEj3S"}],"key":"xt4J7X49w6"}],"enumerator":"9.1","html_id":"sparse-reward-mdp","key":"FJfDsg0mRz"},{"type":"heading","depth":3,"position":{"start":{"line":50,"column":1},"end":{"line":50,"column":1}},"children":[{"type":"text","value":"Exploration in deterministic MDPs","position":{"start":{"line":50,"column":1},"end":{"line":50,"column":1}},"key":"yvlqJLKcjS"}],"identifier":"exploration-in-deterministic-mdps","label":"Exploration in deterministic MDPs","html_id":"exploration-in-deterministic-mdps","implicit":true,"enumerator":"9.1.2","key":"R1MdG4pIB2"},{"type":"paragraph","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"children":[{"type":"text","value":"Let us address the exploration problem in a ","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"UeBoDhmIcE"},{"type":"emphasis","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"children":[{"type":"text","value":"deterministic","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"ZsDxnLqwn1"}],"key":"jCCp6NDkVe"},{"type":"text","value":" MDP where taking action ","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"N2sOeKHL9r"},{"type":"inlineMath","value":"a","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"html":"aaa","key":"KXSFCCIhc1"},{"type":"text","value":" in state ","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"K9O8MCn7Ex"},{"type":"inlineMath","value":"s","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"html":"sss","key":"Ruu2859WVO"},{"type":"text","value":" always leads to the state ","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"VkaLfPJIHD"},{"type":"inlineMath","value":"P(s, a) \\in \\mathcal{S}","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"html":"P(s,a)SP(s, a) \\in \\mathcal{S}P(s,a)S","key":"T5i3itoEAJ"},{"type":"text","value":". In this simple setting, there will be no “automatic” exploration due to randomness, so our strategy must actively explore new states. One simple strategy is to visit every possible state-action pair to learn the entire MDP. Then, once the MDP is known, we can use DP to solve for the optimal policy. (This should remind you of the ","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"bGa7SPVMGK"},{"type":"crossReference","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"children":[{"type":"text","value":"Section ","key":"iGjiynqupo"},{"type":"text","value":"3.4","key":"JX4dht2tgt"}],"identifier":"etc","label":"etc","kind":"heading","template":"Section %s","enumerator":"3.4","resolved":true,"html_id":"etc","remote":true,"url":"/bandits","dataUrl":"/bandits.json","key":"uftMdDHLw6"},{"type":"text","value":" algorithm.)","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"A7gtasH81f"}],"key":"YkGb5k4MEh"},{"type":"proof","kind":"definition","label":"explore_then_exploit","identifier":"explore_then_exploit","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Explore-then-exploit (for deterministic MDPs)","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"key":"UheKXx5lgo"}],"key":"D8cd4rGc5s"},{"type":"paragraph","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"children":[{"type":"text","value":"We’ll keep a set ","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"qEQ6C3LjVW"},{"type":"inlineMath","value":"K","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"html":"KKK","key":"BOTClSe5fz"},{"type":"text","value":" of all the ","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"F5bjsU4rrG"},{"type":"inlineMath","value":"(s, a, r, s')","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"html":"(s,a,r,s)(s, a, r, s')(s,a,r,s)","key":"NPVPyDqkk6"},{"type":"text","value":" pairs we’ve observed. Each episode, we’ll choose an unseen state-action pair for which the reward and the next state are unknown, and take the shortest path there. We assume that every state can be reached from the initial state within a single episode.","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"NjeKgNugED"}],"key":"hr4OKNg9xk"},{"type":"comment","value":" :::{algorithmic}\n$K \\gets \\emptyset$ Using our known transitions $K$, compute the shortest path $\\tilde \\pi$ to $(s, a)$ Execute $\\tilde \\pi$ to visit $(s, a)$ and observe $r = r(s, a), s' = P(s, a)$ $K \\gets K \\cup \\{ (s, a, r, s') \\}$ Compute the optimal policy $\\pi^\\star$ in the MDP $K$ (e.g. using policy iteration). $\\pi^\\star$.\n::: ","key":"l9znnoPezy"},{"type":"paragraph","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"children":[{"type":"text","value":"The shortest path computation can be implemented using DP. We leave this as an exercise.","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"key":"YFMpEBgckm"}],"key":"tju5SMM193"}],"enumerator":"9.2","html_id":"explore-then-exploit","key":"uQFNNds3zU"},{"type":"proof","kind":"theorem","label":"explore_then_exploit_performance","identifier":"explore_then_exploit_performance","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Performance of explore-then-exploit","position":{"start":{"line":66,"column":1},"end":{"line":66,"column":1}},"key":"WBOCt0P4kI"}],"key":"dncpXlDjut"},{"type":"paragraph","position":{"start":{"line":69,"column":1},"end":{"line":70,"column":1}},"children":[{"type":"text","value":"As long as every state can be reached from ","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"tTm8DwfJp4"},{"type":"inlineMath","value":"s_0","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"html":"s0s_0s0","key":"Ug4oDKu7ne"},{"type":"text","value":" within a single episode, i.e. ","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"NneVMPDX8Q"},{"type":"inlineMath","value":"|\\mathcal{S}| \\le \\hor","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"html":"SH|\\mathcal{S}| \\le \\horSH","key":"XejHUUsYVr"},{"type":"text","value":", this will eventually be able to explore all ","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"ds9X5jIgL5"},{"type":"inlineMath","value":"|\\mathcal{S}| |\\mathcal{A}|","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"html":"SA|\\mathcal{S}| |\\mathcal{A}|S∣∣A","key":"PoVR9I1npp"},{"type":"text","value":" state-action pairs, adding one new transition per episode. We know it will take at most ","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"We9YnjYJxx"},{"type":"inlineMath","value":"|\\mathcal{S}| |\\mathcal{A}|","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"html":"SA|\\mathcal{S}| |\\mathcal{A}|S∣∣A","key":"bmJmNzzPht"},{"type":"text","value":" iterations to explore the entire MDP, after which ","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"ibFYoMXo38"},{"type":"inlineMath","value":"\\pi^t = \\pi^\\star","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"html":"πt=π\\pi^t = \\pi^\\starπt=π","key":"qrqnQCmQEA"},{"type":"text","value":", incurring no additional regret.\nFor each ","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"JH0w6EMH0A"},{"type":"inlineMath","value":"\\pi^t","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"html":"πt\\pi^tπt","key":"osEM56Um7o"},{"type":"text","value":" up until then, corresponding to the shortest-path policies ","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"c18LkkqQcV"},{"type":"inlineMath","value":"\\tilde \\pi","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"html":"π~\\tilde \\piπ~","key":"sJyxivklO5"},{"type":"text","value":", the value of policy ","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"xoACRhxwYG"},{"type":"inlineMath","value":"\\pi^t","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"html":"πt\\pi^tπt","key":"WX6HvokvMq"},{"type":"text","value":" will differ from that of ","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"tHuh4PeU3Y"},{"type":"inlineMath","value":"\\pi^\\star","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"html":"π\\pi^\\starπ","key":"RHjdd1U4fo"},{"type":"text","value":" by at most ","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"UOvvqPsWoY"},{"type":"inlineMath","value":"\\hor","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"html":"H\\horH","key":"TMawbmXr7T"},{"type":"text","value":", since the policies will differ by at most ","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"eqISqfBja3"},{"type":"text","value":"1","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"QMgE1dtcn8"},{"type":"text","value":" reward at each timestep. So,","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"yjDw3ZqR9r"}],"key":"v1wGLSeB8H"},{"type":"math","value":"\\sum_{t=0}^{T-1} V^\\star_0 - V_0^{\\pi^t} \\le |\\mathcal{S}||\\mathcal{A}| \\hor.","position":{"start":{"line":72,"column":1},"end":{"line":72,"column":1}},"html":"t=0T1V0V0πtSAH.\\sum_{t=0}^{T-1} V^\\star_0 - V_0^{\\pi^t} \\le |\\mathcal{S}||\\mathcal{A}| \\hor.t=0T1V0V0πtS∣∣AH.","enumerator":"9.2","key":"iiCS9qDECY"},{"type":"paragraph","position":{"start":{"line":74,"column":1},"end":{"line":74,"column":1}},"children":[{"type":"text","value":"(Note that this MDP and algorithm are deterministic, so the regret is not random.)","position":{"start":{"line":74,"column":1},"end":{"line":74,"column":1}},"key":"j7oPvNmOJQ"}],"key":"kWFMRmcqUp"}],"enumerator":"9.1","html_id":"explore-then-exploit-performance","key":"gfCiVjLBsZ"},{"type":"heading","depth":2,"position":{"start":{"line":78,"column":1},"end":{"line":78,"column":1}},"children":[{"type":"text","value":"Treating an unknown MDP as a MAB","position":{"start":{"line":78,"column":1},"end":{"line":78,"column":1}},"key":"sEt86lAIh2"}],"label":"mdp_mab","identifier":"mdp_mab","html_id":"mdp-mab","enumerator":"9.2","key":"Ba8cVjM0ij"},{"type":"paragraph","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"children":[{"type":"text","value":"We also explored the exploration-exploitation tradeoff in ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"wSbBUcnkXa"},{"type":"link","url":"/bandits","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"children":[{"type":"text","value":"3 Multi-Armed Bandits","key":"HAiR0cIXEn"}],"urlSource":"./bandits.md","dataUrl":"/bandits.json","internal":true,"protocol":"file","key":"IkvCrmzo3H"},{"type":"text","value":". Recall tthat in the MAB setting, we have ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"KptkHhdSSe"},{"type":"inlineMath","value":"K","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"html":"KKK","key":"uG9BAMXrwC"},{"type":"text","value":" arms, each of which has an unknown reward distribution, and we want to learn which of the arms is ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"DgqgrcuGzd"},{"type":"emphasis","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"children":[{"type":"text","value":"optimal","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"utkPW4Uzoq"}],"key":"hQr589LpBa"},{"type":"text","value":", i.e. has the highest mean reward.","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"iKi9jKwCJ6"}],"key":"V0xNS9zYzK"},{"type":"paragraph","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"children":[{"type":"text","value":"One algorithm that struck a good balance between exploration and exploitation was the ","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"G2LeNtFLHr"},{"type":"strong","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"children":[{"type":"text","value":"upper confidence bound","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"Qzy2CeEOJM"}],"key":"ZW1QvhFH1I"},{"type":"text","value":" algorithm ","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"CY0NMMFp94"},{"type":"crossReference","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"children":[{"type":"text","value":"Section ","key":"tYRPwHUhQ7"},{"type":"text","value":"3.6","key":"Fw0gp6qoPv"}],"identifier":"ucb","label":"ucb","kind":"heading","template":"Section %s","enumerator":"3.6","resolved":true,"html_id":"ucb","remote":true,"url":"/bandits","dataUrl":"/bandits.json","key":"Z3ThQQ162p"},{"type":"text","value":": For each arm, we construct a ","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"EK0Y5M997y"},{"type":"emphasis","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"children":[{"type":"text","value":"confidence interval","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"JlDHufJqVt"}],"key":"ovOfACOES9"},{"type":"text","value":" for its true mean award, and then choose the arm with the highest upper confidence bound. In summary,","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"s9q9HQrwaO"}],"key":"rKbPknrxV6"},{"type":"math","value":"k_{t+1} \\gets \\arg\\max_{k \\in [K]} \\frac{R^{k}_t}{N^{k}_t} + \\sqrt{\\frac{\\ln(2t/\\delta)}{2 N^{k}_t}}","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"html":"kt+1argmaxk[K]RtkNtk+ln(2t/δ)2Ntkk_{t+1} \\gets \\arg\\max_{k \\in [K]} \\frac{R^{k}_t}{N^{k}_t} + \\sqrt{\\frac{\\ln(2t/\\delta)}{2 N^{k}_t}}kt+1argk[K]maxNtkRtk+2Ntkln(2t/δ)","enumerator":"9.3","key":"a1qqjC0nsv"},{"type":"paragraph","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"key":"raSOyEf3eM"},{"type":"inlineMath","value":"N_t^k","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"html":"NtkN_t^kNtk","key":"rhAeVBDNFG"},{"type":"text","value":" indicates the number of times arm ","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"key":"K7OkqOdAOq"},{"type":"inlineMath","value":"k","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"html":"kkk","key":"nQTKkV10Ht"},{"type":"text","value":" has been pulled up until time ","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"key":"qR8H1XR267"},{"type":"inlineMath","value":"t","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"html":"ttt","key":"hxAMt6I9W5"},{"type":"text","value":", ","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"key":"unbUgQe7qR"},{"type":"inlineMath","value":"R_t^k","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"html":"RtkR_t^kRtk","key":"UBFsNGgSxp"},{"type":"text","value":" indicates the total reward obtained by pulling arm ","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"key":"leeVa8CA83"},{"type":"inlineMath","value":"k","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"html":"kkk","key":"jqjaBNpquc"},{"type":"text","value":" up until time ","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"key":"gxyf8OrNqM"},{"type":"inlineMath","value":"t","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"html":"ttt","key":"UUqLPFhdIQ"},{"type":"text","value":", and ","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"key":"vPjRuaL8Rr"},{"type":"inlineMath","value":"\\delta > 0","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"html":"δ>0\\delta > 0δ>0","key":"w9BUwd7RWK"},{"type":"text","value":" controls the width of the confidence interval. How might we extend UCB to the MDP case?","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"key":"sG80D9DNcD"}],"key":"qgs5Rs4y0n"},{"type":"paragraph","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"children":[{"type":"text","value":"Let us formally describe an unknown MDP as an MAB problem. In an unknown MDP, we want to learn which ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"sPpyPALfMX"},{"type":"emphasis","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"children":[{"type":"text","value":"policy","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"eyxktNBbFG"}],"key":"J6yoOMM2Le"},{"type":"text","value":" is optimal. So if we want to apply MAB techniques to solving an MDP, it makes sense to think of ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"zL3myIEw7V"},{"type":"emphasis","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"children":[{"type":"text","value":"arms","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"VnEJT9mvhD"}],"key":"EPWpo47IOa"},{"type":"text","value":" as ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"Ka7WJjZXO1"},{"type":"emphasis","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"children":[{"type":"text","value":"policies","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"kvwFjXvQUf"}],"key":"fVnkVT1CJE"},{"type":"text","value":". There are ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"nSxFyOGVrk"},{"type":"inlineMath","value":"K = (|\\mathcal{A}|^{|\\mathcal{S}|})^\\hor","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"html":"K=(AS)HK = (|\\mathcal{A}|^{|\\mathcal{S}|})^\\horK=(AS)H","key":"lETw0bGX9p"},{"type":"text","value":" deterministic policies in a finite MDP. Then, “pulling” arm ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"DhcrPGe7rl"},{"type":"text","value":"π","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"EoQt6mvBOZ"},{"type":"text","value":" corresponds to using ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"CGXciGtK2T"},{"type":"text","value":"π","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"JEEHC0qyc9"},{"type":"text","value":" to act through a trajectory in the MDP, and observing the total reward.","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"YXraGXxzz6"}],"key":"zdTum7Qu7V"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"PPRR9IZn9V"}],"key":"prlrwaR5VR"},{"type":"paragraph","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"children":[{"type":"text","value":"Which quantity that we have seen so far equals the mean reward from arm ","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"key":"oek8t3Lx94"},{"type":"text","value":"π","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"key":"sL8naLYVsX"},{"type":"text","value":"?","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"key":"iQ1aKAUB58"}],"key":"Nrew7mGL7m"}],"key":"RP0aKmg2ZD"},{"type":"paragraph","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"Recall that UCB incurs regret ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"KyRAzzIFwz"},{"type":"inlineMath","value":"\\tilde{O}(\\sqrt{TK})","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"O~(TK)\\tilde{O}(\\sqrt{TK})O~(TK)","key":"guidKPYCQk"},{"type":"text","value":", where ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"oFbQQ4lzds"},{"type":"inlineMath","value":"T","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"TTT","key":"TybQ152iL7"},{"type":"text","value":" is the number of pulls and ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"E7wtlfkTKM"},{"type":"inlineMath","value":"K","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"KKK","key":"miFBFGBS5J"},{"type":"text","value":" is the number of arms. So in the MDP-as-MAB problem, using UCB for ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"vBemf1fU7w"},{"type":"inlineMath","value":"T","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"TTT","key":"mELteOejYn"},{"type":"text","value":" episodes would achieve regret","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"LsYhCImUHe"}],"key":"ZlmHcVpJUj"},{"type":"math","value":"\\tilde{O}(\\sqrt{|\\mathcal{A}|^{|\\mathcal{S}|\\hor} T})","label":"mdp_as_mab","identifier":"mdp_as_mab","html":"O~(ASHT)\\tilde{O}(\\sqrt{|\\mathcal{A}|^{|\\mathcal{S}|\\hor} T})O~(ASHT)","enumerator":"9.4","html_id":"mdp-as-mab","key":"h6YkJJt9VT"},{"type":"paragraph","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"This scales ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"WuTN5rT0hK"},{"type":"emphasis","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"exponentially","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"D6e0VrTMMt"}],"key":"wIxAZyk8vI"},{"type":"text","value":" in ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"yQgpleT6ye"},{"type":"inlineMath","value":"|\\mathcal{S}|","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"html":"S|\\mathcal{S}|S","key":"Vq5qF4ilaP"},{"type":"text","value":" and ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"PoLi19SzVG"},{"type":"inlineMath","value":"\\hor","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"html":"H\\horH","key":"Ryw78rPV8P"},{"type":"text","value":", which quickly becomes intractable. Notably, this method doesn’t consider the information that we gain across different policies. We can illustrate this with the following example:","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"btC3RD90Ny"}],"key":"J4LQgiAakV"},{"type":"proof","kind":"example","label":"ineffective_mdp","identifier":"ineffective_mdp","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Treating an MDP as a MAB","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"key":"LKk0dPenpU"}],"key":"vuLjPxydBg"},{"type":"paragraph","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"children":[{"type":"text","value":"Consider a “coin MDP” with two states “heads” and “tails”, two actions “Y” and “N”, and a time horizon of ","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"R6LmIWKg2j"},{"type":"inlineMath","value":"\\hor=2","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"html":"H=2\\hor=2H=2","key":"EdkLk8pAj2"},{"type":"text","value":". The state transition flips the coin, and doesn’t depend on the action. The reward only depends on the action: Taking action Y gives reward ","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"hDmhEHIttR"},{"type":"text","value":"1","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"qJRUtZmc3k"},{"type":"text","value":", and taking action N gives reward ","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"NJP403nV6O"},{"type":"text","value":"0","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"c96QcNxCn5"},{"type":"text","value":".","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"gLD5hyxM8F"}],"key":"oY1OTzPTXg"},{"type":"paragraph","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"children":[{"type":"text","value":"Suppose we collect data from the two constant policies ","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"HEe4Bu8W42"},{"type":"inlineMath","value":"\\pi_{\\text{Y}}(s) = \\text{Y}","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"html":"πY(s)=Y\\pi_{\\text{Y}}(s) = \\text{Y}πY(s)=Y","key":"BvHPpRDTpG"},{"type":"text","value":" and ","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"Mh7cLPwJYI"},{"type":"inlineMath","value":"\\pi_{\\text{N}}(s) = \\text{N}","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"html":"πN(s)=N\\pi_{\\text{N}}(s) = \\text{N}πN(s)=N","key":"f71a6QpSKh"},{"type":"text","value":". Now we want to learn about the policy ","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"a3HSM28unL"},{"type":"inlineMath","value":"\\tilde{\\pi}","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"html":"π~\\tilde{\\pi}π~","key":"oy3qkGkJ9W"},{"type":"text","value":" that takes action Y and then N. Do we need to collect data from ","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"l3xXVAhMNW"},{"type":"inlineMath","value":"\\tilde{\\pi}","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"html":"π~\\tilde{\\pi}π~","key":"zIQe6fihUW"},{"type":"text","value":" to evaluate it? No: Since the reward only depends on the action, we can infer its value from our data on the policies ","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"Lgt5PIhn8r"},{"type":"inlineMath","value":"\\pi_{\\text{Y}}","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"html":"πY\\pi_{\\text{Y}}πY","key":"J1sJo6BJfU"},{"type":"text","value":" and ","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"sSCdOeMXMm"},{"type":"inlineMath","value":"\\pi_{\\text{N}}","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"html":"πN\\pi_{\\text{N}}πN","key":"FyfUad64oN"},{"type":"text","value":". However, if we treat the MDP as a bandit in which ","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"XLrnANmpfj"},{"type":"inlineMath","value":"\\tilde{\\pi}","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"html":"π~\\tilde{\\pi}π~","key":"SbiDBpWJAh"},{"type":"text","value":" is a new, unknown arm, we ignore the known correlation between the action and the reward.","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"dh1oUSgWff"}],"key":"MHvlTbejzD"}],"enumerator":"9.2","html_id":"ineffective-mdp","key":"x7KiLM6ECZ"},{"type":"heading","depth":2,"position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"children":[{"type":"text","value":"UCB-VI","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"f9wNcCyyre"}],"identifier":"ucb-vi","label":"UCB-VI","html_id":"ucb-vi","implicit":true,"enumerator":"9.3","key":"hyHdNAGzuC"},{"type":"paragraph","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"children":[{"type":"text","value":"The approach above is inefficient: We shouldn’t need to consider all ","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"dSt6Jjik0E"},{"type":"inlineMath","value":"|\\mathcal{A}|^{|\\mathcal{S}| H}","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"html":"ASH|\\mathcal{A}|^{|\\mathcal{S}| H}ASH","key":"EnXnDj4LYe"},{"type":"text","value":" deterministic policies to achieve low regret. Rather, all we need to describe the optimal policy is ","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"A1RyaNHLGZ"},{"type":"inlineMath","value":"Q^\\star","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"html":"QQ^\\starQ","key":"eB29Y5QVtB"},{"type":"text","value":", which has ","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"y6FODz2rJ6"},{"type":"inlineMath","value":"H |\\mathcal{S}||\\mathcal{A}|","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"html":"HSAH |\\mathcal{S}||\\mathcal{A}|HS∣∣A","key":"SqPl2njrQU"},{"type":"text","value":" entries to be learned. Can we borrow ideas from UCB to reduce the regret to this order (i.e. polynomial in ","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"n1YMokdIy9"},{"type":"inlineMath","value":"|\\mathcal{S}|","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"html":"S|\\mathcal{S}|S","key":"gm61ixvb7k"},{"type":"text","value":", ","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"RHPZPfKxBJ"},{"type":"inlineMath","value":"|\\mathcal{A}|","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"html":"A|\\mathcal{A}|A","key":"UUkri2pWFn"},{"type":"text","value":", and ","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"g1LcaOmfAF"},{"type":"inlineMath","value":"H","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"html":"HHH","key":"gHNf5ygQcL"},{"type":"text","value":")?","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"Zp4CvYEB4b"}],"key":"Afjxdc7bzj"},{"type":"paragraph","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"children":[{"type":"text","value":"One way to frame the UCB algorithm is that, when choosing arms, we optimize over a ","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"Ab8XETzyVq"},{"type":"emphasis","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"children":[{"type":"text","value":"proxy reward","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"AkrUtpK0Qw"}],"key":"dLqkXMH9BB"},{"type":"text","value":" that is the sum of the estimated mean reward and an exploration term. In the ","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"QleApmNN8H"},{"type":"strong","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"children":[{"type":"text","value":"UCB-VI","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"Xoqh35cGUe"}],"key":"OwtTWLciMr"},{"type":"text","value":" algorithm, we will extend this idea to the case of an unknown MDP ","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"efe0azUlyA"},{"type":"inlineMath","value":"\\mathcal{M}^{?}","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"html":"M?\\mathcal{M}^{?}M?","key":"ywC2Zv9EB2"},{"type":"text","value":" by modelling a proxy MDP ","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"WJ3wDlL05T"},{"type":"inlineMath","value":"\\tilde{\\mathcal{M}}","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"html":"M~\\tilde{\\mathcal{M}}M~","key":"UE5tpFk0PN"},{"type":"text","value":" with a reward function that encourages exploration. Then, we will use DP to solve for the optimal policy in ","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"IfiFJBRpJR"},{"type":"inlineMath","value":"\\tilde{\\mathcal{M}}","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"html":"M~\\tilde{\\mathcal{M}}M~","key":"puKoPexkCf"},{"type":"text","value":".","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"O0r6pTOuQy"}],"key":"jm5uHKMH1a"},{"type":"paragraph","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"strong","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"Assumptions:","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"DvyYCbx6hX"}],"key":"eRpb0aDnLO"},{"type":"text","value":" For simplicity, here we assume the reward function of ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"AEI7y6ZPCC"},{"type":"inlineMath","value":"\\mathcal{M}^{?}","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"html":"M?\\mathcal{M}^{?}M?","key":"OQQKm3MZ9E"},{"type":"text","value":" is known, so we only need to model the state transitions, though the rewards can be modelled similarly. We will also consider the more general case of a ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"tNJWHPw9JU"},{"type":"strong","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"time-varying","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"onUxkygYut"}],"key":"WmgTLNKVJ0"},{"type":"text","value":" MDP, where the transition and reward functions can change over time. We take the convention that ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"MQqyv1UUIR"},{"type":"inlineMath","value":"P_\\hi","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"html":"PhP_\\hiPh","key":"rqUWNU83Yz"},{"type":"text","value":" is the distribution of ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"oIAr1RCkKs"},{"type":"inlineMath","value":"s_{h+1} \\mid s_{h}, a_{h}","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"html":"sh+1sh,ahs_{h+1} \\mid s_{h}, a_{h}sh+1sh,ah","key":"ZRDUG0MvSE"},{"type":"text","value":" and ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"CWXWbeYoIE"},{"type":"inlineMath","value":"r_\\hi","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"html":"rhr_\\hirh","key":"yp2DSOaKs4"},{"type":"text","value":" is applied to ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"FtUuA9SWBV"},{"type":"inlineMath","value":"s_\\hi, a_\\hi","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"html":"sh,ahs_\\hi, a_\\hish,ah","key":"p47esduFRJ"},{"type":"text","value":".","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"AeXbpKCddS"}],"key":"rJo3OT3RCc"},{"type":"paragraph","position":{"start":{"line":120,"column":1},"end":{"line":120,"column":1}},"children":[{"type":"text","value":"At a high level, the UCB-VI algorithm can be described as follows:","position":{"start":{"line":120,"column":1},"end":{"line":120,"column":1}},"key":"Edf4VsGBMU"}],"key":"hQcxJpTiBw"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":122,"column":1},"end":{"line":127,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":122,"column":1},"end":{"line":123,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"children":[{"type":"strong","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"children":[{"type":"text","value":"Modelling:","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"iidvSlsCX9"}],"key":"atpcD9Qfbi"},{"type":"text","value":" Use previous data to model the transitions ","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"W7Ecb3B7fu"},{"type":"inlineMath","value":"\\hat{P}_0, \\dots, \\hat{P}_{H-1}","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"html":"P^0,,P^H1\\hat{P}_0, \\dots, \\hat{P}_{H-1}P^0,,P^H1","key":"pGHqMhbLAY"},{"type":"text","value":".","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"MAb0WUvWIv"}],"key":"Q7GwY0qpKU"}],"key":"spN0CPbpb1"},{"type":"listItem","spread":true,"position":{"start":{"line":124,"column":1},"end":{"line":125,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":124,"column":1},"end":{"line":124,"column":1}},"children":[{"type":"strong","position":{"start":{"line":124,"column":1},"end":{"line":124,"column":1}},"children":[{"type":"text","value":"Reward bonus:","position":{"start":{"line":124,"column":1},"end":{"line":124,"column":1}},"key":"Y8jtOJLR9n"}],"key":"x7Rj9jA3zR"},{"type":"text","value":" Design a reward bonus ","position":{"start":{"line":124,"column":1},"end":{"line":124,"column":1}},"key":"diEDmnkr9a"},{"type":"inlineMath","value":"b_\\hi(s, a) \\in \\mathbb{R}","position":{"start":{"line":124,"column":1},"end":{"line":124,"column":1}},"html":"bh(s,a)Rb_\\hi(s, a) \\in \\mathbb{R}bh(s,a)R","key":"qh9fw6m3UH"},{"type":"text","value":" to encourage exploration, analogous to the UCB term.","position":{"start":{"line":124,"column":1},"end":{"line":124,"column":1}},"key":"jojXMkBCzO"}],"key":"dTQvVAt36O"}],"key":"MX38zMniPP"},{"type":"listItem","spread":true,"position":{"start":{"line":126,"column":1},"end":{"line":127,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":126,"column":1},"end":{"line":126,"column":1}},"children":[{"type":"strong","position":{"start":{"line":126,"column":1},"end":{"line":126,"column":1}},"children":[{"type":"text","value":"Optimistic planning:","position":{"start":{"line":126,"column":1},"end":{"line":126,"column":1}},"key":"nUf2OxwzV5"}],"key":"jVBwkrini9"},{"type":"text","value":" Use DP to compute the optimal policy ","position":{"start":{"line":126,"column":1},"end":{"line":126,"column":1}},"key":"W6J2qR7IDf"},{"type":"inlineMath","value":"\\hat \\pi_\\hi(s)","position":{"start":{"line":126,"column":1},"end":{"line":126,"column":1}},"html":"π^h(s)\\hat \\pi_\\hi(s)π^h(s)","key":"G5qG2xfTqd"},{"type":"text","value":" in the modelled MDP","position":{"start":{"line":126,"column":1},"end":{"line":126,"column":1}},"key":"Gey0YbsccT"}],"key":"GYs2mvLo3a"}],"key":"eFWJFDXnMd"}],"key":"MmrIBqXsSk"},{"type":"math","value":"\\tilde{\\mathcal{M}} = (\\mathcal{S}, \\mathcal{A}, \\{ \\hat{P}_\\hi \\}_{h \\in [H]}, \\{ r_\\hi + b_\\hi \\}_{h \\in [H]}, H).","position":{"start":{"line":128,"column":1},"end":{"line":128,"column":1}},"html":"M~=(S,A,{P^h}h[H],{rh+bh}h[H],H).\\tilde{\\mathcal{M}} = (\\mathcal{S}, \\mathcal{A}, \\{ \\hat{P}_\\hi \\}_{h \\in [H]}, \\{ r_\\hi + b_\\hi \\}_{h \\in [H]}, H).M~=(S,A,{P^h}h[H],{rh+bh}h[H],H).","enumerator":"9.5","key":"thF5VzSec4"},{"type":"list","ordered":true,"start":4,"spread":false,"position":{"start":{"line":130,"column":1},"end":{"line":131,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":130,"column":1},"end":{"line":131,"column":1}},"children":[{"type":"strong","position":{"start":{"line":130,"column":1},"end":{"line":130,"column":1}},"children":[{"type":"text","value":"Execution:","position":{"start":{"line":130,"column":1},"end":{"line":130,"column":1}},"key":"vupGEliipC"}],"key":"Myh5AQPBLE"},{"type":"text","value":" Use ","position":{"start":{"line":130,"column":1},"end":{"line":130,"column":1}},"key":"AgeuiWPb2y"},{"type":"inlineMath","value":"\\hat \\pi_\\hi(s)","position":{"start":{"line":130,"column":1},"end":{"line":130,"column":1}},"html":"π^h(s)\\hat \\pi_\\hi(s)π^h(s)","key":"RxXhxLdlRX"},{"type":"text","value":" to collect a new trajectory, and repeat.","position":{"start":{"line":130,"column":1},"end":{"line":130,"column":1}},"key":"mUSUQiwGUu"}],"key":"onIqmYguvB"}],"key":"z7TqYVWAmm"},{"type":"paragraph","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"children":[{"type":"text","value":"We detail each of these steps below. The full definition follows in ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"nwIRGQD86R"},{"type":"crossReference","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"children":[{"type":"text","value":"(","key":"Re2750oaB9"},{"type":"text","value":"9.16","key":"W2CYYpfSNx"},{"type":"text","value":")","key":"HuiS1aF14y"}],"identifier":"ucb-vi-alg","label":"ucb-vi-alg","kind":"equation","template":"(%s)","enumerator":"9.16","resolved":true,"html_id":"ucb-vi-alg","key":"zuJpwAMuIx"},{"type":"text","value":".","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"P53fp6F09h"}],"key":"pJDL33yEv8"},{"type":"heading","depth":3,"position":{"start":{"line":134,"column":1},"end":{"line":134,"column":1}},"children":[{"type":"text","value":"Modelling the transitions","position":{"start":{"line":134,"column":1},"end":{"line":134,"column":1}},"key":"lwnBXk1z0x"}],"identifier":"modelling-the-transitions","label":"Modelling the transitions","html_id":"modelling-the-transitions","implicit":true,"enumerator":"9.3.1","key":"KnlFZO7s3c"},{"type":"paragraph","position":{"start":{"line":136,"column":1},"end":{"line":136,"column":1}},"children":[{"type":"text","value":"We seek to approximate ","position":{"start":{"line":136,"column":1},"end":{"line":136,"column":1}},"key":"feQsnFvJcZ"},{"type":"inlineMath","value":"P_\\hi(s_{h+1} \\mid s_\\hi, a_\\hi) = \\frac{\\pr(s_\\hi, a_\\hi, s_{h+1})}{\\pr(s_\\hi, a_\\hi)}","position":{"start":{"line":136,"column":1},"end":{"line":136,"column":1}},"html":"Ph(sh+1sh,ah)=P(sh,ah,sh+1)P(sh,ah)P_\\hi(s_{h+1} \\mid s_\\hi, a_\\hi) = \\frac{\\pr(s_\\hi, a_\\hi, s_{h+1})}{\\pr(s_\\hi, a_\\hi)}Ph(sh+1sh,ah)=P(sh,ah)P(sh,ah,sh+1)","key":"ZnRmLrbxVH"},{"type":"text","value":". We can estimate these using their sample probabilities from the dataset. That is, define","position":{"start":{"line":136,"column":1},"end":{"line":136,"column":1}},"key":"t8FAXiKa0Q"}],"key":"ymJXGhyhHo"},{"type":"math","value":"\\begin{aligned}\n N_\\hi^t(s, a, s') & := \\sum_{i=0}^{t-1} \\ind{ (s_\\hi^i, a_\\hi^i, s_{h+1}^i) = (s, a, s') } \\\\\n N_\\hi^t(s, a) & := \\sum_{i=0}^{t-1} \\ind{ (s_\\hi^i, a_\\hi^i) = (s, a) } \\\\\n\\end{aligned}","position":{"start":{"line":138,"column":1},"end":{"line":141,"column":1}},"html":"Nht(s,a,s):=i=0t11{(shi,ahi,sh+1i)=(s,a,s)}Nht(s,a):=i=0t11{(shi,ahi)=(s,a)}\\begin{aligned}\n N_\\hi^t(s, a, s') & := \\sum_{i=0}^{t-1} \\ind{ (s_\\hi^i, a_\\hi^i, s_{h+1}^i) = (s, a, s') } \\\\\n N_\\hi^t(s, a) & := \\sum_{i=0}^{t-1} \\ind{ (s_\\hi^i, a_\\hi^i) = (s, a) } \\\\\n\\end{aligned}Nht(s,a,s)Nht(s,a):=i=0t11{(shi,ahi,sh+1i)=(s,a,s)}:=i=0t11{(shi,ahi)=(s,a)}","enumerator":"9.6","key":"gQrMbQEsEc"},{"type":"paragraph","position":{"start":{"line":143,"column":1},"end":{"line":143,"column":1}},"children":[{"type":"text","value":"Then we can model","position":{"start":{"line":143,"column":1},"end":{"line":143,"column":1}},"key":"heMHHIy9d2"}],"key":"pu1ng8PlHy"},{"type":"math","value":"\\hat{P}_\\hi^t(s' \\mid s, a) = \\frac{N_\\hi^t(s, a, s')}{N_\\hi^t(s, a)}.","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"P^ht(ss,a)=Nht(s,a,s)Nht(s,a).\\hat{P}_\\hi^t(s' \\mid s, a) = \\frac{N_\\hi^t(s, a, s')}{N_\\hi^t(s, a)}.P^ht(ss,a)=Nht(s,a)Nht(s,a,s).","enumerator":"9.7","key":"nLYYkc0EBQ"},{"type":"proof","kind":"remark","enumerated":true,"children":[{"type":"paragraph","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"children":[{"type":"text","value":"Note that this is also a fairly naive, nonparametric estimator that doesn’t assume any underlying structure of the MDP. We’ll see how to incorporate assumptions about the MDP in the following section.","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"key":"f8UWv10pSP"}],"key":"CJzv4RfGRL"}],"enumerator":"9.1","key":"fzGcKwMm4E"},{"type":"heading","depth":3,"position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"Reward bonus","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"key":"fplI6627ct"}],"identifier":"reward-bonus","label":"Reward bonus","html_id":"reward-bonus","implicit":true,"enumerator":"9.3.2","key":"xq1nQb5Qub"},{"type":"paragraph","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"children":[{"type":"text","value":"To motivate the reward bonus term ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"KIxwr67osI"},{"type":"inlineMath","value":"b_\\hi^t(s, a)","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"bht(s,a)b_\\hi^t(s, a)bht(s,a)","key":"w44LAZd2ex"},{"type":"text","value":", recall how we designed the reward bonus term for UCB:","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"lWW9At2MpE"}],"key":"xspldpzwlU"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":155,"column":1},"end":{"line":160,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":155,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"children":[{"type":"text","value":"We used Hoeffding’s inequality to bound, with high probability, how far the sample mean ","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"Gp0tVQ6upb"},{"type":"inlineMath","value":"\\hat \\mu_t^k","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"html":"μ^tk\\hat \\mu_t^kμ^tk","key":"WBGLPP1F95"},{"type":"text","value":" deviated from the true mean ","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"Xon9rYymto"},{"type":"inlineMath","value":"\\mu^k","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"html":"μk\\mu^kμk","key":"UDHGsZ7h1x"},{"type":"text","value":".","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"jyJhWxJFkq"}],"key":"Wr4qq4GXOz"}],"key":"g7NhaJNmYF"},{"type":"listItem","spread":true,"position":{"start":{"line":157,"column":1},"end":{"line":158,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"children":[{"type":"text","value":"By inverting this inequality, we obtained a ","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"key":"WJYie03O1y"},{"type":"inlineMath","value":"(1-\\delta)","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"html":"(1δ)(1-\\delta)(1δ)","key":"WuIQMqwu8y"},{"type":"text","value":"-confidence interval for the true mean, centered at our estimate.","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"key":"paFcPuwdvJ"}],"key":"KT4YX0xXJV"}],"key":"wZtYIdPRXr"},{"type":"listItem","spread":true,"position":{"start":{"line":159,"column":1},"end":{"line":160,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"children":[{"type":"text","value":"To make this bound ","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"Ag7psfiMt0"},{"type":"emphasis","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"children":[{"type":"text","value":"uniform","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"Ai3og05UF7"}],"key":"cuipyQRW5j"},{"type":"text","value":" across all timesteps ","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"wFQnqyDFZ8"},{"type":"inlineMath","value":"t \\in [T]","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"html":"t[T]t \\in [T]t[T]","key":"L0RDNqABDz"},{"type":"text","value":", we applied the union bound and multiplied ","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"l2Ykno5Rsp"},{"type":"text","value":"δ","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"MfDtcQQJQZ"},{"type":"text","value":" by a factor of ","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"tnxi722nyw"},{"type":"inlineMath","value":"T","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"html":"TTT","key":"fx9ayQl9gb"},{"type":"text","value":".","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"Os6UihCPAF"}],"key":"r00o2Z1xot"}],"key":"YhhF2KcEr7"}],"key":"Z2lNcNN63q"},{"type":"paragraph","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"children":[{"type":"text","value":"We’d like to do the same for UCB-VI, and construct the bonus term such that ","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"key":"b8Y1A7aEX5"},{"type":"inlineMath","value":"V^\\star_\\hi(s) \\le \\hat{V}_\\hi^t(s)","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"html":"Vh(s)V^ht(s)V^\\star_\\hi(s) \\le \\hat{V}_\\hi^t(s)Vh(s)V^ht(s)","key":"cTORbB6iNn"},{"type":"text","value":" with high probability. However, our construction will be more complex than the MAB case, since ","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"key":"K1zGD8hjog"},{"type":"inlineMath","value":"\\hat{V}_\\hi^t(s)","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"html":"V^ht(s)\\hat{V}_\\hi^t(s)V^ht(s)","key":"Vx5Ll7zXTG"},{"type":"text","value":" depends on the bonus ","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"key":"tao5oqwrgS"},{"type":"inlineMath","value":"b_\\hi^t(s, a)","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"html":"bht(s,a)b_\\hi^t(s, a)bht(s,a)","key":"RMExIRyjGL"},{"type":"text","value":" implicitly via DP. We claim that the bonus term that gives the proper bound is","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"key":"BOHS3FeONe"}],"key":"u7swTphE2u"},{"type":"math","value":"b_\\hi^t(s, a) = 2 H \\sqrt{\\frac{\\log( |\\mathcal{S}||\\mathcal{A}|H T/\\delta )}{N_\\hi^t(s, a)}}.","position":{"start":{"line":163,"column":1},"end":{"line":164,"column":1}},"identifier":"eq:ucb_vi_bonus","label":"eq:ucb_vi_bonus","html_id":"eq-ucb-vi-bonus","html":"bht(s,a)=2Hlog(SAHT/δ)Nht(s,a).b_\\hi^t(s, a) = 2 H \\sqrt{\\frac{\\log( |\\mathcal{S}||\\mathcal{A}|H T/\\delta )}{N_\\hi^t(s, a)}}.bht(s,a)=2HNht(s,a)log(S∣∣AHT/δ).","enumerator":"9.8","key":"S7S7a02JcG"},{"type":"paragraph","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"children":[{"type":"text","value":"We will only provide a heuristic sketch of the proof; see ","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"key":"yRIu34uNVk"},{"type":"cite","kind":"narrative","label":"agarwal_reinforcement_2022","identifier":"agarwal_reinforcement_2022","children":[{"type":"text","value":"Agarwal ","key":"WK4TQnZjQU"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"VcHsCJ5cEc"}],"key":"Q0IJFFUFoF"},{"type":"text","value":" (2022)","key":"rrMmvG0FVx"}],"enumerator":"1","key":"rRDl0e9h6G"},{"type":"text","value":" (Section 7.3) for a full proof.","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"key":"hbhJ5YgVhc"}],"key":"KfnHlErJDr"},{"type":"proof","kind":"remark","label":"ucb_vi_bonus","identifier":"ucb_vi_bonus","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"UCB-VI reward bonus construction","position":{"start":{"line":168,"column":1},"end":{"line":168,"column":1}},"key":"YGRa2iwirx"}],"key":"UiUOZCkI9O"},{"type":"paragraph","position":{"start":{"line":171,"column":1},"end":{"line":171,"column":1}},"children":[{"type":"text","value":"We aim to show that, with high probability,","position":{"start":{"line":171,"column":1},"end":{"line":171,"column":1}},"key":"fP8n0nksxa"}],"key":"IRFS3VYraX"},{"type":"math","value":"V_\\hi^\\star(s) \\le \\hat{V}_\\hi^t(s) \\quad \\forall t \\in [T], h \\in [H], s \\in \\mathcal{S}.","position":{"start":{"line":173,"column":1},"end":{"line":173,"column":1}},"html":"Vh(s)V^ht(s)t[T],h[H],sS.V_\\hi^\\star(s) \\le \\hat{V}_\\hi^t(s) \\quad \\forall t \\in [T], h \\in [H], s \\in \\mathcal{S}.Vh(s)V^ht(s)t[T],h[H],sS.","enumerator":"9.9","key":"gKVN0hr4pH"},{"type":"paragraph","position":{"start":{"line":175,"column":1},"end":{"line":175,"column":1}},"children":[{"type":"text","value":"We’ll do this by bounding the error incurred at each step of DP. Recall that DP solves for ","position":{"start":{"line":175,"column":1},"end":{"line":175,"column":1}},"key":"SsbcrzAill"},{"type":"inlineMath","value":"\\hat{V}_\\hi^t(s)","position":{"start":{"line":175,"column":1},"end":{"line":175,"column":1}},"html":"V^ht(s)\\hat{V}_\\hi^t(s)V^ht(s)","key":"K93JgP61PW"},{"type":"text","value":" recursively as follows:","position":{"start":{"line":175,"column":1},"end":{"line":175,"column":1}},"key":"qXLgL8Vnx3"}],"key":"OlQC90JwXi"},{"type":"math","value":"\\hat{V}_\\hi^t(s) = \\max_{a \\in \\mathcal{A}} \\left[ \\tilde r^t_\\hi(s, a) + \\E_{s' \\sim \\hat{P}_\\hi^t(\\cdot \\mid s, a)} \\left[ \\hat{V}_{h+1}^t(s') \\right] \\right]","position":{"start":{"line":177,"column":1},"end":{"line":177,"column":1}},"html":"V^ht(s)=maxaA[r~ht(s,a)+EsP^ht(s,a)[V^h+1t(s)]]\\hat{V}_\\hi^t(s) = \\max_{a \\in \\mathcal{A}} \\left[ \\tilde r^t_\\hi(s, a) + \\E_{s' \\sim \\hat{P}_\\hi^t(\\cdot \\mid s, a)} \\left[ \\hat{V}_{h+1}^t(s') \\right] \\right]V^ht(s)=aAmax[r~ht(s,a)+EsP^ht(s,a)[V^h+1t(s)]]","enumerator":"9.10","key":"agSTcdoNRR"},{"type":"paragraph","position":{"start":{"line":179,"column":1},"end":{"line":179,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":179,"column":1},"end":{"line":179,"column":1}},"key":"ROZSwrBBU9"},{"type":"inlineMath","value":"\\tilde r^t_\\hi(s, a) = r_\\hi(s, a) + b_\\hi^t(s, a)","position":{"start":{"line":179,"column":1},"end":{"line":179,"column":1}},"html":"r~ht(s,a)=rh(s,a)+bht(s,a)\\tilde r^t_\\hi(s, a) = r_\\hi(s, a) + b_\\hi^t(s, a)r~ht(s,a)=rh(s,a)+bht(s,a)","key":"e9mZl83Ruv"},{"type":"text","value":" is the reward function of our modelled MDP ","position":{"start":{"line":179,"column":1},"end":{"line":179,"column":1}},"key":"IwsaFjzLO7"},{"type":"inlineMath","value":"\\tilde{\\mathcal{M}}^t","position":{"start":{"line":179,"column":1},"end":{"line":179,"column":1}},"html":"M~t\\tilde{\\mathcal{M}}^tM~t","key":"ryxyO9HzAO"},{"type":"text","value":". On the other hand, we know that ","position":{"start":{"line":179,"column":1},"end":{"line":179,"column":1}},"key":"myXDHfX9UO"},{"type":"inlineMath","value":"V^\\star","position":{"start":{"line":179,"column":1},"end":{"line":179,"column":1}},"html":"VV^\\starV","key":"aicI897GcB"},{"type":"text","value":" must satisfy","position":{"start":{"line":179,"column":1},"end":{"line":179,"column":1}},"key":"UWs4GAkSba"}],"key":"ha87RPUxhh"},{"type":"math","value":"V^\\star_\\hi(s) = \\max_{a \\in \\mathcal{A}} \\left[ \\tilde r^t_\\hi(s, a) + \\E_{s' \\sim P^?_\\hi(\\cdot \\mid s, a)} [V^\\star_{\\hi+1}(s')] \\right]","position":{"start":{"line":181,"column":1},"end":{"line":181,"column":1}},"html":"Vh(s)=maxaA[r~ht(s,a)+EsPh?(s,a)[Vh+1(s)]]V^\\star_\\hi(s) = \\max_{a \\in \\mathcal{A}} \\left[ \\tilde r^t_\\hi(s, a) + \\E_{s' \\sim P^?_\\hi(\\cdot \\mid s, a)} [V^\\star_{\\hi+1}(s')] \\right]Vh(s)=aAmax[r~ht(s,a)+EsPh?(s,a)[Vh+1(s)]]","enumerator":"9.11","key":"Bcc1q7pRQ2"},{"type":"paragraph","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"children":[{"type":"text","value":"so it suffices to bound the difference between the two inner expectations. There are two sources of error:","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"RXP3ItnxFt"}],"key":"t9WO9ekEZ1"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":185,"column":1},"end":{"line":188,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":185,"column":1},"end":{"line":186,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"children":[{"type":"text","value":"The value functions ","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"Ee29RVMj2e"},{"type":"inlineMath","value":"\\hat{V}^t_{h+1}","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"html":"V^h+1t\\hat{V}^t_{h+1}V^h+1t","key":"iEFxzBsppZ"},{"type":"text","value":" v.s. ","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"qT4U5Hoik9"},{"type":"inlineMath","value":"V^\\star_{h+1}","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"html":"Vh+1V^\\star_{h+1}Vh+1","key":"JwZd7k6bJn"}],"key":"VV0BvMZvz9"}],"key":"eGslLPTMn7"},{"type":"listItem","spread":true,"position":{"start":{"line":187,"column":1},"end":{"line":188,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":187,"column":1},"end":{"line":187,"column":1}},"children":[{"type":"text","value":"The transition probabilities ","position":{"start":{"line":187,"column":1},"end":{"line":187,"column":1}},"key":"sQ7FuJ2YAd"},{"type":"inlineMath","value":"\\hat{P}_\\hi^t","position":{"start":{"line":187,"column":1},"end":{"line":187,"column":1}},"html":"P^ht\\hat{P}_\\hi^tP^ht","key":"Bj9ZgtgfdM"},{"type":"text","value":" v.s. ","position":{"start":{"line":187,"column":1},"end":{"line":187,"column":1}},"key":"lv18Y85kgI"},{"type":"inlineMath","value":"P^?_\\hi","position":{"start":{"line":187,"column":1},"end":{"line":187,"column":1}},"html":"Ph?P^?_\\hiPh?","key":"MirDrsfeji"},{"type":"text","value":".","position":{"start":{"line":187,"column":1},"end":{"line":187,"column":1}},"key":"MuxLw6ss5i"}],"key":"GbM78GZ34J"}],"key":"sc6LqIJ3kX"}],"key":"JzNPMIoGE0"},{"type":"paragraph","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"children":[{"type":"text","value":"We can bound these individually, and then combine them by the triangle inequality. For the former, we can simply bound the difference by ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"Ya4eAdm7tZ"},{"type":"inlineMath","value":"H","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"html":"HHH","key":"OIcYi9ccyB"},{"type":"text","value":", assuming that the rewards are within ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"US56D8Okr2"},{"type":"inlineMath","value":"[0, 1]","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"html":"[0,1][0, 1][0,1]","key":"K9lY3gvpJJ"},{"type":"text","value":". Now, all that is left is to bound the error from the transition probabilities:","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"aa1sAIZToC"}],"key":"tedUdBz6TD"},{"type":"math","value":"\\text{error} = \\left| \\E_{s' \\sim \\hat{P}_\\hi^t(\\cdot \\mid s, a)} \\left[ V^\\star_{h+1}(s') \\right] - \\E_{s' \\sim P^?_\\hi(\\cdot \\mid s, a)} \\left[ V^\\star_{h+1}(s') \\right]. \\right|","label":"err","identifier":"err","html":"error=EsP^ht(s,a)[Vh+1(s)]EsPh?(s,a)[Vh+1(s)].\\text{error} = \\left| \\E_{s' \\sim \\hat{P}_\\hi^t(\\cdot \\mid s, a)} \\left[ V^\\star_{h+1}(s') \\right] - \\E_{s' \\sim P^?_\\hi(\\cdot \\mid s, a)} \\left[ V^\\star_{h+1}(s') \\right]. \\right|error=EsP^ht(s,a)[Vh+1(s)]EsPh?(s,a)[Vh+1(s)].","enumerator":"9.12","html_id":"err","key":"Sh9lBFBTqJ"},{"type":"paragraph","position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"children":[{"type":"text","value":"Let us bound this term for a fixed ","position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"key":"I5AM04dyaz"},{"type":"inlineMath","value":"s, a, h, t","position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"html":"s,a,h,ts, a, h, ts,a,h,t","key":"PgmPxaaQta"},{"type":"text","value":". (Later we can make this uniform across ","position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"key":"DG7cPsDxpG"},{"type":"inlineMath","value":"s, a, h, t","position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"html":"s,a,h,ts, a, h, ts,a,h,t","key":"O0XYenefko"},{"type":"text","value":" using the union bound.) Note that expanding out the definition of ","position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"key":"mnuR0cV31e"},{"type":"inlineMath","value":"\\hat{P}_\\hi^t","position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"html":"P^ht\\hat{P}_\\hi^tP^ht","key":"ti4eKzRNnD"},{"type":"text","value":" gives","position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"key":"MUU8Z8mPNE"}],"key":"epXchb8VKp"},{"type":"math","value":"\\begin{aligned}\n \\E_{s' \\sim \\hat{P}_\\hi^t(\\cdot \\mid s, a)} \\left[ V^\\star_{h+1}(s') \\right] & = \\sum_{s' \\in \\mathcal{S}} \\frac{N^t_\\hi(s, a, s')}{N^t_\\hi(s, a)} V^\\star_{h+1}(s') \\\\\n & = \\frac{1}{N^t_\\hi(s, a)} \\sum_{i=0}^{t-1} \\sum_{s' \\in \\mathcal{S}} \\ind{ (s_\\hi^i, a_\\hi^i, s_{h+1}^i) = (s, a, s') } V^\\star_{h+1}(s') \\\\\n & = \\frac{1}{N^t_\\hi(s, a)} \\sum_{i=0}^{t-1} \\underbrace{\\ind{ (s_\\hi^i, a_\\hi^i) = (s, a) } V^\\star_{h+1}(s_{h+1}^i)}_{X^i}\n\\end{aligned}","position":{"start":{"line":199,"column":1},"end":{"line":203,"column":1}},"html":"EsP^ht(s,a)[Vh+1(s)]=sSNht(s,a,s)Nht(s,a)Vh+1(s)=1Nht(s,a)i=0t1sS1{(shi,ahi,sh+1i)=(s,a,s)}Vh+1(s)=1Nht(s,a)i=0t11{(shi,ahi)=(s,a)}Vh+1(sh+1i)Xi\\begin{aligned}\n \\E_{s' \\sim \\hat{P}_\\hi^t(\\cdot \\mid s, a)} \\left[ V^\\star_{h+1}(s') \\right] & = \\sum_{s' \\in \\mathcal{S}} \\frac{N^t_\\hi(s, a, s')}{N^t_\\hi(s, a)} V^\\star_{h+1}(s') \\\\\n & = \\frac{1}{N^t_\\hi(s, a)} \\sum_{i=0}^{t-1} \\sum_{s' \\in \\mathcal{S}} \\ind{ (s_\\hi^i, a_\\hi^i, s_{h+1}^i) = (s, a, s') } V^\\star_{h+1}(s') \\\\\n & = \\frac{1}{N^t_\\hi(s, a)} \\sum_{i=0}^{t-1} \\underbrace{\\ind{ (s_\\hi^i, a_\\hi^i) = (s, a) } V^\\star_{h+1}(s_{h+1}^i)}_{X^i}\n\\end{aligned}EsP^ht(s,a)[Vh+1(s)]=sSNht(s,a)Nht(s,a,s)Vh+1(s)=Nht(s,a)1i=0t1sS1{(shi,ahi,sh+1i)=(s,a,s)}Vh+1(s)=Nht(s,a)1i=0t1Xi1{(shi,ahi)=(s,a)}Vh+1(sh+1i)","enumerator":"9.13","key":"vU8Nuqiv3Z"},{"type":"paragraph","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"children":[{"type":"text","value":"since the terms where ","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"P56TE5Ab36"},{"type":"inlineMath","value":"s' \\neq s_{h+1}^i","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"html":"ssh+1is' \\neq s_{h+1}^is=sh+1i","key":"LUTpPFwLzN"},{"type":"text","value":" vanish.","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"oftRqbcbDJ"}],"key":"UYx5SlYvso"},{"type":"paragraph","position":{"start":{"line":207,"column":1},"end":{"line":207,"column":1}},"children":[{"type":"text","value":"Now, in order to apply Hoeffding’s inequality, we would like to express the second term in ","position":{"start":{"line":207,"column":1},"end":{"line":207,"column":1}},"key":"XlYaQ8SeOa"},{"type":"crossReference","kind":"equation","identifier":"err","label":"err","children":[{"type":"text","value":"(","key":"KfidD6MJvd"},{"type":"text","value":"9.12","key":"jOal1iKPUs"},{"type":"text","value":")","key":"BD3e2HFS6u"}],"template":"(%s)","enumerator":"9.12","resolved":true,"html_id":"err","key":"nLLOY37YW3"},{"type":"text","value":" as a sum over ","position":{"start":{"line":207,"column":1},"end":{"line":207,"column":1}},"key":"Gbor671dfb"},{"type":"inlineMath","value":"t","position":{"start":{"line":207,"column":1},"end":{"line":207,"column":1}},"html":"ttt","key":"x001CeG8Ye"},{"type":"text","value":" random variables as well. We will do this by redundantly averaging over all desired trajectories (i.e. where we visit state ","position":{"start":{"line":207,"column":1},"end":{"line":207,"column":1}},"key":"GuQW2bczIq"},{"type":"inlineMath","value":"s","position":{"start":{"line":207,"column":1},"end":{"line":207,"column":1}},"html":"sss","key":"TmbmytIqus"},{"type":"text","value":" and action ","position":{"start":{"line":207,"column":1},"end":{"line":207,"column":1}},"key":"A3U1UQHaLF"},{"type":"inlineMath","value":"a","position":{"start":{"line":207,"column":1},"end":{"line":207,"column":1}},"html":"aaa","key":"XO2IKdOc8P"},{"type":"text","value":" at time ","position":{"start":{"line":207,"column":1},"end":{"line":207,"column":1}},"key":"Cg1Go0TkPc"},{"type":"inlineMath","value":"h","position":{"start":{"line":207,"column":1},"end":{"line":207,"column":1}},"html":"hhh","key":"kaDqMetM9X"},{"type":"text","value":"):","position":{"start":{"line":207,"column":1},"end":{"line":207,"column":1}},"key":"uXKRF0Apy1"}],"key":"B2ChE0RSEg"},{"type":"math","value":"\\begin{aligned}\n \\E_{s' \\sim P^?_\\hi(\\cdot \\mid s, a)} \\left[ V^\\star_{h+1}(s') \\right]\n & = \\sum_{s' \\in \\mathcal{S}} P^?_\\hi(s' \\mid s, a) V^\\star_{h+1}(s') \\\\\n & = \\sum_{s' \\in \\mathcal{S}} \\frac{1}{N^t_\\hi(s, a)} \\sum_{i=0}^{t-1} \\ind{ (s_\\hi^i, a_\\hi^i) = (s, a) } P^?_\\hi(s' \\mid s, a) V^\\star_{h+1}(s') \\\\\n & = \\frac{1}{N^t_\\hi(s, a)} \\sum_{i=0}^{t-1} \\E_{s_{h+1}^i \\sim P^?_{h}(\\cdot \\mid s_\\hi^i, a_\\hi^i)} X^i.\n\\end{aligned}","position":{"start":{"line":209,"column":1},"end":{"line":215,"column":1}},"html":"EsPh?(s,a)[Vh+1(s)]=sSPh?(ss,a)Vh+1(s)=sS1Nht(s,a)i=0t11{(shi,ahi)=(s,a)}Ph?(ss,a)Vh+1(s)=1Nht(s,a)i=0t1Esh+1iPh?(shi,ahi)Xi.\\begin{aligned}\n \\E_{s' \\sim P^?_\\hi(\\cdot \\mid s, a)} \\left[ V^\\star_{h+1}(s') \\right]\n & = \\sum_{s' \\in \\mathcal{S}} P^?_\\hi(s' \\mid s, a) V^\\star_{h+1}(s') \\\\\n & = \\sum_{s' \\in \\mathcal{S}} \\frac{1}{N^t_\\hi(s, a)} \\sum_{i=0}^{t-1} \\ind{ (s_\\hi^i, a_\\hi^i) = (s, a) } P^?_\\hi(s' \\mid s, a) V^\\star_{h+1}(s') \\\\\n & = \\frac{1}{N^t_\\hi(s, a)} \\sum_{i=0}^{t-1} \\E_{s_{h+1}^i \\sim P^?_{h}(\\cdot \\mid s_\\hi^i, a_\\hi^i)} X^i.\n\\end{aligned}EsPh?(s,a)[Vh+1(s)]=sSPh?(ss,a)Vh+1(s)=sSNht(s,a)1i=0t11{(shi,ahi)=(s,a)}Ph?(ss,a)Vh+1(s)=Nht(s,a)1i=0t1Esh+1iPh?(shi,ahi)Xi.","enumerator":"9.14","key":"oeJGc2eNnw"},{"type":"paragraph","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"children":[{"type":"text","value":"Now we can apply Hoeffding’s inequality to ","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"key":"oVOaaKyqfZ"},{"type":"inlineMath","value":"X^i - \\E_{s_{h+1}^i \\sim P^?_{h}(\\cdot \\mid s_\\hi^i, a_\\hi^i)} X^i","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"html":"XiEsh+1iPh?(shi,ahi)XiX^i - \\E_{s_{h+1}^i \\sim P^?_{h}(\\cdot \\mid s_\\hi^i, a_\\hi^i)} X^iXiEsh+1iPh?(shi,ahi)Xi","key":"JbKrB3gVGi"},{"type":"text","value":", which is bounded by ","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"key":"HDLIkEMpt6"},{"type":"inlineMath","value":"\\hor","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"html":"H\\horH","key":"pkxNgUmqEL"},{"type":"text","value":", to obtain that, with probability at least ","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"key":"n8rbsDDaSX"},{"type":"inlineMath","value":"1-\\delta","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"html":"1δ1-\\delta1δ","key":"oWt6bjj78O"},{"type":"text","value":",","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"key":"jEh2NxdKdA"}],"key":"JIT8ZOewsS"},{"type":"math","value":"\\text{error} = \\left| \\frac{1}{N^t_\\hi(s, a)} \\sum_{i=0}^{t-1} \\left(X^i - \\E_{s_{h+1}^i \\sim P^?_{h}(\\cdot \\mid s_\\hi^i, a_\\hi^i)} X^i \\right) \\right| \\le 2 H \\sqrt{\\frac{\\ln(1/\\delta)}{N_\\hi^t(s, a)}}.","position":{"start":{"line":219,"column":1},"end":{"line":221,"column":1}},"html":"error=1Nht(s,a)i=0t1(XiEsh+1iPh?(shi,ahi)Xi)2Hln(1/δ)Nht(s,a).\\text{error} = \\left| \\frac{1}{N^t_\\hi(s, a)} \\sum_{i=0}^{t-1} \\left(X^i - \\E_{s_{h+1}^i \\sim P^?_{h}(\\cdot \\mid s_\\hi^i, a_\\hi^i)} X^i \\right) \\right| \\le 2 H \\sqrt{\\frac{\\ln(1/\\delta)}{N_\\hi^t(s, a)}}.error=Nht(s,a)1i=0t1(XiEsh+1iPh?(shi,ahi)Xi)2HNht(s,a)ln(1/δ).","enumerator":"9.15","key":"V3ZH4Moodu"},{"type":"paragraph","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"children":[{"type":"text","value":"Applying a union bound over all ","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"uepkH79RVt"},{"type":"inlineMath","value":"s \\in \\mathcal{S}, a \\in \\mathcal{A}, t \\in [T], h \\in [H]","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"sS,aA,t[T],h[H]s \\in \\mathcal{S}, a \\in \\mathcal{A}, t \\in [T], h \\in [H]sS,aA,t[T],h[H]","key":"EXEBSOXGyj"},{"type":"text","value":" gives the ","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"S1RchitBoN"},{"type":"inlineMath","value":"b_\\hi^t(s, a)","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"bht(s,a)b_\\hi^t(s, a)bht(s,a)","key":"xFKW5aYaeB"},{"type":"text","value":" term above.","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"syIIAYDbbU"}],"key":"mNjStVxDHh"}],"enumerator":"9.2","html_id":"ucb-vi-bonus","key":"L7RSkMwZ47"},{"type":"heading","depth":3,"position":{"start":{"line":226,"column":1},"end":{"line":226,"column":1}},"children":[{"type":"text","value":"Definition","position":{"start":{"line":226,"column":1},"end":{"line":226,"column":1}},"key":"C0xiuSejoM"}],"identifier":"definition","label":"Definition","html_id":"definition","implicit":true,"enumerator":"9.3.3","key":"QaJl8FI48a"},{"type":"paragraph","position":{"start":{"line":228,"column":1},"end":{"line":228,"column":1}},"children":[{"type":"text","value":"Putting these parts together, we can define the algorithm as follows:","position":{"start":{"line":228,"column":1},"end":{"line":228,"column":1}},"key":"etlYhdvcTe"}],"key":"SCZjaRPGsd"},{"type":"math","value":"3 + 1 = 4","label":"ucb-vi-alg","identifier":"ucb-vi-alg","html":"3+1=43 + 1 = 43+1=4","enumerator":"9.16","html_id":"ucb-vi-alg","key":"gMoDGrolvf"},{"type":"comment","value":" TODO :::{algorithmic}\n$N_\\hi(s, a, s') \\gets \\sum_{i=0}^{t-1} \\ind{ (s_\\hi^i, a_\\hi^i, s_{h+1}^i) = (s, a, s') }$ $N_\\hi(s, a) \\gets \\sum_{i=0}^{t-1} \\ind{ (s_\\hi^i, a_\\hi^i) = (s, a) }$ $\\hat P_\\hi \\gets \\frac{N_\\hi(s, a, s')}{N_\\hi(s, a)}$ $b_\\hi(s, a) \\gets 2 H \\sqrt{\\frac{\\log( |\\mathcal{S}||\\mathcal{A}|H T/\\delta )}{N_\\hi(s, a)}}$ $\\tilde{\\mathcal{M}} \\gets (\\mathcal{S}, \\mathcal{A}, \\{ \\hat{P}_\\hi \\}_{h \\in [H-1]}, \\{ r_\\hi + b_\\hi \\}_{h \\in [H-1]}, H)$ $\\hat \\pi \\gets \\text{VI}(\\tilde{\\mathcal{M}})$ Use $\\hat \\pi_h(s)$ to collect a new trajectory $(s^t_\\hi, a^t_\\hi, s^t_{\\hi+1})_{\\hi \\in [\\hor]}$\n::: ","key":"ZLsIfQvPd1"},{"type":"heading","depth":3,"position":{"start":{"line":240,"column":1},"end":{"line":240,"column":1}},"children":[{"type":"text","value":"Performance of UCB-VI","position":{"start":{"line":240,"column":1},"end":{"line":240,"column":1}},"key":"kJ2Y4A26pS"}],"identifier":"performance-of-ucb-vi","label":"Performance of UCB-VI","html_id":"performance-of-ucb-vi","implicit":true,"enumerator":"9.3.4","key":"zzbcDfQVLv"},{"type":"paragraph","position":{"start":{"line":242,"column":1},"end":{"line":242,"column":1}},"children":[{"type":"text","value":"How exactly does UCB-VI strike a good balance between exploration and exploitation? In UCB for MABs, the bonus exploration term is simple to interpret: It encourages the learner to take actions with a high exploration term. Here, the policy depends on the bonus term indirectly: The policy is obtained by planning in an MDP where the bonus term is added to the reward function. Note that the bonuses ","position":{"start":{"line":242,"column":1},"end":{"line":242,"column":1}},"key":"pmNlpzAG4z"},{"type":"emphasis","position":{"start":{"line":242,"column":1},"end":{"line":242,"column":1}},"children":[{"type":"text","value":"propagate backwards","position":{"start":{"line":242,"column":1},"end":{"line":242,"column":1}},"key":"qx6bEW4D4m"}],"key":"H8QJ55Oee8"},{"type":"text","value":" in DP, effectively enabling the learner to ","position":{"start":{"line":242,"column":1},"end":{"line":242,"column":1}},"key":"KYnnvtfW4f"},{"type":"emphasis","position":{"start":{"line":242,"column":1},"end":{"line":242,"column":1}},"children":[{"type":"text","value":"plan to explore","position":{"start":{"line":242,"column":1},"end":{"line":242,"column":1}},"key":"eSSEhpVKCb"}],"key":"XZsLCvhPRE"},{"type":"text","value":" unknown states. This effect takes some further interpretation.","position":{"start":{"line":242,"column":1},"end":{"line":242,"column":1}},"key":"yBEhEtOdZn"}],"key":"FzQuh7l0iL"},{"type":"paragraph","position":{"start":{"line":244,"column":1},"end":{"line":244,"column":1}},"children":[{"type":"text","value":"Recall we constructed ","position":{"start":{"line":244,"column":1},"end":{"line":244,"column":1}},"key":"Hl2PswnbhZ"},{"type":"inlineMath","value":"b^t_\\hi","position":{"start":{"line":244,"column":1},"end":{"line":244,"column":1}},"html":"bhtb^t_\\hibht","key":"jnUhijKd3X"},{"type":"text","value":" so that, with high probability, ","position":{"start":{"line":244,"column":1},"end":{"line":244,"column":1}},"key":"CdmLSMXRFO"},{"type":"inlineMath","value":"V^\\star_\\hi(s) \\le \\hat{V}_\\hi^t(s)","position":{"start":{"line":244,"column":1},"end":{"line":244,"column":1}},"html":"Vh(s)V^ht(s)V^\\star_\\hi(s) \\le \\hat{V}_\\hi^t(s)Vh(s)V^ht(s)","key":"QSjmmA4C4i"},{"type":"text","value":" and so","position":{"start":{"line":244,"column":1},"end":{"line":244,"column":1}},"key":"xLbdWHbqMU"}],"key":"pabevrrqPO"},{"type":"math","value":"V^\\star_\\hi(s) - V^{\\pi^t}_\\hi(s) \\le \\hat{V}_\\hi^t(s) - V^{\\pi^t}_\\hi(s).","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"html":"Vh(s)Vhπt(s)V^ht(s)Vhπt(s).V^\\star_\\hi(s) - V^{\\pi^t}_\\hi(s) \\le \\hat{V}_\\hi^t(s) - V^{\\pi^t}_\\hi(s).Vh(s)Vhπt(s)V^ht(s)Vhπt(s).","enumerator":"9.17","key":"ptQPawhw8v"},{"type":"paragraph","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"children":[{"type":"text","value":"That is, the l.h.s. measures how suboptimal policy ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"OlgY0H9PSv"},{"type":"inlineMath","value":"\\pi^t","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"html":"πt\\pi^tπt","key":"xZHvU82j1G"},{"type":"text","value":" is in the true environment, while the r.h.s. is the difference in the policy’s value when acting in the modelled MDP ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"YNWcen2ywA"},{"type":"inlineMath","value":"\\tilde{\\mathcal{M}}^t","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"html":"M~t\\tilde{\\mathcal{M}}^tM~t","key":"Nor00f6q2E"},{"type":"text","value":" instead of the true one ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"qzbk4um4ri"},{"type":"inlineMath","value":"\\mathcal{M}^{?}","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"html":"M?\\mathcal{M}^{?}M?","key":"wc2m7VCxB5"},{"type":"text","value":".","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"FnQnTr5CFh"}],"key":"UdXKM4I2eN"},{"type":"paragraph","position":{"start":{"line":250,"column":1},"end":{"line":250,"column":1}},"children":[{"type":"text","value":"If the r.h.s. is ","position":{"start":{"line":250,"column":1},"end":{"line":250,"column":1}},"key":"QmJNn26B2x"},{"type":"emphasis","position":{"start":{"line":250,"column":1},"end":{"line":250,"column":1}},"children":[{"type":"text","value":"small","position":{"start":{"line":250,"column":1},"end":{"line":250,"column":1}},"key":"pTpISpkHLM"}],"key":"GHzGr0Sors"},{"type":"text","value":", this implies that the l.h.s. difference is also small, i.e. that ","position":{"start":{"line":250,"column":1},"end":{"line":250,"column":1}},"key":"S9HsXzAFbu"},{"type":"inlineMath","value":"\\pi^t","position":{"start":{"line":250,"column":1},"end":{"line":250,"column":1}},"html":"πt\\pi^tπt","key":"K6XZLEFvJt"},{"type":"text","value":" is ","position":{"start":{"line":250,"column":1},"end":{"line":250,"column":1}},"key":"d7r0kmsP0O"},{"type":"emphasis","position":{"start":{"line":250,"column":1},"end":{"line":250,"column":1}},"children":[{"type":"text","value":"exploiting","position":{"start":{"line":250,"column":1},"end":{"line":250,"column":1}},"key":"tYZZSMMVjC"}],"key":"NSJNjKSVuZ"},{"type":"text","value":" actions that are giving high reward.","position":{"start":{"line":250,"column":1},"end":{"line":250,"column":1}},"key":"heD1KKeC53"}],"key":"L895yVaUBx"},{"type":"paragraph","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"children":[{"type":"text","value":"If the r.h.s. is ","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"key":"AVZYbBg96w"},{"type":"emphasis","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"children":[{"type":"text","value":"large","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"key":"rWcUfSQpi8"}],"key":"jqlZKoearY"},{"type":"text","value":", then we have overestimated the value: ","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"key":"Aq4UBGaSEU"},{"type":"inlineMath","value":"\\pi^t","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"html":"πt\\pi^tπt","key":"FPeqhxtEoE"},{"type":"text","value":", the optimal policy of ","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"key":"mL0FvhlsC6"},{"type":"inlineMath","value":"\\tilde{\\mathcal{M}}^t","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"html":"M~t\\tilde{\\mathcal{M}}^tM~t","key":"b73Tmgm0Fk"},{"type":"text","value":", does not perform well in the true environment ","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"key":"U43rT863Jg"},{"type":"inlineMath","value":"\\mathcal{M}^{?}","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"html":"M?\\mathcal{M}^{?}M?","key":"uShlSFDh2Z"},{"type":"text","value":". This indicates that one of the ","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"key":"FBr5A199C7"},{"type":"inlineMath","value":"b_h^t(s, a)","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"html":"bht(s,a)b_h^t(s, a)bht(s,a)","key":"f8nqMIK16Y"},{"type":"text","value":" terms must be large, or some ","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"key":"kP2LnKhJwy"},{"type":"inlineMath","value":"\\hat P^t_\\hi(\\cdot \\mid s, a)","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"html":"P^ht(s,a)\\hat P^t_\\hi(\\cdot \\mid s, a)P^ht(s,a)","key":"cKCuweRxNT"},{"type":"text","value":" must be inaccurate, indicating a state-action pair with a low visit count ","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"key":"jz2MZAFxJK"},{"type":"inlineMath","value":"N^t_\\hi(s, a)","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"html":"Nht(s,a)N^t_\\hi(s, a)Nht(s,a)","key":"YjsqmClHYS"},{"type":"text","value":" that the learner was encouraged to explore.","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"key":"TBN5Dkqz9m"}],"key":"agUnN1XcTo"},{"type":"paragraph","position":{"start":{"line":254,"column":1},"end":{"line":254,"column":1}},"children":[{"type":"text","value":"It turns out that UCB-VI achieves a per-episode regret of","position":{"start":{"line":254,"column":1},"end":{"line":254,"column":1}},"key":"BxzCuYKYaq"}],"key":"BYKfXwOPQ8"},{"type":"proof","kind":"theorem","label":"ucb_vi_regret","identifier":"ucb_vi_regret","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"UCB-VI regret","position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"key":"SoIq1fwDYb"}],"key":"kAvYo8TCZH"},{"type":"math","value":"\\E \\left[ \\sum_{t=0}^{T-1} \\left(V^\\star_0(s_0) - V^{\\pi^t}_0(s_0) \\right) \\right] = \\tilde{O}(H^2 \\sqrt{|\\mathcal{S}| |\\mathcal{A}| T})","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"html":"E[t=0T1(V0(s0)V0πt(s0))]=O~(H2SAT)\\E \\left[ \\sum_{t=0}^{T-1} \\left(V^\\star_0(s_0) - V^{\\pi^t}_0(s_0) \\right) \\right] = \\tilde{O}(H^2 \\sqrt{|\\mathcal{S}| |\\mathcal{A}| T})E[t=0T1(V0(s0)V0πt(s0))]=O~(H2S∣∣AT)","enumerator":"9.18","key":"J5SlPyr8Mx"}],"enumerator":"9.2","html_id":"ucb-vi-regret","key":"dq15oCedgs"},{"type":"paragraph","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"children":[{"type":"text","value":"Comparing this to the UCB regret bound ","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"key":"Vqis2iUZ8d"},{"type":"inlineMath","value":"\\tilde{O}(\\sqrt{T K})","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"html":"O~(TK)\\tilde{O}(\\sqrt{T K})O~(TK)","key":"vXU1R4BxTW"},{"type":"text","value":", where ","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"key":"OLvDeRREUp"},{"type":"inlineMath","value":"K","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"html":"KKK","key":"HRWYqgEEQX"},{"type":"text","value":" is the number of arms of the MAB, we see that we’ve reduced the number of effective arms from ","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"key":"UzLRpiwXmq"},{"type":"inlineMath","value":"|\\mathcal{A}|^{|\\mathcal{S}|\\hor}","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"html":"ASH|\\mathcal{A}|^{|\\mathcal{S}|\\hor}ASH","key":"L8gQdrSD5w"},{"type":"text","value":" (in ","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"key":"mYS2C43Qa9"},{"type":"crossReference","kind":"equation","identifier":"mdp_as_mab","label":"mdp_as_mab","children":[{"type":"text","value":"(","key":"IMLybX7DiT"},{"type":"text","value":"9.4","key":"j71MgGwjyP"},{"type":"text","value":")","key":"OOkWoeh7tF"}],"template":"(%s)","enumerator":"9.4","resolved":true,"html_id":"mdp-as-mab","key":"bCKZGtiC7Q"},{"type":"text","value":") to ","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"key":"PDkVnqjaLX"},{"type":"inlineMath","value":"H^4 |\\mathcal{S}||\\mathcal{A}|","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"html":"H4SAH^4 |\\mathcal{S}||\\mathcal{A}|H4S∣∣A","key":"mY4q6PbTab"},{"type":"text","value":", which is indeed polynomial in ","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"key":"IoAXJtGM7m"},{"type":"inlineMath","value":"|\\mathcal{S}|","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"html":"S|\\mathcal{S}|S","key":"uuV2kJriAq"},{"type":"text","value":", ","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"key":"lZ842NMPEe"},{"type":"inlineMath","value":"|\\mathcal{A}|","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"html":"A|\\mathcal{A}|A","key":"NyQr1ny3T4"},{"type":"text","value":", and ","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"key":"O5ujSvUYJM"},{"type":"inlineMath","value":"H","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"html":"HHH","key":"urm31QXGL5"},{"type":"text","value":", as desired. This is also roughly the number of episodes it takes to achieve constant-order average regret:","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"key":"v4gCEcWZQg"}],"key":"P2lljAcvcQ"},{"type":"math","value":"\\frac{1}{T} \\E[\\text{Regret}_T] = \\tilde{O}\\left(\\sqrt{\\frac{H^4 |\\mathcal{S}||\\mathcal{A}|}{T}}\\right)","position":{"start":{"line":264,"column":1},"end":{"line":264,"column":1}},"html":"1TE[RegretT]=O~(H4SAT)\\frac{1}{T} \\E[\\text{Regret}_T] = \\tilde{O}\\left(\\sqrt{\\frac{H^4 |\\mathcal{S}||\\mathcal{A}|}{T}}\\right)T1E[RegretT]=O~(TH4S∣∣A)","enumerator":"9.19","key":"BVvGro4vXx"},{"type":"paragraph","position":{"start":{"line":266,"column":1},"end":{"line":266,"column":1}},"children":[{"type":"text","value":"Note that the time-dependent transition matrix has ","position":{"start":{"line":266,"column":1},"end":{"line":266,"column":1}},"key":"V9OQXsVH7v"},{"type":"inlineMath","value":"H |\\mathcal{S}|^2 |\\mathcal{A}|","position":{"start":{"line":266,"column":1},"end":{"line":266,"column":1}},"html":"HS2AH |\\mathcal{S}|^2 |\\mathcal{A}|HS2A","key":"AAq7srOdnB"},{"type":"text","value":" entries. Assuming ","position":{"start":{"line":266,"column":1},"end":{"line":266,"column":1}},"key":"yqNGsEh75x"},{"type":"inlineMath","value":"H \\ll |\\mathcal{S}|","position":{"start":{"line":266,"column":1},"end":{"line":266,"column":1}},"html":"HSH \\ll |\\mathcal{S}|HS","key":"YoUAV04ZMW"},{"type":"text","value":", this shows that it’s possible to achieve low regret, and achieve a near-optimal policy, while only understanding a ","position":{"start":{"line":266,"column":1},"end":{"line":266,"column":1}},"key":"HexOXdNPgf"},{"type":"inlineMath","value":"1/|\\mathcal{S}|","position":{"start":{"line":266,"column":1},"end":{"line":266,"column":1}},"html":"1/S1/|\\mathcal{S}|1/∣S","key":"Au2gfpEoTR"},{"type":"text","value":" fraction of the world’s dynamics.","position":{"start":{"line":266,"column":1},"end":{"line":266,"column":1}},"key":"ZmSDRnyFEc"}],"key":"zBgWE2dhvW"},{"type":"heading","depth":2,"position":{"start":{"line":268,"column":1},"end":{"line":268,"column":1}},"children":[{"type":"text","value":"Linear MDPs","position":{"start":{"line":268,"column":1},"end":{"line":268,"column":1}},"key":"dUCkobnY95"}],"identifier":"linear-mdps","label":"Linear MDPs","html_id":"linear-mdps","implicit":true,"enumerator":"9.4","key":"q5qeXLl45R"},{"type":"paragraph","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"children":[{"type":"text","value":"A polynomial dependency on ","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"key":"uxX9SNVkGB"},{"type":"inlineMath","value":"|\\mathcal{S}|","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"html":"S|\\mathcal{S}|S","key":"ugJOnmWHUj"},{"type":"text","value":" and ","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"key":"VVlo4f9mRM"},{"type":"inlineMath","value":"|\\mathcal{A}|","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"html":"A|\\mathcal{A}|A","key":"dLCBEFzeuO"},{"type":"text","value":" is manageable when the state and action spaces are small. But for large or continuous state and action spaces, even this polynomial factor will become intractable. Can we find algorithms that don’t depend on ","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"key":"qQmKQ0tSwX"},{"type":"inlineMath","value":"|\\mathcal{S}|","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"html":"S|\\mathcal{S}|S","key":"MmQzjXjAl1"},{"type":"text","value":" or ","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"key":"koShOUYSPS"},{"type":"inlineMath","value":"|\\mathcal{A}|","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"html":"A|\\mathcal{A}|A","key":"x2Vl9chKJr"},{"type":"text","value":" at all, effectively reducing the dimensionality of the MDP? In this section, we’ll explore ","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"key":"Q1Pm3BWO9S"},{"type":"strong","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"children":[{"type":"text","value":"linear MDPs","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"key":"Efs27D2DG9"}],"key":"XxbLNfkznl"},{"type":"text","value":": an example of a ","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"key":"eQhOhXmqvF"},{"type":"emphasis","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"children":[{"type":"text","value":"parameterized","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"key":"wJeZE4QrbS"}],"key":"riS45V8bV1"},{"type":"text","value":" MDP where the rewards and state transitions depend only on some parameter space of dimension ","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"key":"X3hl8R2mBp"},{"type":"inlineMath","value":"d","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"html":"ddd","key":"tz0w9EFWOU"},{"type":"text","value":" that is independent from ","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"key":"x8ab9b2YJc"},{"type":"inlineMath","value":"|\\mathcal{S}|","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"html":"S|\\mathcal{S}|S","key":"zKeYJRHbg2"},{"type":"text","value":" or ","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"key":"ktfybnbNIn"},{"type":"inlineMath","value":"|\\mathcal{A}|","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"html":"A|\\mathcal{A}|A","key":"ydf0o714EQ"},{"type":"text","value":".","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"key":"UM1xFIyrJB"}],"key":"TgPZpm1sok"},{"type":"proof","kind":"definition","label":"linear_mdp","identifier":"linear_mdp","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Linear MDP","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"BLY3xOO8CR"}],"key":"o42T2kWiPa"},{"type":"paragraph","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"children":[{"type":"text","value":"We assume that the transition probabilities and rewards are ","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"BlQtbiGVRC"},{"type":"emphasis","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"children":[{"type":"text","value":"linear","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"k08YyTrIRS"}],"key":"lsrMcvnKwU"},{"type":"text","value":" in some feature vector","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"bLR21sdoqK"}],"key":"uDOUBM43ix"},{"type":"paragraph","position":{"start":{"line":277,"column":1},"end":{"line":277,"column":1}},"children":[{"type":"inlineMath","value":"\\phi(s, a) \\in \\mathbb{R}^d","position":{"start":{"line":277,"column":1},"end":{"line":277,"column":1}},"html":"ϕ(s,a)Rd\\phi(s, a) \\in \\mathbb{R}^dϕ(s,a)Rd","key":"abVBrAsV0u"},{"type":"text","value":":","position":{"start":{"line":277,"column":1},"end":{"line":277,"column":1}},"key":"KHDNV722jm"}],"key":"WDaTJbH9ed"},{"type":"math","value":"\\begin{aligned}\n P_\\hi(s' \\mid s, a) & = \\phi(s, a)^\\top \\mu^\\star_\\hi(s') \\\\\n r_\\hi(s, a) & = \\phi(s, a)^\\top \\theta_\\hi^\\star\n\\end{aligned}","position":{"start":{"line":279,"column":1},"end":{"line":282,"column":1}},"html":"Ph(ss,a)=ϕ(s,a)μh(s)rh(s,a)=ϕ(s,a)θh\\begin{aligned}\n P_\\hi(s' \\mid s, a) & = \\phi(s, a)^\\top \\mu^\\star_\\hi(s') \\\\\n r_\\hi(s, a) & = \\phi(s, a)^\\top \\theta_\\hi^\\star\n\\end{aligned}Ph(ss,a)rh(s,a)=ϕ(s,a)μh(s)=ϕ(s,a)θh","enumerator":"9.20","key":"XSw26SarDm"},{"type":"paragraph","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"children":[{"type":"text","value":"Note that we can also think of ","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"key":"zBo6DmIUNA"},{"type":"inlineMath","value":"P_\\hi(\\cdot \\mid s, a) = \\mu_\\hi^\\star","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"html":"Ph(s,a)=μhP_\\hi(\\cdot \\mid s, a) = \\mu_\\hi^\\starPh(s,a)=μh","key":"hV87hiknRW"},{"type":"text","value":" as an ","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"key":"LCueJR4Rt3"},{"type":"inlineMath","value":"|\\mathcal{S}| \\times d","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"html":"S×d|\\mathcal{S}| \\times dS×d","key":"TxfI4pWsE5"},{"type":"text","value":" matrix, and think of ","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"key":"YYbX2A840U"},{"type":"inlineMath","value":"\\mu^\\star_\\hi(s')","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"html":"μh(s)\\mu^\\star_\\hi(s')μh(s)","key":"WmShntsB36"},{"type":"text","value":" as indexing into the ","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"key":"ORTMkXNEB5"},{"type":"inlineMath","value":"s'","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"html":"ss's","key":"kfTOvlhq43"},{"type":"text","value":"-th row of this matrix (treating it as a column vector). Thinking of ","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"key":"upAHumg50Z"},{"type":"inlineMath","value":"V^\\star_{\\hi+1}","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"html":"Vh+1V^\\star_{\\hi+1}Vh+1","key":"slwf00C6Cx"},{"type":"text","value":" as an ","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"key":"VbPwz9TaqN"},{"type":"inlineMath","value":"|\\mathcal{S}|","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"html":"S|\\mathcal{S}|S","key":"MoBGIKAj4T"},{"type":"text","value":"-dimensional vector, this allows us to write","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"key":"TjPSuJyqtb"}],"key":"GzI1oo7KSQ"},{"type":"math","value":"\\E_{s' \\sim P_\\hi(\\cdot \\mid s, a)}[V^\\star_{\\hi+1}(s)] = (\\mu^\\star_\\hi \\phi(s, a))^\\top V^\\star_{\\hi+1}.","position":{"start":{"line":286,"column":1},"end":{"line":286,"column":1}},"html":"EsPh(s,a)[Vh+1(s)]=(μhϕ(s,a))Vh+1.\\E_{s' \\sim P_\\hi(\\cdot \\mid s, a)}[V^\\star_{\\hi+1}(s)] = (\\mu^\\star_\\hi \\phi(s, a))^\\top V^\\star_{\\hi+1}.EsPh(s,a)[Vh+1(s)]=(μhϕ(s,a))Vh+1.","enumerator":"9.21","key":"ZDX17jWua0"},{"type":"paragraph","position":{"start":{"line":288,"column":1},"end":{"line":288,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":288,"column":1},"end":{"line":288,"column":1}},"key":"T54UBlZ6Wg"},{"type":"text","value":"ϕ","position":{"start":{"line":288,"column":1},"end":{"line":288,"column":1}},"key":"IqWXfUArdL"},{"type":"text","value":" feature mapping can be designed to capture interactions between the state ","position":{"start":{"line":288,"column":1},"end":{"line":288,"column":1}},"key":"GhGQqefkQ5"},{"type":"inlineMath","value":"s","position":{"start":{"line":288,"column":1},"end":{"line":288,"column":1}},"html":"sss","key":"Bkd6UB0kfu"},{"type":"text","value":" and action ","position":{"start":{"line":288,"column":1},"end":{"line":288,"column":1}},"key":"VDhtaoLkit"},{"type":"inlineMath","value":"a","position":{"start":{"line":288,"column":1},"end":{"line":288,"column":1}},"html":"aaa","key":"ML3DIVR1g6"},{"type":"text","value":". In this book, we’ll assume that the feature map ","position":{"start":{"line":288,"column":1},"end":{"line":288,"column":1}},"key":"GA9t03dhyG"},{"type":"inlineMath","value":"\\phi : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}^d","position":{"start":{"line":288,"column":1},"end":{"line":288,"column":1}},"html":"ϕ:S×ARd\\phi : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}^dϕ:S×ARd","key":"gFe2rB3rOs"},{"type":"text","value":" and the reward function (described by ","position":{"start":{"line":288,"column":1},"end":{"line":288,"column":1}},"key":"I5vMmFUXZ6"},{"type":"inlineMath","value":"\\theta_\\hi^\\star","position":{"start":{"line":288,"column":1},"end":{"line":288,"column":1}},"html":"θh\\theta_\\hi^\\starθh","key":"ihr7Z5KDkT"},{"type":"text","value":") are known to the learner.","position":{"start":{"line":288,"column":1},"end":{"line":288,"column":1}},"key":"jxF8HkOeJT"}],"key":"W9b8urlhik"}],"enumerator":"9.3","html_id":"linear-mdp","key":"zSJH6ifB1S"},{"type":"heading","depth":3,"position":{"start":{"line":291,"column":1},"end":{"line":291,"column":1}},"children":[{"type":"text","value":"Planning in a linear MDP","position":{"start":{"line":291,"column":1},"end":{"line":291,"column":1}},"key":"tldSteOaT6"}],"identifier":"planning-in-a-linear-mdp","label":"Planning in a linear MDP","html_id":"planning-in-a-linear-mdp","implicit":true,"enumerator":"9.4.1","key":"mVnvwn6xoN"},{"type":"paragraph","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"children":[{"type":"text","value":"It turns out that ","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"key":"d6CLVdoKJK"},{"type":"inlineMath","value":"Q^\\star_\\hi","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"html":"QhQ^\\star_\\hiQh","key":"g9KlPwn7XV"},{"type":"text","value":" is also linear with respect to this feature mapping. We can prove this by simply computing it using DP. We initialize ","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"key":"vlUby2kwRc"},{"type":"inlineMath","value":"V_{H}^\\star(s) = 0 \\forall s","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"html":"VH(s)=0sV_{H}^\\star(s) = 0 \\forall sVH(s)=0∀s","key":"Qzbh4xelt3"},{"type":"text","value":". Then we iterate:","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"key":"ALrQWVhW0k"}],"key":"VgZTJM3PAc"},{"type":"math","value":"\\begin{aligned}\n Q^\\star_\\hi(s, a) & = r_\\hi(s, a) + \\E_{s' \\sim P_\\hi(\\cdot \\mid s, a)} [V^\\star_{h+1}(s')] \\\\\n & = \\phi(s, a)^\\top \\theta_\\hi^\\star + (\\mu_\\hi^\\star \\phi(s, a))^\\top V^\\star_{h+1} \\\\\n & = \\phi(s, a)^\\top \\underbrace{( \\theta_\\hi^\\star + (\\mu_\\hi^\\star)^\\top V^\\star_{h+1})}_{w_\\hi} \\\\\n V^\\star_\\hi(s) & = \\max_a Q^\\star_\\hi(s, a) \\\\\n \\pi^\\star_\\hi(s) & = \\arg\\max_a Q^\\star_\\hi(s, a)\n\\end{aligned}","position":{"start":{"line":295,"column":1},"end":{"line":301,"column":1}},"html":"Qh(s,a)=rh(s,a)+EsPh(s,a)[Vh+1(s)]=ϕ(s,a)θh+(μhϕ(s,a))Vh+1=ϕ(s,a)(θh+(μh)Vh+1)whVh(s)=maxaQh(s,a)πh(s)=argmaxaQh(s,a)\\begin{aligned}\n Q^\\star_\\hi(s, a) & = r_\\hi(s, a) + \\E_{s' \\sim P_\\hi(\\cdot \\mid s, a)} [V^\\star_{h+1}(s')] \\\\\n & = \\phi(s, a)^\\top \\theta_\\hi^\\star + (\\mu_\\hi^\\star \\phi(s, a))^\\top V^\\star_{h+1} \\\\\n & = \\phi(s, a)^\\top \\underbrace{( \\theta_\\hi^\\star + (\\mu_\\hi^\\star)^\\top V^\\star_{h+1})}_{w_\\hi} \\\\\n V^\\star_\\hi(s) & = \\max_a Q^\\star_\\hi(s, a) \\\\\n \\pi^\\star_\\hi(s) & = \\arg\\max_a Q^\\star_\\hi(s, a)\n\\end{aligned}Qh(s,a)Vh(s)πh(s)=rh(s,a)+EsPh(s,a)[Vh+1(s)]=ϕ(s,a)θh+(μhϕ(s,a))Vh+1=ϕ(s,a)wh(θh+(μh)Vh+1)=amaxQh(s,a)=argamaxQh(s,a)","enumerator":"9.22","key":"Mj5obllJIV"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"VtKv2PpseQ"}],"key":"ywDLJss2cW"},{"type":"paragraph","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"children":[{"type":"text","value":"Show that ","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"key":"fcckLmkt9g"},{"type":"inlineMath","value":"Q^\\pi_\\hi","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"html":"QhπQ^\\pi_\\hiQhπ","key":"jn3eNm8OWH"},{"type":"text","value":" is also linear with respect to ","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"key":"OMZIstf3ZW"},{"type":"inlineMath","value":"\\phi(s, a)","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"html":"ϕ(s,a)\\phi(s, a)ϕ(s,a)","key":"WtxMH6oEzx"},{"type":"text","value":" for any policy ","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"key":"F8PZJKZCYE"},{"type":"text","value":"π","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"key":"WxcLRWV7Xi"},{"type":"text","value":".","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"key":"kFldkhAJMk"}],"key":"SzvNhb1M3E"}],"key":"j55UJbTJkN"},{"type":"heading","depth":3,"position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"children":[{"type":"text","value":"UCB-VI in a linear MDP","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"eN3VhbwH2H"}],"label":"lin_ucb_vi","identifier":"lin_ucb_vi","html_id":"lin-ucb-vi","enumerator":"9.4.2","key":"dzV6gOadhd"},{"type":"heading","depth":4,"position":{"start":{"line":310,"column":1},"end":{"line":310,"column":1}},"children":[{"type":"text","value":"Modelling the transitions","position":{"start":{"line":310,"column":1},"end":{"line":310,"column":1}},"key":"Vws76qh9UZ"}],"identifier":"modelling-the-transitions","label":"Modelling the transitions","html_id":"modelling-the-transitions-1","implicit":true,"enumerator":"9.4.2.1","key":"y7LX9RHWA4"},{"type":"paragraph","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"children":[{"type":"text","value":"This linear assumption on the MDP will also allow us to model the unknown dynamics ","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"IKoQQTvJ2N"},{"type":"inlineMath","value":"P^?_\\hi(s' \\mid s, a)","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"html":"Ph?(ss,a)P^?_\\hi(s' \\mid s, a)Ph?(ss,a)","key":"zQAv1MrPyv"},{"type":"text","value":" with techniques from ","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"KVVaEjHhCL"},{"type":"strong","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"children":[{"type":"text","value":"supervised learning","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"NRd7MrnuSl"}],"key":"RZWhLpgEhQ"},{"type":"text","value":" (SL). Recall that SL is useful for estimating conditional expectations by minimizing mean squared error. We can rephrase the estimation of ","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"mho2Q0DsFx"},{"type":"inlineMath","value":"P^?_\\hi(s' \\mid s, a)","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"html":"Ph?(ss,a)P^?_\\hi(s' \\mid s, a)Ph?(ss,a)","key":"H66c49MdqE"},{"type":"text","value":" as a least-squares problem as follows: Write ","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"hPAAUEnHD3"},{"type":"inlineMath","value":"\\delta_s","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"html":"δs\\delta_sδs","key":"O23Ixvq8vT"},{"type":"text","value":" to denote a one-hot vector in ","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"BD3aCCMppU"},{"type":"inlineMath","value":"\\mathbb{R}^{|\\mathcal{S}|}","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"html":"RS\\mathbb{R}^{|\\mathcal{S}|}RS","key":"fE9kI9E88h"},{"type":"text","value":", with a ","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"QAuuUQ0who"},{"type":"text","value":"1","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"QtNcFlEO5F"},{"type":"text","value":" in the ","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"uM1ZJ7p4ms"},{"type":"inlineMath","value":"s","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"html":"sss","key":"Y65A5IXRm6"},{"type":"text","value":"-th entry and ","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"VgCS9Xr4dh"},{"type":"text","value":"0","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"NvlIKPZJW2"},{"type":"text","value":" everywhere else. Note that","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"xciiodHFqe"}],"key":"N2HxBvn0FV"},{"type":"math","value":"\\E_{s' \\sim P_h(\\cdot \\mid s, a)} [\\delta_{s'}] = P_h(\\cdot \\mid s, a) = \\mu_h^\\star \\phi(s, a).","position":{"start":{"line":314,"column":1},"end":{"line":314,"column":1}},"html":"EsPh(s,a)[δs]=Ph(s,a)=μhϕ(s,a).\\E_{s' \\sim P_h(\\cdot \\mid s, a)} [\\delta_{s'}] = P_h(\\cdot \\mid s, a) = \\mu_h^\\star \\phi(s, a).EsPh(s,a)[δs]=Ph(s,a)=μhϕ(s,a).","enumerator":"9.23","key":"bf5FEPAk6o"},{"type":"paragraph","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"children":[{"type":"text","value":"Furthermore, since the expectation here is linear with respect to ","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"key":"JdMzdig4w0"},{"type":"inlineMath","value":"\\phi(s, a)","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"html":"ϕ(s,a)\\phi(s, a)ϕ(s,a)","key":"oxXAdOJLjb"},{"type":"text","value":", we can directly apply least-squares multi-target linear regression to construct the estimate","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"key":"NO3XE0v0MG"}],"key":"jttTq8zX8I"},{"type":"math","value":"\\hat \\mu = \\arg\\min_{\\mu \\in \\mathbb{R}^{|\\mathcal{S}| \\times d}} \\sum_{t=0}^{T-1} \\|\\mu \\phi(s_h^i, a_h^i) - \\delta_{s_{h+1}^i} \\|_2^2.","position":{"start":{"line":318,"column":1},"end":{"line":318,"column":1}},"html":"μ^=argminμRS×dt=0T1μϕ(shi,ahi)δsh+1i22.\\hat \\mu = \\arg\\min_{\\mu \\in \\mathbb{R}^{|\\mathcal{S}| \\times d}} \\sum_{t=0}^{T-1} \\|\\mu \\phi(s_h^i, a_h^i) - \\delta_{s_{h+1}^i} \\|_2^2.μ^=argμRS×dmint=0T1μϕ(shi,ahi)δsh+1i22.","enumerator":"9.24","key":"hZpve9qDti"},{"type":"paragraph","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"children":[{"type":"text","value":"This has a well-known closed-form solution:","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"key":"mSfpglEi4A"}],"key":"MN5iay7G7e"},{"type":"math","value":"\\begin{aligned}\n \\hat \\mu^\\top & = (A_h^t)^{-1} \\sum_{i=0}^{t-1} \\phi(s_h^i, a_h^i) \\delta_{s_{h+1}^i}^\\top \\\\\n \\text{where} \\quad A_h^t & = \\sum_{i=0}^{t-1} \\phi(s_h^i, a_h^i) \\phi(s_h^i, a_h^i)^\\top + \\lambda I\n\\end{aligned}","position":{"start":{"line":322,"column":1},"end":{"line":325,"column":1}},"html":"μ^=(Aht)1i=0t1ϕ(shi,ahi)δsh+1iwhereAht=i=0t1ϕ(shi,ahi)ϕ(shi,ahi)+λI\\begin{aligned}\n \\hat \\mu^\\top & = (A_h^t)^{-1} \\sum_{i=0}^{t-1} \\phi(s_h^i, a_h^i) \\delta_{s_{h+1}^i}^\\top \\\\\n \\text{where} \\quad A_h^t & = \\sum_{i=0}^{t-1} \\phi(s_h^i, a_h^i) \\phi(s_h^i, a_h^i)^\\top + \\lambda I\n\\end{aligned}μ^whereAht=(Aht)1i=0t1ϕ(shi,ahi)δsh+1i=i=0t1ϕ(shi,ahi)ϕ(shi,ahi)+λI","enumerator":"9.25","key":"MNlyynsQ9Q"},{"type":"paragraph","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"children":[{"type":"text","value":"where we include a ","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"key":"SUDRmcSk4v"},{"type":"inlineMath","value":"\\lambda I","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"html":"λI\\lambda IλI","key":"FelrlIFkDQ"},{"type":"text","value":" term to ensure that the matrix ","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"key":"HGdVEW8lp8"},{"type":"inlineMath","value":"A^t_h","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"html":"AhtA^t_hAht","key":"CQIlMD3PS6"},{"type":"text","value":" is invertible. (This can also be derived by adding a ","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"key":"CyvZWFiyxL"},{"type":"inlineMath","value":"\\lambda \\|\\mu\\|_{\\text{F}}^2","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"html":"λμF2\\lambda \\|\\mu\\|_{\\text{F}}^2λμF2","key":"prwjUmj8Od"},{"type":"text","value":" regularization term to the objective.) We can directly plug in this estimate into ","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"key":"T4I4aQnvu0"},{"type":"inlineMath","value":"\\hat{P}^t_h(\\cdot \\mid s, a) = \\hat \\mu^t_h \\phi(s, a)","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"html":"P^ht(s,a)=μ^htϕ(s,a)\\hat{P}^t_h(\\cdot \\mid s, a) = \\hat \\mu^t_h \\phi(s, a)P^ht(s,a)=μ^htϕ(s,a)","key":"DQ2rOnRVsD"},{"type":"text","value":".","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"key":"wWL68P3bQZ"}],"key":"Q0q2lMpiC6"},{"type":"heading","depth":4,"position":{"start":{"line":329,"column":1},"end":{"line":329,"column":1}},"children":[{"type":"text","value":"Reward bonus","position":{"start":{"line":329,"column":1},"end":{"line":329,"column":1}},"key":"OSbbJg50F1"}],"identifier":"reward-bonus","label":"Reward bonus","html_id":"reward-bonus-1","implicit":true,"enumerator":"9.4.2.2","key":"tOhzqdw7TI"},{"type":"paragraph","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"children":[{"type":"text","value":"Now, to design the reward bonus, we can’t apply Hoeffding anymore, since the terms no longer involve sample means of bounded random variables; Instead, we’re incorporating information across different states and actions. Rather, we can construct an upper bound using ","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"key":"meDVDJ51Vv"},{"type":"emphasis","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"children":[{"type":"text","value":"Chebyshev’s inequality","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"key":"RN1l2PNk7u"}],"key":"lPhlgfsH8x"},{"type":"text","value":" in the same way we did for the LinUCB algorithm in the MAB setting ","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"key":"kEmlKoyrVB"},{"type":"crossReference","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"children":[{"type":"text","value":"Section ","key":"dyKnbqtdbK"},{"type":"text","value":"3.8.1","key":"nsaWOhASsX"}],"identifier":"lin_ucb","label":"lin_ucb","kind":"heading","template":"Section %s","enumerator":"3.8.1","resolved":true,"html_id":"lin-ucb","remote":true,"url":"/bandits","dataUrl":"/bandits.json","key":"g7oOqUbMjJ"},{"type":"text","value":":","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"key":"r7jz4wVoJq"}],"key":"vaT7jHwiXO"},{"type":"math","value":"b^t_\\hi(s, a) = \\beta \\sqrt{\\phi(s, a)^\\top (A^t_h)^{-1} \\phi(s, a)}, \\quad \\beta = \\tilde O(d \\hor).","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"html":"bht(s,a)=βϕ(s,a)(Aht)1ϕ(s,a),β=O~(dH).b^t_\\hi(s, a) = \\beta \\sqrt{\\phi(s, a)^\\top (A^t_h)^{-1} \\phi(s, a)}, \\quad \\beta = \\tilde O(d \\hor).bht(s,a)=βϕ(s,a)(Aht)1ϕ(s,a),β=O~(dH).","enumerator":"9.26","key":"PGg1RzQgNL"},{"type":"paragraph","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"Note that this isn’t explicitly inversely proportional to ","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"ihpPHguiXC"},{"type":"inlineMath","value":"N_h^t(s, a)","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"html":"Nht(s,a)N_h^t(s, a)Nht(s,a)","key":"MXlJOlEXoJ"},{"type":"text","value":" as in the original UCB-VI bonus term ","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"FeaJnWfZuT"},{"type":"crossReference","kind":"equation","identifier":"eq:ucb_vi_bonus","label":"eq:ucb_vi_bonus","children":[{"type":"text","value":"(","key":"rK2Uqkid5b"},{"type":"text","value":"9.8","key":"BxNd2A4muF"},{"type":"text","value":")","key":"JbJ1NTSrIi"}],"template":"(%s)","enumerator":"9.8","resolved":true,"html_id":"eq-ucb-vi-bonus","key":"nuJgnmZDb2"},{"type":"text","value":". Rather, it is inversely proportional to the amount that the direction ","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"H6wUlGq0yV"},{"type":"inlineMath","value":"\\phi(s, a)","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"html":"ϕ(s,a)\\phi(s, a)ϕ(s,a)","key":"YJLAsKjFSE"},{"type":"text","value":" has been explored in the history. That is, if ","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"uC691L2UUr"},{"type":"inlineMath","value":"A_h^t","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"html":"AhtA_h^tAht","key":"tvzX1EL2X1"},{"type":"text","value":" has a large component in the direction ","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"KhsJPLSrJ3"},{"type":"inlineMath","value":"\\phi(s, a)","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"html":"ϕ(s,a)\\phi(s, a)ϕ(s,a)","key":"U561GnTHZq"},{"type":"text","value":", implying that this direction is well explored, then the bonus term will be small, and vice versa.","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"UPd6qehqjm"}],"key":"Qu2c1FkVcH"},{"type":"paragraph","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"We can now plug in these transition estimates and reward bonuses into the UCB-VI algorithm ","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"Jza9pDyCQd"},{"type":"crossReference","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"(","key":"iaTufiNnLu"},{"type":"text","value":"9.16","key":"ttKDIVEmCz"},{"type":"text","value":")","key":"NfakAioNRX"}],"identifier":"ucb-vi-alg","label":"ucb-vi-alg","kind":"equation","template":"(%s)","enumerator":"9.16","resolved":true,"html_id":"ucb-vi-alg","key":"jPd0irBylg"},{"type":"text","value":".","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"VmmHtpcSfR"}],"key":"x1mMBolJ3t"},{"type":"heading","depth":4,"position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"children":[{"type":"text","value":"Performance","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"key":"zTNc6bXxOD"}],"identifier":"performance","label":"Performance","html_id":"performance","implicit":true,"enumerator":"9.4.2.3","key":"Obd9HI9shV"},{"type":"proof","kind":"theorem","label":"lin_ucb_vi_regret","identifier":"lin_ucb_vi_regret","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"LinUCB-VI regret","position":{"start":{"line":341,"column":1},"end":{"line":341,"column":1}},"key":"KUrFXbSCPv"}],"key":"x8iyv7E1hZ"},{"type":"paragraph","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"children":[{"type":"text","value":"The LinUCB-VI algorithm achieves expected regret","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"zZXpHTGw0h"}],"key":"ThS73YHtif"},{"type":"math","value":"\\E[\\text{Regret}_T] = \\E\\left[\\sum_{t=0}^{T-1} V^\\star_0(s_0) - V^{\\pi^t}_0(s_0) \\right] \\le \\tilde O(H^2 d^{1.5} \\sqrt{T})","position":{"start":{"line":346,"column":1},"end":{"line":346,"column":1}},"html":"E[RegretT]=E[t=0T1V0(s0)V0πt(s0)]O~(H2d1.5T)\\E[\\text{Regret}_T] = \\E\\left[\\sum_{t=0}^{T-1} V^\\star_0(s_0) - V^{\\pi^t}_0(s_0) \\right] \\le \\tilde O(H^2 d^{1.5} \\sqrt{T})E[RegretT]=E[t=0T1V0(s0)V0πt(s0)]O~(H2d1.5T)","enumerator":"9.27","key":"mlolFoNSiB"}],"enumerator":"9.3","html_id":"lin-ucb-vi-regret","key":"jEOb2zuXAH"},{"type":"paragraph","position":{"start":{"line":349,"column":1},"end":{"line":349,"column":1}},"children":[{"type":"text","value":"Comparing this to our bound for UCB-VI in an environment without this linear assumption, we see that we go from a sample complexity of ","position":{"start":{"line":349,"column":1},"end":{"line":349,"column":1}},"key":"pmsW6LsAE8"},{"type":"inlineMath","value":"\\tilde \\Omega(H^4 |\\mathcal{S}||\\mathcal{A}|)","position":{"start":{"line":349,"column":1},"end":{"line":349,"column":1}},"html":"Ω~(H4SA)\\tilde \\Omega(H^4 |\\mathcal{S}||\\mathcal{A}|)Ω~(H4S∣∣A)","key":"CzI9JV2N2y"},{"type":"text","value":" to ","position":{"start":{"line":349,"column":1},"end":{"line":349,"column":1}},"key":"vDrDbVPa6M"},{"type":"inlineMath","value":"\\tilde \\Omega(H^4 d^{3})","position":{"start":{"line":349,"column":1},"end":{"line":349,"column":1}},"html":"Ω~(H4d3)\\tilde \\Omega(H^4 d^{3})Ω~(H4d3)","key":"XRBUSuXcuj"},{"type":"text","value":". This new sample complexity only depends on the feature dimension and not on the state or action space of the MDP!","position":{"start":{"line":349,"column":1},"end":{"line":349,"column":1}},"key":"Pwrnr5qK06"}],"key":"N1u5EVfuk6"},{"type":"heading","depth":2,"position":{"start":{"line":351,"column":1},"end":{"line":351,"column":1}},"children":[{"type":"text","value":"Summary","position":{"start":{"line":351,"column":1},"end":{"line":351,"column":1}},"key":"i8dvX0C90r"}],"identifier":"summary","label":"Summary","html_id":"summary","implicit":true,"enumerator":"9.5","key":"SZSjJ1YHUb"},{"type":"paragraph","position":{"start":{"line":353,"column":1},"end":{"line":353,"column":1}},"children":[{"type":"text","value":"In this chapter, we’ve explored how to explore in an unknown MDP.","position":{"start":{"line":353,"column":1},"end":{"line":353,"column":1}},"key":"EIHj8T4ZCv"}],"key":"CvCrICQGa8"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":355,"column":1},"end":{"line":361,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":355,"column":1},"end":{"line":356,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"children":[{"type":"text","value":"We first discussed the explore-then-exploit algorithm ","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"key":"iuGagg4Ypm"},{"type":"crossReference","kind":"proof:definition","identifier":"explore_then_exploit","label":"explore_then_exploit","children":[{"type":"text","value":"Definition ","key":"u0P3zwCvEW"},{"type":"text","value":"9.2","key":"Xkpjk7fPDJ"}],"template":"Definition %s","enumerator":"9.2","resolved":true,"html_id":"explore-then-exploit","key":"TbeGfXQML4"},{"type":"text","value":", a simple way to explore a deterministic MDP by visiting all state-action pairs.","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"key":"ghIjh72WmF"}],"key":"W4jBLfabvV"}],"key":"zIpTWu6Wem"},{"type":"listItem","spread":true,"position":{"start":{"line":357,"column":1},"end":{"line":358,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"children":[{"type":"text","value":"We then discussed how to treat an unknown MDP as a MAB ","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"key":"bb7QkZrnwa"},{"type":"crossReference","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"children":[{"type":"text","value":"Section ","key":"R8wKwSCR9S"},{"type":"text","value":"9.2","key":"AUtTLhEogo"}],"identifier":"mdp_mab","label":"mdp_mab","kind":"heading","template":"Section %s","enumerator":"9.2","resolved":true,"html_id":"mdp-mab","key":"h5nO0EMYTX"},{"type":"text","value":", and how this approach is inefficient since it doesn’t make use of relationships between policies.","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"key":"iX8LXPT3eT"}],"key":"OUpDBiF42h"}],"key":"TFooAf8673"},{"type":"listItem","spread":true,"position":{"start":{"line":359,"column":1},"end":{"line":360,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":359,"column":1},"end":{"line":359,"column":1}},"children":[{"type":"text","value":"We then introduced the UCB-VI algorithm ","position":{"start":{"line":359,"column":1},"end":{"line":359,"column":1}},"key":"VGARGCqSRi"},{"type":"crossReference","position":{"start":{"line":359,"column":1},"end":{"line":359,"column":1}},"children":[{"type":"text","value":"(","key":"ejbteIBNsO"},{"type":"text","value":"9.16","key":"psGwVovIBw"},{"type":"text","value":")","key":"f4NsIZVqUT"}],"identifier":"ucb-vi-alg","label":"ucb-vi-alg","kind":"equation","template":"(%s)","enumerator":"9.16","resolved":true,"html_id":"ucb-vi-alg","key":"z4P6smXzDm"},{"type":"text","value":", which models the unknown MDP by a proxy MDP with a reward bonus term that encourages exploration.","position":{"start":{"line":359,"column":1},"end":{"line":359,"column":1}},"key":"l1FOWQK3eE"}],"key":"zlJ4dbq7Yc"}],"key":"R70Acn9q54"},{"type":"listItem","spread":true,"position":{"start":{"line":361,"column":1},"end":{"line":361,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":361,"column":1},"end":{"line":361,"column":1}},"children":[{"type":"text","value":"Finally, assuming that the transitions and rewards are linear with respect to a feature transformation of the state and action, we introduced the LinUCB-VI algorithm ","position":{"start":{"line":361,"column":1},"end":{"line":361,"column":1}},"key":"zkNcG50VtF"},{"type":"crossReference","position":{"start":{"line":361,"column":1},"end":{"line":361,"column":1}},"children":[{"type":"text","value":"Section ","key":"J89B09ZezP"},{"type":"text","value":"9.4.2","key":"Y5XwAKHgdW"}],"identifier":"lin_ucb_vi","label":"lin_ucb_vi","kind":"heading","template":"Section %s","enumerator":"9.4.2","resolved":true,"html_id":"lin-ucb-vi","key":"nJH0pAUqYC"},{"type":"text","value":", which has a sample complexity independent of the size of the state and action spaces.","position":{"start":{"line":361,"column":1},"end":{"line":361,"column":1}},"key":"FO4m2xT7JX"}],"key":"NF1EcQHC07"}],"key":"Z8odEHgUsC"}],"key":"Kg7Xzhfyvb"}],"key":"aKnB7DD28R"}],"key":"WKeB1nmL7Y"},"references":{"cite":{"order":["agarwal_reinforcement_2022"],"data":{"agarwal_reinforcement_2022":{"label":"agarwal_reinforcement_2022","enumerator":"1","html":"Agarwal, A., Jiang, N., Kakade, S. M., & Sun, W. (2022). Reinforcement Learning: Theory and Algorithms."}}}},"footer":{"navigation":{"prev":{"title":"8 Tree Search Methods","url":"/planning","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"Appendix: Background","url":"/background","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file +{"kind":"Notebook","sha256":"16601dd72e8b5e5b5a3530b6022d894d109f606501a1e0863d8a727655c4c852","slug":"exploration","location":"/exploration.md","dependencies":[],"frontmatter":{"title":"9 Exploration in MDPs","numbering":{"all":{"enabled":true},"enumerator":{"template":"9.%s"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"thumbnail":"/build/sparse_reward_mdp-d4beda7e57ed42a0bbe96cfa6c5ecbbe.png","thumbnailOptimized":"/build/sparse_reward_mdp-d4beda7e57ed42a0bbe96cfa6c5ecbbe.webp","exports":[{"format":"md","filename":"exploration.md","url":"/build/exploration-81ded2f1b068acb6df548cb9ef312d11.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"hA52VhLzQm"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"9.1","key":"xydYg6OlGP"},{"type":"paragraph","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"One of the key challenges of reinforcement learning is the ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"Yw8NtjieaZ"},{"type":"emphasis","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"exploration-exploitation tradeoff","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"i3L1iQ5uCY"}],"key":"zesuokLdOv"},{"type":"text","value":". Should we ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"rW4RXSXhr6"},{"type":"emphasis","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"exploit","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"v3cp28yVcv"}],"key":"WpMzWfjDhf"},{"type":"text","value":" actions we know will give high reward, or should we ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"Wt2k6tMMLM"},{"type":"emphasis","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"explore","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"vy6ER9e4ue"}],"key":"tGfHPQ5s4U"},{"type":"text","value":" different actions to discover potentially better strategies? An algorithm that doesn’t explore effectively might easily ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"kFnuEskFZB"},{"type":"emphasis","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"overfit","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"gErkY8VP93"}],"key":"u8akapgR5u"},{"type":"text","value":" to certain areas of the state space, and fail to generalize once they enter a region they haven’t yet seen. The algorithms we saw in the chapter on fitted DP ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"McIMv3YmB2"},{"type":"link","url":"/fitted-dp","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"5 Fitted Dynamic Programming Algorithms","key":"pMogXVr3ee"}],"urlSource":"./fitted_dp.md","dataUrl":"/fitted-dp.json","internal":true,"protocol":"file","key":"TeaAfQ2f42"},{"type":"text","value":" suffer from this issue.","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"uKaQlOJXdd"}],"key":"KZPjSzzP3d"},{"type":"paragraph","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"children":[{"type":"text","value":"In ","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"key":"uA7ESI3HKD"},{"type":"link","url":"/bandits","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"children":[{"type":"text","value":"3 Multi-Armed Bandits","key":"bBZamQE5M5"}],"urlSource":"./bandits.md","dataUrl":"/bandits.json","internal":true,"protocol":"file","key":"NoPKVC7dlI"},{"type":"text","value":", where the state never changes so all we care about are the actions, we saw algorithms like ","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"key":"fsC7uns9wJ"},{"type":"crossReference","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"children":[{"type":"text","value":"Section ","key":"AmC1KwhQB6"},{"type":"text","value":"3.6","key":"M30aQIBAub"}],"identifier":"ucb","label":"ucb","kind":"heading","template":"Section %s","enumerator":"3.6","resolved":true,"html_id":"ucb","remote":true,"url":"/bandits","dataUrl":"/bandits.json","key":"V96MBahWFl"},{"type":"text","value":" and ","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"key":"oQMVFo6VWY"},{"type":"crossReference","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"children":[{"type":"text","value":"Thompson sampling","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"key":"vc34j8UnT6"}],"identifier":"thompson_sampling","label":"thompson_sampling","kind":"heading","template":"Section %s","enumerator":"3.7","resolved":true,"html_id":"thompson-sampling","remote":true,"url":"/bandits","dataUrl":"/bandits.json","key":"h38vrDN7LR"},{"type":"text","value":" that incentivize the learner to explore arms that it is uncertain about. In this chapter, we will see how to generalize these ideas to the MDP setting.","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"key":"QUwj3TgzTt"}],"key":"cBKEpsHT7e"},{"type":"proof","kind":"definition","label":"per_episode_regret","identifier":"per_episode_regret","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Per-episode regret","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"lPmJksV9X8"}],"key":"FGQkU5LB7U"},{"type":"paragraph","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"text","value":"To quantify the performance of a learning algorithm, we will consider its per-episode regret over ","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"vHv7O94gg1"},{"type":"inlineMath","value":"T","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"html":"TTT","key":"CpmBtdnhnB"},{"type":"text","value":" timesteps/episodes:","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"BwMWoGZwSQ"}],"key":"Iq0azxIfi5"},{"type":"math","value":"\\text{Regret}_T = \\E\\left[ \\sum_{t=0}^{T-1} V^\\star_0(s_0) - V^{\\pi^t}_0(s_0) \\right]","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"html":"RegretT=E[t=0T1V0(s0)V0πt(s0)]\\text{Regret}_T = \\E\\left[ \\sum_{t=0}^{T-1} V^\\star_0(s_0) - V^{\\pi^t}_0(s_0) \\right]RegretT=E[t=0T1V0(s0)V0πt(s0)]","enumerator":"9.1","key":"W6NJrtWjyG"},{"type":"paragraph","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"dQiPaEZtoZ"},{"type":"inlineMath","value":"\\pi^t","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"html":"πt\\pi^tπt","key":"A0JTQ1KNSZ"},{"type":"text","value":" is the policy generated by the algorithm at the ","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"BD6SlaEtRz"},{"type":"inlineMath","value":"t","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"html":"ttt","key":"Qf8rAZUhx0"},{"type":"text","value":"th iteration.","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"CjEK92YAH1"}],"key":"S9WStWduKw"}],"enumerator":"9.1","html_id":"per-episode-regret","key":"OgG0QV27NO"},{"type":"heading","depth":3,"position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"children":[{"type":"text","value":"Sparse reward","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"VjZPTFBPmA"}],"identifier":"sparse-reward","label":"Sparse reward","html_id":"sparse-reward","implicit":true,"enumerator":"9.1.1","key":"cg4kMbdWJK"},{"type":"paragraph","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"children":[{"type":"text","value":"Exploration is especially crucial in ","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"key":"OdMSkJdLX2"},{"type":"strong","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"children":[{"type":"text","value":"sparse reward","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"key":"Robps9PnX5"}],"key":"qrJfbEG91G"},{"type":"text","value":" problems where reward doesn’t come until after many steps, and algorithms which do not ","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"key":"S15utHfjpJ"},{"type":"emphasis","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"children":[{"type":"text","value":"systematically","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"key":"xCZOi5Q4UM"}],"key":"wh54PaxtLN"},{"type":"text","value":" explore new states may fail to learn anything meaningful (within a reasonable amount of time).","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"key":"Fe3ghghQnr"}],"key":"vOir0zD2vn"},{"type":"paragraph","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"children":[{"type":"text","value":"For example, policy gradient algorithms require the gradient to be nonzero in order to learn. If we never observe any reward, the gradient will always be zero, and the policy will never change or improve.","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"CfYiHTfdVa"}],"key":"eQ4WAOC24R"},{"type":"proof","kind":"example","label":"sparse_reward_mdp","identifier":"sparse_reward_mdp","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Sparse Reward MDP","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"P59dq7oi9d"}],"key":"Y38LNx8jXp"},{"type":"paragraph","position":{"start":{"line":43,"column":1},"end":{"line":43,"column":1}},"children":[{"type":"text","value":"Here’s a simple example of an MDP with sparse reward:","position":{"start":{"line":43,"column":1},"end":{"line":43,"column":1}},"key":"sDSDfeJIzZ"}],"key":"yeDOUcvfvi"},{"type":"image","url":"/build/sparse_reward_mdp-d4beda7e57ed42a0bbe96cfa6c5ecbbe.png","alt":"image","position":{"start":{"line":45,"column":1},"end":{"line":45,"column":1}},"key":"LN8JxRrOim","urlSource":"shared/sparse_reward_mdp.png","urlOptimized":"/build/sparse_reward_mdp-d4beda7e57ed42a0bbe96cfa6c5ecbbe.webp"},{"type":"paragraph","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"children":[{"type":"text","value":"There are ","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"key":"rx3mUbETlH"},{"type":"inlineMath","value":"|\\mathcal{S}|","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"html":"S|\\mathcal{S}|S","key":"vNCGDeFwAE"},{"type":"text","value":" states. The agent starts in the leftmost state. In every state, there are three possible actions, two of which move the agent left and one which moves the agent right. The reward function assigns ","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"key":"k3E0ByYIME"},{"type":"inlineMath","value":"r=1","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"html":"r=1r=1r=1","key":"PuWrtNN5Hx"},{"type":"text","value":" to the rightmost cell.","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"key":"OiXNtM1atJ"}],"key":"v0qk69yVmg"}],"enumerator":"9.1","html_id":"sparse-reward-mdp","key":"LNdEWKCp2c"},{"type":"heading","depth":3,"position":{"start":{"line":50,"column":1},"end":{"line":50,"column":1}},"children":[{"type":"text","value":"Exploration in deterministic MDPs","position":{"start":{"line":50,"column":1},"end":{"line":50,"column":1}},"key":"iET4ko6SY9"}],"identifier":"exploration-in-deterministic-mdps","label":"Exploration in deterministic MDPs","html_id":"exploration-in-deterministic-mdps","implicit":true,"enumerator":"9.1.2","key":"EPvZnHCGFQ"},{"type":"paragraph","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"children":[{"type":"text","value":"Let us address the exploration problem in a ","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"GDiPlVdwU1"},{"type":"emphasis","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"children":[{"type":"text","value":"deterministic","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"mK0dM1cgTp"}],"key":"FFxu6JSROb"},{"type":"text","value":" MDP where taking action ","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"M87sBBBSWI"},{"type":"inlineMath","value":"a","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"html":"aaa","key":"UZSmhC8Z4l"},{"type":"text","value":" in state ","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"mex3yTyiOw"},{"type":"inlineMath","value":"s","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"html":"sss","key":"Kb9pC5xIn5"},{"type":"text","value":" always leads to the state ","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"TFEUASvWdM"},{"type":"inlineMath","value":"P(s, a) \\in \\mathcal{S}","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"html":"P(s,a)SP(s, a) \\in \\mathcal{S}P(s,a)S","key":"QPjETK6q4D"},{"type":"text","value":". In this simple setting, there will be no “automatic” exploration due to randomness, so our strategy must actively explore new states. One simple strategy is to visit every possible state-action pair to learn the entire MDP. Then, once the MDP is known, we can use DP to solve for the optimal policy. (This should remind you of the ","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"tOIFZsRxCr"},{"type":"crossReference","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"children":[{"type":"text","value":"Section ","key":"W5r1NWSc4G"},{"type":"text","value":"3.4","key":"QUbzwR8AOV"}],"identifier":"etc","label":"etc","kind":"heading","template":"Section %s","enumerator":"3.4","resolved":true,"html_id":"etc","remote":true,"url":"/bandits","dataUrl":"/bandits.json","key":"CvheVrHFs7"},{"type":"text","value":" algorithm.)","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"tBgIcEmYqq"}],"key":"uOQ47BWk0x"},{"type":"proof","kind":"definition","label":"explore_then_exploit","identifier":"explore_then_exploit","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Explore-then-exploit (for deterministic MDPs)","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"key":"Fu7xkwTpnJ"}],"key":"mE0aLkdvkq"},{"type":"paragraph","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"children":[{"type":"text","value":"We’ll keep a set ","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"S6JEba5YBz"},{"type":"inlineMath","value":"K","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"html":"KKK","key":"Cv98x0fyGS"},{"type":"text","value":" of all the ","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"bMzDJjYzvs"},{"type":"inlineMath","value":"(s, a, r, s')","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"html":"(s,a,r,s)(s, a, r, s')(s,a,r,s)","key":"TxbZEWc3dZ"},{"type":"text","value":" pairs we’ve observed. Each episode, we’ll choose an unseen state-action pair for which the reward and the next state are unknown, and take the shortest path there. We assume that every state can be reached from the initial state within a single episode.","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"IWOg4YN7bY"}],"key":"Mlvq71h3U8"},{"type":"comment","value":" :::{algorithmic}\n$K \\gets \\emptyset$ Using our known transitions $K$, compute the shortest path $\\tilde \\pi$ to $(s, a)$ Execute $\\tilde \\pi$ to visit $(s, a)$ and observe $r = r(s, a), s' = P(s, a)$ $K \\gets K \\cup \\{ (s, a, r, s') \\}$ Compute the optimal policy $\\pi^\\star$ in the MDP $K$ (e.g. using policy iteration). $\\pi^\\star$.\n::: ","key":"S5LavZsmJH"},{"type":"paragraph","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"children":[{"type":"text","value":"The shortest path computation can be implemented using DP. We leave this as an exercise.","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"key":"R3dKg9Ogql"}],"key":"ViG7KSDbiV"}],"enumerator":"9.2","html_id":"explore-then-exploit","key":"frUVrUbQtI"},{"type":"proof","kind":"theorem","label":"explore_then_exploit_performance","identifier":"explore_then_exploit_performance","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Performance of explore-then-exploit","position":{"start":{"line":66,"column":1},"end":{"line":66,"column":1}},"key":"FXOxrb7NH5"}],"key":"QV63quwPWp"},{"type":"paragraph","position":{"start":{"line":69,"column":1},"end":{"line":70,"column":1}},"children":[{"type":"text","value":"As long as every state can be reached from ","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"ZRwtRrW7xz"},{"type":"inlineMath","value":"s_0","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"html":"s0s_0s0","key":"GMeD2VrvMB"},{"type":"text","value":" within a single episode, i.e. ","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"ElsDQlTa7K"},{"type":"inlineMath","value":"|\\mathcal{S}| \\le \\hor","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"html":"SH|\\mathcal{S}| \\le \\horSH","key":"wlg6ibjtzr"},{"type":"text","value":", this will eventually be able to explore all ","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"m6ZsGRI8iK"},{"type":"inlineMath","value":"|\\mathcal{S}| |\\mathcal{A}|","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"html":"SA|\\mathcal{S}| |\\mathcal{A}|S∣∣A","key":"zxHHEU06W1"},{"type":"text","value":" state-action pairs, adding one new transition per episode. We know it will take at most ","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"ibzNYg8zW2"},{"type":"inlineMath","value":"|\\mathcal{S}| |\\mathcal{A}|","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"html":"SA|\\mathcal{S}| |\\mathcal{A}|S∣∣A","key":"BkQEwFtQd6"},{"type":"text","value":" iterations to explore the entire MDP, after which ","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"npxuSDcInG"},{"type":"inlineMath","value":"\\pi^t = \\pi^\\star","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"html":"πt=π\\pi^t = \\pi^\\starπt=π","key":"Za9z7ZYrI0"},{"type":"text","value":", incurring no additional regret.\nFor each ","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"hE2g61jBxX"},{"type":"inlineMath","value":"\\pi^t","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"html":"πt\\pi^tπt","key":"j7N0NgjkHJ"},{"type":"text","value":" up until then, corresponding to the shortest-path policies ","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"lmmuyWbVH7"},{"type":"inlineMath","value":"\\tilde \\pi","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"html":"π~\\tilde \\piπ~","key":"PkyLTtCgBs"},{"type":"text","value":", the value of policy ","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"ym77gFX0Gz"},{"type":"inlineMath","value":"\\pi^t","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"html":"πt\\pi^tπt","key":"yrTaGi5KI3"},{"type":"text","value":" will differ from that of ","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"pKsI2F4iQw"},{"type":"inlineMath","value":"\\pi^\\star","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"html":"π\\pi^\\starπ","key":"PctMjq0ScD"},{"type":"text","value":" by at most ","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"QeeGg6TOgy"},{"type":"inlineMath","value":"\\hor","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"html":"H\\horH","key":"T7U4oBa42E"},{"type":"text","value":", since the policies will differ by at most ","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"WxZuYQNBFJ"},{"type":"text","value":"1","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"jBrcKFjVSc"},{"type":"text","value":" reward at each timestep. So,","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"key":"fAC8QX121j"}],"key":"MsyaMcxxIs"},{"type":"math","value":"\\sum_{t=0}^{T-1} V^\\star_0 - V_0^{\\pi^t} \\le |\\mathcal{S}||\\mathcal{A}| \\hor.","position":{"start":{"line":72,"column":1},"end":{"line":72,"column":1}},"html":"t=0T1V0V0πtSAH.\\sum_{t=0}^{T-1} V^\\star_0 - V_0^{\\pi^t} \\le |\\mathcal{S}||\\mathcal{A}| \\hor.t=0T1V0V0πtS∣∣AH.","enumerator":"9.2","key":"spxNkz2iDo"},{"type":"paragraph","position":{"start":{"line":74,"column":1},"end":{"line":74,"column":1}},"children":[{"type":"text","value":"(Note that this MDP and algorithm are deterministic, so the regret is not random.)","position":{"start":{"line":74,"column":1},"end":{"line":74,"column":1}},"key":"bdq4ZkLkQA"}],"key":"iJXx8mgPzj"}],"enumerator":"9.1","html_id":"explore-then-exploit-performance","key":"Hk4dOHY2b7"},{"type":"heading","depth":2,"position":{"start":{"line":78,"column":1},"end":{"line":78,"column":1}},"children":[{"type":"text","value":"Treating an unknown MDP as a MAB","position":{"start":{"line":78,"column":1},"end":{"line":78,"column":1}},"key":"qTCKq8h9i3"}],"label":"mdp_mab","identifier":"mdp_mab","html_id":"mdp-mab","enumerator":"9.2","key":"SjFSiWMxht"},{"type":"paragraph","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"children":[{"type":"text","value":"We also explored the exploration-exploitation tradeoff in ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"cITeqNz7uV"},{"type":"link","url":"/bandits","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"children":[{"type":"text","value":"3 Multi-Armed Bandits","key":"lz4BduACXN"}],"urlSource":"./bandits.md","dataUrl":"/bandits.json","internal":true,"protocol":"file","key":"n6CGSeLWwa"},{"type":"text","value":". Recall tthat in the MAB setting, we have ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"ZuBhNGH0nN"},{"type":"inlineMath","value":"K","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"html":"KKK","key":"FdaLm9uy7b"},{"type":"text","value":" arms, each of which has an unknown reward distribution, and we want to learn which of the arms is ","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"yW2lGTKNLZ"},{"type":"emphasis","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"children":[{"type":"text","value":"optimal","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"rsqCF33viO"}],"key":"qozkw9bEZ3"},{"type":"text","value":", i.e. has the highest mean reward.","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"wheWik7rIV"}],"key":"wR9633uOtq"},{"type":"paragraph","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"children":[{"type":"text","value":"One algorithm that struck a good balance between exploration and exploitation was the ","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"xyobvGzsch"},{"type":"strong","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"children":[{"type":"text","value":"upper confidence bound","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"AaIW5TaWjV"}],"key":"eIiNzPhFuY"},{"type":"text","value":" algorithm ","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"wjpIMkCEET"},{"type":"crossReference","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"children":[{"type":"text","value":"Section ","key":"hPELzMUMFE"},{"type":"text","value":"3.6","key":"V4B0NxxBNg"}],"identifier":"ucb","label":"ucb","kind":"heading","template":"Section %s","enumerator":"3.6","resolved":true,"html_id":"ucb","remote":true,"url":"/bandits","dataUrl":"/bandits.json","key":"oGraRt5h3v"},{"type":"text","value":": For each arm, we construct a ","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"t2sIkfrfJ3"},{"type":"emphasis","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"children":[{"type":"text","value":"confidence interval","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"YCNnyAz5I7"}],"key":"nOtmVyZZfk"},{"type":"text","value":" for its true mean award, and then choose the arm with the highest upper confidence bound. In summary,","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"cjdm5633Zv"}],"key":"Scbhrbhy50"},{"type":"math","value":"k_{t+1} \\gets \\arg\\max_{k \\in [K]} \\frac{R^{k}_t}{N^{k}_t} + \\sqrt{\\frac{\\ln(2t/\\delta)}{2 N^{k}_t}}","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"html":"kt+1argmaxk[K]RtkNtk+ln(2t/δ)2Ntkk_{t+1} \\gets \\arg\\max_{k \\in [K]} \\frac{R^{k}_t}{N^{k}_t} + \\sqrt{\\frac{\\ln(2t/\\delta)}{2 N^{k}_t}}kt+1argk[K]maxNtkRtk+2Ntkln(2t/δ)","enumerator":"9.3","key":"EVG7E4qc2A"},{"type":"paragraph","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"key":"gliMC7ovvn"},{"type":"inlineMath","value":"N_t^k","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"html":"NtkN_t^kNtk","key":"szJSqMGgYS"},{"type":"text","value":" indicates the number of times arm ","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"key":"VrZuS3ywMV"},{"type":"inlineMath","value":"k","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"html":"kkk","key":"zLCoaopRqH"},{"type":"text","value":" has been pulled up until time ","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"key":"oh33LTzpGO"},{"type":"inlineMath","value":"t","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"html":"ttt","key":"jywWragpfg"},{"type":"text","value":", ","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"key":"KwqSXBNrZ7"},{"type":"inlineMath","value":"R_t^k","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"html":"RtkR_t^kRtk","key":"YQ9saeodlS"},{"type":"text","value":" indicates the total reward obtained by pulling arm ","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"key":"BVklh0wtzu"},{"type":"inlineMath","value":"k","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"html":"kkk","key":"zZoVrgHmCi"},{"type":"text","value":" up until time ","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"key":"R6ptE3AKG6"},{"type":"inlineMath","value":"t","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"html":"ttt","key":"PZ8i08nu0W"},{"type":"text","value":", and ","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"key":"HlDrDyZE3G"},{"type":"inlineMath","value":"\\delta > 0","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"html":"δ>0\\delta > 0δ>0","key":"w4dpDYIxD9"},{"type":"text","value":" controls the width of the confidence interval. How might we extend UCB to the MDP case?","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"key":"WWI2GpQnOj"}],"key":"ANPOvpd5ie"},{"type":"paragraph","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"children":[{"type":"text","value":"Let us formally describe an unknown MDP as an MAB problem. In an unknown MDP, we want to learn which ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"uXsq63mtcS"},{"type":"emphasis","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"children":[{"type":"text","value":"policy","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"KfqYQge8mq"}],"key":"Er4q9iNgwi"},{"type":"text","value":" is optimal. So if we want to apply MAB techniques to solving an MDP, it makes sense to think of ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"ssNaV03o59"},{"type":"emphasis","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"children":[{"type":"text","value":"arms","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"xw0vpHXpZc"}],"key":"a9dbbF8eO2"},{"type":"text","value":" as ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"h87HHvjOgm"},{"type":"emphasis","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"children":[{"type":"text","value":"policies","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"mdf8QnkTC9"}],"key":"B4OntMDNea"},{"type":"text","value":". There are ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"zAHaeiBCIi"},{"type":"inlineMath","value":"K = (|\\mathcal{A}|^{|\\mathcal{S}|})^\\hor","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"html":"K=(AS)HK = (|\\mathcal{A}|^{|\\mathcal{S}|})^\\horK=(AS)H","key":"j6Bh3qutlj"},{"type":"text","value":" deterministic policies in a finite MDP. Then, “pulling” arm ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"riV2TZBH7u"},{"type":"text","value":"π","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"ClPMajc9LV"},{"type":"text","value":" corresponds to using ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"qBxsLxdi0n"},{"type":"text","value":"π","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"zhUCKfs3qj"},{"type":"text","value":" to act through a trajectory in the MDP, and observing the total reward.","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"EHBApEUzS3"}],"key":"dZUoN07WY1"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"xqM3oVV7pX"}],"key":"jT4hiPJcKl"},{"type":"paragraph","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"children":[{"type":"text","value":"Which quantity that we have seen so far equals the mean reward from arm ","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"key":"WkfZbiFkDO"},{"type":"text","value":"π","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"key":"vpzjmGFYgI"},{"type":"text","value":"?","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"key":"pv9edMEwWY"}],"key":"IrlGkiiD58"}],"key":"CE3Y38obra"},{"type":"paragraph","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"Recall that UCB incurs regret ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"BMkeQsW9xG"},{"type":"inlineMath","value":"\\tilde{O}(\\sqrt{TK})","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"O~(TK)\\tilde{O}(\\sqrt{TK})O~(TK)","key":"hKaWoFC0VY"},{"type":"text","value":", where ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"QzrkvDdhEV"},{"type":"inlineMath","value":"T","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"TTT","key":"PAmykvk5uY"},{"type":"text","value":" is the number of pulls and ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"GYPMphQRKv"},{"type":"inlineMath","value":"K","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"KKK","key":"SnWnu6pdup"},{"type":"text","value":" is the number of arms. So in the MDP-as-MAB problem, using UCB for ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"hu3zKxVSsq"},{"type":"inlineMath","value":"T","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"TTT","key":"pwxgeRTUmL"},{"type":"text","value":" episodes would achieve regret","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"d16C6d73E8"}],"key":"HmBneXC7Zr"},{"type":"math","value":"\\tilde{O}(\\sqrt{|\\mathcal{A}|^{|\\mathcal{S}|\\hor} T})","label":"mdp_as_mab","identifier":"mdp_as_mab","html":"O~(ASHT)\\tilde{O}(\\sqrt{|\\mathcal{A}|^{|\\mathcal{S}|\\hor} T})O~(ASHT)","enumerator":"9.4","html_id":"mdp-as-mab","key":"NfCYb1zc2v"},{"type":"paragraph","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"This scales ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"R3BJ09DezC"},{"type":"emphasis","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"exponentially","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"dRPNOJDIuf"}],"key":"wIBiJ7NhS2"},{"type":"text","value":" in ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"RCSPOcXRvL"},{"type":"inlineMath","value":"|\\mathcal{S}|","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"html":"S|\\mathcal{S}|S","key":"lu7YqTTzds"},{"type":"text","value":" and ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"rrJNo13Gao"},{"type":"inlineMath","value":"\\hor","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"html":"H\\horH","key":"C56oRoq2i5"},{"type":"text","value":", which quickly becomes intractable. Notably, this method doesn’t consider the information that we gain across different policies. We can illustrate this with the following example:","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"kkrkzz3Jvm"}],"key":"zee4dfNoV1"},{"type":"proof","kind":"example","label":"ineffective_mdp","identifier":"ineffective_mdp","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Treating an MDP as a MAB","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"key":"xSIhbwaRtJ"}],"key":"rjj80azdV7"},{"type":"paragraph","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"children":[{"type":"text","value":"Consider a “coin MDP” with two states “heads” and “tails”, two actions “Y” and “N”, and a time horizon of ","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"HJfiDQMAGp"},{"type":"inlineMath","value":"\\hor=2","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"html":"H=2\\hor=2H=2","key":"rvUE82LmzH"},{"type":"text","value":". The state transition flips the coin, and doesn’t depend on the action. The reward only depends on the action: Taking action Y gives reward ","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"sGYu0EY8Ux"},{"type":"text","value":"1","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"M6nT632aZZ"},{"type":"text","value":", and taking action N gives reward ","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"YfZVzPIfPl"},{"type":"text","value":"0","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"TkbmhMfo7f"},{"type":"text","value":".","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"MtFBcjM95m"}],"key":"RTay4ozZt4"},{"type":"paragraph","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"children":[{"type":"text","value":"Suppose we collect data from the two constant policies ","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"Z2VyggPrvW"},{"type":"inlineMath","value":"\\pi_{\\text{Y}}(s) = \\text{Y}","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"html":"πY(s)=Y\\pi_{\\text{Y}}(s) = \\text{Y}πY(s)=Y","key":"n0K7UBTe3x"},{"type":"text","value":" and ","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"FtxXBMPhan"},{"type":"inlineMath","value":"\\pi_{\\text{N}}(s) = \\text{N}","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"html":"πN(s)=N\\pi_{\\text{N}}(s) = \\text{N}πN(s)=N","key":"z9ml6mJ1Gn"},{"type":"text","value":". Now we want to learn about the policy ","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"wSfdD4DwaD"},{"type":"inlineMath","value":"\\tilde{\\pi}","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"html":"π~\\tilde{\\pi}π~","key":"HA2sgtgstY"},{"type":"text","value":" that takes action Y and then N. Do we need to collect data from ","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"K0nEmTxYQW"},{"type":"inlineMath","value":"\\tilde{\\pi}","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"html":"π~\\tilde{\\pi}π~","key":"tvikXK29MY"},{"type":"text","value":" to evaluate it? No: Since the reward only depends on the action, we can infer its value from our data on the policies ","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"nmJfbGfZVR"},{"type":"inlineMath","value":"\\pi_{\\text{Y}}","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"html":"πY\\pi_{\\text{Y}}πY","key":"rJ2ruSoYFA"},{"type":"text","value":" and ","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"bUvOiYVOCS"},{"type":"inlineMath","value":"\\pi_{\\text{N}}","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"html":"πN\\pi_{\\text{N}}πN","key":"rPcpLMkCGa"},{"type":"text","value":". However, if we treat the MDP as a bandit in which ","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"GzPhK1qz9P"},{"type":"inlineMath","value":"\\tilde{\\pi}","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"html":"π~\\tilde{\\pi}π~","key":"DbQF8cLoQp"},{"type":"text","value":" is a new, unknown arm, we ignore the known correlation between the action and the reward.","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"weEJkHhp9H"}],"key":"jNhVkjlnnS"}],"enumerator":"9.2","html_id":"ineffective-mdp","key":"MsDBVv8QXs"},{"type":"heading","depth":2,"position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"children":[{"type":"text","value":"UCB-VI","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"AoTsj0yiFQ"}],"identifier":"ucb-vi","label":"UCB-VI","html_id":"ucb-vi","implicit":true,"enumerator":"9.3","key":"jOw4X0tmOJ"},{"type":"paragraph","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"children":[{"type":"text","value":"The approach above is inefficient: We shouldn’t need to consider all ","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"XfynaJn1yX"},{"type":"inlineMath","value":"|\\mathcal{A}|^{|\\mathcal{S}| H}","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"html":"ASH|\\mathcal{A}|^{|\\mathcal{S}| H}ASH","key":"qbyMjGQH23"},{"type":"text","value":" deterministic policies to achieve low regret. Rather, all we need to describe the optimal policy is ","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"LrdbHqowHr"},{"type":"inlineMath","value":"Q^\\star","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"html":"QQ^\\starQ","key":"BKgrNgz9ia"},{"type":"text","value":", which has ","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"WYBizCescB"},{"type":"inlineMath","value":"H |\\mathcal{S}||\\mathcal{A}|","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"html":"HSAH |\\mathcal{S}||\\mathcal{A}|HS∣∣A","key":"wbdwOo6Nqp"},{"type":"text","value":" entries to be learned. Can we borrow ideas from UCB to reduce the regret to this order (i.e. polynomial in ","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"MdHbTrTffb"},{"type":"inlineMath","value":"|\\mathcal{S}|","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"html":"S|\\mathcal{S}|S","key":"i6g874n7xO"},{"type":"text","value":", ","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"pS7Bs1engL"},{"type":"inlineMath","value":"|\\mathcal{A}|","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"html":"A|\\mathcal{A}|A","key":"YgaLpkAXUv"},{"type":"text","value":", and ","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"SCnK2UM8s7"},{"type":"inlineMath","value":"H","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"html":"HHH","key":"TVb3kcgxjL"},{"type":"text","value":")?","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"uHn7BXCeT9"}],"key":"K17ex1q2We"},{"type":"paragraph","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"children":[{"type":"text","value":"One way to frame the UCB algorithm is that, when choosing arms, we optimize over a ","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"bhY2QLm3b9"},{"type":"emphasis","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"children":[{"type":"text","value":"proxy reward","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"PKW1P5QVqE"}],"key":"OKdRSHoaTj"},{"type":"text","value":" that is the sum of the estimated mean reward and an exploration term. In the ","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"i35NLMwm7M"},{"type":"strong","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"children":[{"type":"text","value":"UCB-VI","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"sP0TUd3Akc"}],"key":"vl3Jm3FFE6"},{"type":"text","value":" algorithm, we will extend this idea to the case of an unknown MDP ","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"OgIUOWmPGN"},{"type":"inlineMath","value":"\\mathcal{M}^{?}","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"html":"M?\\mathcal{M}^{?}M?","key":"siMQpHxJ4z"},{"type":"text","value":" by modelling a proxy MDP ","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"H6R4hnjW63"},{"type":"inlineMath","value":"\\tilde{\\mathcal{M}}","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"html":"M~\\tilde{\\mathcal{M}}M~","key":"NjNJDJO8wP"},{"type":"text","value":" with a reward function that encourages exploration. Then, we will use DP to solve for the optimal policy in ","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"pBnTZ4Clf3"},{"type":"inlineMath","value":"\\tilde{\\mathcal{M}}","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"html":"M~\\tilde{\\mathcal{M}}M~","key":"hfUQgVx93l"},{"type":"text","value":".","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"j7aWS43Lbk"}],"key":"QhjUa7b5DS"},{"type":"paragraph","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"strong","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"Assumptions:","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"bMVYmZlnkD"}],"key":"G6olgp5QxW"},{"type":"text","value":" For simplicity, here we assume the reward function of ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"jn46FaWHNK"},{"type":"inlineMath","value":"\\mathcal{M}^{?}","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"html":"M?\\mathcal{M}^{?}M?","key":"zLevqxGxGd"},{"type":"text","value":" is known, so we only need to model the state transitions, though the rewards can be modelled similarly. We will also consider the more general case of a ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"EHGyCrp8lm"},{"type":"strong","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"time-varying","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"W5xrKeWJNJ"}],"key":"bhG93gr3Zc"},{"type":"text","value":" MDP, where the transition and reward functions can change over time. We take the convention that ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"iiyKFbS8lf"},{"type":"inlineMath","value":"P_\\hi","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"html":"PhP_\\hiPh","key":"zXAwYQSZSu"},{"type":"text","value":" is the distribution of ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"UotGxqVRM7"},{"type":"inlineMath","value":"s_{h+1} \\mid s_{h}, a_{h}","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"html":"sh+1sh,ahs_{h+1} \\mid s_{h}, a_{h}sh+1sh,ah","key":"dI7xAhr22n"},{"type":"text","value":" and ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"ietyIMdZOH"},{"type":"inlineMath","value":"r_\\hi","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"html":"rhr_\\hirh","key":"ysZeTd0IQJ"},{"type":"text","value":" is applied to ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"phkDvMp7rj"},{"type":"inlineMath","value":"s_\\hi, a_\\hi","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"html":"sh,ahs_\\hi, a_\\hish,ah","key":"W2Fhe4ImEF"},{"type":"text","value":".","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"L413HLWN41"}],"key":"QCon9SwLdv"},{"type":"paragraph","position":{"start":{"line":120,"column":1},"end":{"line":120,"column":1}},"children":[{"type":"text","value":"At a high level, the UCB-VI algorithm can be described as follows:","position":{"start":{"line":120,"column":1},"end":{"line":120,"column":1}},"key":"ymOCsMz7do"}],"key":"yAvQza54PN"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":122,"column":1},"end":{"line":127,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":122,"column":1},"end":{"line":123,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"children":[{"type":"strong","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"children":[{"type":"text","value":"Modelling:","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"ZTIolPklMm"}],"key":"MjFcxHwFqT"},{"type":"text","value":" Use previous data to model the transitions ","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"QvdbQroRGw"},{"type":"inlineMath","value":"\\hat{P}_0, \\dots, \\hat{P}_{H-1}","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"html":"P^0,,P^H1\\hat{P}_0, \\dots, \\hat{P}_{H-1}P^0,,P^H1","key":"kewe0LD6dW"},{"type":"text","value":".","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"yZY5OqpGfA"}],"key":"KwFizUjVGs"}],"key":"P4vwSVfVhQ"},{"type":"listItem","spread":true,"position":{"start":{"line":124,"column":1},"end":{"line":125,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":124,"column":1},"end":{"line":124,"column":1}},"children":[{"type":"strong","position":{"start":{"line":124,"column":1},"end":{"line":124,"column":1}},"children":[{"type":"text","value":"Reward bonus:","position":{"start":{"line":124,"column":1},"end":{"line":124,"column":1}},"key":"huWiP2hLvj"}],"key":"TRkLnEvU4j"},{"type":"text","value":" Design a reward bonus ","position":{"start":{"line":124,"column":1},"end":{"line":124,"column":1}},"key":"JuxKSYvSTu"},{"type":"inlineMath","value":"b_\\hi(s, a) \\in \\mathbb{R}","position":{"start":{"line":124,"column":1},"end":{"line":124,"column":1}},"html":"bh(s,a)Rb_\\hi(s, a) \\in \\mathbb{R}bh(s,a)R","key":"dsvA73yBEi"},{"type":"text","value":" to encourage exploration, analogous to the UCB term.","position":{"start":{"line":124,"column":1},"end":{"line":124,"column":1}},"key":"xt9MuqhP0A"}],"key":"K1CYkmQ2AM"}],"key":"rv5zEX9JYe"},{"type":"listItem","spread":true,"position":{"start":{"line":126,"column":1},"end":{"line":127,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":126,"column":1},"end":{"line":126,"column":1}},"children":[{"type":"strong","position":{"start":{"line":126,"column":1},"end":{"line":126,"column":1}},"children":[{"type":"text","value":"Optimistic planning:","position":{"start":{"line":126,"column":1},"end":{"line":126,"column":1}},"key":"aKuBILnoX4"}],"key":"oTTAWp61gT"},{"type":"text","value":" Use DP to compute the optimal policy ","position":{"start":{"line":126,"column":1},"end":{"line":126,"column":1}},"key":"m7OK8zdyTx"},{"type":"inlineMath","value":"\\hat \\pi_\\hi(s)","position":{"start":{"line":126,"column":1},"end":{"line":126,"column":1}},"html":"π^h(s)\\hat \\pi_\\hi(s)π^h(s)","key":"ydEwgSMJRU"},{"type":"text","value":" in the modelled MDP","position":{"start":{"line":126,"column":1},"end":{"line":126,"column":1}},"key":"kVwj2XBqWv"}],"key":"KA7rEau6vP"}],"key":"rjNsScY0JD"}],"key":"D2ofKIzdQP"},{"type":"math","value":"\\tilde{\\mathcal{M}} = (\\mathcal{S}, \\mathcal{A}, \\{ \\hat{P}_\\hi \\}_{h \\in [H]}, \\{ r_\\hi + b_\\hi \\}_{h \\in [H]}, H).","position":{"start":{"line":128,"column":1},"end":{"line":128,"column":1}},"html":"M~=(S,A,{P^h}h[H],{rh+bh}h[H],H).\\tilde{\\mathcal{M}} = (\\mathcal{S}, \\mathcal{A}, \\{ \\hat{P}_\\hi \\}_{h \\in [H]}, \\{ r_\\hi + b_\\hi \\}_{h \\in [H]}, H).M~=(S,A,{P^h}h[H],{rh+bh}h[H],H).","enumerator":"9.5","key":"eopl4ZpRM0"},{"type":"list","ordered":true,"start":4,"spread":false,"position":{"start":{"line":130,"column":1},"end":{"line":131,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":130,"column":1},"end":{"line":131,"column":1}},"children":[{"type":"strong","position":{"start":{"line":130,"column":1},"end":{"line":130,"column":1}},"children":[{"type":"text","value":"Execution:","position":{"start":{"line":130,"column":1},"end":{"line":130,"column":1}},"key":"oGxko0wE6e"}],"key":"OroyWVtTtx"},{"type":"text","value":" Use ","position":{"start":{"line":130,"column":1},"end":{"line":130,"column":1}},"key":"pDU0P3LF3T"},{"type":"inlineMath","value":"\\hat \\pi_\\hi(s)","position":{"start":{"line":130,"column":1},"end":{"line":130,"column":1}},"html":"π^h(s)\\hat \\pi_\\hi(s)π^h(s)","key":"rd1wawfSN8"},{"type":"text","value":" to collect a new trajectory, and repeat.","position":{"start":{"line":130,"column":1},"end":{"line":130,"column":1}},"key":"ured5teoSY"}],"key":"r0kosgNt9J"}],"key":"WarKMp4vb9"},{"type":"paragraph","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"children":[{"type":"text","value":"We detail each of these steps below. The full definition follows in ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"BxYWwpBkrS"},{"type":"crossReference","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"children":[{"type":"text","value":"(","key":"k1AInbZGO8"},{"type":"text","value":"9.16","key":"o07rlkIyNS"},{"type":"text","value":")","key":"tHtDA5KUBQ"}],"identifier":"ucb-vi-alg","label":"ucb-vi-alg","kind":"equation","template":"(%s)","enumerator":"9.16","resolved":true,"html_id":"ucb-vi-alg","key":"Lqj5lxDoPI"},{"type":"text","value":".","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"tQwsRuoRXZ"}],"key":"ZIFTveXu2S"},{"type":"heading","depth":3,"position":{"start":{"line":134,"column":1},"end":{"line":134,"column":1}},"children":[{"type":"text","value":"Modelling the transitions","position":{"start":{"line":134,"column":1},"end":{"line":134,"column":1}},"key":"eKMxUShplj"}],"identifier":"modelling-the-transitions","label":"Modelling the transitions","html_id":"modelling-the-transitions","implicit":true,"enumerator":"9.3.1","key":"Qy4etmtmwa"},{"type":"paragraph","position":{"start":{"line":136,"column":1},"end":{"line":136,"column":1}},"children":[{"type":"text","value":"We seek to approximate ","position":{"start":{"line":136,"column":1},"end":{"line":136,"column":1}},"key":"mHr2YJZj8o"},{"type":"inlineMath","value":"P_\\hi(s_{h+1} \\mid s_\\hi, a_\\hi) = \\frac{\\pr(s_\\hi, a_\\hi, s_{h+1})}{\\pr(s_\\hi, a_\\hi)}","position":{"start":{"line":136,"column":1},"end":{"line":136,"column":1}},"html":"Ph(sh+1sh,ah)=P(sh,ah,sh+1)P(sh,ah)P_\\hi(s_{h+1} \\mid s_\\hi, a_\\hi) = \\frac{\\pr(s_\\hi, a_\\hi, s_{h+1})}{\\pr(s_\\hi, a_\\hi)}Ph(sh+1sh,ah)=P(sh,ah)P(sh,ah,sh+1)","key":"opAAd1vVEj"},{"type":"text","value":". We can estimate these using their sample probabilities from the dataset. That is, define","position":{"start":{"line":136,"column":1},"end":{"line":136,"column":1}},"key":"sLqjRwIgw8"}],"key":"oqo2ZHNZB7"},{"type":"math","value":"\\begin{aligned}\n N_\\hi^t(s, a, s') & := \\sum_{i=0}^{t-1} \\ind{ (s_\\hi^i, a_\\hi^i, s_{h+1}^i) = (s, a, s') } \\\\\n N_\\hi^t(s, a) & := \\sum_{i=0}^{t-1} \\ind{ (s_\\hi^i, a_\\hi^i) = (s, a) } \\\\\n\\end{aligned}","position":{"start":{"line":138,"column":1},"end":{"line":141,"column":1}},"html":"Nht(s,a,s):=i=0t11{(shi,ahi,sh+1i)=(s,a,s)}Nht(s,a):=i=0t11{(shi,ahi)=(s,a)}\\begin{aligned}\n N_\\hi^t(s, a, s') & := \\sum_{i=0}^{t-1} \\ind{ (s_\\hi^i, a_\\hi^i, s_{h+1}^i) = (s, a, s') } \\\\\n N_\\hi^t(s, a) & := \\sum_{i=0}^{t-1} \\ind{ (s_\\hi^i, a_\\hi^i) = (s, a) } \\\\\n\\end{aligned}Nht(s,a,s)Nht(s,a):=i=0t11{(shi,ahi,sh+1i)=(s,a,s)}:=i=0t11{(shi,ahi)=(s,a)}","enumerator":"9.6","key":"RteRSWFK0o"},{"type":"paragraph","position":{"start":{"line":143,"column":1},"end":{"line":143,"column":1}},"children":[{"type":"text","value":"Then we can model","position":{"start":{"line":143,"column":1},"end":{"line":143,"column":1}},"key":"E9jsbNCeii"}],"key":"jqkHQn5iNp"},{"type":"math","value":"\\hat{P}_\\hi^t(s' \\mid s, a) = \\frac{N_\\hi^t(s, a, s')}{N_\\hi^t(s, a)}.","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"P^ht(ss,a)=Nht(s,a,s)Nht(s,a).\\hat{P}_\\hi^t(s' \\mid s, a) = \\frac{N_\\hi^t(s, a, s')}{N_\\hi^t(s, a)}.P^ht(ss,a)=Nht(s,a)Nht(s,a,s).","enumerator":"9.7","key":"efnqhwqdji"},{"type":"proof","kind":"remark","enumerated":true,"children":[{"type":"paragraph","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"children":[{"type":"text","value":"Note that this is also a fairly naive, nonparametric estimator that doesn’t assume any underlying structure of the MDP. We’ll see how to incorporate assumptions about the MDP in the following section.","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"key":"iIVcT1yzgo"}],"key":"ggnFyFJet4"}],"enumerator":"9.1","key":"WXRSnpLTFy"},{"type":"heading","depth":3,"position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"Reward bonus","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"key":"BAey4sHF1E"}],"identifier":"reward-bonus","label":"Reward bonus","html_id":"reward-bonus","implicit":true,"enumerator":"9.3.2","key":"rs0oooX6A4"},{"type":"paragraph","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"children":[{"type":"text","value":"To motivate the reward bonus term ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"O1yevnvzK9"},{"type":"inlineMath","value":"b_\\hi^t(s, a)","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"bht(s,a)b_\\hi^t(s, a)bht(s,a)","key":"xQidha5Ehx"},{"type":"text","value":", recall how we designed the reward bonus term for UCB:","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"kAivL5IQCS"}],"key":"GiAyZa8uip"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":155,"column":1},"end":{"line":160,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":155,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"children":[{"type":"text","value":"We used Hoeffding’s inequality to bound, with high probability, how far the sample mean ","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"AMsVaSiyf9"},{"type":"inlineMath","value":"\\hat \\mu_t^k","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"html":"μ^tk\\hat \\mu_t^kμ^tk","key":"IAB4lZ2HuC"},{"type":"text","value":" deviated from the true mean ","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"dfamAxxulI"},{"type":"inlineMath","value":"\\mu^k","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"html":"μk\\mu^kμk","key":"p4v8kBtArX"},{"type":"text","value":".","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"cRo69yk8cz"}],"key":"j6GFya6foU"}],"key":"EFOCpWwRjd"},{"type":"listItem","spread":true,"position":{"start":{"line":157,"column":1},"end":{"line":158,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"children":[{"type":"text","value":"By inverting this inequality, we obtained a ","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"key":"jHPhJMrIWf"},{"type":"inlineMath","value":"(1-\\delta)","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"html":"(1δ)(1-\\delta)(1δ)","key":"EagtfWRSL9"},{"type":"text","value":"-confidence interval for the true mean, centered at our estimate.","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"key":"TdTzNAuoTX"}],"key":"opzQgx32yf"}],"key":"fpnHmD7sZd"},{"type":"listItem","spread":true,"position":{"start":{"line":159,"column":1},"end":{"line":160,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"children":[{"type":"text","value":"To make this bound ","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"v5RyK3PWiL"},{"type":"emphasis","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"children":[{"type":"text","value":"uniform","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"euFQnisafp"}],"key":"iPGLkLzUfI"},{"type":"text","value":" across all timesteps ","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"KeuxzDqQ9B"},{"type":"inlineMath","value":"t \\in [T]","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"html":"t[T]t \\in [T]t[T]","key":"OkNCUh1Ign"},{"type":"text","value":", we applied the union bound and multiplied ","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"EL1TNnsra3"},{"type":"text","value":"δ","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"ZQVWC691Fo"},{"type":"text","value":" by a factor of ","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"STOkan3Qcu"},{"type":"inlineMath","value":"T","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"html":"TTT","key":"vM8YsJiV8G"},{"type":"text","value":".","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"AfFShhehaa"}],"key":"BiczuoT5qU"}],"key":"GMr67rUYZI"}],"key":"gyNKNEGPxm"},{"type":"paragraph","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"children":[{"type":"text","value":"We’d like to do the same for UCB-VI, and construct the bonus term such that ","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"key":"Eu3RpVPf5B"},{"type":"inlineMath","value":"V^\\star_\\hi(s) \\le \\hat{V}_\\hi^t(s)","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"html":"Vh(s)V^ht(s)V^\\star_\\hi(s) \\le \\hat{V}_\\hi^t(s)Vh(s)V^ht(s)","key":"YKfSESgpMu"},{"type":"text","value":" with high probability. However, our construction will be more complex than the MAB case, since ","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"key":"o7mmIe915o"},{"type":"inlineMath","value":"\\hat{V}_\\hi^t(s)","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"html":"V^ht(s)\\hat{V}_\\hi^t(s)V^ht(s)","key":"Mg4ZdfIMYz"},{"type":"text","value":" depends on the bonus ","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"key":"qHsry0RCBl"},{"type":"inlineMath","value":"b_\\hi^t(s, a)","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"html":"bht(s,a)b_\\hi^t(s, a)bht(s,a)","key":"PXkiSVbrXV"},{"type":"text","value":" implicitly via DP. We claim that the bonus term that gives the proper bound is","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"key":"o0bovkq83v"}],"key":"HVXWDKt4g6"},{"type":"math","value":"b_\\hi^t(s, a) = 2 H \\sqrt{\\frac{\\log( |\\mathcal{S}||\\mathcal{A}|H T/\\delta )}{N_\\hi^t(s, a)}}.","position":{"start":{"line":163,"column":1},"end":{"line":164,"column":1}},"identifier":"eq:ucb_vi_bonus","label":"eq:ucb_vi_bonus","html_id":"eq-ucb-vi-bonus","html":"bht(s,a)=2Hlog(SAHT/δ)Nht(s,a).b_\\hi^t(s, a) = 2 H \\sqrt{\\frac{\\log( |\\mathcal{S}||\\mathcal{A}|H T/\\delta )}{N_\\hi^t(s, a)}}.bht(s,a)=2HNht(s,a)log(S∣∣AHT/δ).","enumerator":"9.8","key":"HvyK3W5MBz"},{"type":"paragraph","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"children":[{"type":"text","value":"We will only provide a heuristic sketch of the proof; see ","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"key":"mA0SqW7nJ6"},{"type":"cite","kind":"narrative","label":"agarwal_reinforcement_2022","identifier":"agarwal_reinforcement_2022","children":[{"type":"text","value":"Agarwal ","key":"YF1CAgeKJr"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"BQJmYOqjtD"}],"key":"Pj6TM6o9x2"},{"type":"text","value":" (2022)","key":"ZPZrd0WOPu"}],"enumerator":"1","key":"RI5qHROyYi"},{"type":"text","value":" (Section 7.3) for a full proof.","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"key":"daeFwPNk2j"}],"key":"vvH36rmYb2"},{"type":"proof","kind":"remark","label":"ucb_vi_bonus","identifier":"ucb_vi_bonus","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"UCB-VI reward bonus construction","position":{"start":{"line":168,"column":1},"end":{"line":168,"column":1}},"key":"POPsHs1F5E"}],"key":"DNVlHSyO1D"},{"type":"paragraph","position":{"start":{"line":171,"column":1},"end":{"line":171,"column":1}},"children":[{"type":"text","value":"We aim to show that, with high probability,","position":{"start":{"line":171,"column":1},"end":{"line":171,"column":1}},"key":"DPdkTLnVAD"}],"key":"CPN2zt86ak"},{"type":"math","value":"V_\\hi^\\star(s) \\le \\hat{V}_\\hi^t(s) \\quad \\forall t \\in [T], h \\in [H], s \\in \\mathcal{S}.","position":{"start":{"line":173,"column":1},"end":{"line":173,"column":1}},"html":"Vh(s)V^ht(s)t[T],h[H],sS.V_\\hi^\\star(s) \\le \\hat{V}_\\hi^t(s) \\quad \\forall t \\in [T], h \\in [H], s \\in \\mathcal{S}.Vh(s)V^ht(s)t[T],h[H],sS.","enumerator":"9.9","key":"FQTopPSGoj"},{"type":"paragraph","position":{"start":{"line":175,"column":1},"end":{"line":175,"column":1}},"children":[{"type":"text","value":"We’ll do this by bounding the error incurred at each step of DP. Recall that DP solves for ","position":{"start":{"line":175,"column":1},"end":{"line":175,"column":1}},"key":"QRT6xQwXR6"},{"type":"inlineMath","value":"\\hat{V}_\\hi^t(s)","position":{"start":{"line":175,"column":1},"end":{"line":175,"column":1}},"html":"V^ht(s)\\hat{V}_\\hi^t(s)V^ht(s)","key":"PLAMGzLnIw"},{"type":"text","value":" recursively as follows:","position":{"start":{"line":175,"column":1},"end":{"line":175,"column":1}},"key":"rVOdCvTEEb"}],"key":"Zb3LuZBWNV"},{"type":"math","value":"\\hat{V}_\\hi^t(s) = \\max_{a \\in \\mathcal{A}} \\left[ \\tilde r^t_\\hi(s, a) + \\E_{s' \\sim \\hat{P}_\\hi^t(\\cdot \\mid s, a)} \\left[ \\hat{V}_{h+1}^t(s') \\right] \\right]","position":{"start":{"line":177,"column":1},"end":{"line":177,"column":1}},"html":"V^ht(s)=maxaA[r~ht(s,a)+EsP^ht(s,a)[V^h+1t(s)]]\\hat{V}_\\hi^t(s) = \\max_{a \\in \\mathcal{A}} \\left[ \\tilde r^t_\\hi(s, a) + \\E_{s' \\sim \\hat{P}_\\hi^t(\\cdot \\mid s, a)} \\left[ \\hat{V}_{h+1}^t(s') \\right] \\right]V^ht(s)=aAmax[r~ht(s,a)+EsP^ht(s,a)[V^h+1t(s)]]","enumerator":"9.10","key":"BhOEFkZu3v"},{"type":"paragraph","position":{"start":{"line":179,"column":1},"end":{"line":179,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":179,"column":1},"end":{"line":179,"column":1}},"key":"yIpICDiQgI"},{"type":"inlineMath","value":"\\tilde r^t_\\hi(s, a) = r_\\hi(s, a) + b_\\hi^t(s, a)","position":{"start":{"line":179,"column":1},"end":{"line":179,"column":1}},"html":"r~ht(s,a)=rh(s,a)+bht(s,a)\\tilde r^t_\\hi(s, a) = r_\\hi(s, a) + b_\\hi^t(s, a)r~ht(s,a)=rh(s,a)+bht(s,a)","key":"yyf4A58dXr"},{"type":"text","value":" is the reward function of our modelled MDP ","position":{"start":{"line":179,"column":1},"end":{"line":179,"column":1}},"key":"LLGK8jSpWz"},{"type":"inlineMath","value":"\\tilde{\\mathcal{M}}^t","position":{"start":{"line":179,"column":1},"end":{"line":179,"column":1}},"html":"M~t\\tilde{\\mathcal{M}}^tM~t","key":"E8nPPiwJPV"},{"type":"text","value":". On the other hand, we know that ","position":{"start":{"line":179,"column":1},"end":{"line":179,"column":1}},"key":"sB8mLidkY0"},{"type":"inlineMath","value":"V^\\star","position":{"start":{"line":179,"column":1},"end":{"line":179,"column":1}},"html":"VV^\\starV","key":"ElShAA8lrD"},{"type":"text","value":" must satisfy","position":{"start":{"line":179,"column":1},"end":{"line":179,"column":1}},"key":"pnmEnRUNIw"}],"key":"z7qq9N8T8A"},{"type":"math","value":"V^\\star_\\hi(s) = \\max_{a \\in \\mathcal{A}} \\left[ \\tilde r^t_\\hi(s, a) + \\E_{s' \\sim P^?_\\hi(\\cdot \\mid s, a)} [V^\\star_{\\hi+1}(s')] \\right]","position":{"start":{"line":181,"column":1},"end":{"line":181,"column":1}},"html":"Vh(s)=maxaA[r~ht(s,a)+EsPh?(s,a)[Vh+1(s)]]V^\\star_\\hi(s) = \\max_{a \\in \\mathcal{A}} \\left[ \\tilde r^t_\\hi(s, a) + \\E_{s' \\sim P^?_\\hi(\\cdot \\mid s, a)} [V^\\star_{\\hi+1}(s')] \\right]Vh(s)=aAmax[r~ht(s,a)+EsPh?(s,a)[Vh+1(s)]]","enumerator":"9.11","key":"kD3owWvsfH"},{"type":"paragraph","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"children":[{"type":"text","value":"so it suffices to bound the difference between the two inner expectations. There are two sources of error:","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"Oqje2635vC"}],"key":"lPhhxRA82X"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":185,"column":1},"end":{"line":188,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":185,"column":1},"end":{"line":186,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"children":[{"type":"text","value":"The value functions ","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"XZbqAharqo"},{"type":"inlineMath","value":"\\hat{V}^t_{h+1}","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"html":"V^h+1t\\hat{V}^t_{h+1}V^h+1t","key":"H9TIrxWRMc"},{"type":"text","value":" v.s. ","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"KgZSeobfTT"},{"type":"inlineMath","value":"V^\\star_{h+1}","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"html":"Vh+1V^\\star_{h+1}Vh+1","key":"VfIERt9Cnt"}],"key":"CKN08vPG12"}],"key":"aB4Vxxvtop"},{"type":"listItem","spread":true,"position":{"start":{"line":187,"column":1},"end":{"line":188,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":187,"column":1},"end":{"line":187,"column":1}},"children":[{"type":"text","value":"The transition probabilities ","position":{"start":{"line":187,"column":1},"end":{"line":187,"column":1}},"key":"AGQNSYSVeb"},{"type":"inlineMath","value":"\\hat{P}_\\hi^t","position":{"start":{"line":187,"column":1},"end":{"line":187,"column":1}},"html":"P^ht\\hat{P}_\\hi^tP^ht","key":"LNhqoOLbw5"},{"type":"text","value":" v.s. ","position":{"start":{"line":187,"column":1},"end":{"line":187,"column":1}},"key":"es8gnPFx5k"},{"type":"inlineMath","value":"P^?_\\hi","position":{"start":{"line":187,"column":1},"end":{"line":187,"column":1}},"html":"Ph?P^?_\\hiPh?","key":"Zek9ypigpH"},{"type":"text","value":".","position":{"start":{"line":187,"column":1},"end":{"line":187,"column":1}},"key":"DPNTgz8NFF"}],"key":"MV5N2zXKIw"}],"key":"AdFddc9m9f"}],"key":"I37H3DmvbU"},{"type":"paragraph","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"children":[{"type":"text","value":"We can bound these individually, and then combine them by the triangle inequality. For the former, we can simply bound the difference by ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"rZD3aJ6RDy"},{"type":"inlineMath","value":"H","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"html":"HHH","key":"O3KCl1CYlj"},{"type":"text","value":", assuming that the rewards are within ","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"NOp89HM23Y"},{"type":"inlineMath","value":"[0, 1]","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"html":"[0,1][0, 1][0,1]","key":"r8dxpXcI6S"},{"type":"text","value":". Now, all that is left is to bound the error from the transition probabilities:","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"ZBWNRfjyf0"}],"key":"Mdp1VXHIov"},{"type":"math","value":"\\text{error} = \\left| \\E_{s' \\sim \\hat{P}_\\hi^t(\\cdot \\mid s, a)} \\left[ V^\\star_{h+1}(s') \\right] - \\E_{s' \\sim P^?_\\hi(\\cdot \\mid s, a)} \\left[ V^\\star_{h+1}(s') \\right]. \\right|","label":"err","identifier":"err","html":"error=EsP^ht(s,a)[Vh+1(s)]EsPh?(s,a)[Vh+1(s)].\\text{error} = \\left| \\E_{s' \\sim \\hat{P}_\\hi^t(\\cdot \\mid s, a)} \\left[ V^\\star_{h+1}(s') \\right] - \\E_{s' \\sim P^?_\\hi(\\cdot \\mid s, a)} \\left[ V^\\star_{h+1}(s') \\right]. \\right|error=EsP^ht(s,a)[Vh+1(s)]EsPh?(s,a)[Vh+1(s)].","enumerator":"9.12","html_id":"err","key":"kltivMZHhe"},{"type":"paragraph","position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"children":[{"type":"text","value":"Let us bound this term for a fixed ","position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"key":"zrkOgx2cm2"},{"type":"inlineMath","value":"s, a, h, t","position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"html":"s,a,h,ts, a, h, ts,a,h,t","key":"QQhqyDPZif"},{"type":"text","value":". (Later we can make this uniform across ","position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"key":"rA1qMsiEKX"},{"type":"inlineMath","value":"s, a, h, t","position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"html":"s,a,h,ts, a, h, ts,a,h,t","key":"r2nHhYr6wn"},{"type":"text","value":" using the union bound.) Note that expanding out the definition of ","position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"key":"ybttkBGNUm"},{"type":"inlineMath","value":"\\hat{P}_\\hi^t","position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"html":"P^ht\\hat{P}_\\hi^tP^ht","key":"S9tEnnXjyb"},{"type":"text","value":" gives","position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"key":"rvha1iLwiN"}],"key":"W8xG3jrPct"},{"type":"math","value":"\\begin{aligned}\n \\E_{s' \\sim \\hat{P}_\\hi^t(\\cdot \\mid s, a)} \\left[ V^\\star_{h+1}(s') \\right] & = \\sum_{s' \\in \\mathcal{S}} \\frac{N^t_\\hi(s, a, s')}{N^t_\\hi(s, a)} V^\\star_{h+1}(s') \\\\\n & = \\frac{1}{N^t_\\hi(s, a)} \\sum_{i=0}^{t-1} \\sum_{s' \\in \\mathcal{S}} \\ind{ (s_\\hi^i, a_\\hi^i, s_{h+1}^i) = (s, a, s') } V^\\star_{h+1}(s') \\\\\n & = \\frac{1}{N^t_\\hi(s, a)} \\sum_{i=0}^{t-1} \\underbrace{\\ind{ (s_\\hi^i, a_\\hi^i) = (s, a) } V^\\star_{h+1}(s_{h+1}^i)}_{X^i}\n\\end{aligned}","position":{"start":{"line":199,"column":1},"end":{"line":203,"column":1}},"html":"EsP^ht(s,a)[Vh+1(s)]=sSNht(s,a,s)Nht(s,a)Vh+1(s)=1Nht(s,a)i=0t1sS1{(shi,ahi,sh+1i)=(s,a,s)}Vh+1(s)=1Nht(s,a)i=0t11{(shi,ahi)=(s,a)}Vh+1(sh+1i)Xi\\begin{aligned}\n \\E_{s' \\sim \\hat{P}_\\hi^t(\\cdot \\mid s, a)} \\left[ V^\\star_{h+1}(s') \\right] & = \\sum_{s' \\in \\mathcal{S}} \\frac{N^t_\\hi(s, a, s')}{N^t_\\hi(s, a)} V^\\star_{h+1}(s') \\\\\n & = \\frac{1}{N^t_\\hi(s, a)} \\sum_{i=0}^{t-1} \\sum_{s' \\in \\mathcal{S}} \\ind{ (s_\\hi^i, a_\\hi^i, s_{h+1}^i) = (s, a, s') } V^\\star_{h+1}(s') \\\\\n & = \\frac{1}{N^t_\\hi(s, a)} \\sum_{i=0}^{t-1} \\underbrace{\\ind{ (s_\\hi^i, a_\\hi^i) = (s, a) } V^\\star_{h+1}(s_{h+1}^i)}_{X^i}\n\\end{aligned}EsP^ht(s,a)[Vh+1(s)]=sSNht(s,a)Nht(s,a,s)Vh+1(s)=Nht(s,a)1i=0t1sS1{(shi,ahi,sh+1i)=(s,a,s)}Vh+1(s)=Nht(s,a)1i=0t1Xi1{(shi,ahi)=(s,a)}Vh+1(sh+1i)","enumerator":"9.13","key":"n7QHt0vJs6"},{"type":"paragraph","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"children":[{"type":"text","value":"since the terms where ","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"bL77DT4yis"},{"type":"inlineMath","value":"s' \\neq s_{h+1}^i","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"html":"ssh+1is' \\neq s_{h+1}^is=sh+1i","key":"cNLGvHOnDx"},{"type":"text","value":" vanish.","position":{"start":{"line":205,"column":1},"end":{"line":205,"column":1}},"key":"OOAkiZxrAs"}],"key":"vXqh0o1YcU"},{"type":"paragraph","position":{"start":{"line":207,"column":1},"end":{"line":207,"column":1}},"children":[{"type":"text","value":"Now, in order to apply Hoeffding’s inequality, we would like to express the second term in ","position":{"start":{"line":207,"column":1},"end":{"line":207,"column":1}},"key":"B2tILosjfS"},{"type":"crossReference","kind":"equation","identifier":"err","label":"err","children":[{"type":"text","value":"(","key":"lHRoc71H4I"},{"type":"text","value":"9.12","key":"OsDdXEARkC"},{"type":"text","value":")","key":"aFzbc62c9w"}],"template":"(%s)","enumerator":"9.12","resolved":true,"html_id":"err","key":"o61L17Xu6V"},{"type":"text","value":" as a sum over ","position":{"start":{"line":207,"column":1},"end":{"line":207,"column":1}},"key":"jVSUqPFfkH"},{"type":"inlineMath","value":"t","position":{"start":{"line":207,"column":1},"end":{"line":207,"column":1}},"html":"ttt","key":"yVcFTZTtGw"},{"type":"text","value":" random variables as well. We will do this by redundantly averaging over all desired trajectories (i.e. where we visit state ","position":{"start":{"line":207,"column":1},"end":{"line":207,"column":1}},"key":"m9hlwKixNO"},{"type":"inlineMath","value":"s","position":{"start":{"line":207,"column":1},"end":{"line":207,"column":1}},"html":"sss","key":"ULfS4Rnrwj"},{"type":"text","value":" and action ","position":{"start":{"line":207,"column":1},"end":{"line":207,"column":1}},"key":"GRlGHJdo6X"},{"type":"inlineMath","value":"a","position":{"start":{"line":207,"column":1},"end":{"line":207,"column":1}},"html":"aaa","key":"su5mHjJrjA"},{"type":"text","value":" at time ","position":{"start":{"line":207,"column":1},"end":{"line":207,"column":1}},"key":"yJyvmisWaV"},{"type":"inlineMath","value":"h","position":{"start":{"line":207,"column":1},"end":{"line":207,"column":1}},"html":"hhh","key":"B0FHFWoxPG"},{"type":"text","value":"):","position":{"start":{"line":207,"column":1},"end":{"line":207,"column":1}},"key":"Ro2UmsFg3Z"}],"key":"lT3sQUBy3Y"},{"type":"math","value":"\\begin{aligned}\n \\E_{s' \\sim P^?_\\hi(\\cdot \\mid s, a)} \\left[ V^\\star_{h+1}(s') \\right]\n & = \\sum_{s' \\in \\mathcal{S}} P^?_\\hi(s' \\mid s, a) V^\\star_{h+1}(s') \\\\\n & = \\sum_{s' \\in \\mathcal{S}} \\frac{1}{N^t_\\hi(s, a)} \\sum_{i=0}^{t-1} \\ind{ (s_\\hi^i, a_\\hi^i) = (s, a) } P^?_\\hi(s' \\mid s, a) V^\\star_{h+1}(s') \\\\\n & = \\frac{1}{N^t_\\hi(s, a)} \\sum_{i=0}^{t-1} \\E_{s_{h+1}^i \\sim P^?_{h}(\\cdot \\mid s_\\hi^i, a_\\hi^i)} X^i.\n\\end{aligned}","position":{"start":{"line":209,"column":1},"end":{"line":215,"column":1}},"html":"EsPh?(s,a)[Vh+1(s)]=sSPh?(ss,a)Vh+1(s)=sS1Nht(s,a)i=0t11{(shi,ahi)=(s,a)}Ph?(ss,a)Vh+1(s)=1Nht(s,a)i=0t1Esh+1iPh?(shi,ahi)Xi.\\begin{aligned}\n \\E_{s' \\sim P^?_\\hi(\\cdot \\mid s, a)} \\left[ V^\\star_{h+1}(s') \\right]\n & = \\sum_{s' \\in \\mathcal{S}} P^?_\\hi(s' \\mid s, a) V^\\star_{h+1}(s') \\\\\n & = \\sum_{s' \\in \\mathcal{S}} \\frac{1}{N^t_\\hi(s, a)} \\sum_{i=0}^{t-1} \\ind{ (s_\\hi^i, a_\\hi^i) = (s, a) } P^?_\\hi(s' \\mid s, a) V^\\star_{h+1}(s') \\\\\n & = \\frac{1}{N^t_\\hi(s, a)} \\sum_{i=0}^{t-1} \\E_{s_{h+1}^i \\sim P^?_{h}(\\cdot \\mid s_\\hi^i, a_\\hi^i)} X^i.\n\\end{aligned}EsPh?(s,a)[Vh+1(s)]=sSPh?(ss,a)Vh+1(s)=sSNht(s,a)1i=0t11{(shi,ahi)=(s,a)}Ph?(ss,a)Vh+1(s)=Nht(s,a)1i=0t1Esh+1iPh?(shi,ahi)Xi.","enumerator":"9.14","key":"o0lsWEwCtN"},{"type":"paragraph","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"children":[{"type":"text","value":"Now we can apply Hoeffding’s inequality to ","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"key":"mxwJ0hNfNb"},{"type":"inlineMath","value":"X^i - \\E_{s_{h+1}^i \\sim P^?_{h}(\\cdot \\mid s_\\hi^i, a_\\hi^i)} X^i","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"html":"XiEsh+1iPh?(shi,ahi)XiX^i - \\E_{s_{h+1}^i \\sim P^?_{h}(\\cdot \\mid s_\\hi^i, a_\\hi^i)} X^iXiEsh+1iPh?(shi,ahi)Xi","key":"D7snmjWgYk"},{"type":"text","value":", which is bounded by ","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"key":"fUAWbE0knx"},{"type":"inlineMath","value":"\\hor","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"html":"H\\horH","key":"v3ulVgylVe"},{"type":"text","value":", to obtain that, with probability at least ","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"key":"HwtGDs2fDk"},{"type":"inlineMath","value":"1-\\delta","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"html":"1δ1-\\delta1δ","key":"qk8B4eDLJK"},{"type":"text","value":",","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"key":"bbdNRGvTPP"}],"key":"Kx2ExRMvF5"},{"type":"math","value":"\\text{error} = \\left| \\frac{1}{N^t_\\hi(s, a)} \\sum_{i=0}^{t-1} \\left(X^i - \\E_{s_{h+1}^i \\sim P^?_{h}(\\cdot \\mid s_\\hi^i, a_\\hi^i)} X^i \\right) \\right| \\le 2 H \\sqrt{\\frac{\\ln(1/\\delta)}{N_\\hi^t(s, a)}}.","position":{"start":{"line":219,"column":1},"end":{"line":221,"column":1}},"html":"error=1Nht(s,a)i=0t1(XiEsh+1iPh?(shi,ahi)Xi)2Hln(1/δ)Nht(s,a).\\text{error} = \\left| \\frac{1}{N^t_\\hi(s, a)} \\sum_{i=0}^{t-1} \\left(X^i - \\E_{s_{h+1}^i \\sim P^?_{h}(\\cdot \\mid s_\\hi^i, a_\\hi^i)} X^i \\right) \\right| \\le 2 H \\sqrt{\\frac{\\ln(1/\\delta)}{N_\\hi^t(s, a)}}.error=Nht(s,a)1i=0t1(XiEsh+1iPh?(shi,ahi)Xi)2HNht(s,a)ln(1/δ).","enumerator":"9.15","key":"nXFDaogfxj"},{"type":"paragraph","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"children":[{"type":"text","value":"Applying a union bound over all ","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"LTSwLJN7Ol"},{"type":"inlineMath","value":"s \\in \\mathcal{S}, a \\in \\mathcal{A}, t \\in [T], h \\in [H]","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"sS,aA,t[T],h[H]s \\in \\mathcal{S}, a \\in \\mathcal{A}, t \\in [T], h \\in [H]sS,aA,t[T],h[H]","key":"LaavoIqxA2"},{"type":"text","value":" gives the ","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"ODuMsGHloh"},{"type":"inlineMath","value":"b_\\hi^t(s, a)","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"bht(s,a)b_\\hi^t(s, a)bht(s,a)","key":"x9IRNkDy5P"},{"type":"text","value":" term above.","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"Kl9nDIsZQQ"}],"key":"FpAdom3Esv"}],"enumerator":"9.2","html_id":"ucb-vi-bonus","key":"kIbgBcwOVK"},{"type":"heading","depth":3,"position":{"start":{"line":226,"column":1},"end":{"line":226,"column":1}},"children":[{"type":"text","value":"Definition","position":{"start":{"line":226,"column":1},"end":{"line":226,"column":1}},"key":"KfUzzadWwh"}],"identifier":"definition","label":"Definition","html_id":"definition","implicit":true,"enumerator":"9.3.3","key":"ZBkdpzLhPV"},{"type":"paragraph","position":{"start":{"line":228,"column":1},"end":{"line":228,"column":1}},"children":[{"type":"text","value":"Putting these parts together, we can define the algorithm as follows:","position":{"start":{"line":228,"column":1},"end":{"line":228,"column":1}},"key":"EWBfRAWoco"}],"key":"pasr8qiMQK"},{"type":"math","value":"3 + 1 = 4","label":"ucb-vi-alg","identifier":"ucb-vi-alg","html":"3+1=43 + 1 = 43+1=4","enumerator":"9.16","html_id":"ucb-vi-alg","key":"UwdoZDmnnL"},{"type":"comment","value":" TODO :::{algorithmic}\n$N_\\hi(s, a, s') \\gets \\sum_{i=0}^{t-1} \\ind{ (s_\\hi^i, a_\\hi^i, s_{h+1}^i) = (s, a, s') }$ $N_\\hi(s, a) \\gets \\sum_{i=0}^{t-1} \\ind{ (s_\\hi^i, a_\\hi^i) = (s, a) }$ $\\hat P_\\hi \\gets \\frac{N_\\hi(s, a, s')}{N_\\hi(s, a)}$ $b_\\hi(s, a) \\gets 2 H \\sqrt{\\frac{\\log( |\\mathcal{S}||\\mathcal{A}|H T/\\delta )}{N_\\hi(s, a)}}$ $\\tilde{\\mathcal{M}} \\gets (\\mathcal{S}, \\mathcal{A}, \\{ \\hat{P}_\\hi \\}_{h \\in [H-1]}, \\{ r_\\hi + b_\\hi \\}_{h \\in [H-1]}, H)$ $\\hat \\pi \\gets \\text{VI}(\\tilde{\\mathcal{M}})$ Use $\\hat \\pi_h(s)$ to collect a new trajectory $(s^t_\\hi, a^t_\\hi, s^t_{\\hi+1})_{\\hi \\in [\\hor]}$\n::: ","key":"cxFLvXedLn"},{"type":"heading","depth":3,"position":{"start":{"line":240,"column":1},"end":{"line":240,"column":1}},"children":[{"type":"text","value":"Performance of UCB-VI","position":{"start":{"line":240,"column":1},"end":{"line":240,"column":1}},"key":"Qldn77nYPx"}],"identifier":"performance-of-ucb-vi","label":"Performance of UCB-VI","html_id":"performance-of-ucb-vi","implicit":true,"enumerator":"9.3.4","key":"yY1OjtMMlN"},{"type":"paragraph","position":{"start":{"line":242,"column":1},"end":{"line":242,"column":1}},"children":[{"type":"text","value":"How exactly does UCB-VI strike a good balance between exploration and exploitation? In UCB for MABs, the bonus exploration term is simple to interpret: It encourages the learner to take actions with a high exploration term. Here, the policy depends on the bonus term indirectly: The policy is obtained by planning in an MDP where the bonus term is added to the reward function. Note that the bonuses ","position":{"start":{"line":242,"column":1},"end":{"line":242,"column":1}},"key":"TPBAeE4TVx"},{"type":"emphasis","position":{"start":{"line":242,"column":1},"end":{"line":242,"column":1}},"children":[{"type":"text","value":"propagate backwards","position":{"start":{"line":242,"column":1},"end":{"line":242,"column":1}},"key":"JSBlrGy6U5"}],"key":"QYbtDT6nqM"},{"type":"text","value":" in DP, effectively enabling the learner to ","position":{"start":{"line":242,"column":1},"end":{"line":242,"column":1}},"key":"TTuLVCC0WJ"},{"type":"emphasis","position":{"start":{"line":242,"column":1},"end":{"line":242,"column":1}},"children":[{"type":"text","value":"plan to explore","position":{"start":{"line":242,"column":1},"end":{"line":242,"column":1}},"key":"Hh5wcUTV2A"}],"key":"hmHgAOWtol"},{"type":"text","value":" unknown states. This effect takes some further interpretation.","position":{"start":{"line":242,"column":1},"end":{"line":242,"column":1}},"key":"kVNlxrbfnE"}],"key":"kHMSIqWsSs"},{"type":"paragraph","position":{"start":{"line":244,"column":1},"end":{"line":244,"column":1}},"children":[{"type":"text","value":"Recall we constructed ","position":{"start":{"line":244,"column":1},"end":{"line":244,"column":1}},"key":"IvnOdEzOco"},{"type":"inlineMath","value":"b^t_\\hi","position":{"start":{"line":244,"column":1},"end":{"line":244,"column":1}},"html":"bhtb^t_\\hibht","key":"A0FZFullvz"},{"type":"text","value":" so that, with high probability, ","position":{"start":{"line":244,"column":1},"end":{"line":244,"column":1}},"key":"IQ3QMcfcnm"},{"type":"inlineMath","value":"V^\\star_\\hi(s) \\le \\hat{V}_\\hi^t(s)","position":{"start":{"line":244,"column":1},"end":{"line":244,"column":1}},"html":"Vh(s)V^ht(s)V^\\star_\\hi(s) \\le \\hat{V}_\\hi^t(s)Vh(s)V^ht(s)","key":"o2fbYyajvC"},{"type":"text","value":" and so","position":{"start":{"line":244,"column":1},"end":{"line":244,"column":1}},"key":"tshVnnYF6v"}],"key":"jnCSTzXS0H"},{"type":"math","value":"V^\\star_\\hi(s) - V^{\\pi^t}_\\hi(s) \\le \\hat{V}_\\hi^t(s) - V^{\\pi^t}_\\hi(s).","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"html":"Vh(s)Vhπt(s)V^ht(s)Vhπt(s).V^\\star_\\hi(s) - V^{\\pi^t}_\\hi(s) \\le \\hat{V}_\\hi^t(s) - V^{\\pi^t}_\\hi(s).Vh(s)Vhπt(s)V^ht(s)Vhπt(s).","enumerator":"9.17","key":"DPrRr2OCcX"},{"type":"paragraph","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"children":[{"type":"text","value":"That is, the l.h.s. measures how suboptimal policy ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"q4vMYKK8TK"},{"type":"inlineMath","value":"\\pi^t","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"html":"πt\\pi^tπt","key":"FOLzOkByGz"},{"type":"text","value":" is in the true environment, while the r.h.s. is the difference in the policy’s value when acting in the modelled MDP ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"MvBcHENQsF"},{"type":"inlineMath","value":"\\tilde{\\mathcal{M}}^t","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"html":"M~t\\tilde{\\mathcal{M}}^tM~t","key":"hU9tnvoe3B"},{"type":"text","value":" instead of the true one ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"GC5wFYS6iy"},{"type":"inlineMath","value":"\\mathcal{M}^{?}","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"html":"M?\\mathcal{M}^{?}M?","key":"u0pQmZz1Ms"},{"type":"text","value":".","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"XXlX3BosD6"}],"key":"K3hhCliaX5"},{"type":"paragraph","position":{"start":{"line":250,"column":1},"end":{"line":250,"column":1}},"children":[{"type":"text","value":"If the r.h.s. is ","position":{"start":{"line":250,"column":1},"end":{"line":250,"column":1}},"key":"mMZaD7W9Hh"},{"type":"emphasis","position":{"start":{"line":250,"column":1},"end":{"line":250,"column":1}},"children":[{"type":"text","value":"small","position":{"start":{"line":250,"column":1},"end":{"line":250,"column":1}},"key":"tT5f74oCIx"}],"key":"yASowgTZtI"},{"type":"text","value":", this implies that the l.h.s. difference is also small, i.e. that ","position":{"start":{"line":250,"column":1},"end":{"line":250,"column":1}},"key":"wxDpqCdfnw"},{"type":"inlineMath","value":"\\pi^t","position":{"start":{"line":250,"column":1},"end":{"line":250,"column":1}},"html":"πt\\pi^tπt","key":"ZgFHiQpcTZ"},{"type":"text","value":" is ","position":{"start":{"line":250,"column":1},"end":{"line":250,"column":1}},"key":"YVPRJYvL1e"},{"type":"emphasis","position":{"start":{"line":250,"column":1},"end":{"line":250,"column":1}},"children":[{"type":"text","value":"exploiting","position":{"start":{"line":250,"column":1},"end":{"line":250,"column":1}},"key":"DlT1dNIGzv"}],"key":"yk8AIrpi1i"},{"type":"text","value":" actions that are giving high reward.","position":{"start":{"line":250,"column":1},"end":{"line":250,"column":1}},"key":"aqMWakQqBv"}],"key":"gsYVFt8vM5"},{"type":"paragraph","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"children":[{"type":"text","value":"If the r.h.s. is ","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"key":"ZA06mdEl8v"},{"type":"emphasis","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"children":[{"type":"text","value":"large","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"key":"qKieB2ZWNV"}],"key":"CKp1xueL8Z"},{"type":"text","value":", then we have overestimated the value: ","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"key":"wDH9V4d08A"},{"type":"inlineMath","value":"\\pi^t","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"html":"πt\\pi^tπt","key":"RwCrfVE8Kf"},{"type":"text","value":", the optimal policy of ","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"key":"UxkDfOMB6p"},{"type":"inlineMath","value":"\\tilde{\\mathcal{M}}^t","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"html":"M~t\\tilde{\\mathcal{M}}^tM~t","key":"pNIfLoI9qP"},{"type":"text","value":", does not perform well in the true environment ","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"key":"OUmklz19u1"},{"type":"inlineMath","value":"\\mathcal{M}^{?}","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"html":"M?\\mathcal{M}^{?}M?","key":"YrUJT3Uffw"},{"type":"text","value":". This indicates that one of the ","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"key":"QHsCoqwZ3V"},{"type":"inlineMath","value":"b_h^t(s, a)","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"html":"bht(s,a)b_h^t(s, a)bht(s,a)","key":"l17s93gsr9"},{"type":"text","value":" terms must be large, or some ","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"key":"Tm4Qc3fx4b"},{"type":"inlineMath","value":"\\hat P^t_\\hi(\\cdot \\mid s, a)","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"html":"P^ht(s,a)\\hat P^t_\\hi(\\cdot \\mid s, a)P^ht(s,a)","key":"UkoExAJtpE"},{"type":"text","value":" must be inaccurate, indicating a state-action pair with a low visit count ","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"key":"l6HxZTB4gd"},{"type":"inlineMath","value":"N^t_\\hi(s, a)","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"html":"Nht(s,a)N^t_\\hi(s, a)Nht(s,a)","key":"zylFmQXzS9"},{"type":"text","value":" that the learner was encouraged to explore.","position":{"start":{"line":252,"column":1},"end":{"line":252,"column":1}},"key":"DHOOS50Th0"}],"key":"HDdfKzjt4h"},{"type":"paragraph","position":{"start":{"line":254,"column":1},"end":{"line":254,"column":1}},"children":[{"type":"text","value":"It turns out that UCB-VI achieves a per-episode regret of","position":{"start":{"line":254,"column":1},"end":{"line":254,"column":1}},"key":"D8OmCqVpZC"}],"key":"oMYzC8M9oz"},{"type":"proof","kind":"theorem","label":"ucb_vi_regret","identifier":"ucb_vi_regret","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"UCB-VI regret","position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"key":"kzWyzoFJmm"}],"key":"PHIKeFX93K"},{"type":"math","value":"\\E \\left[ \\sum_{t=0}^{T-1} \\left(V^\\star_0(s_0) - V^{\\pi^t}_0(s_0) \\right) \\right] = \\tilde{O}(H^2 \\sqrt{|\\mathcal{S}| |\\mathcal{A}| T})","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"html":"E[t=0T1(V0(s0)V0πt(s0))]=O~(H2SAT)\\E \\left[ \\sum_{t=0}^{T-1} \\left(V^\\star_0(s_0) - V^{\\pi^t}_0(s_0) \\right) \\right] = \\tilde{O}(H^2 \\sqrt{|\\mathcal{S}| |\\mathcal{A}| T})E[t=0T1(V0(s0)V0πt(s0))]=O~(H2S∣∣AT)","enumerator":"9.18","key":"cBbD4VW3mR"}],"enumerator":"9.2","html_id":"ucb-vi-regret","key":"mRIQj16Zcy"},{"type":"paragraph","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"children":[{"type":"text","value":"Comparing this to the UCB regret bound ","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"key":"f8ZJW9LK1L"},{"type":"inlineMath","value":"\\tilde{O}(\\sqrt{T K})","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"html":"O~(TK)\\tilde{O}(\\sqrt{T K})O~(TK)","key":"xC8S76DYA3"},{"type":"text","value":", where ","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"key":"Tp2dcMFp7c"},{"type":"inlineMath","value":"K","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"html":"KKK","key":"QDZaADxrGx"},{"type":"text","value":" is the number of arms of the MAB, we see that we’ve reduced the number of effective arms from ","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"key":"zkYZyX8W6C"},{"type":"inlineMath","value":"|\\mathcal{A}|^{|\\mathcal{S}|\\hor}","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"html":"ASH|\\mathcal{A}|^{|\\mathcal{S}|\\hor}ASH","key":"UDnJyoIYDZ"},{"type":"text","value":" (in ","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"key":"QevkTISCnL"},{"type":"crossReference","kind":"equation","identifier":"mdp_as_mab","label":"mdp_as_mab","children":[{"type":"text","value":"(","key":"HDNveIzU1r"},{"type":"text","value":"9.4","key":"D0EeW0wOJC"},{"type":"text","value":")","key":"WeKPz4xlmk"}],"template":"(%s)","enumerator":"9.4","resolved":true,"html_id":"mdp-as-mab","key":"LA2NXXQTMn"},{"type":"text","value":") to ","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"key":"I9dIILvpqO"},{"type":"inlineMath","value":"H^4 |\\mathcal{S}||\\mathcal{A}|","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"html":"H4SAH^4 |\\mathcal{S}||\\mathcal{A}|H4S∣∣A","key":"AL1dI0Ptz4"},{"type":"text","value":", which is indeed polynomial in ","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"key":"JLMnShRu6Y"},{"type":"inlineMath","value":"|\\mathcal{S}|","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"html":"S|\\mathcal{S}|S","key":"cfCbMstCqb"},{"type":"text","value":", ","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"key":"L8r52LzrHt"},{"type":"inlineMath","value":"|\\mathcal{A}|","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"html":"A|\\mathcal{A}|A","key":"ZdH4DukLML"},{"type":"text","value":", and ","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"key":"FOaO4rwKnn"},{"type":"inlineMath","value":"H","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"html":"HHH","key":"gHNGwpEnmu"},{"type":"text","value":", as desired. This is also roughly the number of episodes it takes to achieve constant-order average regret:","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"key":"P3kvZXZU7A"}],"key":"hnLE2MDhAo"},{"type":"math","value":"\\frac{1}{T} \\E[\\text{Regret}_T] = \\tilde{O}\\left(\\sqrt{\\frac{H^4 |\\mathcal{S}||\\mathcal{A}|}{T}}\\right)","position":{"start":{"line":264,"column":1},"end":{"line":264,"column":1}},"html":"1TE[RegretT]=O~(H4SAT)\\frac{1}{T} \\E[\\text{Regret}_T] = \\tilde{O}\\left(\\sqrt{\\frac{H^4 |\\mathcal{S}||\\mathcal{A}|}{T}}\\right)T1E[RegretT]=O~(TH4S∣∣A)","enumerator":"9.19","key":"PDYJx0TzJJ"},{"type":"paragraph","position":{"start":{"line":266,"column":1},"end":{"line":266,"column":1}},"children":[{"type":"text","value":"Note that the time-dependent transition matrix has ","position":{"start":{"line":266,"column":1},"end":{"line":266,"column":1}},"key":"YNGcOmiw0d"},{"type":"inlineMath","value":"H |\\mathcal{S}|^2 |\\mathcal{A}|","position":{"start":{"line":266,"column":1},"end":{"line":266,"column":1}},"html":"HS2AH |\\mathcal{S}|^2 |\\mathcal{A}|HS2A","key":"EbsdI20C7b"},{"type":"text","value":" entries. Assuming ","position":{"start":{"line":266,"column":1},"end":{"line":266,"column":1}},"key":"yyPjD1e9S1"},{"type":"inlineMath","value":"H \\ll |\\mathcal{S}|","position":{"start":{"line":266,"column":1},"end":{"line":266,"column":1}},"html":"HSH \\ll |\\mathcal{S}|HS","key":"iiczkSflv7"},{"type":"text","value":", this shows that it’s possible to achieve low regret, and achieve a near-optimal policy, while only understanding a ","position":{"start":{"line":266,"column":1},"end":{"line":266,"column":1}},"key":"HG9Ak4ucUx"},{"type":"inlineMath","value":"1/|\\mathcal{S}|","position":{"start":{"line":266,"column":1},"end":{"line":266,"column":1}},"html":"1/S1/|\\mathcal{S}|1/∣S","key":"z8yaNErT9b"},{"type":"text","value":" fraction of the world’s dynamics.","position":{"start":{"line":266,"column":1},"end":{"line":266,"column":1}},"key":"qtgYkJcOFa"}],"key":"ZJr9sq7UAu"},{"type":"heading","depth":2,"position":{"start":{"line":268,"column":1},"end":{"line":268,"column":1}},"children":[{"type":"text","value":"Linear MDPs","position":{"start":{"line":268,"column":1},"end":{"line":268,"column":1}},"key":"QIW2AuGcdh"}],"identifier":"linear-mdps","label":"Linear MDPs","html_id":"linear-mdps","implicit":true,"enumerator":"9.4","key":"K9odJu7Q1V"},{"type":"paragraph","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"children":[{"type":"text","value":"A polynomial dependency on ","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"key":"IDId73gISX"},{"type":"inlineMath","value":"|\\mathcal{S}|","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"html":"S|\\mathcal{S}|S","key":"z8FddIB1HI"},{"type":"text","value":" and ","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"key":"VIszEfAqqe"},{"type":"inlineMath","value":"|\\mathcal{A}|","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"html":"A|\\mathcal{A}|A","key":"rpdHgzpsCG"},{"type":"text","value":" is manageable when the state and action spaces are small. But for large or continuous state and action spaces, even this polynomial factor will become intractable. Can we find algorithms that don’t depend on ","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"key":"VEuXtLk1m4"},{"type":"inlineMath","value":"|\\mathcal{S}|","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"html":"S|\\mathcal{S}|S","key":"kwwTq35ePG"},{"type":"text","value":" or ","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"key":"amKd599PAf"},{"type":"inlineMath","value":"|\\mathcal{A}|","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"html":"A|\\mathcal{A}|A","key":"OlLWvB0SME"},{"type":"text","value":" at all, effectively reducing the dimensionality of the MDP? In this section, we’ll explore ","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"key":"l3exdzFa65"},{"type":"strong","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"children":[{"type":"text","value":"linear MDPs","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"key":"pJ85jYrHra"}],"key":"bBQEo2ylnY"},{"type":"text","value":": an example of a ","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"key":"dK9TkqLu6n"},{"type":"emphasis","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"children":[{"type":"text","value":"parameterized","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"key":"s0nFNasBOV"}],"key":"yISowJkxeo"},{"type":"text","value":" MDP where the rewards and state transitions depend only on some parameter space of dimension ","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"key":"waA7n3GwAO"},{"type":"inlineMath","value":"d","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"html":"ddd","key":"yRaTfwSMHd"},{"type":"text","value":" that is independent from ","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"key":"wTyycZHnR9"},{"type":"inlineMath","value":"|\\mathcal{S}|","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"html":"S|\\mathcal{S}|S","key":"fF2S5xzcri"},{"type":"text","value":" or ","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"key":"Bc0Cj56Dv9"},{"type":"inlineMath","value":"|\\mathcal{A}|","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"html":"A|\\mathcal{A}|A","key":"goJyyphKm7"},{"type":"text","value":".","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"key":"ItuEKrYGsV"}],"key":"uRnjlfSpsl"},{"type":"proof","kind":"definition","label":"linear_mdp","identifier":"linear_mdp","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Linear MDP","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"bCKVJh4aBh"}],"key":"RdP2oplD9A"},{"type":"paragraph","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"children":[{"type":"text","value":"We assume that the transition probabilities and rewards are ","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"C6bR24tN7d"},{"type":"emphasis","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"children":[{"type":"text","value":"linear","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"LJeQNdl7v3"}],"key":"Vs4m1L5Q49"},{"type":"text","value":" in some feature vector","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"hNS3fyTyVS"}],"key":"fS462Nzk4K"},{"type":"paragraph","position":{"start":{"line":277,"column":1},"end":{"line":277,"column":1}},"children":[{"type":"inlineMath","value":"\\phi(s, a) \\in \\mathbb{R}^d","position":{"start":{"line":277,"column":1},"end":{"line":277,"column":1}},"html":"ϕ(s,a)Rd\\phi(s, a) \\in \\mathbb{R}^dϕ(s,a)Rd","key":"Pgu8mYCRzx"},{"type":"text","value":":","position":{"start":{"line":277,"column":1},"end":{"line":277,"column":1}},"key":"cCw2LDIRRF"}],"key":"LEWhGYb853"},{"type":"math","value":"\\begin{aligned}\n P_\\hi(s' \\mid s, a) & = \\phi(s, a)^\\top \\mu^\\star_\\hi(s') \\\\\n r_\\hi(s, a) & = \\phi(s, a)^\\top \\theta_\\hi^\\star\n\\end{aligned}","position":{"start":{"line":279,"column":1},"end":{"line":282,"column":1}},"html":"Ph(ss,a)=ϕ(s,a)μh(s)rh(s,a)=ϕ(s,a)θh\\begin{aligned}\n P_\\hi(s' \\mid s, a) & = \\phi(s, a)^\\top \\mu^\\star_\\hi(s') \\\\\n r_\\hi(s, a) & = \\phi(s, a)^\\top \\theta_\\hi^\\star\n\\end{aligned}Ph(ss,a)rh(s,a)=ϕ(s,a)μh(s)=ϕ(s,a)θh","enumerator":"9.20","key":"LQUybLOeQQ"},{"type":"paragraph","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"children":[{"type":"text","value":"Note that we can also think of ","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"key":"lxurMd3F43"},{"type":"inlineMath","value":"P_\\hi(\\cdot \\mid s, a) = \\mu_\\hi^\\star","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"html":"Ph(s,a)=μhP_\\hi(\\cdot \\mid s, a) = \\mu_\\hi^\\starPh(s,a)=μh","key":"wsEimG6i9R"},{"type":"text","value":" as an ","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"key":"zR8EMmaRPe"},{"type":"inlineMath","value":"|\\mathcal{S}| \\times d","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"html":"S×d|\\mathcal{S}| \\times dS×d","key":"ysT31bjiO3"},{"type":"text","value":" matrix, and think of ","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"key":"m9kcXYQKEb"},{"type":"inlineMath","value":"\\mu^\\star_\\hi(s')","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"html":"μh(s)\\mu^\\star_\\hi(s')μh(s)","key":"LWZNBLFPnI"},{"type":"text","value":" as indexing into the ","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"key":"DgAsPTT52I"},{"type":"inlineMath","value":"s'","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"html":"ss's","key":"VWfDoApcRa"},{"type":"text","value":"-th row of this matrix (treating it as a column vector). Thinking of ","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"key":"E3vp0TB745"},{"type":"inlineMath","value":"V^\\star_{\\hi+1}","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"html":"Vh+1V^\\star_{\\hi+1}Vh+1","key":"HtAO9f1OXA"},{"type":"text","value":" as an ","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"key":"nxCZXLNOlo"},{"type":"inlineMath","value":"|\\mathcal{S}|","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"html":"S|\\mathcal{S}|S","key":"vXoan0aeVf"},{"type":"text","value":"-dimensional vector, this allows us to write","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"key":"iJ4Ha0WUTV"}],"key":"kwmhDFhHEA"},{"type":"math","value":"\\E_{s' \\sim P_\\hi(\\cdot \\mid s, a)}[V^\\star_{\\hi+1}(s)] = (\\mu^\\star_\\hi \\phi(s, a))^\\top V^\\star_{\\hi+1}.","position":{"start":{"line":286,"column":1},"end":{"line":286,"column":1}},"html":"EsPh(s,a)[Vh+1(s)]=(μhϕ(s,a))Vh+1.\\E_{s' \\sim P_\\hi(\\cdot \\mid s, a)}[V^\\star_{\\hi+1}(s)] = (\\mu^\\star_\\hi \\phi(s, a))^\\top V^\\star_{\\hi+1}.EsPh(s,a)[Vh+1(s)]=(μhϕ(s,a))Vh+1.","enumerator":"9.21","key":"J63f7nmuFt"},{"type":"paragraph","position":{"start":{"line":288,"column":1},"end":{"line":288,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":288,"column":1},"end":{"line":288,"column":1}},"key":"jS6dTtLchn"},{"type":"text","value":"ϕ","position":{"start":{"line":288,"column":1},"end":{"line":288,"column":1}},"key":"XcGJxyFf6t"},{"type":"text","value":" feature mapping can be designed to capture interactions between the state ","position":{"start":{"line":288,"column":1},"end":{"line":288,"column":1}},"key":"rU4cG4QNkv"},{"type":"inlineMath","value":"s","position":{"start":{"line":288,"column":1},"end":{"line":288,"column":1}},"html":"sss","key":"jysGdwAoFg"},{"type":"text","value":" and action ","position":{"start":{"line":288,"column":1},"end":{"line":288,"column":1}},"key":"WFWBVKxlqM"},{"type":"inlineMath","value":"a","position":{"start":{"line":288,"column":1},"end":{"line":288,"column":1}},"html":"aaa","key":"oE78OhKFkB"},{"type":"text","value":". In this book, we’ll assume that the feature map ","position":{"start":{"line":288,"column":1},"end":{"line":288,"column":1}},"key":"GZHfM0ueco"},{"type":"inlineMath","value":"\\phi : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}^d","position":{"start":{"line":288,"column":1},"end":{"line":288,"column":1}},"html":"ϕ:S×ARd\\phi : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}^dϕ:S×ARd","key":"U218Bro2jc"},{"type":"text","value":" and the reward function (described by ","position":{"start":{"line":288,"column":1},"end":{"line":288,"column":1}},"key":"I1ljpxXq5O"},{"type":"inlineMath","value":"\\theta_\\hi^\\star","position":{"start":{"line":288,"column":1},"end":{"line":288,"column":1}},"html":"θh\\theta_\\hi^\\starθh","key":"hYMbbdeLDm"},{"type":"text","value":") are known to the learner.","position":{"start":{"line":288,"column":1},"end":{"line":288,"column":1}},"key":"aZJSr033uS"}],"key":"upbV6prwKs"}],"enumerator":"9.3","html_id":"linear-mdp","key":"eB8sjycLnS"},{"type":"heading","depth":3,"position":{"start":{"line":291,"column":1},"end":{"line":291,"column":1}},"children":[{"type":"text","value":"Planning in a linear MDP","position":{"start":{"line":291,"column":1},"end":{"line":291,"column":1}},"key":"eFEPwf7zEn"}],"identifier":"planning-in-a-linear-mdp","label":"Planning in a linear MDP","html_id":"planning-in-a-linear-mdp","implicit":true,"enumerator":"9.4.1","key":"ZI4ZN2Sy74"},{"type":"paragraph","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"children":[{"type":"text","value":"It turns out that ","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"key":"dgofLSt9uP"},{"type":"inlineMath","value":"Q^\\star_\\hi","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"html":"QhQ^\\star_\\hiQh","key":"j0PazBYcoI"},{"type":"text","value":" is also linear with respect to this feature mapping. We can prove this by simply computing it using DP. We initialize ","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"key":"ejZtbNCXz4"},{"type":"inlineMath","value":"V_{H}^\\star(s) = 0 \\forall s","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"html":"VH(s)=0sV_{H}^\\star(s) = 0 \\forall sVH(s)=0∀s","key":"Ypjsj1HZap"},{"type":"text","value":". Then we iterate:","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"key":"BcyBS7rLoR"}],"key":"mrs4g1Cjhd"},{"type":"math","value":"\\begin{aligned}\n Q^\\star_\\hi(s, a) & = r_\\hi(s, a) + \\E_{s' \\sim P_\\hi(\\cdot \\mid s, a)} [V^\\star_{h+1}(s')] \\\\\n & = \\phi(s, a)^\\top \\theta_\\hi^\\star + (\\mu_\\hi^\\star \\phi(s, a))^\\top V^\\star_{h+1} \\\\\n & = \\phi(s, a)^\\top \\underbrace{( \\theta_\\hi^\\star + (\\mu_\\hi^\\star)^\\top V^\\star_{h+1})}_{w_\\hi} \\\\\n V^\\star_\\hi(s) & = \\max_a Q^\\star_\\hi(s, a) \\\\\n \\pi^\\star_\\hi(s) & = \\arg\\max_a Q^\\star_\\hi(s, a)\n\\end{aligned}","position":{"start":{"line":295,"column":1},"end":{"line":301,"column":1}},"html":"Qh(s,a)=rh(s,a)+EsPh(s,a)[Vh+1(s)]=ϕ(s,a)θh+(μhϕ(s,a))Vh+1=ϕ(s,a)(θh+(μh)Vh+1)whVh(s)=maxaQh(s,a)πh(s)=argmaxaQh(s,a)\\begin{aligned}\n Q^\\star_\\hi(s, a) & = r_\\hi(s, a) + \\E_{s' \\sim P_\\hi(\\cdot \\mid s, a)} [V^\\star_{h+1}(s')] \\\\\n & = \\phi(s, a)^\\top \\theta_\\hi^\\star + (\\mu_\\hi^\\star \\phi(s, a))^\\top V^\\star_{h+1} \\\\\n & = \\phi(s, a)^\\top \\underbrace{( \\theta_\\hi^\\star + (\\mu_\\hi^\\star)^\\top V^\\star_{h+1})}_{w_\\hi} \\\\\n V^\\star_\\hi(s) & = \\max_a Q^\\star_\\hi(s, a) \\\\\n \\pi^\\star_\\hi(s) & = \\arg\\max_a Q^\\star_\\hi(s, a)\n\\end{aligned}Qh(s,a)Vh(s)πh(s)=rh(s,a)+EsPh(s,a)[Vh+1(s)]=ϕ(s,a)θh+(μhϕ(s,a))Vh+1=ϕ(s,a)wh(θh+(μh)Vh+1)=amaxQh(s,a)=argamaxQh(s,a)","enumerator":"9.22","key":"K6OlZ26oS2"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"oc3le9VUPl"}],"key":"ixUtE9V62k"},{"type":"paragraph","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"children":[{"type":"text","value":"Show that ","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"key":"a3HrxVInUa"},{"type":"inlineMath","value":"Q^\\pi_\\hi","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"html":"QhπQ^\\pi_\\hiQhπ","key":"B18MRvy9Ux"},{"type":"text","value":" is also linear with respect to ","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"key":"bicb24bQnz"},{"type":"inlineMath","value":"\\phi(s, a)","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"html":"ϕ(s,a)\\phi(s, a)ϕ(s,a)","key":"PeokabSRg7"},{"type":"text","value":" for any policy ","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"key":"gCxnlD4Svx"},{"type":"text","value":"π","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"key":"zPRki9CXie"},{"type":"text","value":".","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"key":"zGGNoudnV4"}],"key":"O2FdgImQw0"}],"key":"GUQ63nJgvY"},{"type":"heading","depth":3,"position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"children":[{"type":"text","value":"UCB-VI in a linear MDP","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"JhRbpeDHHa"}],"label":"lin_ucb_vi","identifier":"lin_ucb_vi","html_id":"lin-ucb-vi","enumerator":"9.4.2","key":"hJdefaoPyb"},{"type":"heading","depth":4,"position":{"start":{"line":310,"column":1},"end":{"line":310,"column":1}},"children":[{"type":"text","value":"Modelling the transitions","position":{"start":{"line":310,"column":1},"end":{"line":310,"column":1}},"key":"Nro1KIWhQk"}],"identifier":"modelling-the-transitions","label":"Modelling the transitions","html_id":"modelling-the-transitions-1","implicit":true,"enumerator":"9.4.2.1","key":"Ii3cxBgbJo"},{"type":"paragraph","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"children":[{"type":"text","value":"This linear assumption on the MDP will also allow us to model the unknown dynamics ","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"rBt14vDqPz"},{"type":"inlineMath","value":"P^?_\\hi(s' \\mid s, a)","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"html":"Ph?(ss,a)P^?_\\hi(s' \\mid s, a)Ph?(ss,a)","key":"G3XOOmsCoA"},{"type":"text","value":" with techniques from ","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"uyyZ2YPjpo"},{"type":"strong","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"children":[{"type":"text","value":"supervised learning","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"HFAaM6Rsy8"}],"key":"lg8aUNdwRU"},{"type":"text","value":" (SL). Recall that SL is useful for estimating conditional expectations by minimizing mean squared error. We can rephrase the estimation of ","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"bWMPzRypf5"},{"type":"inlineMath","value":"P^?_\\hi(s' \\mid s, a)","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"html":"Ph?(ss,a)P^?_\\hi(s' \\mid s, a)Ph?(ss,a)","key":"RU7S6gGMtu"},{"type":"text","value":" as a least-squares problem as follows: Write ","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"A8f7NjnAPo"},{"type":"inlineMath","value":"\\delta_s","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"html":"δs\\delta_sδs","key":"HSsgSgdpk6"},{"type":"text","value":" to denote a one-hot vector in ","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"ezHkvmMg9H"},{"type":"inlineMath","value":"\\mathbb{R}^{|\\mathcal{S}|}","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"html":"RS\\mathbb{R}^{|\\mathcal{S}|}RS","key":"dmFK0WKkSK"},{"type":"text","value":", with a ","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"KbGOwtMdYM"},{"type":"text","value":"1","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"bu3aeslMPa"},{"type":"text","value":" in the ","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"ct8jwnCnn6"},{"type":"inlineMath","value":"s","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"html":"sss","key":"u8QhzlZcn8"},{"type":"text","value":"-th entry and ","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"SPji5f6h5E"},{"type":"text","value":"0","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"hriU0vRHUD"},{"type":"text","value":" everywhere else. Note that","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"key":"Px6AwqCSCk"}],"key":"tBN4qFCnHF"},{"type":"math","value":"\\E_{s' \\sim P_h(\\cdot \\mid s, a)} [\\delta_{s'}] = P_h(\\cdot \\mid s, a) = \\mu_h^\\star \\phi(s, a).","position":{"start":{"line":314,"column":1},"end":{"line":314,"column":1}},"html":"EsPh(s,a)[δs]=Ph(s,a)=μhϕ(s,a).\\E_{s' \\sim P_h(\\cdot \\mid s, a)} [\\delta_{s'}] = P_h(\\cdot \\mid s, a) = \\mu_h^\\star \\phi(s, a).EsPh(s,a)[δs]=Ph(s,a)=μhϕ(s,a).","enumerator":"9.23","key":"gwNNZl2St6"},{"type":"paragraph","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"children":[{"type":"text","value":"Furthermore, since the expectation here is linear with respect to ","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"key":"e5kLuE7sGG"},{"type":"inlineMath","value":"\\phi(s, a)","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"html":"ϕ(s,a)\\phi(s, a)ϕ(s,a)","key":"NF39L4J2wJ"},{"type":"text","value":", we can directly apply least-squares multi-target linear regression to construct the estimate","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"key":"XYoe3hGLpm"}],"key":"mhQlQzKCcI"},{"type":"math","value":"\\hat \\mu = \\arg\\min_{\\mu \\in \\mathbb{R}^{|\\mathcal{S}| \\times d}} \\sum_{t=0}^{T-1} \\|\\mu \\phi(s_h^i, a_h^i) - \\delta_{s_{h+1}^i} \\|_2^2.","position":{"start":{"line":318,"column":1},"end":{"line":318,"column":1}},"html":"μ^=argminμRS×dt=0T1μϕ(shi,ahi)δsh+1i22.\\hat \\mu = \\arg\\min_{\\mu \\in \\mathbb{R}^{|\\mathcal{S}| \\times d}} \\sum_{t=0}^{T-1} \\|\\mu \\phi(s_h^i, a_h^i) - \\delta_{s_{h+1}^i} \\|_2^2.μ^=argμRS×dmint=0T1μϕ(shi,ahi)δsh+1i22.","enumerator":"9.24","key":"yerGDp8fDp"},{"type":"paragraph","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"children":[{"type":"text","value":"This has a well-known closed-form solution:","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"key":"tj7cJ880py"}],"key":"K93pLcLFuh"},{"type":"math","value":"\\begin{aligned}\n \\hat \\mu^\\top & = (A_h^t)^{-1} \\sum_{i=0}^{t-1} \\phi(s_h^i, a_h^i) \\delta_{s_{h+1}^i}^\\top \\\\\n \\text{where} \\quad A_h^t & = \\sum_{i=0}^{t-1} \\phi(s_h^i, a_h^i) \\phi(s_h^i, a_h^i)^\\top + \\lambda I\n\\end{aligned}","position":{"start":{"line":322,"column":1},"end":{"line":325,"column":1}},"html":"μ^=(Aht)1i=0t1ϕ(shi,ahi)δsh+1iwhereAht=i=0t1ϕ(shi,ahi)ϕ(shi,ahi)+λI\\begin{aligned}\n \\hat \\mu^\\top & = (A_h^t)^{-1} \\sum_{i=0}^{t-1} \\phi(s_h^i, a_h^i) \\delta_{s_{h+1}^i}^\\top \\\\\n \\text{where} \\quad A_h^t & = \\sum_{i=0}^{t-1} \\phi(s_h^i, a_h^i) \\phi(s_h^i, a_h^i)^\\top + \\lambda I\n\\end{aligned}μ^whereAht=(Aht)1i=0t1ϕ(shi,ahi)δsh+1i=i=0t1ϕ(shi,ahi)ϕ(shi,ahi)+λI","enumerator":"9.25","key":"eV5jzhOtpr"},{"type":"paragraph","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"children":[{"type":"text","value":"where we include a ","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"key":"gmbHKjtprT"},{"type":"inlineMath","value":"\\lambda I","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"html":"λI\\lambda IλI","key":"ZPxz4AdHAU"},{"type":"text","value":" term to ensure that the matrix ","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"key":"wX9aC4uZbE"},{"type":"inlineMath","value":"A^t_h","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"html":"AhtA^t_hAht","key":"w4S4QsQyHW"},{"type":"text","value":" is invertible. (This can also be derived by adding a ","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"key":"G4eA1zJ17c"},{"type":"inlineMath","value":"\\lambda \\|\\mu\\|_{\\text{F}}^2","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"html":"λμF2\\lambda \\|\\mu\\|_{\\text{F}}^2λμF2","key":"W1lknxwZPi"},{"type":"text","value":" regularization term to the objective.) We can directly plug in this estimate into ","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"key":"kBIW1eUcn5"},{"type":"inlineMath","value":"\\hat{P}^t_h(\\cdot \\mid s, a) = \\hat \\mu^t_h \\phi(s, a)","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"html":"P^ht(s,a)=μ^htϕ(s,a)\\hat{P}^t_h(\\cdot \\mid s, a) = \\hat \\mu^t_h \\phi(s, a)P^ht(s,a)=μ^htϕ(s,a)","key":"NkSOcnbULC"},{"type":"text","value":".","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"key":"gkQrWNL3hS"}],"key":"YGQWfoBnjI"},{"type":"heading","depth":4,"position":{"start":{"line":329,"column":1},"end":{"line":329,"column":1}},"children":[{"type":"text","value":"Reward bonus","position":{"start":{"line":329,"column":1},"end":{"line":329,"column":1}},"key":"rqfHj1ax4L"}],"identifier":"reward-bonus","label":"Reward bonus","html_id":"reward-bonus-1","implicit":true,"enumerator":"9.4.2.2","key":"jfnsPvUUEv"},{"type":"paragraph","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"children":[{"type":"text","value":"Now, to design the reward bonus, we can’t apply Hoeffding anymore, since the terms no longer involve sample means of bounded random variables; Instead, we’re incorporating information across different states and actions. Rather, we can construct an upper bound using ","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"key":"FILfAtEUpm"},{"type":"emphasis","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"children":[{"type":"text","value":"Chebyshev’s inequality","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"key":"ha7JwJHy3x"}],"key":"ihBab59wzY"},{"type":"text","value":" in the same way we did for the LinUCB algorithm in the MAB setting ","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"key":"Sg5b7hqnVl"},{"type":"crossReference","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"children":[{"type":"text","value":"Section ","key":"dYyZUwP1vY"},{"type":"text","value":"3.8.1","key":"CwshBojFeh"}],"identifier":"lin_ucb","label":"lin_ucb","kind":"heading","template":"Section %s","enumerator":"3.8.1","resolved":true,"html_id":"lin-ucb","remote":true,"url":"/bandits","dataUrl":"/bandits.json","key":"NZvxw4ZMwN"},{"type":"text","value":":","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"key":"gJOwMVdjjz"}],"key":"Ty5SV561tt"},{"type":"math","value":"b^t_\\hi(s, a) = \\beta \\sqrt{\\phi(s, a)^\\top (A^t_h)^{-1} \\phi(s, a)}, \\quad \\beta = \\tilde O(d \\hor).","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"html":"bht(s,a)=βϕ(s,a)(Aht)1ϕ(s,a),β=O~(dH).b^t_\\hi(s, a) = \\beta \\sqrt{\\phi(s, a)^\\top (A^t_h)^{-1} \\phi(s, a)}, \\quad \\beta = \\tilde O(d \\hor).bht(s,a)=βϕ(s,a)(Aht)1ϕ(s,a),β=O~(dH).","enumerator":"9.26","key":"MGiu7AmvhL"},{"type":"paragraph","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"Note that this isn’t explicitly inversely proportional to ","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"WdckdQ7PME"},{"type":"inlineMath","value":"N_h^t(s, a)","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"html":"Nht(s,a)N_h^t(s, a)Nht(s,a)","key":"K92Ss7Jgya"},{"type":"text","value":" as in the original UCB-VI bonus term ","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"bjFnoYPdWS"},{"type":"crossReference","kind":"equation","identifier":"eq:ucb_vi_bonus","label":"eq:ucb_vi_bonus","children":[{"type":"text","value":"(","key":"YQiMWcPqZd"},{"type":"text","value":"9.8","key":"eIOXM0ZPMx"},{"type":"text","value":")","key":"Xc0wtScT6A"}],"template":"(%s)","enumerator":"9.8","resolved":true,"html_id":"eq-ucb-vi-bonus","key":"AC6OQC1Tz7"},{"type":"text","value":". Rather, it is inversely proportional to the amount that the direction ","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"txb5VVUT9D"},{"type":"inlineMath","value":"\\phi(s, a)","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"html":"ϕ(s,a)\\phi(s, a)ϕ(s,a)","key":"EfzBFOwxWr"},{"type":"text","value":" has been explored in the history. That is, if ","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"S69EhWAiaJ"},{"type":"inlineMath","value":"A_h^t","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"html":"AhtA_h^tAht","key":"aegre9dwCc"},{"type":"text","value":" has a large component in the direction ","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"qy925GIcIz"},{"type":"inlineMath","value":"\\phi(s, a)","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"html":"ϕ(s,a)\\phi(s, a)ϕ(s,a)","key":"lKezPaAzd1"},{"type":"text","value":", implying that this direction is well explored, then the bonus term will be small, and vice versa.","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"v1SHu23r1O"}],"key":"fDwvLXDNtc"},{"type":"paragraph","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"We can now plug in these transition estimates and reward bonuses into the UCB-VI algorithm ","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"UTsQGBahM8"},{"type":"crossReference","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"(","key":"N0hoWFjPBE"},{"type":"text","value":"9.16","key":"ZI3JgA0sZk"},{"type":"text","value":")","key":"JokZIdkJOT"}],"identifier":"ucb-vi-alg","label":"ucb-vi-alg","kind":"equation","template":"(%s)","enumerator":"9.16","resolved":true,"html_id":"ucb-vi-alg","key":"CzGZLevTNL"},{"type":"text","value":".","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"tiZcBTrjej"}],"key":"B1N26ike79"},{"type":"heading","depth":4,"position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"children":[{"type":"text","value":"Performance","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"key":"VXQK4CiWAP"}],"identifier":"performance","label":"Performance","html_id":"performance","implicit":true,"enumerator":"9.4.2.3","key":"nzaP0pcofd"},{"type":"proof","kind":"theorem","label":"lin_ucb_vi_regret","identifier":"lin_ucb_vi_regret","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"LinUCB-VI regret","position":{"start":{"line":341,"column":1},"end":{"line":341,"column":1}},"key":"lGgUwQoFis"}],"key":"vJSDdFEQ52"},{"type":"paragraph","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"children":[{"type":"text","value":"The LinUCB-VI algorithm achieves expected regret","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"kUt3Nbjxza"}],"key":"RSjPkxvvC2"},{"type":"math","value":"\\E[\\text{Regret}_T] = \\E\\left[\\sum_{t=0}^{T-1} V^\\star_0(s_0) - V^{\\pi^t}_0(s_0) \\right] \\le \\tilde O(H^2 d^{1.5} \\sqrt{T})","position":{"start":{"line":346,"column":1},"end":{"line":346,"column":1}},"html":"E[RegretT]=E[t=0T1V0(s0)V0πt(s0)]O~(H2d1.5T)\\E[\\text{Regret}_T] = \\E\\left[\\sum_{t=0}^{T-1} V^\\star_0(s_0) - V^{\\pi^t}_0(s_0) \\right] \\le \\tilde O(H^2 d^{1.5} \\sqrt{T})E[RegretT]=E[t=0T1V0(s0)V0πt(s0)]O~(H2d1.5T)","enumerator":"9.27","key":"HZ1olYPJDf"}],"enumerator":"9.3","html_id":"lin-ucb-vi-regret","key":"KlsgyJZxiu"},{"type":"paragraph","position":{"start":{"line":349,"column":1},"end":{"line":349,"column":1}},"children":[{"type":"text","value":"Comparing this to our bound for UCB-VI in an environment without this linear assumption, we see that we go from a sample complexity of ","position":{"start":{"line":349,"column":1},"end":{"line":349,"column":1}},"key":"Pgosd1NbsU"},{"type":"inlineMath","value":"\\tilde \\Omega(H^4 |\\mathcal{S}||\\mathcal{A}|)","position":{"start":{"line":349,"column":1},"end":{"line":349,"column":1}},"html":"Ω~(H4SA)\\tilde \\Omega(H^4 |\\mathcal{S}||\\mathcal{A}|)Ω~(H4S∣∣A)","key":"sg1TaDJ8kW"},{"type":"text","value":" to ","position":{"start":{"line":349,"column":1},"end":{"line":349,"column":1}},"key":"Fr1SRei4ao"},{"type":"inlineMath","value":"\\tilde \\Omega(H^4 d^{3})","position":{"start":{"line":349,"column":1},"end":{"line":349,"column":1}},"html":"Ω~(H4d3)\\tilde \\Omega(H^4 d^{3})Ω~(H4d3)","key":"KDihTH2gVe"},{"type":"text","value":". This new sample complexity only depends on the feature dimension and not on the state or action space of the MDP!","position":{"start":{"line":349,"column":1},"end":{"line":349,"column":1}},"key":"OYxfuIYuZU"}],"key":"Bf9rBxCYlG"},{"type":"heading","depth":2,"position":{"start":{"line":351,"column":1},"end":{"line":351,"column":1}},"children":[{"type":"text","value":"Summary","position":{"start":{"line":351,"column":1},"end":{"line":351,"column":1}},"key":"nEWPVXSMOQ"}],"identifier":"summary","label":"Summary","html_id":"summary","implicit":true,"enumerator":"9.5","key":"iMugjqFcrt"},{"type":"paragraph","position":{"start":{"line":353,"column":1},"end":{"line":353,"column":1}},"children":[{"type":"text","value":"In this chapter, we’ve explored how to explore in an unknown MDP.","position":{"start":{"line":353,"column":1},"end":{"line":353,"column":1}},"key":"i0vstyPttT"}],"key":"aA9JqijqyW"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":355,"column":1},"end":{"line":361,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":355,"column":1},"end":{"line":356,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"children":[{"type":"text","value":"We first discussed the explore-then-exploit algorithm ","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"key":"IpRBEEHXAw"},{"type":"crossReference","kind":"proof:definition","identifier":"explore_then_exploit","label":"explore_then_exploit","children":[{"type":"text","value":"Definition ","key":"JVOUJYqmB9"},{"type":"text","value":"9.2","key":"ttwzdq4c7P"}],"template":"Definition %s","enumerator":"9.2","resolved":true,"html_id":"explore-then-exploit","key":"NbgO6vzA2E"},{"type":"text","value":", a simple way to explore a deterministic MDP by visiting all state-action pairs.","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"key":"pQ6YYNC9hQ"}],"key":"jZyGS35id7"}],"key":"Ap0haIiENK"},{"type":"listItem","spread":true,"position":{"start":{"line":357,"column":1},"end":{"line":358,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"children":[{"type":"text","value":"We then discussed how to treat an unknown MDP as a MAB ","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"key":"NxJE6sRw5K"},{"type":"crossReference","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"children":[{"type":"text","value":"Section ","key":"XGlJQDfZMP"},{"type":"text","value":"9.2","key":"QRJ1VaHxrt"}],"identifier":"mdp_mab","label":"mdp_mab","kind":"heading","template":"Section %s","enumerator":"9.2","resolved":true,"html_id":"mdp-mab","key":"Bx1weIedC4"},{"type":"text","value":", and how this approach is inefficient since it doesn’t make use of relationships between policies.","position":{"start":{"line":357,"column":1},"end":{"line":357,"column":1}},"key":"IXPaxA9h0O"}],"key":"J3q3igk60w"}],"key":"kn271XAzIo"},{"type":"listItem","spread":true,"position":{"start":{"line":359,"column":1},"end":{"line":360,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":359,"column":1},"end":{"line":359,"column":1}},"children":[{"type":"text","value":"We then introduced the UCB-VI algorithm ","position":{"start":{"line":359,"column":1},"end":{"line":359,"column":1}},"key":"ESbpwJp4Qu"},{"type":"crossReference","position":{"start":{"line":359,"column":1},"end":{"line":359,"column":1}},"children":[{"type":"text","value":"(","key":"plbZRHs5WF"},{"type":"text","value":"9.16","key":"UIMwRoc0qy"},{"type":"text","value":")","key":"MfD9c1Yt3f"}],"identifier":"ucb-vi-alg","label":"ucb-vi-alg","kind":"equation","template":"(%s)","enumerator":"9.16","resolved":true,"html_id":"ucb-vi-alg","key":"PBm3Uxf00Y"},{"type":"text","value":", which models the unknown MDP by a proxy MDP with a reward bonus term that encourages exploration.","position":{"start":{"line":359,"column":1},"end":{"line":359,"column":1}},"key":"nrx8LnWxo1"}],"key":"BDpTwHSfqL"}],"key":"UgnHjmDd5P"},{"type":"listItem","spread":true,"position":{"start":{"line":361,"column":1},"end":{"line":361,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":361,"column":1},"end":{"line":361,"column":1}},"children":[{"type":"text","value":"Finally, assuming that the transitions and rewards are linear with respect to a feature transformation of the state and action, we introduced the LinUCB-VI algorithm ","position":{"start":{"line":361,"column":1},"end":{"line":361,"column":1}},"key":"goUX3wKbRR"},{"type":"crossReference","position":{"start":{"line":361,"column":1},"end":{"line":361,"column":1}},"children":[{"type":"text","value":"Section ","key":"FUuHebGzYy"},{"type":"text","value":"9.4.2","key":"zrlTtnM5WZ"}],"identifier":"lin_ucb_vi","label":"lin_ucb_vi","kind":"heading","template":"Section %s","enumerator":"9.4.2","resolved":true,"html_id":"lin-ucb-vi","key":"qfUU3cU1Yx"},{"type":"text","value":", which has a sample complexity independent of the size of the state and action spaces.","position":{"start":{"line":361,"column":1},"end":{"line":361,"column":1}},"key":"cnsBLFVk5X"}],"key":"SuzyZo0It8"}],"key":"pDwN6wYGXF"}],"key":"R1rpa8re69"}],"key":"TNoj6QqrEm"}],"key":"uVcHlfUKEy"},"references":{"cite":{"order":["agarwal_reinforcement_2022"],"data":{"agarwal_reinforcement_2022":{"label":"agarwal_reinforcement_2022","enumerator":"1","html":"Agarwal, A., Jiang, N., Kakade, S. M., & Sun, W. (2022). Reinforcement Learning: Theory and Algorithms."}}}},"footer":{"navigation":{"prev":{"title":"8 Tree Search Methods","url":"/planning","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"Appendix: Background","url":"/background","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file diff --git a/fitted-dp.html b/fitted-dp.html index d899145..365b997 100644 --- a/fitted-dp.html +++ b/fitted-dp.html @@ -1,4 +1,4 @@ -5 Fitted Dynamic Programming Algorithms - CS/STAT 184: Introduction to Reinforcement Learning

5 Fitted Dynamic Programming Algorithms

5.1Introduction

We borrow these definitions from the 1 Markov Decision Processes chapter:

from typing import NamedTuple, Callable, Optional
+          c-1.4,4.9-3.6,8.6-6.8,11.1c-3.1,2.5-7.3,3.7-12.4,3.8H63c-6.7,0-10,2.9-10,8.8">Made with MyST

5 Fitted Dynamic Programming Algorithms

5.1Introduction

We borrow these definitions from the 1 Markov Decision Processes chapter:

from typing import NamedTuple, Callable, Optional
 from jaxtyping import Float, Array
 import jax.numpy as np
 from jax import grad, vmap
@@ -60,7 +60,7 @@
 
 def q_to_greedy(Q: QFunction) -> Policy:
     """Get the greedy policy for the given state-action value function."""
-    return lambda s, h: np.argmax(Q(s, h))

The 1 Markov Decision Processes chapter discussed the case of finite MDPs, where the state and action spaces S\mathcal{S} and A\mathcal{A} were finite. + return lambda s, h: np.argmax(Q(s, h))

The 1 Markov Decision Processes chapter discussed the case of finite MDPs, where the state and action spaces S\mathcal{S} and A\mathcal{A} were finite. This gave us a closed-form expression for computing the r.h.s. of the Bellman one-step consistency equation. In this chapter, we consider the case of large or continuous state spaces, where the state space is too large to be enumerated. In this case, we need to approximate the value function and Q-function using methods from supervised learning.

We will first take a quick detour to introduce the empirical risk minimization framework for function approximation. @@ -69,22 +69,22 @@ We seek to learn the relationship between some input variables xx and some output variable yy (drawn from their joint distribution). Precisely, we want to find a function f^:xy\hat f : x \mapsto y that minimizes the -squared error of the prediction:

f^=argminfE[(yf(x))2]\hat f = \arg\min_{f} \E[(y - f(x))^2]

An equivalent framing is that we seek to approximate the conditional expectation of yy given xx:

In most applications, the joint distribution of x,yx, y is unknown or extremely complex, and so we can’t analytically evaluate E[yx]\E [y \mid x]. Instead, our strategy is to draw NN samples (xi,yi)(x_i, y_i) from the joint distribution of xx and yy, and then use the sample average i=1N(yif(xi))2/N\sum_{i=1}^N (y_i - f(x_i))^2 / N to approximate the mean squared error. Then we use a fitting method to find a function f^\hat f that minimizes this objective and thus approximates the conditional expectation. -This approach is called empirical risk minimization.

5.3Fitted value iteration

Let us apply ERM to the RL problem of computing the optimal policy / value function.

How did we compute the optimal value function in MDPs with finite state and action spaces?

  • In a [](#finite-horizon MDP <finite_horizon_mdps>), we can use dynamic programming, working backwards from the end of the time horizon, to compute the optimal value function exactly.

  • In an [](#infinite-horizon MDP <infinite_horizon_mdps>), we can use [](#value iteration <value_iteration>), which iterates the Bellman optimality operator (1.54) to approximately compute the optimal value function.

Our existing approaches represent the value function, and the MDP itself, +This approach is called empirical risk minimization.

5.3Fitted value iteration

Let us apply ERM to the RL problem of computing the optimal policy / value function.

How did we compute the optimal value function in MDPs with finite state and action spaces?

  • In a [](#finite-horizon MDP <finite_horizon_mdps>), we can use dynamic programming, working backwards from the end of the time horizon, to compute the optimal value function exactly.

  • In an [](#infinite-horizon MDP <infinite_horizon_mdps>), we can use [](#value iteration <value_iteration>), which iterates the Bellman optimality operator (1.54) to approximately compute the optimal value function.

Our existing approaches represent the value function, and the MDP itself, in matrix notation. But what happens if the state space is extremely large, or even infinite (e.g. real-valued)? Then computing a weighted sum over all possible next states, which is required to compute the Bellman operator, becomes intractable.

Instead, we will need to use function approximation methods from supervised learning to solve for the value function in an alternative way.

In particular, suppose we have a dataset of NN trajectories τ1,,τNρπ\tau_1, \dots, \tau_N \sim \rho_{\pi} from some policy π (called the data collection policy) acting in the MDP of interest. -Let us indicate the trajectory index in the superscript, so that

τi={s0i,a0i,r0i,s1i,a1i,r1i,,sH1i,aH1i,rH1i}.\tau_i = \{ s_0^i, a_0^i, r_0^i, s_1^i, a_1^i, r_1^i, \dots, s_{\hor-1}^i, a_{\hor-1}^i, r_{\hor-1}^i \}.
def collect_data(
+Let us indicate the trajectory index in the superscript, so that

τi={s0i,a0i,r0i,s1i,a1i,r1i,,sH1i,aH1i,rH1i}.\tau_i = \{ s_0^i, a_0^i, r_0^i, s_1^i, a_1^i, r_1^i, \dots, s_{\hor-1}^i, a_{\hor-1}^i, r_{\hor-1}^i \}.
def collect_data(
     env: gym.Env, N: int, H: int, key: rand.PRNGKey, π: Optional[Policy] = None
 ) -> list[Trajectory]:
     """Collect a dataset of trajectories from the given policy (or a random one)."""
@@ -102,22 +102,48 @@
                 break
             s = s_next
         trajectories.append(τ)
-    return trajectories
env = gym.make("LunarLander-v2")
+    return trajectories
env = gym.make("LunarLander-v2")
 trajectories = collect_data(env, 100, 300, key)
-trajectories[0][:5]  # show first five transitions from first trajectory
  0%|          | 0/100 [00:00<?, ?it/s]
  9%|▉         | 9/100 [00:00<00:01, 85.78it/s]
 19%|█▉        | 19/100 [00:00<00:00, 90.32it/s]
 29%|██▉       | 29/100 [00:00<00:00, 77.90it/s]
 40%|████      | 40/100 [00:00<00:00, 87.71it/s]
 53%|█████▎    | 53/100 [00:00<00:00, 100.68it/s]
 65%|██████▌   | 65/100 [00:00<00:00, 103.10it/s]
 76%|███████▌  | 76/100 [00:00<00:00, 88.07it/s] 
 86%|████████▌ | 86/100 [00:00<00:00, 85.40it/s]
 95%|█████████▌| 95/100 [00:01<00:00, 83.88it/s]
100%|██████████| 100/100 [00:01<00:00, 88.19it/s]

-
[Transition(s=array([-0.00767412, 1.4020356 , -0.77731264, -0.39489663, 0.00889908, - 0.17607279, 0. , 0. ], dtype=float32), a=np.int64(3), r=np.float64(0.01510799459859527)), - Transition(s=array([-0.01526899, 1.392572 , -0.766254 , -0.42065707, 0.01559265, - 0.13388489, 0. , 0. ], dtype=float32), a=np.int64(0), r=np.float64(-0.9906126974697145)), - Transition(s=array([-0.02286405, 1.3825084 , -0.7662748 , -0.44735536, 0.02228237, - 0.13380653, 0. , 0. ], dtype=float32), a=np.int64(0), r=np.float64(-0.9934895324159925)), - Transition(s=array([-0.0304594 , 1.3718452 , -0.7662946 , -0.4740309 , 0.02897082, - 0.13378178, 0. , 0. ], dtype=float32), a=np.int64(2), r=np.float64(1.4450091994476508)), - Transition(s=array([-0.03802614, 1.361714 , -0.7636849 , -0.45042533, 0.03589968, - 0.1385901 , 0. , 0. ], dtype=float32), a=np.int64(2), r=np.float64(0.43907361933223116))]

Can we view the dataset of trajectories as a “labelled dataset” in order to apply supervised learning to approximate the optimal Q-function? Yes! +trajectories[0][:5] # show first five transitions from first trajectory

/Users/adzcai/micromamba/envs/rlbook/lib/python3.11/site-packages/gymnasium/envs/registration.py:517: DeprecationWarning: WARN: The environment LunarLander-v2 is out of date. You should consider upgrading to version `v3`.
+  logger.deprecation(
+
---------------------------------------------------------------------------
+DeprecatedEnv                             Traceback (most recent call last)
+Cell In[3], line 1
+----> 1 env = gym.make("LunarLander-v2")
+      2 trajectories = collect_data(env, 100, 300, key)
+      3 trajectories[0][:5]  # show first five transitions from first trajectory
+
+File ~/micromamba/envs/rlbook/lib/python3.11/site-packages/gymnasium/envs/registration.py:687, in make(id, max_episode_steps, disable_env_checker, **kwargs)
+    684     assert isinstance(id, str)
+    686     # The environment name can include an unloaded module in "module:env_name" style
+--> 687     env_spec = _find_spec(id)
+    689 assert isinstance(env_spec, EnvSpec)
+    691 # Update the env spec kwargs with the `make` kwargs
+
+File ~/micromamba/envs/rlbook/lib/python3.11/site-packages/gymnasium/envs/registration.py:531, in _find_spec(env_id)
+    525     logger.warn(
+    526         f"Using the latest versioned environment `{new_env_id}` "
+    527         f"instead of the unversioned environment `{env_name}`."
+    528     )
+    530 if env_spec is None:
+--> 531     _check_version_exists(ns, name, version)
+    532     raise error.Error(
+    533         f"No registered env with id: {env_name}. Did you register it, or import the package that registers it? Use `gymnasium.pprint_registry()` to see all of the registered environments."
+    534     )
+    536 return env_spec
+
+File ~/micromamba/envs/rlbook/lib/python3.11/site-packages/gymnasium/envs/registration.py:431, in _check_version_exists(ns, name, version)
+    428     raise error.VersionNotFound(message)
+    430 if latest_spec is not None and version < latest_spec.version:
+--> 431     raise error.DeprecatedEnv(
+    432         f"Environment version v{version} for `{get_env_id(ns, name, None)}` is deprecated. "
+    433         f"Please use `{latest_spec.id}` instead."
+    434     )
+
+DeprecatedEnv: Environment version v2 for `LunarLander` is deprecated. Please use `LunarLander-v3` instead.

Can we view the dataset of trajectories as a “labelled dataset” in order to apply supervised learning to approximate the optimal Q-function? Yes! Recall that we can characterize the optimal Q-function using the Bellman optimality equations, -which don’t depend on an actual policy:

Qh(s,a)=r(s,a)+EsP(s,a)[maxaQh+1(s,a)]Q_\hi^\star(s, a) = r(s, a) + \E_{s' \sim P(s, a)} [\max_{a'} Q_{\hi+1}^\star(s', a')]

We can think of the arguments to the Q-function -- i.e. the current state, action, and timestep h\hi -- -as the inputs xx, and the r.h.s. of the above equation as the label f(x)f(x). Note that the r.h.s. can also be expressed as a conditional expectation:

f(x)=E[yx]wherey=r(sh,ah)+maxaQh+1(s,a).f(x) = \E [y \mid x] \quad \text{where} \quad y = r(s_\hi, a_\hi) + \max_{a'} Q^\star_{\hi + 1}(s', a').

Approximating the conditional expectation is precisely the task that Section 5.2 is suited for!

Our above dataset would give us NHN \cdot \hor samples in the dataset:

xih=(shi,ahi,h)yih=r(shi,ahi)+maxaQh+1(sh+1i,a)x_{i \hi} = (s_\hi^i, a_\hi^i, \hi) \qquad y_{i \hi} = r(s_\hi^i, a_\hi^i) + \max_{a'} Q^\star_{\hi + 1}(s_{\hi + 1}^i, a')
def get_X(trajectories: list[Trajectory]):
+which don’t depend on an actual policy:

Qh(s,a)=r(s,a)+EsP(s,a)[maxaQh+1(s,a)]Q_\hi^\star(s, a) = r(s, a) + \E_{s' \sim P(s, a)} [\max_{a'} Q_{\hi+1}^\star(s', a')]

We can think of the arguments to the Q-function -- i.e. the current state, action, and timestep h\hi -- +as the inputs xx, and the r.h.s. of the above equation as the label f(x)f(x). Note that the r.h.s. can also be expressed as a conditional expectation:

f(x)=E[yx]wherey=r(sh,ah)+maxaQh+1(s,a).f(x) = \E [y \mid x] \quad \text{where} \quad y = r(s_\hi, a_\hi) + \max_{a'} Q^\star_{\hi + 1}(s', a').

Approximating the conditional expectation is precisely the task that Section 5.2 is suited for!

Our above dataset would give us NHN \cdot \hor samples in the dataset:

xih=(shi,ahi,h)yih=r(shi,ahi)+maxaQh+1(sh+1i,a)x_{i \hi} = (s_\hi^i, a_\hi^i, \hi) \qquad y_{i \hi} = r(s_\hi^i, a_\hi^i) + \max_{a'} Q^\star_{\hi + 1}(s_{\hi + 1}^i, a')
def get_X(trajectories: list[Trajectory]):
     """
     We pass the state and timestep as input to the Q-function
     and return an array of Q-values.
@@ -144,27 +170,15 @@
             Q_values = f(s, h + 1)
             y.append(r + (Q_values[π(s, h + 1)] if π else Q_values.max()))
         y.append(τ[-1].r)
-    return np.array(y)
s, a, h = get_X(trajectories[:1])
+    return np.array(y)
s, a, h = get_X(trajectories[:1])
 print("states:", s[:5])
 print("actions:", a[:5])
-print("timesteps:", h[:5])
states: [[-0.00767412  1.4020356  -0.77731264 -0.39489663  0.00889908  0.17607279
-   0.          0.        ]
- [-0.01526899  1.392572   -0.766254   -0.42065707  0.01559265  0.13388489
-   0.          0.        ]
- [-0.02286405  1.3825084  -0.7662748  -0.44735536  0.02228237  0.13380653
-   0.          0.        ]
- [-0.0304594   1.3718452  -0.7662946  -0.4740309   0.02897082  0.13378178
-   0.          0.        ]
- [-0.03802614  1.361714   -0.7636849  -0.45042533  0.03589968  0.1385901
-   0.          0.        ]]
-actions: [3 0 0 2 2]
-timesteps: [0 1 2 3 4]
-
get_y(trajectories[:1])[:5]
Array([ 0.01510799, -0.9906127 , -0.9934895 , 1.4450092 , 0.43907362], dtype=float32)

Then we can use empirical risk minimization to find a function f^\hat f that approximates the optimal Q-function.

# We will see some examples of fitting methods in the next section
-FittingMethod = Callable[[Float[Array, "N D"], Float[Array, " N"]], QFunction]

But notice that the definition of yihy_{i \hi} depends on the Q-function itself! +print("timesteps:", h[:5])

get_y(trajectories[:1])[:5]

Then we can use empirical risk minimization to find a function f^\hat f that approximates the optimal Q-function.

# We will see some examples of fitting methods in the next section
+FittingMethod = Callable[[Float[Array, "N D"], Float[Array, " N"]], QFunction]

But notice that the definition of yihy_{i \hi} depends on the Q-function itself! How can we resolve this circular dependency? Recall that we faced the same issue when evaluating a policy in an infinite-horizon MDP. There, we iterated the Definition 1.8 since we knew that the policy’s value function was a fixed point of the policy’s Bellman operator. We can apply the same strategy here, using the f^\hat f from the previous iteration to compute the labels yihy_{i \hi}, -and then using this new dataset to fit the next iterate.

def fitted_q_iteration(
+and then using this new dataset to fit the next iterate.

def fitted_q_iteration(
     trajectories: list[Trajectory],
     fit: FittingMethod,
     epochs: int,
@@ -179,7 +193,7 @@
     for _ in range(epochs):
         y = get_y(trajectories, Q_hat)
         Q_hat = fit(X, y)
-    return Q_hat

We can also use this fixed-point interation to evaluate a policy using the dataset (not necessarily the one used to generate the trajectories):

def fitted_evaluation(
+    return Q_hat

5.4Fitted policy evaluation

We can also use this fixed-point interation to evaluate a policy using the dataset (not necessarily the one used to generate the trajectories):

def fitted_evaluation(
     trajectories: list[Trajectory],
     fit: FittingMethod,
     π: Policy,
@@ -195,8 +209,8 @@
     for _ in tqdm(range(epochs)):
         y = get_y(trajectories, Q_hat, π)
         Q_hat = fit(X, y)
-    return Q_hat

We can use this policy evaluation algorithm to adapt the [](#policy iteration algorithm <policy_iteration>) to this new setting. The algorithm remains exactly the same -- repeatedly make the policy greedy w.r.t. its own value function -- except now we must evaluate the policy (i.e. compute its value function) using the iterative fitted_evaluation algorithm.

def fitted_policy_iteration(
+    return Q_hat

5.5Fitted policy iteration

We can use this policy evaluation algorithm to adapt the [](#policy iteration algorithm <policy_iteration>) to this new setting. The algorithm remains exactly the same -- repeatedly make the policy greedy w.r.t. its own value function -- except now we must evaluate the policy (i.e. compute its value function) using the iterative fitted_evaluation algorithm.

def fitted_policy_iteration(
     trajectories: list[Trajectory],
     fit: FittingMethod,
     epochs: int,
@@ -208,9 +222,9 @@
     for _ in range(epochs):
         Q_hat = fitted_evaluation(trajectories, fit, π, evaluation_epochs)
         π = q_to_greedy(Q_hat)
-    return π

5.4Summary

\ No newline at end of file diff --git a/fitted-dp.json b/fitted-dp.json index 5d1ec89..c549c3b 100644 --- a/fitted-dp.json +++ b/fitted-dp.json @@ -1 +1 @@ -{"kind":"Notebook","sha256":"383dbef4a54c4fa6d21d8262b47a43806b7de9e8cf0aded0d6e80d9e6efb981f","slug":"fitted-dp","location":"/fitted_dp.md","dependencies":[],"frontmatter":{"title":"5 Fitted Dynamic Programming Algorithms","numbering":{"all":{"enabled":true},"enumerator":{"template":"5.%s"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"exports":[{"format":"md","filename":"fitted_dp.md","url":"/build/fitted_dp-bbfcf7e66c9311fe5ec9f9beb0cc0cbc.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"zS6OQ5PWTo"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"5.1","key":"yxmUeqWUjf"},{"type":"paragraph","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"We borrow these definitions from the ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"lhdaTGdEH8"},{"type":"link","url":"/mdps","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"1 Markov Decision Processes","key":"tQXIWj8p8Z"}],"urlSource":"./mdps.md","dataUrl":"/mdps.json","internal":true,"protocol":"file","key":"vSjvrOEwSP"},{"type":"text","value":" chapter:","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"Y48vE6AHAe"}],"key":"SshPXwPWv7"}],"key":"g3CNyQJdcn"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"from typing import NamedTuple, Callable, Optional\nfrom jaxtyping import Float, Array\nimport jax.numpy as np\nfrom jax import grad, vmap\nimport jax.random as rand\nfrom tqdm import tqdm\nimport gymnasium as gym\n\nkey = rand.PRNGKey(184)\n\n\nclass Transition(NamedTuple):\n s: int\n a: int\n r: float\n\n\nTrajectory = list[Transition]\n\n\ndef get_num_actions(trajectories: list[Trajectory]) -> int:\n \"\"\"Get the number of actions in the dataset. Assumes actions range from 0 to A-1.\"\"\"\n return max(max(t.a for t in τ) for τ in trajectories) + 1\n\n\nState = Float[Array, \"...\"] # arbitrary shape\n\n# assume finite `A` actions and f outputs an array of Q-values\n# i.e. Q(s, a, h) is implemented as f(s, h)[a]\nQFunction = Callable[[State, int], Float[Array, \" A\"]]\n\n\ndef Q_zero(A: int) -> QFunction:\n \"\"\"A Q-function that always returns zero.\"\"\"\n return lambda s, a: np.zeros(A)\n\n\n# a deterministic time-dependent policy\nPolicy = Callable[[State, int], int]\n\n\ndef q_to_greedy(Q: QFunction) -> Policy:\n \"\"\"Get the greedy policy for the given state-action value function.\"\"\"\n return lambda s, h: np.argmax(Q(s, h))","visibility":"hide","key":"GjvOVpl8dg"},{"type":"output","id":"8pUJXzCUF9ZcKRj1XtFv2","data":[],"visibility":"show","key":"DjH4Uqm2bQ"}],"data":{"tags":[]},"visibility":"show","key":"pbbVyVj8xd"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":71,"column":1},"end":{"line":74,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"YNGL1fjB8t"},{"type":"link","url":"/mdps","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"1 Markov Decision Processes","key":"jQ1TSWaLci"}],"urlSource":"./mdps.md","dataUrl":"/mdps.json","internal":true,"protocol":"file","key":"uY3GKmznqW"},{"type":"text","value":" chapter discussed the case of ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"qY41gWrYyX"},{"type":"strong","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"finite","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"mwtCHZ3379"}],"key":"Rx8lbuzkP4"},{"type":"text","value":" MDPs, where the state and action spaces ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"SHh6RFQSYy"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"html":"S\\mathcal{S}S","key":"ixuDvTIN3d"},{"type":"text","value":" and ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"MmkBcQLnGP"},{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"html":"A\\mathcal{A}A","key":"GYfwjCsaml"},{"type":"text","value":" were finite.\nThis gave us a closed-form expression for computing the r.h.s. of ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"fYMTUwQd5z"},{"type":"crossReference","kind":"proof:theorem","identifier":"bellman_consistency","label":"bellman_consistency","children":[{"type":"text","value":"the Bellman one-step consistency equation","key":"FUHolABSeT"}],"template":"Theorem %s","enumerator":"1.1","resolved":true,"html_id":"bellman-consistency","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"P5965vlkx4"},{"type":"text","value":".\nIn this chapter, we consider the case of ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"KKe4ZguNcZ"},{"type":"strong","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"large","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"yYTUoesYKO"}],"key":"yvHDr9bH1w"},{"type":"text","value":" or ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"e0Tx1Dg3Me"},{"type":"strong","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"continuous","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"jzSQI2w0ww"}],"key":"T5ftabGRfr"},{"type":"text","value":" state spaces, where the state space is too large to be enumerated.\nIn this case, we need to ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"FMZOPeLOt1"},{"type":"emphasis","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"approximate","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"SuB4qUe653"}],"key":"yJqBJjfgOI"},{"type":"text","value":" the value function and Q-function using methods from ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"aHlp3XnJx8"},{"type":"strong","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"supervised learning","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"LdMIykysfy"}],"key":"ocTykntGMu"},{"type":"text","value":".","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"IHpEouyhrY"}],"key":"kNDF5gwkmH"},{"type":"paragraph","position":{"start":{"line":76,"column":1},"end":{"line":78,"column":1}},"children":[{"type":"text","value":"We will first take a quick detour to introduce the ","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"lGtc7jEGrI"},{"type":"emphasis","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"children":[{"type":"text","value":"empirical risk minimization","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"FgQk8hbmm3"}],"key":"dO0BLT0HY0"},{"type":"text","value":" framework for function approximation.\nWe will then see its application to ","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"L9n4AUEZZn"},{"type":"emphasis","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"children":[{"type":"text","value":"fitted","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"tru3eTC2cK"}],"key":"lPySSChZ9i"},{"type":"text","value":" RL algorithms,\nwhich attempt to learn the optimal value function (and the optimal policy) from a dataset of trajectories.","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"irE08Ho0DB"}],"key":"KtLGc3YPXt"},{"type":"heading","depth":2,"position":{"start":{"line":81,"column":1},"end":{"line":81,"column":1}},"children":[{"type":"text","value":"Empirical risk minimization","position":{"start":{"line":81,"column":1},"end":{"line":81,"column":1}},"key":"J4pJL4LPPG"}],"label":"erm","identifier":"erm","html_id":"erm","enumerator":"5.2","key":"daYxPZ4nzE"},{"type":"paragraph","position":{"start":{"line":83,"column":1},"end":{"line":87,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"UePExnuC8i"},{"type":"strong","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"children":[{"type":"text","value":"supervised learning","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"dic9phjyjf"}],"key":"jlCCH3iI25"},{"type":"text","value":" task is as follows:\nWe seek to learn the relationship between some input variables ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"UBLvSwRqoP"},{"type":"inlineMath","value":"x","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"xxx","key":"tNNbXkZxbq"},{"type":"text","value":" and some output variable ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"qgzYJVlMqW"},{"type":"inlineMath","value":"y","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"yyy","key":"GWKAaLfRkP"},{"type":"text","value":"\n(drawn from their joint distribution).\nPrecisely, we want to find a function ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"Gs3JjTaX0U"},{"type":"inlineMath","value":"\\hat f : x \\mapsto y","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"f^:xy\\hat f : x \\mapsto yf^:xy","key":"m1PXOXHfMq"},{"type":"text","value":" that minimizes the\n","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"lTVfBsTOe8"},{"type":"emphasis","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"children":[{"type":"text","value":"squared error","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"HohtYpgnLd"}],"key":"zNKkVww1Ui"},{"type":"text","value":" of the prediction:","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"j1CmH3DjMo"}],"key":"RuCHW3YOts"},{"type":"math","value":"\\hat f = \\arg\\min_{f} \\E[(y - f(x))^2]","position":{"start":{"line":89,"column":1},"end":{"line":91,"column":1}},"html":"f^=argminfE[(yf(x))2]\\hat f = \\arg\\min_{f} \\E[(y - f(x))^2]f^=argfminE[(yf(x))2]","enumerator":"5.1","key":"DCrVEZxLbx"},{"type":"paragraph","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"children":[{"type":"text","value":"An equivalent framing is that we seek to approximate the ","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"key":"YikV8mwDGP"},{"type":"emphasis","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"children":[{"type":"text","value":"conditional expectation","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"key":"ttz09LMm6V"}],"key":"TvmRpv4EEw"},{"type":"text","value":" of ","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"key":"upvwC67cul"},{"type":"inlineMath","value":"y","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"html":"yyy","key":"UEVjAdftck"},{"type":"text","value":" given ","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"key":"GoqXQju7Wd"},{"type":"inlineMath","value":"x","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"html":"xxx","key":"v8VyCzK6Y0"},{"type":"text","value":":","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"key":"gT8zoDLiWo"}],"key":"veRPme754J"},{"type":"proof","kind":"theorem","label":"conditional_expectation_minimizes_mse","identifier":"conditional_expectation_minimizes_mse","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Conditional expectation minimizes mean squared error","position":{"start":{"line":95,"column":1},"end":{"line":95,"column":1}},"key":"VoFIvseYSS"}],"key":"l88teB114J"},{"type":"math","value":"\\arg\\min_{f} \\E[(y - f(x))^2] = (x \\mapsto \\E[y \\mid x])","position":{"start":{"line":98,"column":1},"end":{"line":100,"column":1}},"html":"argminfE[(yf(x))2]=(xE[yx])\\arg\\min_{f} \\E[(y - f(x))^2] = (x \\mapsto \\E[y \\mid x])argfminE[(yf(x))2]=(xE[yx])","enumerator":"5.2","key":"ngPzQJpHW4"}],"enumerator":"5.1","html_id":"conditional-expectation-minimizes-mse","key":"XmtAV9JywA"},{"type":"proof","kind":"proof","enumerated":true,"children":[{"type":"paragraph","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"children":[{"type":"text","value":"We can decompose the mean squared error as","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"key":"PiSvNtEXPi"}],"key":"aJtBS42fE7"},{"type":"math","value":"\\begin{aligned}\n\\E[(y - f(x))^2] &= \\E[ (y - \\E[y \\mid x] + \\E[y \\mid x] - f(x))^2 ] \\\\\n&= \\E[ (y - \\E[y \\mid x])^2 ] + \\E[ (\\E[y \\mid x] - f(x))^2 ] + 2 \\E[ (y - \\E[y \\mid x])(\\E[y \\mid x] - f(x)) ] \\\\\n\\end{aligned}","position":{"start":{"line":106,"column":1},"end":{"line":111,"column":1}},"html":"E[(yf(x))2]=E[(yE[yx]+E[yx]f(x))2]=E[(yE[yx])2]+E[(E[yx]f(x))2]+2E[(yE[yx])(E[yx]f(x))]\\begin{aligned}\n\\E[(y - f(x))^2] &= \\E[ (y - \\E[y \\mid x] + \\E[y \\mid x] - f(x))^2 ] \\\\\n&= \\E[ (y - \\E[y \\mid x])^2 ] + \\E[ (\\E[y \\mid x] - f(x))^2 ] + 2 \\E[ (y - \\E[y \\mid x])(\\E[y \\mid x] - f(x)) ] \\\\\n\\end{aligned}E[(yf(x))2]=E[(yE[yx]+E[yx]f(x))2]=E[(yE[yx])2]+E[(E[yx]f(x))2]+2E[(yE[yx])(E[yx]f(x))]","enumerator":"5.3","key":"ZYeOCugr19"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"ISp3ohiugU"}],"key":"AObrviuE38"},{"type":"paragraph","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"children":[{"type":"text","value":"Use the law of iterated expectations to show that the last term is zero.","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"nHVVB9OA2S"}],"key":"ro9LiKi9Tc"}],"key":"P4k2N5m1Sy"},{"type":"paragraph","position":{"start":{"line":117,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"The first term is the irreducible error, and the second term is the error due to the approximation,\nwhich is minimized at ","position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"key":"LYMfXMFPuV"},{"type":"text","value":"0","position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"key":"o0pvx5vkiV"},{"type":"text","value":" when ","position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"key":"GbedGHYrtj"},{"type":"inlineMath","value":"f(x) = \\E[y \\mid x]","position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"html":"f(x)=E[yx]f(x) = \\E[y \\mid x]f(x)=E[yx]","key":"tjspicOxKK"},{"type":"text","value":".","position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"key":"cthIAVYcoA"}],"key":"PLPsqAeRp4"}],"enumerator":"5.1","key":"LzmFsAokXx"},{"type":"paragraph","position":{"start":{"line":121,"column":1},"end":{"line":127,"column":1}},"children":[{"type":"text","value":"In most applications, the joint distribution of ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"lTs5AOPX7s"},{"type":"inlineMath","value":"x, y","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"x,yx, yx,y","key":"bKNVqtWmvy"},{"type":"text","value":" is unknown or extremely complex, and so we can’t\nanalytically evaluate ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"Tx7LqLomNj"},{"type":"inlineMath","value":"\\E [y \\mid x]","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"E[yx]\\E [y \\mid x]E[yx]","key":"QzHY1F73SB"},{"type":"text","value":".\nInstead, our strategy is to draw ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"lswburjOdQ"},{"type":"inlineMath","value":"N","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"NNN","key":"BNps5ZlAYi"},{"type":"text","value":" samples ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"qnNl21LFEC"},{"type":"inlineMath","value":"(x_i, y_i)","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"(xi,yi)(x_i, y_i)(xi,yi)","key":"IY7QgQIjer"},{"type":"text","value":" from the joint distribution of ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"Vk6LEMHKGA"},{"type":"inlineMath","value":"x","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"xxx","key":"EsklIgjZNO"},{"type":"text","value":" and ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"iNYXmMvyeq"},{"type":"inlineMath","value":"y","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"yyy","key":"HjuMxHmNWw"},{"type":"text","value":",\nand then use the ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"GHvkpJBK8V"},{"type":"emphasis","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"children":[{"type":"text","value":"sample average","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"XRhdHlNm4U"}],"key":"ZOI4XrExD5"},{"type":"text","value":" ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"eSxzjVVPw8"},{"type":"inlineMath","value":"\\sum_{i=1}^N (y_i - f(x_i))^2 / N","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"i=1N(yif(xi))2/N\\sum_{i=1}^N (y_i - f(x_i))^2 / Ni=1N(yif(xi))2/N","key":"jWY4Madoh7"},{"type":"text","value":" to approximate the mean squared error.\nThen we use a ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"CFF11HEAUP"},{"type":"emphasis","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"children":[{"type":"text","value":"fitting method","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"D7dwod2q9o"}],"key":"MsG1HFbXw6"},{"type":"text","value":" to find a function ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"L3Y2WhXtie"},{"type":"inlineMath","value":"\\hat f","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"f^\\hat ff^","key":"OT9gUl5Hu6"},{"type":"text","value":" that minimizes this objective\nand thus approximates the conditional expectation.\nThis approach is called ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"KktbbE9cNC"},{"type":"strong","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"children":[{"type":"text","value":"empirical risk minimization","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"zNCAOb71gu"}],"key":"D6f771VvIN"},{"type":"text","value":".","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"Dwg2XEKbRs"}],"key":"sCfCY0PSqX"},{"type":"proof","kind":"definition","label":"empirical_risk_minimization","identifier":"empirical_risk_minimization","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Empirical risk minimization","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"key":"QP7QLe41CY"}],"key":"QqWXWyEX6v"},{"type":"paragraph","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"children":[{"type":"text","value":"Given a dataset of samples ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"Ris3K8BoCm"},{"type":"inlineMath","value":"(x_1, y_1), \\dots, (x_N, y_N)","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"html":"(x1,y1),,(xN,yN)(x_1, y_1), \\dots, (x_N, y_N)(x1,y1),,(xN,yN)","key":"Myax5yozfy"},{"type":"text","value":", empirical risk minimization seeks to find a function ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"udPhZa4FHh"},{"type":"inlineMath","value":"f","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"html":"fff","key":"DNRVidgO35"},{"type":"text","value":" (from some class of functions ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"yxegKu6D0p"},{"type":"inlineMath","value":"\\mathcal{F}","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"html":"F\\mathcal{F}F","key":"sAlZdjkQqr"},{"type":"text","value":") that minimizes the empirical risk:","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"FoBafnPZTM"}],"key":"WAnQxDkwD6"},{"type":"math","value":"\\hat f = \\arg\\min_{f \\in \\mathcal{F}} \\frac{1}{N} \\sum_{i=1}^N (y_i - f(x_i))^2","position":{"start":{"line":134,"column":1},"end":{"line":136,"column":1}},"html":"f^=argminfF1Ni=1N(yif(xi))2\\hat f = \\arg\\min_{f \\in \\mathcal{F}} \\frac{1}{N} \\sum_{i=1}^N (y_i - f(x_i))^2f^=argfFminN1i=1N(yif(xi))2","enumerator":"5.4","key":"G2tWVQVy7a"},{"type":"paragraph","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"children":[{"type":"text","value":"We will cover the details of the minimization process in [](#the next section ).","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"key":"RkNFKFzt7J"}],"key":"N5g2QskjOX"}],"enumerator":"5.1","html_id":"empirical-risk-minimization","key":"IEw4BO9NRG"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"c9JHUdi4LO"}],"key":"XrzNtL0tIo"},{"type":"paragraph","position":{"start":{"line":142,"column":1},"end":{"line":142,"column":1}},"children":[{"type":"text","value":"Why is it important that we constrain our search to a class of functions ","position":{"start":{"line":142,"column":1},"end":{"line":142,"column":1}},"key":"uOxcoy4cdG"},{"type":"inlineMath","value":"\\mathcal{F}","position":{"start":{"line":142,"column":1},"end":{"line":142,"column":1}},"html":"F\\mathcal{F}F","key":"oTESj6tfWk"},{"type":"text","value":"?","position":{"start":{"line":142,"column":1},"end":{"line":142,"column":1}},"key":"JqFT61tEMo"}],"key":"qSn9VEv1y5"},{"type":"paragraph","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"children":[{"type":"text","value":"Hint: Consider the function ","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"key":"wg3ovtdw1N"},{"type":"inlineMath","value":"f(x) = \\sum_{i=1}^N y_i \\mathbb{1}_{\\{ x = x_i \\}}","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"html":"f(x)=i=1Nyi1{x=xi}f(x) = \\sum_{i=1}^N y_i \\mathbb{1}_{\\{ x = x_i \\}}f(x)=i=1Nyi1{x=xi}","key":"QxDNMqvFb1"},{"type":"text","value":". What is the empirical risk of this function? Would you consider it a good approximation of the conditional expectation?","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"key":"BxfHHGxdum"}],"key":"svxsepspFT"}],"key":"Ol6ogFaF7m"},{"type":"heading","depth":2,"position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"children":[{"type":"text","value":"Fitted value iteration","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"key":"OMiDYJgcHg"}],"identifier":"fitted-value-iteration","label":"Fitted value iteration","html_id":"fitted-value-iteration","implicit":true,"enumerator":"5.3","key":"DSdq4wTYlY"},{"type":"paragraph","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"children":[{"type":"text","value":"Let us apply ERM to the RL problem of computing the optimal policy / value function.","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"key":"OMSc6EC4F7"}],"key":"U7J2i7qSbO"},{"type":"paragraph","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"How did we compute the optimal value function in MDPs with ","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"key":"hLN1EgA6Sf"},{"type":"emphasis","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"finite","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"key":"PoAx3gauBO"}],"key":"qYfUgvSmg9"},{"type":"text","value":" state and action spaces?","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"key":"UqFxJznhjb"}],"key":"qljQk3gJem"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":153,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":153,"column":1},"end":{"line":154,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"children":[{"type":"text","value":"In a [](#finite-horizon MDP ), we can use ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"ezwGmJEAQ9"},{"type":"crossReference","kind":"proof:definition","identifier":"pi_star_dp","label":"pi_star_dp","children":[{"type":"text","value":"dynamic programming","key":"yWZluXBV7B"}],"template":"Definition %s","enumerator":"1.11","resolved":true,"html_id":"pi-star-dp","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"hS4ZwIEeQb"},{"type":"text","value":", working backwards from the end of the time horizon, to compute the optimal value function exactly.","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"w9Na2NUC6q"}],"key":"NEpyma5nUr"}],"key":"YWSVRL12xg"},{"type":"listItem","spread":true,"position":{"start":{"line":155,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"children":[{"type":"text","value":"In an [](#infinite-horizon MDP ), we can use [](#value iteration ), which iterates the Bellman optimality operator ","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"fuAly8PGfR"},{"type":"crossReference","kind":"equation","identifier":"bellman_optimality_operator","label":"bellman_optimality_operator","children":[{"type":"text","value":"(","key":"D97cXjgzvO"},{"type":"text","value":"1.54","key":"x9gRRZa2Vv"},{"type":"text","value":")","key":"WN7YliAxm0"}],"template":"(%s)","enumerator":"1.54","resolved":true,"html_id":"bellman-optimality-operator","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"wqYMDx00P0"},{"type":"text","value":" to approximately compute the optimal value function.","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"J9fVgvq8uO"}],"key":"cmXzXcuLkx"}],"key":"vsdhevnuDL"}],"key":"T5Vb2ozkbr"},{"type":"paragraph","position":{"start":{"line":157,"column":1},"end":{"line":161,"column":1}},"children":[{"type":"text","value":"Our existing approaches represent the value function, and the MDP itself,\nin matrix notation.\nBut what happens if the state space is extremely large, or even infinite (e.g. real-valued)?\nThen computing a weighted sum over all possible next states, which is required to compute the Bellman operator,\nbecomes intractable.","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"key":"LaeZGNJPcB"}],"key":"YWyCV5Tpfx"},{"type":"paragraph","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"children":[{"type":"text","value":"Instead, we will need to use ","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"Ff72srpb81"},{"type":"emphasis","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"children":[{"type":"text","value":"function approximation","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"syoIJTVkMC"}],"key":"GUbIT7gnTN"},{"type":"text","value":" methods from supervised learning to solve for the value function in an alternative way.","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"qNryfGV8I5"}],"key":"YZVH09GXVW"},{"type":"paragraph","position":{"start":{"line":165,"column":1},"end":{"line":166,"column":1}},"children":[{"type":"text","value":"In particular, suppose we have a dataset of ","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"key":"XSXwgnweRu"},{"type":"inlineMath","value":"N","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"html":"NNN","key":"jXGiqlSR6b"},{"type":"text","value":" trajectories ","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"key":"kSnysfw6Sm"},{"type":"inlineMath","value":"\\tau_1, \\dots, \\tau_N \\sim \\rho_{\\pi}","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"html":"τ1,,τNρπ\\tau_1, \\dots, \\tau_N \\sim \\rho_{\\pi}τ1,,τNρπ","key":"rsUUJrMq0y"},{"type":"text","value":" from some policy ","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"key":"Tv2YctNzXB"},{"type":"text","value":"π","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"key":"HMwg8yWmym"},{"type":"text","value":" (called the ","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"key":"C4fbr6vukh"},{"type":"strong","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"children":[{"type":"text","value":"data collection policy","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"key":"Wx2Tx9DhJN"}],"key":"BMsPWda8FK"},{"type":"text","value":") acting in the MDP of interest.\nLet us indicate the trajectory index in the superscript, so that","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"key":"gdQPClsW6v"}],"key":"zTDLPiFyX5"},{"type":"math","value":"\\tau_i = \\{ s_0^i, a_0^i, r_0^i, s_1^i, a_1^i, r_1^i, \\dots, s_{\\hor-1}^i, a_{\\hor-1}^i, r_{\\hor-1}^i \\}.","position":{"start":{"line":168,"column":1},"end":{"line":170,"column":1}},"html":"τi={s0i,a0i,r0i,s1i,a1i,r1i,,sH1i,aH1i,rH1i}.\\tau_i = \\{ s_0^i, a_0^i, r_0^i, s_1^i, a_1^i, r_1^i, \\dots, s_{\\hor-1}^i, a_{\\hor-1}^i, r_{\\hor-1}^i \\}.τi={s0i,a0i,r0i,s1i,a1i,r1i,,sH1i,aH1i,rH1i}.","enumerator":"5.5","key":"moJnQn8Heo"}],"key":"hAo1JVLTvh"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def collect_data(\n env: gym.Env, N: int, H: int, key: rand.PRNGKey, π: Optional[Policy] = None\n) -> list[Trajectory]:\n \"\"\"Collect a dataset of trajectories from the given policy (or a random one).\"\"\"\n trajectories = []\n seeds = [rand.bits(k).item() for k in rand.split(key, N)]\n for i in tqdm(range(N)):\n τ = []\n s, _ = env.reset(seed=seeds[i])\n for h in range(H):\n # sample from a random policy\n a = π(s, h) if π else env.action_space.sample()\n s_next, r, terminated, truncated, _ = env.step(a)\n τ.append(Transition(s, a, r))\n if terminated or truncated:\n break\n s = s_next\n trajectories.append(τ)\n return trajectories","key":"VnXdO7psKF"},{"type":"output","id":"UisSZXHmY_Iaacpwxvgoh","data":[],"key":"B9KgvEuUB2"}],"data":{},"key":"XckbXksuig"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"env = gym.make(\"LunarLander-v2\")\ntrajectories = collect_data(env, 100, 300, key)\ntrajectories[0][:5] # show first five transitions from first trajectory","key":"Fo9Ca27WfF"},{"type":"output","id":"k8YtlaYDO0W5vIl6NaGub","data":[{"output_type":"stream","name":"stderr","text":"\r 0%| | 0/100 [00:00Qh(s,a)=r(s,a)+EsP(s,a)[maxaQh+1(s,a)]Q_\\hi^\\star(s, a) = r(s, a) + \\E_{s' \\sim P(s, a)} [\\max_{a'} Q_{\\hi+1}^\\star(s', a')]Qh(s,a)=r(s,a)+EsP(s,a)[amaxQh+1(s,a)]","enumerator":"5.6","key":"Q48u4rORy3"},{"type":"paragraph","position":{"start":{"line":208,"column":1},"end":{"line":209,"column":1}},"children":[{"type":"text","value":"We can think of the arguments to the Q-function -- i.e. the current state, action, and timestep ","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"nUAiRpNLoy"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"html":"h\\hih","key":"T8gaSaBZDf"},{"type":"text","value":" --\nas the inputs ","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"qSrrqTCcqU"},{"type":"inlineMath","value":"x","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"html":"xxx","key":"xTFoIb80Ot"},{"type":"text","value":", and the r.h.s. of the above equation as the label ","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"NlY6UR25T1"},{"type":"inlineMath","value":"f(x)","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"html":"f(x)f(x)f(x)","key":"nh9KDOzArj"},{"type":"text","value":". Note that the r.h.s. can also be expressed as a ","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"vjJwb7AM9Y"},{"type":"strong","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"children":[{"type":"text","value":"conditional expectation","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"khQYtvKMJ3"}],"key":"IA3Voo1jw9"},{"type":"text","value":":","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"EIzZGJ6pJd"}],"key":"rfdK17jep1"},{"type":"math","value":"f(x) = \\E [y \\mid x] \\quad \\text{where} \\quad y = r(s_\\hi, a_\\hi) + \\max_{a'} Q^\\star_{\\hi + 1}(s', a').","position":{"start":{"line":211,"column":1},"end":{"line":213,"column":1}},"html":"f(x)=E[yx]wherey=r(sh,ah)+maxaQh+1(s,a).f(x) = \\E [y \\mid x] \\quad \\text{where} \\quad y = r(s_\\hi, a_\\hi) + \\max_{a'} Q^\\star_{\\hi + 1}(s', a').f(x)=E[yx]wherey=r(sh,ah)+amaxQh+1(s,a).","enumerator":"5.7","key":"MxjtTzmIff"},{"type":"paragraph","position":{"start":{"line":215,"column":1},"end":{"line":215,"column":1}},"children":[{"type":"text","value":"Approximating the conditional expectation is precisely the task that ","position":{"start":{"line":215,"column":1},"end":{"line":215,"column":1}},"key":"VHSEGAQ2nv"},{"type":"crossReference","position":{"start":{"line":215,"column":1},"end":{"line":215,"column":1}},"children":[{"type":"text","value":"Section ","key":"LKqjUWS9dt"},{"type":"text","value":"5.2","key":"PuzsTuM9Sv"}],"identifier":"erm","label":"erm","kind":"heading","template":"Section %s","enumerator":"5.2","resolved":true,"html_id":"erm","key":"JYefFqnsn0"},{"type":"text","value":" is suited for!","position":{"start":{"line":215,"column":1},"end":{"line":215,"column":1}},"key":"l3V7Ovd7Du"}],"key":"k6Fn3sMn59"},{"type":"paragraph","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"children":[{"type":"text","value":"Our above dataset would give us ","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"key":"kQW2eQCEQh"},{"type":"inlineMath","value":"N \\cdot \\hor","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"html":"NHN \\cdot \\horNH","key":"wGq7SNMnSN"},{"type":"text","value":" samples in the dataset:","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"key":"nGzbKsWAdM"}],"key":"hDUo10d5V0"},{"type":"math","value":"x_{i \\hi} = (s_\\hi^i, a_\\hi^i, \\hi) \\qquad y_{i \\hi} = r(s_\\hi^i, a_\\hi^i) + \\max_{a'} Q^\\star_{\\hi + 1}(s_{\\hi + 1}^i, a')","position":{"start":{"line":219,"column":1},"end":{"line":221,"column":1}},"html":"xih=(shi,ahi,h)yih=r(shi,ahi)+maxaQh+1(sh+1i,a)x_{i \\hi} = (s_\\hi^i, a_\\hi^i, \\hi) \\qquad y_{i \\hi} = r(s_\\hi^i, a_\\hi^i) + \\max_{a'} Q^\\star_{\\hi + 1}(s_{\\hi + 1}^i, a')xih=(shi,ahi,h)yih=r(shi,ahi)+amaxQh+1(sh+1i,a)","enumerator":"5.8","key":"Fs1AcvHuj4"}],"key":"bQM3ydcxQ5"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def get_X(trajectories: list[Trajectory]):\n \"\"\"\n We pass the state and timestep as input to the Q-function\n and return an array of Q-values.\n \"\"\"\n rows = [(τ[h].s, τ[h].a, h) for τ in trajectories for h in range(len(τ))]\n return [np.stack(ary) for ary in zip(*rows)]\n\n\ndef get_y(\n trajectories: list[Trajectory],\n f: Optional[QFunction] = None,\n π: Optional[Policy] = None,\n):\n \"\"\"\n Transform the dataset of trajectories into a dataset for supervised learning.\n If `π` is None, instead estimates the optimal Q function.\n Otherwise, estimates the Q function of π.\n \"\"\"\n f = f or Q_zero(get_num_actions(trajectories))\n y = []\n for τ in trajectories:\n for h in range(len(τ) - 1):\n s, a, r = τ[h]\n Q_values = f(s, h + 1)\n y.append(r + (Q_values[π(s, h + 1)] if π else Q_values.max()))\n y.append(τ[-1].r)\n return np.array(y)","key":"pBjV6iPEV5"},{"type":"output","id":"lYkVtBQEcerGUWBq-34fL","data":[],"key":"cJGe6pp4lK"}],"data":{},"key":"EPgVeJRIt6"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"s, a, h = get_X(trajectories[:1])\nprint(\"states:\", s[:5])\nprint(\"actions:\", a[:5])\nprint(\"timesteps:\", h[:5])","key":"QBQ2QxPmL7"},{"type":"output","id":"dDa0W4zHLWpUlEjucWk1A","data":[{"output_type":"stream","name":"stdout","text":"states: [[-0.00767412 1.4020356 -0.77731264 -0.39489663 0.00889908 0.17607279\n 0. 0. ]\n [-0.01526899 1.392572 -0.766254 -0.42065707 0.01559265 0.13388489\n 0. 0. ]\n [-0.02286405 1.3825084 -0.7662748 -0.44735536 0.02228237 0.13380653\n 0. 0. ]\n [-0.0304594 1.3718452 -0.7662946 -0.4740309 0.02897082 0.13378178\n 0. 0. ]\n [-0.03802614 1.361714 -0.7636849 -0.45042533 0.03589968 0.1385901\n 0. 0. ]]\nactions: [3 0 0 2 2]\ntimesteps: [0 1 2 3 4]\n"}],"key":"qsMc4JRcHj"}],"data":{},"key":"iLY7TMZfpj"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"get_y(trajectories[:1])[:5]","key":"XOFyf8OdvZ"},{"type":"output","id":"01EPeySRKZVc8Ir-eoDKQ","data":[{"output_type":"execute_result","execution_count":6,"metadata":{},"data":{"text/plain":{"content":"Array([ 0.01510799, -0.9906127 , -0.9934895 , 1.4450092 , 0.43907362], dtype=float32)","content_type":"text/plain"}}}],"key":"gg0ibtQ6Vo"}],"data":{},"key":"BF90lrezTS"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"children":[{"type":"text","value":"Then we can use empirical risk minimization to find a function ","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"ACAokekxbX"},{"type":"inlineMath","value":"\\hat f","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"html":"f^\\hat ff^","key":"oxYwdzqw1X"},{"type":"text","value":" that approximates the optimal Q-function.","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"lvhZju6wEK"}],"key":"s7fuV96oVd"}],"key":"blJFEsSX1c"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"# We will see some examples of fitting methods in the next section\nFittingMethod = Callable[[Float[Array, \"N D\"], Float[Array, \" N\"]], QFunction]","key":"h1LQR20tfD"},{"type":"output","id":"Ljiokj6taEDHmUOKW7Whn","data":[],"key":"yNmUvQLZIP"}],"data":{},"key":"jqE2WsZNFG"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":272,"column":1},"end":{"line":276,"column":1}},"children":[{"type":"text","value":"But notice that the definition of ","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"nUaSfFkc4z"},{"type":"inlineMath","value":"y_{i \\hi}","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"html":"yihy_{i \\hi}yih","key":"e6V3DTfR7E"},{"type":"text","value":" depends on the Q-function itself!\nHow can we resolve this circular dependency?\nRecall that we faced the same issue ","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"HKaBB9p8pr"},{"type":"crossReference","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"children":[{"type":"text","value":"when evaluating a policy in an infinite-horizon MDP","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"pfyituWKlt"}],"identifier":"iterative_pe","label":"iterative_pe","kind":"heading","template":"Section %s","enumerator":"1.5.2.2","resolved":true,"html_id":"iterative-pe","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"gHQbSGaZUj"},{"type":"text","value":". There, we iterated the ","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"iVTcLNmKBF"},{"type":"crossReference","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"children":[{"type":"text","value":"Definition ","key":"nUSexCMHB9"},{"type":"text","value":"1.8","key":"AAFXdefdTQ"}],"identifier":"bellman_operator","label":"bellman_operator","kind":"proof:definition","template":"Definition %s","enumerator":"1.8","resolved":true,"html_id":"bellman-operator","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"RNy4GG8lxh"},{"type":"text","value":" since we knew that the policy’s value function was a fixed point of the policy’s Bellman operator.\nWe can apply the same strategy here, using the ","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"g4sKt18UiW"},{"type":"inlineMath","value":"\\hat f","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"html":"f^\\hat ff^","key":"BXWXvYmw1q"},{"type":"text","value":" from the previous iteration to compute the labels ","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"NIhElsxbe6"},{"type":"inlineMath","value":"y_{i \\hi}","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"html":"yihy_{i \\hi}yih","key":"XwuiiQjnlY"},{"type":"text","value":",\nand then using this new dataset to fit the next iterate.","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"IcWX8N7cHk"}],"key":"nJXLw9mDJV"},{"type":"proof","kind":"definition","label":"fitted_q_iteration","identifier":"fitted_q_iteration","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Fitted Q-function iteration","position":{"start":{"line":278,"column":1},"end":{"line":278,"column":1}},"key":"QkiIGQX1ly"}],"key":"n8PCqkLzN9"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":281,"column":1},"end":{"line":286,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"children":[{"type":"text","value":"Initialize some function ","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"qju2OYTC9i"},{"type":"inlineMath","value":"\\hat f(s, a, h) \\in \\mathbb{R}","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"html":"f^(s,a,h)R\\hat f(s, a, h) \\in \\mathbb{R}f^(s,a,h)R","key":"n864YOwkEl"},{"type":"text","value":".","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"GbvBdQQ3F5"}],"key":"xwEZoDB4Re"},{"type":"listItem","spread":true,"position":{"start":{"line":282,"column":1},"end":{"line":286,"column":1}},"children":[{"type":"text","value":"Iterate the following:","position":{"start":{"line":282,"column":1},"end":{"line":282,"column":1}},"key":"SlsFYKMavk"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":283,"column":1},"end":{"line":286,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"children":[{"type":"text","value":"Generate a supervised learning dataset ","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"pPFR6AqBJe"},{"type":"inlineMath","value":"X, y","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"html":"X,yX, yX,y","key":"y157PLAwR6"},{"type":"text","value":" from the trajectories and the current estimate ","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"UDloivl6vv"},{"type":"inlineMath","value":"f","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"html":"fff","key":"k4XDxFG0gB"},{"type":"text","value":", where the labels come from the r.h.s. of the Bellman optimality operator ","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"i76doX051C"},{"type":"crossReference","kind":"equation","identifier":"bellman_optimality_operator","label":"bellman_optimality_operator","children":[{"type":"text","value":"(","key":"cPXMEvzw9u"},{"type":"text","value":"1.54","key":"Mxvys2sD88"},{"type":"text","value":")","key":"m1dN1fdx4P"}],"template":"(%s)","enumerator":"1.54","resolved":true,"html_id":"bellman-optimality-operator","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"x9RFFkUBzc"}],"key":"b2mGk3Q1xN"}],"key":"pf98YPhgKk"},{"type":"listItem","spread":true,"position":{"start":{"line":284,"column":1},"end":{"line":286,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"children":[{"type":"text","value":"Set ","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"key":"HI4Oqn8UAe"},{"type":"inlineMath","value":"\\hat f","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"html":"f^\\hat ff^","key":"xwmUo4f9OB"},{"type":"text","value":" to the function that minimizes the empirical risk:","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"key":"cXlNw33bQX"}],"key":"AOeveyjDz0"},{"type":"math","value":"\\hat f \\gets \\arg\\min_f \\frac{1}{N} \\sum_{i=1}^N (y_i - f(x_i))^2.","position":{"start":{"line":286,"column":1},"end":{"line":286,"column":1}},"html":"f^argminf1Ni=1N(yif(xi))2.\\hat f \\gets \\arg\\min_f \\frac{1}{N} \\sum_{i=1}^N (y_i - f(x_i))^2.f^argfminN1i=1N(yif(xi))2.","enumerator":"5.9","key":"yjCnmYxqlY"}],"key":"ICUqqe9PZ8"}],"key":"p9LFLk2wMG"}],"key":"WEKim84DvH"}],"key":"ZRt77LG3Gl"}],"enumerator":"5.2","html_id":"fitted-q-iteration","key":"fI5QPUOycH"}],"key":"rSSDvwKMhy"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def fitted_q_iteration(\n trajectories: list[Trajectory],\n fit: FittingMethod,\n epochs: int,\n Q_init: Optional[QFunction] = None,\n) -> QFunction:\n \"\"\"\n Run fitted Q-function iteration using the given dataset.\n Returns an estimate of the optimal Q-function.\n \"\"\"\n Q_hat = Q_init or Q_zero(get_num_actions(trajectories))\n X = get_X(trajectories)\n for _ in range(epochs):\n y = get_y(trajectories, Q_hat)\n Q_hat = fit(X, y)\n return Q_hat","key":"o1zc5qC3ZN"},{"type":"output","id":"rngIsQEeWQnWrE8fSIwLD","data":[],"key":"TFMp2AwzdB"}],"data":{},"key":"YHrz0n2YDh"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"children":[{"type":"text","value":"We can also use this fixed-point interation to ","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"ktT9KYvGuM"},{"type":"emphasis","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"children":[{"type":"text","value":"evaluate","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"sHQaOQ9TAN"}],"key":"hMbWTku6AE"},{"type":"text","value":" a policy using the dataset (not necessarily the one used to generate the trajectories):","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"uwgQSxVB6t"}],"key":"vdGRkbSaT3"},{"type":"proof","kind":"definition","label":"fitted_evaluation","identifier":"fitted_evaluation","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Fitted policy evaluation","position":{"start":{"line":310,"column":1},"end":{"line":310,"column":1}},"key":"hyHN7K7ne1"}],"key":"GFj2sEY2rF"},{"type":"paragraph","position":{"start":{"line":313,"column":1},"end":{"line":313,"column":1}},"children":[{"type":"strong","position":{"start":{"line":313,"column":1},"end":{"line":313,"column":1}},"children":[{"type":"text","value":"Input:","position":{"start":{"line":313,"column":1},"end":{"line":313,"column":1}},"key":"sUguAbYoxS"}],"key":"m1Q9RN31WU"},{"type":"text","value":" Policy ","position":{"start":{"line":313,"column":1},"end":{"line":313,"column":1}},"key":"jj777EKfGD"},{"type":"inlineMath","value":"\\pi : \\mathcal{S} \\times [H] \\to \\Delta(\\mathcal{A})","position":{"start":{"line":313,"column":1},"end":{"line":313,"column":1}},"html":"π:S×[H]Δ(A)\\pi : \\mathcal{S} \\times [H] \\to \\Delta(\\mathcal{A})π:S×[H]Δ(A)","key":"kJROcJ8012"},{"type":"text","value":" to be evaluated.","position":{"start":{"line":313,"column":1},"end":{"line":313,"column":1}},"key":"ceQnDUqnJB"}],"key":"XVMKATYUPj"},{"type":"paragraph","position":{"start":{"line":315,"column":1},"end":{"line":315,"column":1}},"children":[{"type":"strong","position":{"start":{"line":315,"column":1},"end":{"line":315,"column":1}},"children":[{"type":"text","value":"Output:","position":{"start":{"line":315,"column":1},"end":{"line":315,"column":1}},"key":"RedKPUMZOF"}],"key":"ZrGO4xZPlq"},{"type":"text","value":" An approximation of the value function ","position":{"start":{"line":315,"column":1},"end":{"line":315,"column":1}},"key":"ttDuWm3y93"},{"type":"inlineMath","value":"Q^\\pi","position":{"start":{"line":315,"column":1},"end":{"line":315,"column":1}},"html":"QπQ^\\piQπ","key":"sGhnB6b72I"},{"type":"text","value":" of the policy.","position":{"start":{"line":315,"column":1},"end":{"line":315,"column":1}},"key":"NrkuuNvxe1"}],"key":"L13r0KAKpC"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":317,"column":1},"end":{"line":322,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":317,"column":1},"end":{"line":317,"column":1}},"children":[{"type":"text","value":"Initialize some function ","position":{"start":{"line":317,"column":1},"end":{"line":317,"column":1}},"key":"xmk16C4cyH"},{"type":"inlineMath","value":"\\hat f(s, a, h) \\in \\mathbb{R}","position":{"start":{"line":317,"column":1},"end":{"line":317,"column":1}},"html":"f^(s,a,h)R\\hat f(s, a, h) \\in \\mathbb{R}f^(s,a,h)R","key":"lsNwBMLXMX"},{"type":"text","value":".","position":{"start":{"line":317,"column":1},"end":{"line":317,"column":1}},"key":"VawkbTb3Ga"}],"key":"SgxAgW7zn7"},{"type":"listItem","spread":true,"position":{"start":{"line":318,"column":1},"end":{"line":322,"column":1}},"children":[{"type":"text","value":"Iterate the following:","position":{"start":{"line":318,"column":1},"end":{"line":318,"column":1}},"key":"NvYkS2ytBl"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":319,"column":1},"end":{"line":322,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":319,"column":1},"end":{"line":319,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":319,"column":1},"end":{"line":319,"column":1}},"children":[{"type":"text","value":"Generate a supervised learning dataset ","position":{"start":{"line":319,"column":1},"end":{"line":319,"column":1}},"key":"jgIlEXYXo1"},{"type":"inlineMath","value":"X, y","position":{"start":{"line":319,"column":1},"end":{"line":319,"column":1}},"html":"X,yX, yX,y","key":"lgGDZnqtEf"},{"type":"text","value":" from the trajectories and the current estimate ","position":{"start":{"line":319,"column":1},"end":{"line":319,"column":1}},"key":"tHVWm4nXM0"},{"type":"inlineMath","value":"f","position":{"start":{"line":319,"column":1},"end":{"line":319,"column":1}},"html":"fff","key":"jOXWavhNJP"},{"type":"text","value":", where the labels come from the r.h.s. of the ","position":{"start":{"line":319,"column":1},"end":{"line":319,"column":1}},"key":"DrMyGGrx5C"},{"type":"crossReference","kind":"proof:theorem","identifier":"bellman_consistency","label":"bellman_consistency","children":[{"type":"text","value":"Bellman consistency equation","key":"Cp9RJPvLpk"}],"template":"Theorem %s","enumerator":"1.1","resolved":true,"html_id":"bellman-consistency","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"xUpEAteCfT"},{"type":"text","value":" for the given policy.","position":{"start":{"line":319,"column":1},"end":{"line":319,"column":1}},"key":"LlM8vegrBk"}],"key":"vRnNwki4in"}],"key":"fU3fKyFxaG"},{"type":"listItem","spread":true,"position":{"start":{"line":320,"column":1},"end":{"line":322,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"children":[{"type":"text","value":"Set ","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"key":"T8vXVZSW0A"},{"type":"inlineMath","value":"\\hat f","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"html":"f^\\hat ff^","key":"Lqbt1RPscm"},{"type":"text","value":" to the function that minimizes the empirical risk:","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"key":"P6IuC26wbE"}],"key":"ehoSJXl8gL"},{"type":"math","value":"\\hat f \\gets \\arg\\min_f \\frac{1}{N} \\sum_{i=1}^N (y_i - f(x_i))^2.","position":{"start":{"line":322,"column":1},"end":{"line":322,"column":1}},"html":"f^argminf1Ni=1N(yif(xi))2.\\hat f \\gets \\arg\\min_f \\frac{1}{N} \\sum_{i=1}^N (y_i - f(x_i))^2.f^argfminN1i=1N(yif(xi))2.","enumerator":"5.10","key":"PSak6mBlEs"}],"key":"SIY568aH3c"}],"key":"zZJsyYzSAM"}],"key":"y6jZwToQ5m"}],"key":"CcYmYc9Wkv"}],"enumerator":"5.3","html_id":"fitted-evaluation","key":"hPBntLgH7L"}],"key":"XAbpFbgMHT"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def fitted_evaluation(\n trajectories: list[Trajectory],\n fit: FittingMethod,\n π: Policy,\n epochs: int,\n Q_init: Optional[QFunction] = None,\n) -> QFunction:\n \"\"\"\n Run fitted policy evaluation using the given dataset.\n Returns an estimate of the Q-function of the given policy.\n \"\"\"\n Q_hat = Q_init or Q_zero(get_num_actions(trajectories))\n X = get_X(trajectories)\n for _ in tqdm(range(epochs)):\n y = get_y(trajectories, Q_hat, π)\n Q_hat = fit(X, y)\n return Q_hat","key":"AQzD5Z9HTe"},{"type":"output","id":"EInXwQepFk5Y8NUP9xEMj","data":[],"key":"BTIx5W4TMO"}],"data":{},"key":"ao2yNlgjFY"},{"type":"block","children":[{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"FnAN6cE9DJ"}],"key":"Nn9wbVowVF"},{"type":"paragraph","position":{"start":{"line":346,"column":1},"end":{"line":347,"column":1}},"children":[{"type":"text","value":"Spot the difference between ","position":{"start":{"line":346,"column":1},"end":{"line":346,"column":1}},"key":"A36fnja6db"},{"type":"inlineCode","value":"fitted_evaluation","position":{"start":{"line":346,"column":1},"end":{"line":346,"column":1}},"key":"N37XUepxiq"},{"type":"text","value":" and ","position":{"start":{"line":346,"column":1},"end":{"line":346,"column":1}},"key":"UVAAn11otM"},{"type":"inlineCode","value":"fitted_q_iteration","position":{"start":{"line":346,"column":1},"end":{"line":346,"column":1}},"key":"YQWKapkleE"},{"type":"text","value":". (See the definition of ","position":{"start":{"line":346,"column":1},"end":{"line":346,"column":1}},"key":"CW0quZNwAs"},{"type":"inlineCode","value":"get_y","position":{"start":{"line":346,"column":1},"end":{"line":346,"column":1}},"key":"uqyKMIBVP5"},{"type":"text","value":".)\nHow would you modify this algorithm to evaluate the data collection policy?","position":{"start":{"line":346,"column":1},"end":{"line":346,"column":1}},"key":"xAC0VX7K1w"}],"key":"rIdgYw7CvI"}],"key":"ZNOumUs6yX"},{"type":"paragraph","position":{"start":{"line":350,"column":1},"end":{"line":350,"column":1}},"children":[{"type":"text","value":"We can use this policy evaluation algorithm to adapt the [](#policy iteration algorithm ) to this new setting. The algorithm remains exactly the same -- repeatedly make the policy greedy w.r.t. its own value function -- except now we must evaluate the policy (i.e. compute its value function) using the iterative ","position":{"start":{"line":350,"column":1},"end":{"line":350,"column":1}},"key":"LriuiQbLT7"},{"type":"inlineCode","value":"fitted_evaluation","position":{"start":{"line":350,"column":1},"end":{"line":350,"column":1}},"key":"w0HMYs6ICS"},{"type":"text","value":" algorithm.","position":{"start":{"line":350,"column":1},"end":{"line":350,"column":1}},"key":"IttOH4YFZ0"}],"key":"VU6kujl1Jo"}],"key":"qvnNIC0zHU"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def fitted_policy_iteration(\n trajectories: list[Trajectory],\n fit: FittingMethod,\n epochs: int,\n evaluation_epochs: int,\n π_init: Optional[Policy] = lambda s, h: 0, # constant zero policy\n):\n \"\"\"Run fitted policy iteration using the given dataset.\"\"\"\n π = π_init\n for _ in range(epochs):\n Q_hat = fitted_evaluation(trajectories, fit, π, evaluation_epochs)\n π = q_to_greedy(Q_hat)\n return π","key":"IquIIeYl9F"},{"type":"output","id":"mk7k8PhH1ign1fEqp3iON","data":[],"key":"QERQlrvDer"}],"data":{},"key":"j0retBrZDf"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":368,"column":1},"end":{"line":368,"column":1}},"children":[{"type":"text","value":"Summary","position":{"start":{"line":368,"column":1},"end":{"line":368,"column":1}},"key":"KaHnU7Hxxx"}],"identifier":"summary","label":"Summary","html_id":"summary","implicit":true,"enumerator":"5.4","key":"tOyuOJwbtV"}],"key":"AAlfb1DQvU"}],"key":"wwNI7tyPYs"},"references":{"cite":{"order":[],"data":{}}},"footer":{"navigation":{"prev":{"title":"4 Supervised learning","url":"/supervised-learning","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"6 Policy Gradient Methods","url":"/pg","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file +{"kind":"Notebook","sha256":"cc9729b7aa8aecc6488688bc0e326dc948ac0671629364a9c0b2425054c6e6c6","slug":"fitted-dp","location":"/fitted_dp.md","dependencies":[],"frontmatter":{"title":"5 Fitted Dynamic Programming Algorithms","numbering":{"all":{"enabled":true},"enumerator":{"template":"5.%s"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"exports":[{"format":"md","filename":"fitted_dp.md","url":"/build/fitted_dp-4d73bec315097a872828e6be1c141ef6.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"rtuREipVrU"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"5.1","key":"PS4MhOGIQx"},{"type":"paragraph","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"We borrow these definitions from the ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"K14kmzeFK0"},{"type":"link","url":"/mdps","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"1 Markov Decision Processes","key":"kFHGE82kkc"}],"urlSource":"./mdps.md","dataUrl":"/mdps.json","internal":true,"protocol":"file","key":"HmTfdiovEo"},{"type":"text","value":" chapter:","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"mlP4bF8PsW"}],"key":"N50J3ILsIw"}],"key":"PN08ZSFx9R"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"from typing import NamedTuple, Callable, Optional\nfrom jaxtyping import Float, Array\nimport jax.numpy as np\nfrom jax import grad, vmap\nimport jax.random as rand\nfrom tqdm import tqdm\nimport gymnasium as gym\n\nkey = rand.PRNGKey(184)\n\n\nclass Transition(NamedTuple):\n s: int\n a: int\n r: float\n\n\nTrajectory = list[Transition]\n\n\ndef get_num_actions(trajectories: list[Trajectory]) -> int:\n \"\"\"Get the number of actions in the dataset. Assumes actions range from 0 to A-1.\"\"\"\n return max(max(t.a for t in τ) for τ in trajectories) + 1\n\n\nState = Float[Array, \"...\"] # arbitrary shape\n\n# assume finite `A` actions and f outputs an array of Q-values\n# i.e. Q(s, a, h) is implemented as f(s, h)[a]\nQFunction = Callable[[State, int], Float[Array, \" A\"]]\n\n\ndef Q_zero(A: int) -> QFunction:\n \"\"\"A Q-function that always returns zero.\"\"\"\n return lambda s, a: np.zeros(A)\n\n\n# a deterministic time-dependent policy\nPolicy = Callable[[State, int], int]\n\n\ndef q_to_greedy(Q: QFunction) -> Policy:\n \"\"\"Get the greedy policy for the given state-action value function.\"\"\"\n return lambda s, h: np.argmax(Q(s, h))","visibility":"hide","key":"wClwhTVvBy"},{"type":"output","id":"0TOiDi-xbqLkwRKU3l2SG","data":[],"visibility":"show","key":"hy6DPL6lpx"}],"data":{"tags":[]},"visibility":"show","key":"UddfViUEHh"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":71,"column":1},"end":{"line":74,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"xp8AwGgf9L"},{"type":"link","url":"/mdps","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"1 Markov Decision Processes","key":"XT4Bwl8wff"}],"urlSource":"./mdps.md","dataUrl":"/mdps.json","internal":true,"protocol":"file","key":"HC9N1AzAyL"},{"type":"text","value":" chapter discussed the case of ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"vciRFUF32l"},{"type":"strong","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"finite","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"EpPN86ZDrq"}],"key":"TqMTUXdYQd"},{"type":"text","value":" MDPs, where the state and action spaces ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"jniZDmkx4y"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"html":"S\\mathcal{S}S","key":"OWTbKTmqgS"},{"type":"text","value":" and ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"YvvYbVYkfp"},{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"html":"A\\mathcal{A}A","key":"AyOTaarkIM"},{"type":"text","value":" were finite.\nThis gave us a closed-form expression for computing the r.h.s. of ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"wIreZiVevS"},{"type":"crossReference","kind":"proof:theorem","identifier":"bellman_consistency","label":"bellman_consistency","children":[{"type":"text","value":"the Bellman one-step consistency equation","key":"qBqtipmU2a"}],"template":"Theorem %s","enumerator":"1.1","resolved":true,"html_id":"bellman-consistency","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"q1qjtWn9Q8"},{"type":"text","value":".\nIn this chapter, we consider the case of ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"mBrLKLOXVp"},{"type":"strong","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"large","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"UXYfqihPXf"}],"key":"UhH575RlxM"},{"type":"text","value":" or ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"rRg4CyrNM2"},{"type":"strong","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"continuous","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"GXN5iTLwjs"}],"key":"wsBrAPBMLh"},{"type":"text","value":" state spaces, where the state space is too large to be enumerated.\nIn this case, we need to ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"t1hGXJTBoz"},{"type":"emphasis","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"approximate","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"ueTvzRHnZ4"}],"key":"j9OCvwsH6u"},{"type":"text","value":" the value function and Q-function using methods from ","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"e6pkwfrCXM"},{"type":"strong","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"supervised learning","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"leliNHPjR9"}],"key":"eEdRGJpAOW"},{"type":"text","value":".","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"LebRnEn9Mo"}],"key":"nacIwlvAsb"},{"type":"paragraph","position":{"start":{"line":76,"column":1},"end":{"line":78,"column":1}},"children":[{"type":"text","value":"We will first take a quick detour to introduce the ","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"rBzlliJxqL"},{"type":"emphasis","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"children":[{"type":"text","value":"empirical risk minimization","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"t8JA7PtDSw"}],"key":"c06nrx6FdF"},{"type":"text","value":" framework for function approximation.\nWe will then see its application to ","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"DrEYFXc1dF"},{"type":"emphasis","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"children":[{"type":"text","value":"fitted","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"m88rAN40md"}],"key":"gPOUUU0dAw"},{"type":"text","value":" RL algorithms,\nwhich attempt to learn the optimal value function (and the optimal policy) from a dataset of trajectories.","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"L3sygHCwlF"}],"key":"VsmRpkFawn"},{"type":"heading","depth":2,"position":{"start":{"line":81,"column":1},"end":{"line":81,"column":1}},"children":[{"type":"text","value":"Empirical risk minimization","position":{"start":{"line":81,"column":1},"end":{"line":81,"column":1}},"key":"jRDs9BHny0"}],"label":"erm","identifier":"erm","html_id":"erm","enumerator":"5.2","key":"Z1I5o7mpU1"},{"type":"paragraph","position":{"start":{"line":83,"column":1},"end":{"line":87,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"ioPFy0sBqp"},{"type":"strong","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"children":[{"type":"text","value":"supervised learning","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"Nw6of66nrO"}],"key":"W3MTFcG1of"},{"type":"text","value":" task is as follows:\nWe seek to learn the relationship between some input variables ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"gv0G5aSlOk"},{"type":"inlineMath","value":"x","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"xxx","key":"o7p2o2D9IZ"},{"type":"text","value":" and some output variable ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"ZMXqskgjFx"},{"type":"inlineMath","value":"y","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"yyy","key":"eZJf9xwXeJ"},{"type":"text","value":"\n(drawn from their joint distribution).\nPrecisely, we want to find a function ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"MOSHyMRLIe"},{"type":"inlineMath","value":"\\hat f : x \\mapsto y","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"f^:xy\\hat f : x \\mapsto yf^:xy","key":"dbwzL1MoVl"},{"type":"text","value":" that minimizes the\n","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"evfVJsWVoA"},{"type":"emphasis","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"children":[{"type":"text","value":"squared error","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"wzs3ybskO7"}],"key":"cbDzGoUFYN"},{"type":"text","value":" of the prediction:","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"Z9w44dXd2h"}],"key":"hBth4xWfZ4"},{"type":"math","value":"\\hat f = \\arg\\min_{f} \\E[(y - f(x))^2]","position":{"start":{"line":89,"column":1},"end":{"line":91,"column":1}},"html":"f^=argminfE[(yf(x))2]\\hat f = \\arg\\min_{f} \\E[(y - f(x))^2]f^=argfminE[(yf(x))2]","enumerator":"5.1","key":"RiOlebi0G4"},{"type":"paragraph","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"children":[{"type":"text","value":"An equivalent framing is that we seek to approximate the ","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"key":"Eru7c6fHyS"},{"type":"emphasis","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"children":[{"type":"text","value":"conditional expectation","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"key":"DUqCWCYzUq"}],"key":"MsNmg2PH1v"},{"type":"text","value":" of ","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"key":"YzGFOLI88J"},{"type":"inlineMath","value":"y","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"html":"yyy","key":"k7Ov0l9RO5"},{"type":"text","value":" given ","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"key":"bZrMDloTWJ"},{"type":"inlineMath","value":"x","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"html":"xxx","key":"jlSKeqiwLq"},{"type":"text","value":":","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"key":"BvowNk6E5q"}],"key":"tzM7m4NznU"},{"type":"proof","kind":"theorem","label":"conditional_expectation_minimizes_mse","identifier":"conditional_expectation_minimizes_mse","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Conditional expectation minimizes mean squared error","position":{"start":{"line":95,"column":1},"end":{"line":95,"column":1}},"key":"eWSPG0jLYr"}],"key":"viYbz5lHHm"},{"type":"math","value":"\\arg\\min_{f} \\E[(y - f(x))^2] = (x \\mapsto \\E[y \\mid x])","position":{"start":{"line":98,"column":1},"end":{"line":100,"column":1}},"html":"argminfE[(yf(x))2]=(xE[yx])\\arg\\min_{f} \\E[(y - f(x))^2] = (x \\mapsto \\E[y \\mid x])argfminE[(yf(x))2]=(xE[yx])","enumerator":"5.2","key":"GlVJ0CPPkh"}],"enumerator":"5.1","html_id":"conditional-expectation-minimizes-mse","key":"W0WojClXGn"},{"type":"proof","kind":"proof","enumerated":true,"children":[{"type":"paragraph","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"children":[{"type":"text","value":"We can decompose the mean squared error as","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"key":"oftpx93yeh"}],"key":"snZoLWGzs1"},{"type":"math","value":"\\begin{aligned}\n\\E[(y - f(x))^2] &= \\E[ (y - \\E[y \\mid x] + \\E[y \\mid x] - f(x))^2 ] \\\\\n&= \\E[ (y - \\E[y \\mid x])^2 ] + \\E[ (\\E[y \\mid x] - f(x))^2 ] + 2 \\E[ (y - \\E[y \\mid x])(\\E[y \\mid x] - f(x)) ] \\\\\n\\end{aligned}","position":{"start":{"line":106,"column":1},"end":{"line":111,"column":1}},"html":"E[(yf(x))2]=E[(yE[yx]+E[yx]f(x))2]=E[(yE[yx])2]+E[(E[yx]f(x))2]+2E[(yE[yx])(E[yx]f(x))]\\begin{aligned}\n\\E[(y - f(x))^2] &= \\E[ (y - \\E[y \\mid x] + \\E[y \\mid x] - f(x))^2 ] \\\\\n&= \\E[ (y - \\E[y \\mid x])^2 ] + \\E[ (\\E[y \\mid x] - f(x))^2 ] + 2 \\E[ (y - \\E[y \\mid x])(\\E[y \\mid x] - f(x)) ] \\\\\n\\end{aligned}E[(yf(x))2]=E[(yE[yx]+E[yx]f(x))2]=E[(yE[yx])2]+E[(E[yx]f(x))2]+2E[(yE[yx])(E[yx]f(x))]","enumerator":"5.3","key":"LNHVi3ajPS"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"d3lb2wd00G"}],"key":"dWJvcx4JiE"},{"type":"paragraph","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"children":[{"type":"text","value":"Use the law of iterated expectations to show that the last term is zero.","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"zQ4XN0U9Wa"}],"key":"ApDYIjQ2Ek"}],"key":"Pul9zlUTqm"},{"type":"paragraph","position":{"start":{"line":117,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"The first term is the irreducible error, and the second term is the error due to the approximation,\nwhich is minimized at ","position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"key":"pEeZtwZ9HG"},{"type":"text","value":"0","position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"key":"QITtarc4v8"},{"type":"text","value":" when ","position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"key":"FRh6pcQf23"},{"type":"inlineMath","value":"f(x) = \\E[y \\mid x]","position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"html":"f(x)=E[yx]f(x) = \\E[y \\mid x]f(x)=E[yx]","key":"bgpYeoz6cn"},{"type":"text","value":".","position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"key":"IaQaor3AMn"}],"key":"Dek7Jz1qOa"}],"enumerator":"5.1","key":"t5BewXuqsA"},{"type":"paragraph","position":{"start":{"line":121,"column":1},"end":{"line":127,"column":1}},"children":[{"type":"text","value":"In most applications, the joint distribution of ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"G1EhaNtZPw"},{"type":"inlineMath","value":"x, y","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"x,yx, yx,y","key":"BTHXBLNOib"},{"type":"text","value":" is unknown or extremely complex, and so we can’t\nanalytically evaluate ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"Q1jRdH7Ah9"},{"type":"inlineMath","value":"\\E [y \\mid x]","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"E[yx]\\E [y \\mid x]E[yx]","key":"ReXtQXPhQD"},{"type":"text","value":".\nInstead, our strategy is to draw ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"tAIEEMCY7V"},{"type":"inlineMath","value":"N","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"NNN","key":"RP9eVUtEMw"},{"type":"text","value":" samples ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"Oxv9jzSEWl"},{"type":"inlineMath","value":"(x_i, y_i)","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"(xi,yi)(x_i, y_i)(xi,yi)","key":"CC9UNJe1uk"},{"type":"text","value":" from the joint distribution of ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"MTig4CAD0X"},{"type":"inlineMath","value":"x","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"xxx","key":"TiV5cX3FdF"},{"type":"text","value":" and ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"pEPW2Y5JKo"},{"type":"inlineMath","value":"y","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"yyy","key":"hHBTgQ1nJW"},{"type":"text","value":",\nand then use the ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"kTx3G72MSI"},{"type":"emphasis","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"children":[{"type":"text","value":"sample average","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"jd6RrpTts0"}],"key":"PE6tcvhhQl"},{"type":"text","value":" ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"f13UhJlgW2"},{"type":"inlineMath","value":"\\sum_{i=1}^N (y_i - f(x_i))^2 / N","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"i=1N(yif(xi))2/N\\sum_{i=1}^N (y_i - f(x_i))^2 / Ni=1N(yif(xi))2/N","key":"m1GDt5UxgI"},{"type":"text","value":" to approximate the mean squared error.\nThen we use a ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"sWMuN1hOnE"},{"type":"emphasis","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"children":[{"type":"text","value":"fitting method","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"lHpD2goIi5"}],"key":"sVKFmMAs86"},{"type":"text","value":" to find a function ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"HAoKFeypKN"},{"type":"inlineMath","value":"\\hat f","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"html":"f^\\hat ff^","key":"mKNRmS2JPF"},{"type":"text","value":" that minimizes this objective\nand thus approximates the conditional expectation.\nThis approach is called ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"U1DeJzTOdq"},{"type":"strong","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"children":[{"type":"text","value":"empirical risk minimization","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"vtf5yIg7Bf"}],"key":"taJT4R6tBB"},{"type":"text","value":".","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"aOf9EZ8yqO"}],"key":"fVW44j6nwX"},{"type":"proof","kind":"definition","label":"empirical_risk_minimization","identifier":"empirical_risk_minimization","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Empirical risk minimization","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"key":"oRUnwiVHjX"}],"key":"Zxn1RDqRc5"},{"type":"paragraph","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"children":[{"type":"text","value":"Given a dataset of samples ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"CpxB1XY7L5"},{"type":"inlineMath","value":"(x_1, y_1), \\dots, (x_N, y_N)","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"html":"(x1,y1),,(xN,yN)(x_1, y_1), \\dots, (x_N, y_N)(x1,y1),,(xN,yN)","key":"jej1d2EYO3"},{"type":"text","value":", empirical risk minimization seeks to find a function ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"n5iqeISfe9"},{"type":"inlineMath","value":"f","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"html":"fff","key":"GHJtauUtBL"},{"type":"text","value":" (from some class of functions ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"rNMc7cYcPP"},{"type":"inlineMath","value":"\\mathcal{F}","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"html":"F\\mathcal{F}F","key":"X3QZ3B7b4i"},{"type":"text","value":") that minimizes the empirical risk:","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"liuJ05ojqi"}],"key":"iVEm6hRN1o"},{"type":"math","value":"\\hat f = \\arg\\min_{f \\in \\mathcal{F}} \\frac{1}{N} \\sum_{i=1}^N (y_i - f(x_i))^2","position":{"start":{"line":134,"column":1},"end":{"line":136,"column":1}},"html":"f^=argminfF1Ni=1N(yif(xi))2\\hat f = \\arg\\min_{f \\in \\mathcal{F}} \\frac{1}{N} \\sum_{i=1}^N (y_i - f(x_i))^2f^=argfFminN1i=1N(yif(xi))2","enumerator":"5.4","key":"z8K9SCKj7o"},{"type":"paragraph","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"children":[{"type":"text","value":"We will cover the details of the minimization process in [](#the next section ).","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"key":"qb24ccaPyZ"}],"key":"vZfWwfoHf1"}],"enumerator":"5.1","html_id":"empirical-risk-minimization","key":"xdQvZR4il7"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"iZwJ7rwCVJ"}],"key":"MstNyhfdD6"},{"type":"paragraph","position":{"start":{"line":142,"column":1},"end":{"line":142,"column":1}},"children":[{"type":"text","value":"Why is it important that we constrain our search to a class of functions ","position":{"start":{"line":142,"column":1},"end":{"line":142,"column":1}},"key":"te3DYqQgoM"},{"type":"inlineMath","value":"\\mathcal{F}","position":{"start":{"line":142,"column":1},"end":{"line":142,"column":1}},"html":"F\\mathcal{F}F","key":"zq6OJm1ZNX"},{"type":"text","value":"?","position":{"start":{"line":142,"column":1},"end":{"line":142,"column":1}},"key":"DTjFNUxKiH"}],"key":"DHCGIGmx4C"},{"type":"paragraph","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"children":[{"type":"text","value":"Hint: Consider the function ","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"key":"YEnjyMYgBg"},{"type":"inlineMath","value":"f(x) = \\sum_{i=1}^N y_i \\mathbb{1}_{\\{ x = x_i \\}}","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"html":"f(x)=i=1Nyi1{x=xi}f(x) = \\sum_{i=1}^N y_i \\mathbb{1}_{\\{ x = x_i \\}}f(x)=i=1Nyi1{x=xi}","key":"NEsgDxnX2C"},{"type":"text","value":". What is the empirical risk of this function? Would you consider it a good approximation of the conditional expectation?","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"key":"JPvTALFzvF"}],"key":"txGlqawVFu"}],"key":"OLCmJSQ0g7"},{"type":"heading","depth":2,"position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"children":[{"type":"text","value":"Fitted value iteration","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"key":"vgBhOqnT4z"}],"identifier":"fitted-value-iteration","label":"Fitted value iteration","html_id":"fitted-value-iteration","implicit":true,"enumerator":"5.3","key":"iRsHZTTzDA"},{"type":"paragraph","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"children":[{"type":"text","value":"Let us apply ERM to the RL problem of computing the optimal policy / value function.","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"key":"yOShVlMWIC"}],"key":"dV3PmUcs2H"},{"type":"paragraph","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"How did we compute the optimal value function in MDPs with ","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"key":"zhoysFYHvA"},{"type":"emphasis","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"finite","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"key":"YJumWNrxkm"}],"key":"nKMgJsCXM4"},{"type":"text","value":" state and action spaces?","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"key":"NvtpKjyDQU"}],"key":"Qx19JdsBbn"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":153,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":153,"column":1},"end":{"line":154,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"children":[{"type":"text","value":"In a [](#finite-horizon MDP ), we can use ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"yjeiu2tnCD"},{"type":"crossReference","kind":"proof:definition","identifier":"pi_star_dp","label":"pi_star_dp","children":[{"type":"text","value":"dynamic programming","key":"F31Rj4zide"}],"template":"Definition %s","enumerator":"1.11","resolved":true,"html_id":"pi-star-dp","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"HmbeJPfhIN"},{"type":"text","value":", working backwards from the end of the time horizon, to compute the optimal value function exactly.","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"RUvtUvLp65"}],"key":"XO5K4DT4Cz"}],"key":"xsWMp2pRI7"},{"type":"listItem","spread":true,"position":{"start":{"line":155,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"children":[{"type":"text","value":"In an [](#infinite-horizon MDP ), we can use [](#value iteration ), which iterates the Bellman optimality operator ","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"TFvsuIispZ"},{"type":"crossReference","kind":"equation","identifier":"bellman_optimality_operator","label":"bellman_optimality_operator","children":[{"type":"text","value":"(","key":"F4Mp5XiNBO"},{"type":"text","value":"1.54","key":"uk3xJvSbMo"},{"type":"text","value":")","key":"sixZOjQ7eC"}],"template":"(%s)","enumerator":"1.54","resolved":true,"html_id":"bellman-optimality-operator","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"da93Zlhqoz"},{"type":"text","value":" to approximately compute the optimal value function.","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"zynN4GFxIo"}],"key":"SVRD0UzICE"}],"key":"VnQJeahQBJ"}],"key":"cRYxEIJYfK"},{"type":"paragraph","position":{"start":{"line":157,"column":1},"end":{"line":161,"column":1}},"children":[{"type":"text","value":"Our existing approaches represent the value function, and the MDP itself,\nin matrix notation.\nBut what happens if the state space is extremely large, or even infinite (e.g. real-valued)?\nThen computing a weighted sum over all possible next states, which is required to compute the Bellman operator,\nbecomes intractable.","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"key":"u1UVssqQZk"}],"key":"vnlUIhHYHV"},{"type":"paragraph","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"children":[{"type":"text","value":"Instead, we will need to use ","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"sQ5QTF65Gp"},{"type":"emphasis","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"children":[{"type":"text","value":"function approximation","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"TkISksaZBB"}],"key":"KH1kp8emJU"},{"type":"text","value":" methods from supervised learning to solve for the value function in an alternative way.","position":{"start":{"line":163,"column":1},"end":{"line":163,"column":1}},"key":"Swu1v7DbeI"}],"key":"O2BsuRYUJC"},{"type":"paragraph","position":{"start":{"line":165,"column":1},"end":{"line":166,"column":1}},"children":[{"type":"text","value":"In particular, suppose we have a dataset of ","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"key":"djS94nGSTI"},{"type":"inlineMath","value":"N","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"html":"NNN","key":"b4qH8lJjnk"},{"type":"text","value":" trajectories ","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"key":"vmRUxMSJFN"},{"type":"inlineMath","value":"\\tau_1, \\dots, \\tau_N \\sim \\rho_{\\pi}","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"html":"τ1,,τNρπ\\tau_1, \\dots, \\tau_N \\sim \\rho_{\\pi}τ1,,τNρπ","key":"w1vfK5mGvx"},{"type":"text","value":" from some policy ","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"key":"nmjc518Ggn"},{"type":"text","value":"π","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"key":"lPZHpqBLwJ"},{"type":"text","value":" (called the ","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"key":"ldr48RtacQ"},{"type":"strong","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"children":[{"type":"text","value":"data collection policy","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"key":"StnpnM212x"}],"key":"yj0v4PBYHC"},{"type":"text","value":") acting in the MDP of interest.\nLet us indicate the trajectory index in the superscript, so that","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"key":"pZzHXdAsem"}],"key":"EiN1pMzccg"},{"type":"math","value":"\\tau_i = \\{ s_0^i, a_0^i, r_0^i, s_1^i, a_1^i, r_1^i, \\dots, s_{\\hor-1}^i, a_{\\hor-1}^i, r_{\\hor-1}^i \\}.","position":{"start":{"line":168,"column":1},"end":{"line":170,"column":1}},"html":"τi={s0i,a0i,r0i,s1i,a1i,r1i,,sH1i,aH1i,rH1i}.\\tau_i = \\{ s_0^i, a_0^i, r_0^i, s_1^i, a_1^i, r_1^i, \\dots, s_{\\hor-1}^i, a_{\\hor-1}^i, r_{\\hor-1}^i \\}.τi={s0i,a0i,r0i,s1i,a1i,r1i,,sH1i,aH1i,rH1i}.","enumerator":"5.5","key":"ANGsh918vk"}],"key":"eYtjzebttH"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def collect_data(\n env: gym.Env, N: int, H: int, key: rand.PRNGKey, π: Optional[Policy] = None\n) -> list[Trajectory]:\n \"\"\"Collect a dataset of trajectories from the given policy (or a random one).\"\"\"\n trajectories = []\n seeds = [rand.bits(k).item() for k in rand.split(key, N)]\n for i in tqdm(range(N)):\n τ = []\n s, _ = env.reset(seed=seeds[i])\n for h in range(H):\n # sample from a random policy\n a = π(s, h) if π else env.action_space.sample()\n s_next, r, terminated, truncated, _ = env.step(a)\n τ.append(Transition(s, a, r))\n if terminated or truncated:\n break\n s = s_next\n trajectories.append(τ)\n return trajectories","key":"Fj4LuBMoae"},{"type":"output","id":"Itu7Mt4a4D4tNob3fsD_m","data":[],"key":"zbfJZgXfoq"}],"data":{},"key":"sX3kdrLDEi"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"env = gym.make(\"LunarLander-v2\")\ntrajectories = collect_data(env, 100, 300, key)\ntrajectories[0][:5] # show first five transitions from first trajectory","key":"YvdcJbmrJK"},{"type":"output","id":"L_gfSWmGuZkTAhNhnPxnB","data":[{"output_type":"stream","name":"stderr","text":"/Users/adzcai/micromamba/envs/rlbook/lib/python3.11/site-packages/gymnasium/envs/registration.py:517: DeprecationWarning: \u001b[33mWARN: The environment LunarLander-v2 is out of date. You should consider upgrading to version `v3`.\u001b[0m\n logger.deprecation(\n"},{"output_type":"error","traceback":"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[0;31mDeprecatedEnv\u001b[0m Traceback (most recent call last)\nCell \u001b[0;32mIn[3], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m env \u001b[38;5;241m=\u001b[39m \u001b[43mgym\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmake\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mLunarLander-v2\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2\u001b[0m trajectories \u001b[38;5;241m=\u001b[39m collect_data(env, \u001b[38;5;241m100\u001b[39m, \u001b[38;5;241m300\u001b[39m, key)\n\u001b[1;32m 3\u001b[0m trajectories[\u001b[38;5;241m0\u001b[39m][:\u001b[38;5;241m5\u001b[39m] \u001b[38;5;66;03m# show first five transitions from first trajectory\u001b[39;00m\n\nFile \u001b[0;32m~/micromamba/envs/rlbook/lib/python3.11/site-packages/gymnasium/envs/registration.py:687\u001b[0m, in \u001b[0;36mmake\u001b[0;34m(id, max_episode_steps, disable_env_checker, **kwargs)\u001b[0m\n\u001b[1;32m 684\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(\u001b[38;5;28mid\u001b[39m, \u001b[38;5;28mstr\u001b[39m)\n\u001b[1;32m 686\u001b[0m \u001b[38;5;66;03m# The environment name can include an unloaded module in \"module:env_name\" style\u001b[39;00m\n\u001b[0;32m--> 687\u001b[0m env_spec \u001b[38;5;241m=\u001b[39m \u001b[43m_find_spec\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mid\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 689\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(env_spec, EnvSpec)\n\u001b[1;32m 691\u001b[0m \u001b[38;5;66;03m# Update the env spec kwargs with the `make` kwargs\u001b[39;00m\n\nFile \u001b[0;32m~/micromamba/envs/rlbook/lib/python3.11/site-packages/gymnasium/envs/registration.py:531\u001b[0m, in \u001b[0;36m_find_spec\u001b[0;34m(env_id)\u001b[0m\n\u001b[1;32m 525\u001b[0m logger\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[1;32m 526\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUsing the latest versioned environment `\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnew_env_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m` \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 527\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minstead of the unversioned environment `\u001b[39m\u001b[38;5;132;01m{\u001b[39;00menv_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m`.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 528\u001b[0m )\n\u001b[1;32m 530\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m env_spec \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 531\u001b[0m \u001b[43m_check_version_exists\u001b[49m\u001b[43m(\u001b[49m\u001b[43mns\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mname\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mversion\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 532\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error\u001b[38;5;241m.\u001b[39mError(\n\u001b[1;32m 533\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNo registered env with id: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00menv_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m. Did you register it, or import the package that registers it? Use `gymnasium.pprint_registry()` to see all of the registered environments.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 534\u001b[0m )\n\u001b[1;32m 536\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m env_spec\n\nFile \u001b[0;32m~/micromamba/envs/rlbook/lib/python3.11/site-packages/gymnasium/envs/registration.py:431\u001b[0m, in \u001b[0;36m_check_version_exists\u001b[0;34m(ns, name, version)\u001b[0m\n\u001b[1;32m 428\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error\u001b[38;5;241m.\u001b[39mVersionNotFound(message)\n\u001b[1;32m 430\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m latest_spec \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m version \u001b[38;5;241m<\u001b[39m latest_spec\u001b[38;5;241m.\u001b[39mversion:\n\u001b[0;32m--> 431\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error\u001b[38;5;241m.\u001b[39mDeprecatedEnv(\n\u001b[1;32m 432\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mEnvironment version v\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mversion\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m for `\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mget_env_id(ns,\u001b[38;5;250m \u001b[39mname,\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m` is deprecated. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 433\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPlease use `\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mlatest_spec\u001b[38;5;241m.\u001b[39mid\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m` instead.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 434\u001b[0m )\n\n\u001b[0;31mDeprecatedEnv\u001b[0m: Environment version v2 for `LunarLander` is deprecated. Please use `LunarLander-v3` instead.","ename":"DeprecatedEnv","evalue":"Environment version v2 for `LunarLander` is deprecated. Please use `LunarLander-v3` instead."}],"key":"PwRfLWynjD"}],"data":{},"key":"Br8pVKUx9T"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":200,"column":1},"end":{"line":202,"column":1}},"children":[{"type":"text","value":"Can we view the dataset of trajectories as a “labelled dataset” in order to apply supervised learning to approximate the optimal Q-function? Yes!\nRecall that we can characterize the optimal Q-function using the ","position":{"start":{"line":200,"column":1},"end":{"line":200,"column":1}},"key":"d3VYkKHKm8"},{"type":"crossReference","kind":"proof:corollary","identifier":"bellman_consistency_optimal","label":"bellman_consistency_optimal","children":[{"type":"text","value":"Bellman optimality equations","key":"PiHn9DOpis"}],"template":"Corollary %s","enumerator":"1.1","resolved":true,"html_id":"bellman-consistency-optimal","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"Rq8aRj4sdt"},{"type":"text","value":",\nwhich don’t depend on an actual policy:","position":{"start":{"line":200,"column":1},"end":{"line":200,"column":1}},"key":"FjjaAgCG88"}],"key":"uClM0nn6vY"},{"type":"math","value":"Q_\\hi^\\star(s, a) = r(s, a) + \\E_{s' \\sim P(s, a)} [\\max_{a'} Q_{\\hi+1}^\\star(s', a')]","position":{"start":{"line":204,"column":1},"end":{"line":206,"column":1}},"html":"Qh(s,a)=r(s,a)+EsP(s,a)[maxaQh+1(s,a)]Q_\\hi^\\star(s, a) = r(s, a) + \\E_{s' \\sim P(s, a)} [\\max_{a'} Q_{\\hi+1}^\\star(s', a')]Qh(s,a)=r(s,a)+EsP(s,a)[amaxQh+1(s,a)]","enumerator":"5.6","key":"HDvKJ1bkMZ"},{"type":"paragraph","position":{"start":{"line":208,"column":1},"end":{"line":209,"column":1}},"children":[{"type":"text","value":"We can think of the arguments to the Q-function -- i.e. the current state, action, and timestep ","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"ujWxU0xyq0"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"html":"h\\hih","key":"w9vdb5cRbm"},{"type":"text","value":" --\nas the inputs ","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"SGHU7di8qs"},{"type":"inlineMath","value":"x","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"html":"xxx","key":"rMuOCdq1Af"},{"type":"text","value":", and the r.h.s. of the above equation as the label ","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"ibtLX8Idhn"},{"type":"inlineMath","value":"f(x)","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"html":"f(x)f(x)f(x)","key":"k6KMwuQgto"},{"type":"text","value":". Note that the r.h.s. can also be expressed as a ","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"e4uC5fM4KK"},{"type":"strong","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"children":[{"type":"text","value":"conditional expectation","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"ytt1xlqYo3"}],"key":"HbbCXMp6sD"},{"type":"text","value":":","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"PRpnIlJwVO"}],"key":"uNmgivPf8k"},{"type":"math","value":"f(x) = \\E [y \\mid x] \\quad \\text{where} \\quad y = r(s_\\hi, a_\\hi) + \\max_{a'} Q^\\star_{\\hi + 1}(s', a').","position":{"start":{"line":211,"column":1},"end":{"line":213,"column":1}},"html":"f(x)=E[yx]wherey=r(sh,ah)+maxaQh+1(s,a).f(x) = \\E [y \\mid x] \\quad \\text{where} \\quad y = r(s_\\hi, a_\\hi) + \\max_{a'} Q^\\star_{\\hi + 1}(s', a').f(x)=E[yx]wherey=r(sh,ah)+amaxQh+1(s,a).","enumerator":"5.7","key":"TfySuQXeHK"},{"type":"paragraph","position":{"start":{"line":215,"column":1},"end":{"line":215,"column":1}},"children":[{"type":"text","value":"Approximating the conditional expectation is precisely the task that ","position":{"start":{"line":215,"column":1},"end":{"line":215,"column":1}},"key":"VclAw3WXEE"},{"type":"crossReference","position":{"start":{"line":215,"column":1},"end":{"line":215,"column":1}},"children":[{"type":"text","value":"Section ","key":"CSWTuwEJbg"},{"type":"text","value":"5.2","key":"KF4Sqd9Msi"}],"identifier":"erm","label":"erm","kind":"heading","template":"Section %s","enumerator":"5.2","resolved":true,"html_id":"erm","key":"FV19eswW4f"},{"type":"text","value":" is suited for!","position":{"start":{"line":215,"column":1},"end":{"line":215,"column":1}},"key":"fmLNCFwkC2"}],"key":"xKyipWsJJa"},{"type":"paragraph","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"children":[{"type":"text","value":"Our above dataset would give us ","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"key":"qrTfT9IDWS"},{"type":"inlineMath","value":"N \\cdot \\hor","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"html":"NHN \\cdot \\horNH","key":"mnnWi1eTuA"},{"type":"text","value":" samples in the dataset:","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"key":"v7O7XT6DzG"}],"key":"Rk7u6opos1"},{"type":"math","value":"x_{i \\hi} = (s_\\hi^i, a_\\hi^i, \\hi) \\qquad y_{i \\hi} = r(s_\\hi^i, a_\\hi^i) + \\max_{a'} Q^\\star_{\\hi + 1}(s_{\\hi + 1}^i, a')","position":{"start":{"line":219,"column":1},"end":{"line":221,"column":1}},"html":"xih=(shi,ahi,h)yih=r(shi,ahi)+maxaQh+1(sh+1i,a)x_{i \\hi} = (s_\\hi^i, a_\\hi^i, \\hi) \\qquad y_{i \\hi} = r(s_\\hi^i, a_\\hi^i) + \\max_{a'} Q^\\star_{\\hi + 1}(s_{\\hi + 1}^i, a')xih=(shi,ahi,h)yih=r(shi,ahi)+amaxQh+1(sh+1i,a)","enumerator":"5.8","key":"Yy63cdOxIj"}],"key":"WkXlsa3M7z"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def get_X(trajectories: list[Trajectory]):\n \"\"\"\n We pass the state and timestep as input to the Q-function\n and return an array of Q-values.\n \"\"\"\n rows = [(τ[h].s, τ[h].a, h) for τ in trajectories for h in range(len(τ))]\n return [np.stack(ary) for ary in zip(*rows)]\n\n\ndef get_y(\n trajectories: list[Trajectory],\n f: Optional[QFunction] = None,\n π: Optional[Policy] = None,\n):\n \"\"\"\n Transform the dataset of trajectories into a dataset for supervised learning.\n If `π` is None, instead estimates the optimal Q function.\n Otherwise, estimates the Q function of π.\n \"\"\"\n f = f or Q_zero(get_num_actions(trajectories))\n y = []\n for τ in trajectories:\n for h in range(len(τ) - 1):\n s, a, r = τ[h]\n Q_values = f(s, h + 1)\n y.append(r + (Q_values[π(s, h + 1)] if π else Q_values.max()))\n y.append(τ[-1].r)\n return np.array(y)","key":"rji1QmZj9s"},{"type":"output","id":"LXqdHl0hnWAqD93eRLQkh","data":[],"key":"gHZW5YxWCE"}],"data":{},"key":"CldSEBYluI"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"s, a, h = get_X(trajectories[:1])\nprint(\"states:\", s[:5])\nprint(\"actions:\", a[:5])\nprint(\"timesteps:\", h[:5])","key":"sK8wQRhjHG"},{"type":"output","id":"w2zjDU64npag1vPf6KQnT","data":[],"key":"XuUzsmTCej"}],"data":{},"key":"eGEXBj2us4"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"get_y(trajectories[:1])[:5]","key":"Yi1g6vUixV"},{"type":"output","id":"EY_v1CINaxl9oq6_ha699","data":[],"key":"ZJAYPLBnsV"}],"data":{},"key":"YK6c11LyPV"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"children":[{"type":"text","value":"Then we can use empirical risk minimization to find a function ","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"DaRez4DrUK"},{"type":"inlineMath","value":"\\hat f","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"html":"f^\\hat ff^","key":"RG9pCfum3E"},{"type":"text","value":" that approximates the optimal Q-function.","position":{"start":{"line":265,"column":1},"end":{"line":265,"column":1}},"key":"zyQxprtVjK"}],"key":"aAE7VQPePz"}],"key":"hoJ1IEdEkf"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"# We will see some examples of fitting methods in the next section\nFittingMethod = Callable[[Float[Array, \"N D\"], Float[Array, \" N\"]], QFunction]","key":"YM919I0yRW"},{"type":"output","id":"ytFrsisvJ8L533C8LOuPO","data":[],"key":"WisoMWN7WT"}],"data":{},"key":"o4MQB7sTTd"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":272,"column":1},"end":{"line":276,"column":1}},"children":[{"type":"text","value":"But notice that the definition of ","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"pU20LEBKeR"},{"type":"inlineMath","value":"y_{i \\hi}","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"html":"yihy_{i \\hi}yih","key":"JKAmMVrlC6"},{"type":"text","value":" depends on the Q-function itself!\nHow can we resolve this circular dependency?\nRecall that we faced the same issue ","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"rbcpAj4Nto"},{"type":"crossReference","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"children":[{"type":"text","value":"when evaluating a policy in an infinite-horizon MDP","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"c1Z9WRi7Fa"}],"identifier":"iterative_pe","label":"iterative_pe","kind":"heading","template":"Section %s","enumerator":"1.5.2.2","resolved":true,"html_id":"iterative-pe","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"cr5OspHqyq"},{"type":"text","value":". There, we iterated the ","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"JvwM0lEU4Y"},{"type":"crossReference","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"children":[{"type":"text","value":"Definition ","key":"ElDPq5fbIB"},{"type":"text","value":"1.8","key":"ny3qJhZUoe"}],"identifier":"bellman_operator","label":"bellman_operator","kind":"proof:definition","template":"Definition %s","enumerator":"1.8","resolved":true,"html_id":"bellman-operator","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"vSh4fcQ0Id"},{"type":"text","value":" since we knew that the policy’s value function was a fixed point of the policy’s Bellman operator.\nWe can apply the same strategy here, using the ","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"hH71He3SLY"},{"type":"inlineMath","value":"\\hat f","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"html":"f^\\hat ff^","key":"opjTh8F7zW"},{"type":"text","value":" from the previous iteration to compute the labels ","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"jq8jeWxHu0"},{"type":"inlineMath","value":"y_{i \\hi}","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"html":"yihy_{i \\hi}yih","key":"lV5GFniZRT"},{"type":"text","value":",\nand then using this new dataset to fit the next iterate.","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"n0VO74x807"}],"key":"dwjaRb3oBs"},{"type":"proof","kind":"definition","label":"fitted_q_iteration","identifier":"fitted_q_iteration","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Fitted Q-function iteration","position":{"start":{"line":278,"column":1},"end":{"line":278,"column":1}},"key":"B3z4fvr0NO"}],"key":"EoyqhtQR4a"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":281,"column":1},"end":{"line":286,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"children":[{"type":"text","value":"Initialize some function ","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"su4aKMfLtc"},{"type":"inlineMath","value":"\\hat f(s, a, h) \\in \\mathbb{R}","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"html":"f^(s,a,h)R\\hat f(s, a, h) \\in \\mathbb{R}f^(s,a,h)R","key":"ewd0xb3SfZ"},{"type":"text","value":".","position":{"start":{"line":281,"column":1},"end":{"line":281,"column":1}},"key":"H7gKx4MN4U"}],"key":"jwVgAwF1xF"},{"type":"listItem","spread":true,"position":{"start":{"line":282,"column":1},"end":{"line":286,"column":1}},"children":[{"type":"text","value":"Iterate the following:","position":{"start":{"line":282,"column":1},"end":{"line":282,"column":1}},"key":"kwTPbMXD1Q"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":283,"column":1},"end":{"line":286,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"children":[{"type":"text","value":"Generate a supervised learning dataset ","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"ynEwjl29BD"},{"type":"inlineMath","value":"X, y","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"html":"X,yX, yX,y","key":"bTWn57lBFq"},{"type":"text","value":" from the trajectories and the current estimate ","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"aDwxkfk8fP"},{"type":"inlineMath","value":"f","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"html":"fff","key":"h3H5GyzgIS"},{"type":"text","value":", where the labels come from the r.h.s. of the Bellman optimality operator ","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"yJyeqHhnbp"},{"type":"crossReference","kind":"equation","identifier":"bellman_optimality_operator","label":"bellman_optimality_operator","children":[{"type":"text","value":"(","key":"Nr1V65PzHb"},{"type":"text","value":"1.54","key":"YV9f7EJTNX"},{"type":"text","value":")","key":"SQTwvb6V1s"}],"template":"(%s)","enumerator":"1.54","resolved":true,"html_id":"bellman-optimality-operator","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"fvWVPFotgv"}],"key":"IMxYTg8DnC"}],"key":"eEcJkIWlrd"},{"type":"listItem","spread":true,"position":{"start":{"line":284,"column":1},"end":{"line":286,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"children":[{"type":"text","value":"Set ","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"key":"MmDe7bZarY"},{"type":"inlineMath","value":"\\hat f","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"html":"f^\\hat ff^","key":"GKsydjN7mQ"},{"type":"text","value":" to the function that minimizes the empirical risk:","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"key":"ybjyjzAms7"}],"key":"L5QcoaznG6"},{"type":"math","value":"\\hat f \\gets \\arg\\min_f \\frac{1}{N} \\sum_{i=1}^N (y_i - f(x_i))^2.","position":{"start":{"line":286,"column":1},"end":{"line":286,"column":1}},"html":"f^argminf1Ni=1N(yif(xi))2.\\hat f \\gets \\arg\\min_f \\frac{1}{N} \\sum_{i=1}^N (y_i - f(x_i))^2.f^argfminN1i=1N(yif(xi))2.","enumerator":"5.9","key":"XRk8kp7jj4"}],"key":"t5BlaGtL1x"}],"key":"eAPT8bpiPT"}],"key":"UV2LaKac4B"}],"key":"mKJtIrtRx5"}],"enumerator":"5.2","html_id":"fitted-q-iteration","key":"JaxKITpFVX"}],"key":"ru1dmowhzO"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def fitted_q_iteration(\n trajectories: list[Trajectory],\n fit: FittingMethod,\n epochs: int,\n Q_init: Optional[QFunction] = None,\n) -> QFunction:\n \"\"\"\n Run fitted Q-function iteration using the given dataset.\n Returns an estimate of the optimal Q-function.\n \"\"\"\n Q_hat = Q_init or Q_zero(get_num_actions(trajectories))\n X = get_X(trajectories)\n for _ in range(epochs):\n y = get_y(trajectories, Q_hat)\n Q_hat = fit(X, y)\n return Q_hat","key":"B1e2LFpj8K"},{"type":"output","id":"eSrp0VB6Gt7ZxWo7S7Zte","data":[],"key":"Ji496FSaYB"}],"data":{},"key":"xGz78AQWOu"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":309,"column":1},"end":{"line":309,"column":1}},"children":[{"type":"text","value":"Fitted policy evaluation","position":{"start":{"line":309,"column":1},"end":{"line":309,"column":1}},"key":"aT4tfXrKgj"}],"label":"fitted-pi-eval","identifier":"fitted-pi-eval","html_id":"fitted-pi-eval","enumerator":"5.4","key":"Q9BMaawOvu"},{"type":"paragraph","position":{"start":{"line":311,"column":1},"end":{"line":311,"column":1}},"children":[{"type":"text","value":"We can also use this fixed-point interation to ","position":{"start":{"line":311,"column":1},"end":{"line":311,"column":1}},"key":"KUtOiSYbwa"},{"type":"emphasis","position":{"start":{"line":311,"column":1},"end":{"line":311,"column":1}},"children":[{"type":"text","value":"evaluate","position":{"start":{"line":311,"column":1},"end":{"line":311,"column":1}},"key":"fAkHcz3oOV"}],"key":"XAhZdXzCVf"},{"type":"text","value":" a policy using the dataset (not necessarily the one used to generate the trajectories):","position":{"start":{"line":311,"column":1},"end":{"line":311,"column":1}},"key":"ZDiUY2TuRl"}],"key":"JWLQSxfZjB"},{"type":"proof","kind":"definition","label":"fitted_evaluation","identifier":"fitted_evaluation","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Fitted policy evaluation","position":{"start":{"line":313,"column":1},"end":{"line":313,"column":1}},"key":"baOIIGRQXp"}],"key":"MgdoqaQbtM"},{"type":"paragraph","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"children":[{"type":"strong","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"children":[{"type":"text","value":"Input:","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"key":"PwQgvvdsnQ"}],"key":"K2AmDgaEQs"},{"type":"text","value":" Policy ","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"key":"FcD2LBAJbk"},{"type":"inlineMath","value":"\\pi : \\mathcal{S} \\times [H] \\to \\Delta(\\mathcal{A})","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"html":"π:S×[H]Δ(A)\\pi : \\mathcal{S} \\times [H] \\to \\Delta(\\mathcal{A})π:S×[H]Δ(A)","key":"V8qpNMj8jJ"},{"type":"text","value":" to be evaluated.","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"key":"e98rfhbObq"}],"key":"TCZKp2xqJy"},{"type":"paragraph","position":{"start":{"line":318,"column":1},"end":{"line":318,"column":1}},"children":[{"type":"strong","position":{"start":{"line":318,"column":1},"end":{"line":318,"column":1}},"children":[{"type":"text","value":"Output:","position":{"start":{"line":318,"column":1},"end":{"line":318,"column":1}},"key":"BpphVvmxQo"}],"key":"NjMVAdXEgb"},{"type":"text","value":" An approximation of the value function ","position":{"start":{"line":318,"column":1},"end":{"line":318,"column":1}},"key":"dls9U5G9N9"},{"type":"inlineMath","value":"Q^\\pi","position":{"start":{"line":318,"column":1},"end":{"line":318,"column":1}},"html":"QπQ^\\piQπ","key":"EEkVF4iCl7"},{"type":"text","value":" of the policy.","position":{"start":{"line":318,"column":1},"end":{"line":318,"column":1}},"key":"EOzdH1jIba"}],"key":"JccFGJJMAv"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":320,"column":1},"end":{"line":325,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"children":[{"type":"text","value":"Initialize some function ","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"key":"QRXNH1jRno"},{"type":"inlineMath","value":"\\hat f(s, a, h) \\in \\mathbb{R}","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"html":"f^(s,a,h)R\\hat f(s, a, h) \\in \\mathbb{R}f^(s,a,h)R","key":"eAXNbWts35"},{"type":"text","value":".","position":{"start":{"line":320,"column":1},"end":{"line":320,"column":1}},"key":"J7xAfZAHIt"}],"key":"ep1QCcmnT2"},{"type":"listItem","spread":true,"position":{"start":{"line":321,"column":1},"end":{"line":325,"column":1}},"children":[{"type":"text","value":"Iterate the following:","position":{"start":{"line":321,"column":1},"end":{"line":321,"column":1}},"key":"bq1YJduG78"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":322,"column":1},"end":{"line":325,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":322,"column":1},"end":{"line":322,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":322,"column":1},"end":{"line":322,"column":1}},"children":[{"type":"text","value":"Generate a supervised learning dataset ","position":{"start":{"line":322,"column":1},"end":{"line":322,"column":1}},"key":"lo1EHYulR0"},{"type":"inlineMath","value":"X, y","position":{"start":{"line":322,"column":1},"end":{"line":322,"column":1}},"html":"X,yX, yX,y","key":"FO9Y1nwR3c"},{"type":"text","value":" from the trajectories and the current estimate ","position":{"start":{"line":322,"column":1},"end":{"line":322,"column":1}},"key":"jkfV2EtSns"},{"type":"inlineMath","value":"f","position":{"start":{"line":322,"column":1},"end":{"line":322,"column":1}},"html":"fff","key":"cuarrbDd0a"},{"type":"text","value":", where the labels come from the r.h.s. of the ","position":{"start":{"line":322,"column":1},"end":{"line":322,"column":1}},"key":"KkP2rFrQ8G"},{"type":"crossReference","kind":"proof:theorem","identifier":"bellman_consistency","label":"bellman_consistency","children":[{"type":"text","value":"Bellman consistency equation","key":"X6M2qbBsHg"}],"template":"Theorem %s","enumerator":"1.1","resolved":true,"html_id":"bellman-consistency","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"GBr2DmK0V6"},{"type":"text","value":" for the given policy.","position":{"start":{"line":322,"column":1},"end":{"line":322,"column":1}},"key":"q3UiUScLLN"}],"key":"D2CINoCw17"}],"key":"mAarJw8sWs"},{"type":"listItem","spread":true,"position":{"start":{"line":323,"column":1},"end":{"line":325,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":323,"column":1},"end":{"line":323,"column":1}},"children":[{"type":"text","value":"Set ","position":{"start":{"line":323,"column":1},"end":{"line":323,"column":1}},"key":"OAf8CQo9Y1"},{"type":"inlineMath","value":"\\hat f","position":{"start":{"line":323,"column":1},"end":{"line":323,"column":1}},"html":"f^\\hat ff^","key":"pEEj3uSwFj"},{"type":"text","value":" to the function that minimizes the empirical risk:","position":{"start":{"line":323,"column":1},"end":{"line":323,"column":1}},"key":"yOFK0BFN9P"}],"key":"zaVV5dnTGB"},{"type":"math","value":"\\hat f \\gets \\arg\\min_f \\frac{1}{N} \\sum_{i=1}^N (y_i - f(x_i))^2.","position":{"start":{"line":325,"column":1},"end":{"line":325,"column":1}},"html":"f^argminf1Ni=1N(yif(xi))2.\\hat f \\gets \\arg\\min_f \\frac{1}{N} \\sum_{i=1}^N (y_i - f(x_i))^2.f^argfminN1i=1N(yif(xi))2.","enumerator":"5.10","key":"nFXWSRp6t5"}],"key":"KAoHw8GaSP"}],"key":"lqZir8ACfe"}],"key":"wjmn8m9SDH"}],"key":"HzvSRJU7vl"}],"enumerator":"5.3","html_id":"fitted-evaluation","key":"VM80BbpQsK"}],"key":"osJUmJ9ar1"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def fitted_evaluation(\n trajectories: list[Trajectory],\n fit: FittingMethod,\n π: Policy,\n epochs: int,\n Q_init: Optional[QFunction] = None,\n) -> QFunction:\n \"\"\"\n Run fitted policy evaluation using the given dataset.\n Returns an estimate of the Q-function of the given policy.\n \"\"\"\n Q_hat = Q_init or Q_zero(get_num_actions(trajectories))\n X = get_X(trajectories)\n for _ in tqdm(range(epochs)):\n y = get_y(trajectories, Q_hat, π)\n Q_hat = fit(X, y)\n return Q_hat","key":"BInrSpasq8"},{"type":"output","id":"bVo-iBA6l8jiwFuKzs8qm","data":[],"key":"muTGv7ZTgL"}],"data":{},"key":"fJKbpC8LMk"},{"type":"block","children":[{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"tdaVI6P5sH"}],"key":"wyz42TffZu"},{"type":"paragraph","position":{"start":{"line":349,"column":1},"end":{"line":350,"column":1}},"children":[{"type":"text","value":"Spot the difference between ","position":{"start":{"line":349,"column":1},"end":{"line":349,"column":1}},"key":"H4Oxn52FRC"},{"type":"inlineCode","value":"fitted_evaluation","position":{"start":{"line":349,"column":1},"end":{"line":349,"column":1}},"key":"xqZuMGODbb"},{"type":"text","value":" and ","position":{"start":{"line":349,"column":1},"end":{"line":349,"column":1}},"key":"kGtgZM9O3z"},{"type":"inlineCode","value":"fitted_q_iteration","position":{"start":{"line":349,"column":1},"end":{"line":349,"column":1}},"key":"dt4PQy0WbL"},{"type":"text","value":". (See the definition of ","position":{"start":{"line":349,"column":1},"end":{"line":349,"column":1}},"key":"Ls5pUNK380"},{"type":"inlineCode","value":"get_y","position":{"start":{"line":349,"column":1},"end":{"line":349,"column":1}},"key":"coNQ0TQ0KG"},{"type":"text","value":".)\nHow would you modify this algorithm to evaluate the data collection policy?","position":{"start":{"line":349,"column":1},"end":{"line":349,"column":1}},"key":"SgKBqqqO3Z"}],"key":"hPNUOSmD6S"}],"key":"Jovm050gND"},{"type":"heading","depth":2,"position":{"start":{"line":353,"column":1},"end":{"line":353,"column":1}},"children":[{"type":"text","value":"Fitted policy iteration","position":{"start":{"line":353,"column":1},"end":{"line":353,"column":1}},"key":"Ko1YkazA8Q"}],"identifier":"fitted-policy-iteration","label":"Fitted policy iteration","html_id":"fitted-policy-iteration","implicit":true,"enumerator":"5.5","key":"YhJeRZNEVh"},{"type":"paragraph","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"children":[{"type":"text","value":"We can use this policy evaluation algorithm to adapt the [](#policy iteration algorithm ) to this new setting. The algorithm remains exactly the same -- repeatedly make the policy greedy w.r.t. its own value function -- except now we must evaluate the policy (i.e. compute its value function) using the iterative ","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"key":"ecSZyhnpG2"},{"type":"inlineCode","value":"fitted_evaluation","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"key":"aJQLCwBPUm"},{"type":"text","value":" algorithm.","position":{"start":{"line":355,"column":1},"end":{"line":355,"column":1}},"key":"UQFG17QzES"}],"key":"KCamzHYg3j"}],"key":"SdkdKUGPxR"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def fitted_policy_iteration(\n trajectories: list[Trajectory],\n fit: FittingMethod,\n epochs: int,\n evaluation_epochs: int,\n π_init: Optional[Policy] = lambda s, h: 0, # constant zero policy\n):\n \"\"\"Run fitted policy iteration using the given dataset.\"\"\"\n π = π_init\n for _ in range(epochs):\n Q_hat = fitted_evaluation(trajectories, fit, π, evaluation_epochs)\n π = q_to_greedy(Q_hat)\n return π","key":"nkb4US5nAC"},{"type":"output","id":"44bzsNRFqw8htir0O16oQ","data":[],"key":"lOxfyHFUIG"}],"data":{},"key":"Uh498TpgCS"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":373,"column":1},"end":{"line":373,"column":1}},"children":[{"type":"text","value":"Summary","position":{"start":{"line":373,"column":1},"end":{"line":373,"column":1}},"key":"YDPXOj71CO"}],"identifier":"summary","label":"Summary","html_id":"summary","implicit":true,"enumerator":"5.6","key":"Aq9kGkO0vT"}],"key":"Tmga8TycUd"}],"key":"L7gpemphXW"},"references":{"cite":{"order":[],"data":{}}},"footer":{"navigation":{"prev":{"title":"4 Supervised learning","url":"/supervised-learning","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"6 Policy Gradient Methods","url":"/pg","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file diff --git a/imitation-learning.html b/imitation-learning.html index 721b45d..45a8003 100644 --- a/imitation-learning.html +++ b/imitation-learning.html @@ -1,4 +1,4 @@ -7 Imitation Learning - CS/STAT 184: Introduction to Reinforcement Learning

7 Imitation Learning

7.1Introduction

Imagine you are tasked with learning how to drive. How do, or did, you go about it? + c-1.4,4.9-3.6,8.6-6.8,11.1c-3.1,2.5-7.3,3.7-12.4,3.8H63c-6.7,0-10,2.9-10,8.8">Made with MyST

7 Imitation Learning

7.1Introduction

Imagine you are tasked with learning how to drive. How do, or did, you go about it? At first, this task might seem insurmountable: there are a vast array of controls, and the cost of making a single mistake could be extremely high, making it hard to explore by trial and error. Luckily, there are already people in the world who know how to drive who can get you started. -In this and many other examples, we all “stand on the shoulders of giants” and learn skills from experts who have already mastered them.

Now in machine learning, much of the time, we are trying to teach machines to accomplish tasks that us humans are already proficient at. +In almost every challenge we face, +we “stand on the shoulders of giants” and learn skills from experts who have already mastered them.

a robot imitating the pose of a young child (Photo by Pavel Danilyuk: https://www.pexels.com/photo/a-robot-imitating-a-girl-s-movement-8294811/)

Now in machine learning, +we are often trying to teach machines to accomplish tasks that humans are already proficient at. In such cases, the machine learning algorithm is the one learning the new skill, and humans are the “experts” that can demonstrate how to perform the task. -Imitation learning is a direct application of this idea to machine learning for interactive tasks. +Imitation learning is a strategy for getting the learner to perform at least as well as the expert. We’ll see that the most naive form of imitation learning, called behavioral cloning, is really an application of supervised learning to interactive tasks. -We’ll then explore dataset aggregation (DAgger) as a way to query an expert and learn even more effectively.

7.2Behavioral cloning

This notion of “learning from human-provided data” may remind you of the basic premise of 4 Supervised learning, -in which there is some mapping from inputs to outputs that us humans can implicitly compute, such as seeing a photo and being able to recognize its constituents. -To teach a machine to calculate this mapping, we first collect a large training dataset by getting people to label a lot of inputs, -and then use some optimization algorithm to produce a predictor that maps from the inputs to the outputs as closely as possible. -How does this relate to interactive tasks? -Here, the input is the observation seen by the agent and the output is the action it selects, so the mapping is the agent’s policy. -What’s stopping us from applying supervised learning techniques? -In practice, nothing! This is called behavioral cloning.

Typically, this second task can be framed as empirical loss minimization:

π~=argminπΠn=0N1loss(π(sn),an)\tilde \pi = \arg\min_{\pi \in \Pi} \sum_{n=0}^{N-1} \text{loss}(\pi(s^n), a^n)

where Π is some class of possible policies, loss\text{loss} is the loss function to measure how far off the policy’s prediction is, and the SL algorithm tells us how to compute this argmin\arg\min. -If training a deterministic policy that is just a function from inputs to outputs with no randomness, we might try to minimize the mean squared error. -More generally, though, we often choose the negative log likelihood as our loss function, so that the optimization is equivalent to maximum likelihood estimation: -out of the space of all possible mappings, we search for the one according to which the training dataset is the most likely.

π~=argmaxπΠPanπ(sn)(a0:Ns0:N)\tilde \pi = \arg\max_{\pi \in \Pi} \pr_{a^n \sim \pi(s^n)}(a^{0:N} \mid s^{0:N})

Can we quantify how well this algorithm works? -For simplicity, let’s consider the case where the action space is discrete and both the data and trained policy are deterministic. -(This corresponds to a classification task in SL.) -Suppose the SL algorithm obtains ε\varepsilon classification error. +We’ll then explore dataset aggregation (DAgger) as a way to query an expert and learn even more effectively.

7.2Behavioral cloning

This notion of “learning from human-provided data” may remind you of the basic premise of 4 Supervised learning. +In supervised learning, +there is some mapping from inputs to outputs, +such as the task of assigning the correct label to an image, +that humans can implicitly compute. +To teach a machine to calculate this mapping, +we first collect a large training dataset by getting people to label a lot of inputs, +and then use some optimization algorithm to produce a predictor that maps from the inputs to the outputs as closely as possible.

How does this relate to interactive tasks? +Here, the input is the observation seen by the agent and the output is the action it selects, +so the mapping is the agent’s policy. +What’s stopping us from applying supervised learning techniques to mimic the expert’s policy? +In principle, nothing! +This is called behavioral cloning.

Typically, this second task can be framed as empirical loss minimization:

π~=argminπΠn=0N1loss(π(sn),an)\widetilde{\pi} = \arg\min_{\pi \in \Pi} \sum_{n=0}^{N-1} \text{loss}(\pi(s^n), a^n)

where Π is some class of possible policies, loss\text{loss} is the loss function to measure how different the policy’s prediction is from the true observed action, +and the SL algorithm itself, also known as the fitting method, tells us how to compute this argmin\arg\min.

How should we choose the loss function? +In supervised learning, we saw that the mean squared error is a good choice for continuous outputs. +However, how should we measure the difference between two actions in a discrete action space? +In this setting, the policy acts more like a classifier that picks the best action in a given state. +Rather than considering a deterministic policy that just outputs a single action, +we’ll consider a stochastic policy π that outputs a distribution over actions. +This allows us to assign a likelihood to observing the entire dataset D\mathcal{D} under the policy π, +assuming the state-action pairs are independent:

Pπ(D)=n=1Nπ(ansn)\pr_\pi (\mathcal{D}) = \prod_{n=1}^{N} \pi(a_n \mid s_n)

Note that the states and actions are not, however, actually independent! A key property of interactive tasks is that the agent’s output -- the action that it takes -- may influence its next observation. +We want to find a policy under which the training dataset D\mathcal{D} is the most likely. +This is called the maximum likelihood estimate of the policy that generated the dataset:

π~=argmaxπΠPπ(D)\widetilde{\pi} = \arg\max_{\pi \in \Pi} \pr_{\pi}(\mathcal{D})

This is also equivalent to picking the negative log likelihood as the loss function:

π~=argminπΠlogPπ(D)=argminπΠn=1Nlogπ(ansn)\begin{align*} +\widetilde{\pi} &= \arg\min_{\pi \in \Pi} - \log \pr_\pi(\mathcal{D}) \\ +&= \arg\min_{\pi \in \Pi} \sum_{n=1}^N - \log \pi(a_n \mid s_n) +\end{align*}

7.2.1Performance of behavioral cloning

Can we quantify how well this algorithm works? +For simplicity, let’s consider the case where the action space is finite and both the expert policy and learned policy are deterministic. +Suppose the learned policy obtains ε\varepsilon classification error. That is, for trajectories drawn from the expert policy, -the learned policy chooses a different action at most ε\varepsilon of the time:

Eτρπdata[1Hh=0H11{π~(sh)πdata(sh)}]ε\mathbb{E}_{\tau \sim \rho_{\pi_{\text{data}}}} \left[ \frac 1 \hor \sum_{\hi=0}^{\hor-1} \ind{ \tilde \pi(s_\hi) \ne \pi_{\text{data}} (s_\hi) } \right] \le \varepsilon

Then, their value functions differ by

VπdataVπ~H2ε| V^{\pi_{\text{data}}} - V^{\tilde \pi} | \le H^2 \varepsilon

where HH is the horizon.

7.3Distribution shift

Let us return to the driving analogy. Suppose you have taken some driving lessons and now feel comfortable in your neighbourhood. But today you have to travel to an area you haven’t visited before, such as a highway, where it would be dangerous to try and apply the techniques you’ve already learned. +This is the issue of distribution shift: a policy learned under a certain distribution of states may not perform well if this distribution changes.

This is already a common issue in supervised learning, where the training dataset for a model might not resemble the environment where it gets deployed. +In interactive environments, this issue is further exacerbated by the dependency between the observations and the agent’s behavior; if you take a wrong turn early on, it may be difficult or impossible to recover in that trajectory.

How could you learn a strategy for these new settings? In the driving example, you might decide to install a dashcam to record the car’s surroundings. That way, once you make it back to safety, you can show the recording to an expert, who can provide feedback at each step of the way. Then the next time you go for a drive, you can remember the expert’s advice, and take a safer route. You could then repeat this training as many times as desired, thereby collecting the expert’s feedback over a diverse range of locations. -This is the key idea behind dataset aggregation.

7.4Dataset aggregation (DAgger)

The DAgger algorithm is due to Ross et al. (2010).

def dagger_pseudocode(
-    env: MAB,
-    π_init: Policy,
-    π_expert: Policy,
-    n_dagger_iterations: int,
-    n_trajectories_per_iteration: int
-):
-    π = π_init
-    dataset = set()
-
-    for _ in range(n_dagger_iterations):
-        for __ in range(n_trajectories_per_iteration):
-            τ = collect_trajectory(π, env)
-            for step in range(env.H):
-                obs = τ.state[step]
-                τ.action[step] = π_expert(obs)
-            dataset.add(τ)
-        
-        π = fit(dataset)
-    
-    return π

How well does DAgger perform?

References
  1. Ross, S., Gordon, G. J., & Bagnell, J. (2010, November). A Reduction of Imitation Learning and Structured Prediction to No-Regret Online Learning. International Conference on Artificial Intelligence and Statistics.
\ No newline at end of file diff --git a/imitation-learning.json b/imitation-learning.json index 5714b0e..69714b8 100644 --- a/imitation-learning.json +++ b/imitation-learning.json @@ -1 +1 @@ -{"kind":"Notebook","sha256":"1e76726d66e846c6b0aed795c9cfc8b5359c0fc8bc249124a868f2881ec3941c","slug":"imitation-learning","location":"/imitation_learning.md","dependencies":[],"frontmatter":{"title":"7 Imitation Learning","numbering":{"all":{"enabled":true},"enumerator":{"template":"7.%s"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"exports":[{"format":"md","filename":"imitation_learning.md","url":"/build/imitation_learning-bf860cb6679fb159939c7b8b45aabd4b.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"oXh8i5tLc1"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"7.1","key":"uP4xl71ybO"},{"type":"paragraph","position":{"start":{"line":20,"column":1},"end":{"line":23,"column":1}},"children":[{"type":"text","value":"Imagine you are tasked with learning how to drive. How do, or did, you go about it?\nAt first, this task might seem insurmountable: there are a vast array of controls, and the cost of making a single mistake could be extremely high, making it hard to explore by trial and error.\nLuckily, there are already people in the world who know how to drive who can get you started.\nIn this and many other examples, we all “stand on the shoulders of giants” and learn skills from experts who have already mastered them.","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"FhLwrFWBDc"}],"key":"vUi3Jmhvye"},{"type":"paragraph","position":{"start":{"line":25,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"text","value":"Now in machine learning, much of the time, we are trying to teach machines to accomplish tasks that us humans are already proficient at.\nIn such cases, the machine learning algorithm is the one learning the new skill, and humans are the “experts” that can demonstrate how to perform the task.\n","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"YBtrkvr1ux"},{"type":"strong","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"Imitation learning","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"AcTbzYoL2c"}],"key":"MwYIHSbNoW"},{"type":"text","value":" is a direct application of this idea to machine learning for interactive tasks.\nWe’ll see that the most naive form of imitation learning, called ","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"h9hwMUhAWv"},{"type":"strong","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"behavioral cloning","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"eZLOaJVA9u"}],"key":"op1EzClOfH"},{"type":"text","value":", is really an application of supervised learning to interactive tasks.\nWe’ll then explore ","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"KhdtZdKLyG"},{"type":"strong","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"dataset aggregation","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"Vry7ZGQBma"}],"key":"O2fa4giLrL"},{"type":"text","value":" (DAgger) as a way to query an expert and learn even more effectively.","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"sveMZgpiXY"}],"key":"GX6ckKdf8M"},{"type":"heading","depth":2,"position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"children":[{"type":"text","value":"Behavioral cloning","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"dDhbIaX9jU"}],"identifier":"behavioral-cloning","label":"Behavioral cloning","html_id":"behavioral-cloning","implicit":true,"enumerator":"7.2","key":"mOmaFJf5hh"},{"type":"paragraph","position":{"start":{"line":33,"column":1},"end":{"line":40,"column":1}},"children":[{"type":"text","value":"This notion of “learning from human-provided data” may remind you of the basic premise of ","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"Ks466pQ2mN"},{"type":"link","url":"/supervised-learning","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"text","value":"4 Supervised learning","key":"HEQfpGCu1M"}],"urlSource":"./supervised_learning.md","dataUrl":"/supervised-learning.json","internal":true,"protocol":"file","key":"ZSKRy5rXyu"},{"type":"text","value":",\nin which there is some mapping from ","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"EqGgfqd0IE"},{"type":"emphasis","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"text","value":"inputs","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"Aq8YqIdIlP"}],"key":"LdgE3k6DHw"},{"type":"text","value":" to ","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"E7Z2cqIrsy"},{"type":"emphasis","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"text","value":"outputs","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"SGHap08AG5"}],"key":"rvJPWDFbui"},{"type":"text","value":" that us humans can implicitly compute, such as seeing a photo and being able to recognize its constituents.\nTo teach a machine to calculate this mapping, we first collect a large ","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"jPze3EXvV9"},{"type":"emphasis","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"text","value":"training dataset","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"vLVnalZlcM"}],"key":"f9nOvVsYOv"},{"type":"text","value":" by getting people to label a lot of inputs,\nand then use some optimization algorithm to produce a predictor that maps from the inputs to the outputs as closely as possible.\nHow does this relate to interactive tasks?\nHere, the input is the observation seen by the agent and the output is the action it selects, so the mapping is the agent’s policy.\nWhat’s stopping us from applying supervised learning techniques?\nIn practice, nothing! This is called ","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"NueeScPtUY"},{"type":"strong","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"text","value":"behavioral cloning.","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"rwxQzYSm6n"}],"key":"f3bpMdCJXK"}],"key":"D3P90FE71O"},{"type":"proof","kind":"definition","label":"behavioral_cloning","identifier":"behavioral_cloning","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Behavioral cloning","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"CuoTGx7I3Y"}],"key":"SVA85mArSp"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":46,"column":1},"end":{"line":48,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":46,"column":1},"end":{"line":47,"column":1}},"children":[{"type":"text","value":"Collect a training dataset of trajectories generated by an expert policy ","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"key":"dFdWNpB6Z0"},{"type":"inlineMath","value":"\\pi_\\text{data}","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"html":"πdata\\pi_\\text{data}πdata","key":"DQFdIqpqRS"},{"type":"text","value":". Here, we treat each state-action pair as independent, resuling in a dataset ","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"key":"OZVzizL1ox"},{"type":"inlineMath","value":"\\mathcal{D} = (s^n, a^n)_{n=1}^{N}","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"html":"D=(sn,an)n=1N\\mathcal{D} = (s^n, a^n)_{n=1}^{N}D=(sn,an)n=1N","key":"obD0KGPTCb"},{"type":"text","value":". (For concreteness, if there are ","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"key":"cWhTonljeO"},{"type":"inlineMath","value":"M","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"html":"MMM","key":"GrOQxW3Eii"},{"type":"text","value":" trajectories with a horizon ","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"key":"HwNFWVZKua"},{"type":"inlineMath","value":"H","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"html":"HHH","key":"qZ263wYgE6"},{"type":"text","value":", then ","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"key":"eyxUzZtJzA"},{"type":"inlineMath","value":"N = M \\times H","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"html":"N=M×HN = M \\times HN=M×H","key":"vZg9EVe6mv"},{"type":"text","value":".)","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"key":"dL9vgFN1tg"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"children":[{"type":"text","value":"Note that this is an inaccurate approximation! A key property of interactive tasks is that the agent’s output -- the action that it takes -- may influence its next observation.","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"key":"gUlotbtCDK"}],"key":"VnW2WuHsGE"}],"key":"vlVsRmZxKV"}],"key":"PgG5bLgWe9"},{"type":"listItem","spread":true,"position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"children":[{"type":"text","value":"Use a SL algorithm ","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"key":"unUAGzD0Jn"},{"type":"inlineMath","value":"\\texttt{fit} : \\mathcal{D} \\mapsto \\tilde \\pi","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"html":"fit:Dπ~\\texttt{fit} : \\mathcal{D} \\mapsto \\tilde \\pifit:Dπ~","key":"zGitxRUPVv"},{"type":"text","value":" to extract a policy ","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"key":"AeUEesKSW3"},{"type":"inlineMath","value":"\\tilde \\pi","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"html":"π~\\tilde \\piπ~","key":"DCAunLkwKC"},{"type":"text","value":" that approximates the expert policy.","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"key":"hZaZuzOlx2"}],"key":"kuYNYLOMuR"}],"key":"lM2KdOXn8N"}],"enumerator":"7.1","html_id":"behavioral-cloning","key":"Qqv98Fxssl"},{"type":"paragraph","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"children":[{"type":"text","value":"Typically, this second task can be framed as ","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"b2zv6c7MY2"},{"type":"strong","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"children":[{"type":"text","value":"empirical loss minimization","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"PpYAo0TXhv"}],"key":"wEO6r81XoP"},{"type":"text","value":":","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"vesWO6joHd"}],"key":"fxj4ue0hj7"},{"type":"math","value":"\\tilde \\pi = \\arg\\min_{\\pi \\in \\Pi} \\sum_{n=0}^{N-1} \\text{loss}(\\pi(s^n), a^n)","html":"π~=argminπΠn=0N1loss(π(sn),an)\\tilde \\pi = \\arg\\min_{\\pi \\in \\Pi} \\sum_{n=0}^{N-1} \\text{loss}(\\pi(s^n), a^n)π~=argπΠminn=0N1loss(π(sn),an)","enumerator":"7.1","key":"NEHPiVjfDp"},{"type":"paragraph","position":{"start":{"line":57,"column":1},"end":{"line":60,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"tjVBRiz8OM"},{"type":"text","value":"Π","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"cC7rJoN50G"},{"type":"text","value":" is some class of possible policies, ","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"K8jlx0rw5p"},{"type":"inlineMath","value":"\\text{loss}","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"html":"loss\\text{loss}loss","key":"WF6FZvCuxO"},{"type":"text","value":" is the loss function to measure how far off the policy’s prediction is, and the SL algorithm tells us how to compute this ","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"FU8zR8Y4LK"},{"type":"inlineMath","value":"\\arg\\min","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"html":"argmin\\arg\\minargmin","key":"gs9AEWlznE"},{"type":"text","value":".\nIf training a deterministic policy that is just a function from inputs to outputs with no randomness, we might try to minimize the ","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"ekcvI4J5gE"},{"type":"strong","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"children":[{"type":"text","value":"mean squared error","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"OajRx00tRw"}],"key":"DuJEsxgDO8"},{"type":"text","value":".\nMore generally, though, we often choose the ","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"yDba0XDiPb"},{"type":"strong","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"children":[{"type":"text","value":"negative log likelihood","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"IfxSTiy9jD"}],"key":"wtRPCMWLXa"},{"type":"text","value":" as our loss function, so that the optimization is equivalent to ","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"spy78n0s1f"},{"type":"strong","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"children":[{"type":"text","value":"maximum likelihood estimation","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"RWpom3CdP9"}],"key":"ZUKuXlHfsm"},{"type":"text","value":":\nout of the space of all possible mappings, we search for the one according to which the training dataset is the most likely.","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"VSwAOhZxWQ"}],"key":"I9R8PK3sy6"},{"type":"math","value":"\\tilde \\pi = \\arg\\max_{\\pi \\in \\Pi} \\pr_{a^n \\sim \\pi(s^n)}(a^{0:N} \\mid s^{0:N})","html":"π~=argmaxπΠPanπ(sn)(a0:Ns0:N)\\tilde \\pi = \\arg\\max_{\\pi \\in \\Pi} \\pr_{a^n \\sim \\pi(s^n)}(a^{0:N} \\mid s^{0:N})π~=argπΠmaxPanπ(sn)(a0:Ns0:N)","enumerator":"7.2","key":"akh00Gigph"},{"type":"paragraph","position":{"start":{"line":66,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"Can we quantify how well this algorithm works?\nFor simplicity, let’s consider the case where the action space is discrete and both the data and trained policy are deterministic.\n(This corresponds to a classification task in SL.)\nSuppose the SL algorithm obtains ","position":{"start":{"line":66,"column":1},"end":{"line":66,"column":1}},"key":"ZKV6A2HTDP"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":66,"column":1},"end":{"line":66,"column":1}},"html":"ε\\varepsilonε","key":"VAOh0Tl49V"},{"type":"text","value":" classification error.\nThat is, for trajectories drawn from the expert policy,\nthe learned policy chooses a different action at most ","position":{"start":{"line":66,"column":1},"end":{"line":66,"column":1}},"key":"gwccsB9bnj"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":66,"column":1},"end":{"line":66,"column":1}},"html":"ε\\varepsilonε","key":"m3PhHXxSPj"},{"type":"text","value":" of the time:","position":{"start":{"line":66,"column":1},"end":{"line":66,"column":1}},"key":"gHwJBWcuAY"}],"key":"qoyCmV1ZkX"},{"type":"math","value":"\\mathbb{E}_{\\tau \\sim \\rho_{\\pi_{\\text{data}}}} \\left[ \\frac 1 \\hor \\sum_{\\hi=0}^{\\hor-1} \\ind{ \\tilde \\pi(s_\\hi) \\ne \\pi_{\\text{data}} (s_\\hi) } \\right] \\le \\varepsilon","html":"Eτρπdata[1Hh=0H11{π~(sh)πdata(sh)}]ε\\mathbb{E}_{\\tau \\sim \\rho_{\\pi_{\\text{data}}}} \\left[ \\frac 1 \\hor \\sum_{\\hi=0}^{\\hor-1} \\ind{ \\tilde \\pi(s_\\hi) \\ne \\pi_{\\text{data}} (s_\\hi) } \\right] \\le \\varepsilonEτρπdata[H1h=0H11{π~(sh)=πdata(sh)}]ε","enumerator":"7.3","key":"JapqNvqqyP"},{"type":"paragraph","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"children":[{"type":"text","value":"Then, their value functions differ by","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"key":"rQQcvoL2dx"}],"key":"JilhKsyFmh"},{"type":"math","value":"| V^{\\pi_{\\text{data}}} - V^{\\tilde \\pi} | \\le H^2 \\varepsilon","html":"VπdataVπ~H2ε| V^{\\pi_{\\text{data}}} - V^{\\tilde \\pi} | \\le H^2 \\varepsilonVπdataVπ~H2ε","enumerator":"7.4","key":"Xh4FfHYkcc"},{"type":"paragraph","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"FmAlkedGmG"},{"type":"inlineMath","value":"H","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"HHH","key":"qLLQ1Afb9L"},{"type":"text","value":" is the horizon.","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"lwlBsMJBHb"}],"key":"b7uUNdqsH4"},{"type":"proof","kind":"theorem","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Performance of behavioral cloning","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"y8XchgvHJc"}],"key":"HwZabIwGUy"},{"type":"paragraph","position":{"start":{"line":87,"column":1},"end":{"line":87,"column":1}},"children":[{"type":"text","value":"Recall the ","position":{"start":{"line":87,"column":1},"end":{"line":87,"column":1}},"key":"GAzZOVwp2P"},{"type":"crossReference","kind":"proof:theorem","identifier":"pdl","label":"pdl","children":[{"type":"text","value":"Theorem ","key":"jYgIaIx9Dg"},{"type":"text","value":"6.1","key":"TDsSgBYhdX"}],"template":"Theorem %s","enumerator":"6.1","resolved":true,"html_id":"pdl","remote":true,"url":"/pg","dataUrl":"/pg.json","key":"tfkWlO4dmH"},{"type":"text","value":" allows us to express the difference between ","position":{"start":{"line":87,"column":1},"end":{"line":87,"column":1}},"key":"wzzzPHVcqk"},{"type":"inlineMath","value":"\\pi_{\\text{data}}","position":{"start":{"line":87,"column":1},"end":{"line":87,"column":1}},"html":"πdata\\pi_{\\text{data}}πdata","key":"SpjYCGalyi"},{"type":"text","value":" and ","position":{"start":{"line":87,"column":1},"end":{"line":87,"column":1}},"key":"N3iCiTdjLw"},{"type":"inlineMath","value":"\\tilde \\pi","position":{"start":{"line":87,"column":1},"end":{"line":87,"column":1}},"html":"π~\\tilde \\piπ~","key":"GP37lQfihj"},{"type":"text","value":" as","position":{"start":{"line":87,"column":1},"end":{"line":87,"column":1}},"key":"Sytr8emzRd"}],"key":"gZVuQeDraM"},{"type":"math","value":"V_0^{\\pi_{\\text{data}}}(s) - V_0^{\\tilde \\pi} (s) = \\E_{\\tau \\sim \\rho^{\\pi_{\\text{data}}} \\mid s_0 = s} \\left[ \\sum_{\\hi=0}^{\\hor-1} A_\\hi^{\\tilde \\pi} (s_\\hi, a_\\hi) \\right].","position":{"start":{"line":89,"column":1},"end":{"line":91,"column":1}},"html":"V0πdata(s)V0π~(s)=Eτρπdatas0=s[h=0H1Ahπ~(sh,ah)].V_0^{\\pi_{\\text{data}}}(s) - V_0^{\\tilde \\pi} (s) = \\E_{\\tau \\sim \\rho^{\\pi_{\\text{data}}} \\mid s_0 = s} \\left[ \\sum_{\\hi=0}^{\\hor-1} A_\\hi^{\\tilde \\pi} (s_\\hi, a_\\hi) \\right].V0πdata(s)V0π~(s)=Eτρπdatas0=s[h=0H1Ahπ~(sh,ah)].","enumerator":"7.5","key":"lypGqarzgg"},{"type":"paragraph","position":{"start":{"line":93,"column":1},"end":{"line":95,"column":1}},"children":[{"type":"text","value":"Now since the data policy is deterministic, we can substitute ","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"key":"u1cKdJkW4H"},{"type":"inlineMath","value":"a_\\hi = \\pi_{\\text{data}}(s_\\hi)","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"html":"ah=πdata(sh)a_\\hi = \\pi_{\\text{data}}(s_\\hi)ah=πdata(sh)","key":"xeSJ2xQiKx"},{"type":"text","value":".\nThis allows us to make a further simplification:\nsince ","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"key":"OHzxKksRDO"},{"type":"inlineMath","value":"\\pi_{\\text{data}}","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"html":"πdata\\pi_{\\text{data}}πdata","key":"As1VXCtcDF"},{"type":"text","value":" is deterministic, we have","position":{"start":{"line":93,"column":1},"end":{"line":93,"column":1}},"key":"cN22EzbAlk"}],"key":"LgNPFvBj7Q"},{"type":"math","value":"A^{\\pi_{\\text{data}}}(s, \\pi_{\\text{data}}(s)) = Q^{\\pi_{\\text{data}}}(s, \\pi_{\\text{data}}(s)) - V^{\\pi_{\\text{data}}}(s) = 0.","position":{"start":{"line":97,"column":1},"end":{"line":99,"column":1}},"html":"Aπdata(s,πdata(s))=Qπdata(s,πdata(s))Vπdata(s)=0.A^{\\pi_{\\text{data}}}(s, \\pi_{\\text{data}}(s)) = Q^{\\pi_{\\text{data}}}(s, \\pi_{\\text{data}}(s)) - V^{\\pi_{\\text{data}}}(s) = 0.Aπdata(s,πdata(s))=Qπdata(s,πdata(s))Vπdata(s)=0.","enumerator":"7.6","key":"I8nDk4SeFV"},{"type":"paragraph","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"children":[{"type":"text","value":"Now we can use the assumption that the SL algorithm obtains ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"u01FihDZbh"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"html":"ε\\varepsilonε","key":"sSmgQtZs2E"},{"type":"text","value":" classification error. By the above, ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"r6fZI9igoh"},{"type":"inlineMath","value":"A_\\hi^{\\tilde \\pi}(s_\\hi, \\pi_{\\text{data}}(s_\\hi)) = 0","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"html":"Ahπ~(sh,πdata(sh))=0A_\\hi^{\\tilde \\pi}(s_\\hi, \\pi_{\\text{data}}(s_\\hi)) = 0Ahπ~(sh,πdata(sh))=0","key":"X1sA3tl6Hi"},{"type":"text","value":" when ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"gRDoqQUUXh"},{"type":"inlineMath","value":"\\pi_{\\text{data}}(s_\\hi) = \\tilde \\pi(s_\\hi)","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"html":"πdata(sh)=π~(sh)\\pi_{\\text{data}}(s_\\hi) = \\tilde \\pi(s_\\hi)πdata(sh)=π~(sh)","key":"dw8g9uJaqa"},{"type":"text","value":". In the case where the two policies differ on ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"prDADnAiHK"},{"type":"inlineMath","value":"s_\\hi","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"html":"shs_\\hish","key":"OA3zdxo4Yd"},{"type":"text","value":", which occurs with probability ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"CYPfV12peO"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"html":"ε\\varepsilonε","key":"qOLZ8kkTQf"},{"type":"text","value":", the advantage is naively upper bounded by ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"GxI0fUmzSu"},{"type":"inlineMath","value":"H","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"html":"HHH","key":"uDcu3f1dwz"},{"type":"text","value":" (assuming rewards are bounded between ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"CTMMv2EcU7"},{"type":"text","value":"0","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"HAUGvAwRDD"},{"type":"text","value":" and ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"eHaHWL8Q8i"},{"type":"text","value":"1","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"rptjfJSUIq"},{"type":"text","value":"). Taking the final sum gives the desired bound.","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"siAUJpThWk"}],"key":"Gyqwv9BlHa"}],"enumerator":"7.1","key":"iYuXkfm6wm"},{"type":"comment","value":" TODO ADD DISTRIBUTION SHIFT EXAMPLE FROM SLIDES ","key":"Taw695tfSQ"},{"type":"heading","depth":2,"position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"children":[{"type":"text","value":"Distribution shift","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"key":"nU9LKCjcVw"}],"identifier":"distribution-shift","label":"Distribution shift","html_id":"distribution-shift","implicit":true,"enumerator":"7.3","key":"IBgd17hZyD"},{"type":"paragraph","position":{"start":{"line":108,"column":1},"end":{"line":109,"column":1}},"children":[{"type":"text","value":"Let us return to the driving analogy. Suppose you have taken some driving lessons and now feel comfortable in your neighbourhood. But today you have to travel to an area you haven’t visited before, such as a highway, where it would be dangerous to try and apply the techniques you’ve already learned.\nThis is the issue of ","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"key":"as8mnFXYTq"},{"type":"emphasis","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"children":[{"type":"text","value":"distribution shift","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"key":"RmllCBxOFv"}],"key":"nSzzkXEauD"},{"type":"text","value":": a policy learned under some distribution of states may not perform well if this distribution changes.","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"key":"FpRQHrY1Sy"}],"key":"jV0E40BIg4"},{"type":"paragraph","position":{"start":{"line":111,"column":1},"end":{"line":111,"column":1}},"children":[{"type":"text","value":"This is already a common issue in supervised learning, where the training dataset for a model might not resemble the environment where it gets deployed. In interactive environments, this issue is further exacerbated by the dependency between the observations and the agent’s behaviour; if you take a wrong turn early on, it may be difficult or impossible to recover in that trajectory.","position":{"start":{"line":111,"column":1},"end":{"line":111,"column":1}},"key":"JnmoLoO9Xw"}],"key":"lHE3IlQmbo"},{"type":"paragraph","position":{"start":{"line":113,"column":1},"end":{"line":117,"column":1}},"children":[{"type":"text","value":"How could you learn a strategy for these new settings?\nIn the driving example, you might decide to install a dashcam to record the car’s surroundings. That way, once you make it back to safety, you can show the recording to an expert, who can provide feedback at each step of the way.\nThen the next time you go for a drive, you can remember the expert’s advice, and take a safer route.\nYou could then repeat this training as many times as desired, thereby collecting the expert’s feedback over a diverse range of locations.\nThis is the key idea behind ","position":{"start":{"line":113,"column":1},"end":{"line":113,"column":1}},"key":"goWvkPi2AU"},{"type":"emphasis","position":{"start":{"line":113,"column":1},"end":{"line":113,"column":1}},"children":[{"type":"text","value":"dataset aggregation","position":{"start":{"line":113,"column":1},"end":{"line":113,"column":1}},"key":"dIYAmmX4bo"}],"key":"zHtvhXCAgr"},{"type":"text","value":".","position":{"start":{"line":113,"column":1},"end":{"line":113,"column":1}},"key":"zV4YzP8zG8"}],"key":"mfCZpQdqpQ"},{"type":"heading","depth":2,"position":{"start":{"line":119,"column":1},"end":{"line":119,"column":1}},"children":[{"type":"text","value":"Dataset aggregation (DAgger)","position":{"start":{"line":119,"column":1},"end":{"line":119,"column":1}},"key":"blZvFAU3RC"}],"identifier":"dataset-aggregation-dagger","label":"Dataset aggregation (DAgger)","html_id":"dataset-aggregation-dagger","implicit":true,"enumerator":"7.4","key":"oQXKG3nY2e"},{"type":"paragraph","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"children":[{"type":"text","value":"The DAgger algorithm is due to ","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"dWQfNx6Wie"},{"type":"cite","kind":"narrative","label":"ross_reduction_2010","identifier":"ross_reduction_2010","children":[{"type":"text","value":"Ross ","key":"w6TKNrKqtP"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"jcLVHPOIwO"}],"key":"QM2UU8engx"},{"type":"text","value":" (2010)","key":"GPIGtk8qB0"}],"enumerator":"1","key":"xr4SUNYGnC"},{"type":"text","value":".","position":{"start":{"line":121,"column":1},"end":{"line":121,"column":1}},"key":"jLC1yXGJ3P"}],"key":"dv5PJUiiv3"},{"type":"code","lang":"python","value":"def dagger_pseudocode(\n env: MAB,\n π_init: Policy,\n π_expert: Policy,\n n_dagger_iterations: int,\n n_trajectories_per_iteration: int\n):\n π = π_init\n dataset = set()\n\n for _ in range(n_dagger_iterations):\n for __ in range(n_trajectories_per_iteration):\n τ = collect_trajectory(π, env)\n for step in range(env.H):\n obs = τ.state[step]\n τ.action[step] = π_expert(obs)\n dataset.add(τ)\n \n π = fit(dataset)\n \n return π","position":{"start":{"line":123,"column":1},"end":{"line":145,"column":1}},"key":"pX8GIbEzMl"},{"type":"paragraph","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"children":[{"type":"text","value":"How well does DAgger perform?","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"key":"dg8llzYhfa"}],"key":"AYLCeONG4A"},{"type":"comment","value":" TODO ","key":"uAlI2xSmDS"}],"key":"FQmos3Rs3x"}],"key":"GsBnDRtkdy"},"references":{"cite":{"order":["ross_reduction_2010"],"data":{"ross_reduction_2010":{"label":"ross_reduction_2010","enumerator":"1","html":"Ross, S., Gordon, G. J., & Bagnell, J. (2010, November). A Reduction of Imitation Learning and Structured Prediction to No-Regret Online Learning. International Conference on Artificial Intelligence and Statistics."}}}},"footer":{"navigation":{"prev":{"title":"6 Policy Gradient Methods","url":"/pg","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"8 Tree Search Methods","url":"/planning","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file +{"kind":"Notebook","sha256":"40f36ad5d7845a64bf77ab662900f54de45318fe5ce887437f4c2cb41510408a","slug":"imitation-learning","location":"/imitation_learning.md","dependencies":[],"frontmatter":{"title":"7 Imitation Learning","numbering":{"all":{"enabled":true},"enumerator":{"template":"7.%s"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"thumbnail":"/build/robot-imitation-lear-8001fbb5135e7bfeebfc489e721eaabd.jpg","exports":[{"format":"md","filename":"imitation_learning.md","url":"/build/imitation_learning-bf09ff59ddcdb66b7ab3f1189910eb31.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"pbibyaLOjE"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"7.1","key":"ZbG30JLaVs"},{"type":"paragraph","position":{"start":{"line":20,"column":1},"end":{"line":24,"column":1}},"children":[{"type":"text","value":"Imagine you are tasked with learning how to drive. How do, or did, you go about it?\nAt first, this task might seem insurmountable: there are a vast array of controls, and the cost of making a single mistake could be extremely high, making it hard to explore by trial and error.\nLuckily, there are already people in the world who know how to drive who can get you started.\nIn almost every challenge we face,\nwe “stand on the shoulders of giants” and learn skills from experts who have already mastered them.","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"O5BDA5hn8t"}],"key":"sxGKsIhauE"},{"type":"image","url":"/build/robot-imitation-lear-8001fbb5135e7bfeebfc489e721eaabd.jpg","alt":"a robot imitating the pose of a young child (Photo by Pavel Danilyuk: https://www.pexels.com/photo/a-robot-imitating-a-girl-s-movement-8294811/)","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"ydyk30bDIP","urlSource":"./shared/robot-imitation-learning.jpg"},{"type":"paragraph","position":{"start":{"line":28,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"text","value":"Now in machine learning,\nwe are often trying to teach machines to accomplish tasks that humans are already proficient at.\nIn such cases, the machine learning algorithm is the one learning the new skill, and humans are the “experts” that can demonstrate how to perform the task.\n","position":{"start":{"line":28,"column":1},"end":{"line":28,"column":1}},"key":"zT6du8vwZS"},{"type":"strong","position":{"start":{"line":28,"column":1},"end":{"line":28,"column":1}},"children":[{"type":"text","value":"Imitation learning","position":{"start":{"line":28,"column":1},"end":{"line":28,"column":1}},"key":"zkHy2nmPuw"}],"key":"CicZi4sObD"},{"type":"text","value":" is a strategy for getting the learner to perform at least as well as the expert.\nWe’ll see that the most naive form of imitation learning, called ","position":{"start":{"line":28,"column":1},"end":{"line":28,"column":1}},"key":"fDSK47p7Uu"},{"type":"strong","position":{"start":{"line":28,"column":1},"end":{"line":28,"column":1}},"children":[{"type":"text","value":"behavioral cloning","position":{"start":{"line":28,"column":1},"end":{"line":28,"column":1}},"key":"X3BNcb2OQz"}],"key":"pVQFvQGknJ"},{"type":"text","value":", is really an application of supervised learning to interactive tasks.\nWe’ll then explore ","position":{"start":{"line":28,"column":1},"end":{"line":28,"column":1}},"key":"NtzbLji2KD"},{"type":"strong","position":{"start":{"line":28,"column":1},"end":{"line":28,"column":1}},"children":[{"type":"text","value":"dataset aggregation","position":{"start":{"line":28,"column":1},"end":{"line":28,"column":1}},"key":"IOi9Ferk9q"}],"key":"wmk5uLhqbe"},{"type":"text","value":" (DAgger) as a way to query an expert and learn even more effectively.","position":{"start":{"line":28,"column":1},"end":{"line":28,"column":1}},"key":"nP6gUKaUqH"}],"key":"upGcUSebts"},{"type":"heading","depth":2,"position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"children":[{"type":"text","value":"Behavioral cloning","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"ImKiOCO7wP"}],"identifier":"behavioral-cloning","label":"Behavioral cloning","html_id":"behavioral-cloning","implicit":true,"enumerator":"7.2","key":"TcXXYAk0ZF"},{"type":"paragraph","position":{"start":{"line":37,"column":1},"end":{"line":44,"column":1}},"children":[{"type":"text","value":"This notion of “learning from human-provided data” may remind you of the basic premise of ","position":{"start":{"line":37,"column":1},"end":{"line":37,"column":1}},"key":"VyibMZZG3A"},{"type":"link","url":"/supervised-learning","position":{"start":{"line":37,"column":1},"end":{"line":37,"column":1}},"children":[{"type":"text","value":"4 Supervised learning","key":"lMep1YyoxQ"}],"urlSource":"./supervised_learning.md","dataUrl":"/supervised-learning.json","internal":true,"protocol":"file","key":"MITlFvVfzD"},{"type":"text","value":".\nIn supervised learning,\nthere is some mapping from ","position":{"start":{"line":37,"column":1},"end":{"line":37,"column":1}},"key":"irYw8egdrI"},{"type":"emphasis","position":{"start":{"line":37,"column":1},"end":{"line":37,"column":1}},"children":[{"type":"text","value":"inputs","position":{"start":{"line":37,"column":1},"end":{"line":37,"column":1}},"key":"SE5qMRma7s"}],"key":"udl54OGbDf"},{"type":"text","value":" to ","position":{"start":{"line":37,"column":1},"end":{"line":37,"column":1}},"key":"Q1XVV8GXx1"},{"type":"emphasis","position":{"start":{"line":37,"column":1},"end":{"line":37,"column":1}},"children":[{"type":"text","value":"outputs","position":{"start":{"line":37,"column":1},"end":{"line":37,"column":1}},"key":"cABnNbd9L5"}],"key":"Ste75RPWDY"},{"type":"text","value":",\nsuch as the task of assigning the correct label to an image,\nthat humans can implicitly compute.\nTo teach a machine to calculate this mapping,\nwe first collect a large ","position":{"start":{"line":37,"column":1},"end":{"line":37,"column":1}},"key":"VQBbwrXNrG"},{"type":"emphasis","position":{"start":{"line":37,"column":1},"end":{"line":37,"column":1}},"children":[{"type":"text","value":"training dataset","position":{"start":{"line":37,"column":1},"end":{"line":37,"column":1}},"key":"euuaq11ZlP"}],"key":"rLDFhjvkU7"},{"type":"text","value":" by getting people to label a lot of inputs,\nand then use some optimization algorithm to produce a predictor that maps from the inputs to the outputs as closely as possible.","position":{"start":{"line":37,"column":1},"end":{"line":37,"column":1}},"key":"eskulUEdmc"}],"key":"J7aOttJJ4G"},{"type":"paragraph","position":{"start":{"line":46,"column":1},"end":{"line":51,"column":1}},"children":[{"type":"text","value":"How does this relate to interactive tasks?\nHere, the input is the observation seen by the agent and the output is the action it selects,\nso the mapping is the agent’s ","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"key":"H8Vj7WUd1y"},{"type":"emphasis","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"children":[{"type":"text","value":"policy","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"key":"zvRej16g0o"}],"key":"UfWDAkUAiX"},{"type":"text","value":".\nWhat’s stopping us from applying supervised learning techniques to mimic the expert’s policy?\nIn principle, nothing!\nThis is called ","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"key":"qsKWN97x81"},{"type":"strong","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"children":[{"type":"text","value":"behavioral cloning.","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"key":"YNNmsqdCs9"}],"key":"A6Un0t0J0f"}],"key":"pN8afUxvNs"},{"type":"proof","kind":"definition","label":"behavioral_cloning","identifier":"behavioral_cloning","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Behavioral cloning","position":{"start":{"line":53,"column":1},"end":{"line":53,"column":1}},"key":"Do6P2pf5Er"}],"key":"NBuiZdqxjB"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":57,"column":1},"end":{"line":58,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"children":[{"type":"text","value":"Collect a training dataset of trajectories ","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"V6K0TSR6o1"},{"type":"inlineMath","value":"\\mathcal{D} = (s^n, a^n)_{n=1}^{N}","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"html":"D=(sn,an)n=1N\\mathcal{D} = (s^n, a^n)_{n=1}^{N}D=(sn,an)n=1N","key":"pVnNtIsG6Z"},{"type":"text","value":" generated by an ","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"qXqNklsh0k"},{"type":"strong","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"children":[{"type":"text","value":"expert policy","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"xQNZLJMZQs"}],"key":"RRd70xIkft"},{"type":"text","value":" ","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"OwpKhBHec4"},{"type":"inlineMath","value":"\\pi_\\text{expert}","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"html":"πexpert\\pi_\\text{expert}πexpert","key":"JJ0xUo2AHd"},{"type":"text","value":". (For example, if the dataset contains ","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"MdvcWGS9Ez"},{"type":"inlineMath","value":"M","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"html":"MMM","key":"FrmUc3KL14"},{"type":"text","value":" trajectories, each with a finite horizon ","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"nYeHz8xrU4"},{"type":"inlineMath","value":"H","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"html":"HHH","key":"WtUuASJ4rO"},{"type":"text","value":", then ","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"Oap8BdhY6i"},{"type":"inlineMath","value":"N = M \\times H","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"html":"N=M×HN = M \\times HN=M×H","key":"Z9H5WpdHW2"},{"type":"text","value":".)","position":{"start":{"line":57,"column":1},"end":{"line":57,"column":1}},"key":"yrozAzjtSJ"}],"key":"NG6rWotTg9"},{"type":"listItem","spread":true,"position":{"start":{"line":58,"column":1},"end":{"line":58,"column":1}},"children":[{"type":"text","value":"Use a SL algorithm ","position":{"start":{"line":58,"column":1},"end":{"line":58,"column":1}},"key":"Y9IQguUzEv"},{"type":"inlineMath","value":"\\texttt{fit} : \\mathcal{D} \\mapsto \\widetilde{\\pi}","position":{"start":{"line":58,"column":1},"end":{"line":58,"column":1}},"html":"fit:Dπ~\\texttt{fit} : \\mathcal{D} \\mapsto \\widetilde{\\pi}fit:Dπ","key":"Xbpug9w3YE"},{"type":"text","value":" to extract a policy ","position":{"start":{"line":58,"column":1},"end":{"line":58,"column":1}},"key":"KckgIxamPS"},{"type":"inlineMath","value":"\\widetilde{\\pi}","position":{"start":{"line":58,"column":1},"end":{"line":58,"column":1}},"html":"π~\\widetilde{\\pi}π","key":"MOBnUDQYn6"},{"type":"text","value":" that approximates the expert policy.","position":{"start":{"line":58,"column":1},"end":{"line":58,"column":1}},"key":"Qc1b8lV0Qm"}],"key":"uMRi5GXm8c"}],"key":"YeweqtEGRl"}],"enumerator":"7.1","html_id":"behavioral-cloning","key":"TYU0TTiVXH"},{"type":"paragraph","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"children":[{"type":"text","value":"Typically, this second task can be framed as ","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"key":"Drx5W0WiGG"},{"type":"strong","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"children":[{"type":"text","value":"empirical loss minimization","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"key":"l1txRuQihP"}],"key":"au5QjV9huk"},{"type":"text","value":":","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"key":"UZ8rkoZmiI"}],"key":"k3kP5CfmNj"},{"type":"math","value":"\\widetilde{\\pi} = \\arg\\min_{\\pi \\in \\Pi} \\sum_{n=0}^{N-1} \\text{loss}(\\pi(s^n), a^n)","html":"π~=argminπΠn=0N1loss(π(sn),an)\\widetilde{\\pi} = \\arg\\min_{\\pi \\in \\Pi} \\sum_{n=0}^{N-1} \\text{loss}(\\pi(s^n), a^n)π=argπΠminn=0N1loss(π(sn),an)","enumerator":"7.1","key":"c0HT3g5oSb"},{"type":"paragraph","position":{"start":{"line":67,"column":1},"end":{"line":68,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":67,"column":1},"end":{"line":67,"column":1}},"key":"drCTEAUnRl"},{"type":"text","value":"Π","position":{"start":{"line":67,"column":1},"end":{"line":67,"column":1}},"key":"jLaR7YFsSp"},{"type":"text","value":" is some class of possible policies, ","position":{"start":{"line":67,"column":1},"end":{"line":67,"column":1}},"key":"RLWFSsrbZZ"},{"type":"inlineMath","value":"\\text{loss}","position":{"start":{"line":67,"column":1},"end":{"line":67,"column":1}},"html":"loss\\text{loss}loss","key":"hKnKjk7QbA"},{"type":"text","value":" is the loss function to measure how different the policy’s prediction is from the true observed action,\nand the SL algorithm itself, also known as the ","position":{"start":{"line":67,"column":1},"end":{"line":67,"column":1}},"key":"mOxUfVcZtm"},{"type":"strong","position":{"start":{"line":67,"column":1},"end":{"line":67,"column":1}},"children":[{"type":"text","value":"fitting method","position":{"start":{"line":67,"column":1},"end":{"line":67,"column":1}},"key":"SfHKV9ydnG"}],"key":"mWPjeC7SHJ"},{"type":"text","value":", tells us how to compute this ","position":{"start":{"line":67,"column":1},"end":{"line":67,"column":1}},"key":"YMJsCgCSnH"},{"type":"inlineMath","value":"\\arg\\min","position":{"start":{"line":67,"column":1},"end":{"line":67,"column":1}},"html":"argmin\\arg\\minargmin","key":"jLdgzf1WPL"},{"type":"text","value":".","position":{"start":{"line":67,"column":1},"end":{"line":67,"column":1}},"key":"bwHVZMRZ5P"}],"key":"Wu8mSilnEf"},{"type":"paragraph","position":{"start":{"line":70,"column":1},"end":{"line":77,"column":1}},"children":[{"type":"text","value":"How should we choose the loss function?\nIn supervised learning, we saw that the ","position":{"start":{"line":70,"column":1},"end":{"line":70,"column":1}},"key":"UIDQxLIFaq"},{"type":"strong","position":{"start":{"line":70,"column":1},"end":{"line":70,"column":1}},"children":[{"type":"text","value":"mean squared error","position":{"start":{"line":70,"column":1},"end":{"line":70,"column":1}},"key":"mwbWCFAwyG"}],"key":"MvZSqLQBuT"},{"type":"text","value":" is a good choice for continuous outputs.\nHowever, how should we measure the difference between two actions in a ","position":{"start":{"line":70,"column":1},"end":{"line":70,"column":1}},"key":"dYRYcl3l33"},{"type":"emphasis","position":{"start":{"line":70,"column":1},"end":{"line":70,"column":1}},"children":[{"type":"text","value":"discrete","position":{"start":{"line":70,"column":1},"end":{"line":70,"column":1}},"key":"iTsnJyhYqs"}],"key":"K5ewf2g7S3"},{"type":"text","value":" action space?\nIn this setting, the policy acts more like a ","position":{"start":{"line":70,"column":1},"end":{"line":70,"column":1}},"key":"rnJe2SN2oJ"},{"type":"emphasis","position":{"start":{"line":70,"column":1},"end":{"line":70,"column":1}},"children":[{"type":"text","value":"classifier","position":{"start":{"line":70,"column":1},"end":{"line":70,"column":1}},"key":"EHKcxW98lk"}],"key":"otJldix8CK"},{"type":"text","value":" that picks the best action in a given state.\nRather than considering a deterministic policy that just outputs a single action,\nwe’ll consider a stochastic policy ","position":{"start":{"line":70,"column":1},"end":{"line":70,"column":1}},"key":"IPvWJ4hRTg"},{"type":"text","value":"π","position":{"start":{"line":70,"column":1},"end":{"line":70,"column":1}},"key":"wkTNIfRjvZ"},{"type":"text","value":" that outputs a ","position":{"start":{"line":70,"column":1},"end":{"line":70,"column":1}},"key":"a2s6TmarRf"},{"type":"emphasis","position":{"start":{"line":70,"column":1},"end":{"line":70,"column":1}},"children":[{"type":"text","value":"distribution","position":{"start":{"line":70,"column":1},"end":{"line":70,"column":1}},"key":"deDvzQZSPj"}],"key":"Gqf6lo92Ae"},{"type":"text","value":" over actions.\nThis allows us to assign a ","position":{"start":{"line":70,"column":1},"end":{"line":70,"column":1}},"key":"mxkEtnKL1c"},{"type":"emphasis","position":{"start":{"line":70,"column":1},"end":{"line":70,"column":1}},"children":[{"type":"text","value":"likelihood","position":{"start":{"line":70,"column":1},"end":{"line":70,"column":1}},"key":"u88yXhj0yn"}],"key":"PbkQq26Veq"},{"type":"text","value":" to observing the entire dataset ","position":{"start":{"line":70,"column":1},"end":{"line":70,"column":1}},"key":"HgMuwGj8KL"},{"type":"inlineMath","value":"\\mathcal{D}","position":{"start":{"line":70,"column":1},"end":{"line":70,"column":1}},"html":"D\\mathcal{D}D","key":"zEzdNQkiER"},{"type":"text","value":" under the policy ","position":{"start":{"line":70,"column":1},"end":{"line":70,"column":1}},"key":"WGBHiwsuKA"},{"type":"text","value":"π","position":{"start":{"line":70,"column":1},"end":{"line":70,"column":1}},"key":"KZM4Qo8ZCh"},{"type":"text","value":",\nassuming the state-action pairs are independent:","position":{"start":{"line":70,"column":1},"end":{"line":70,"column":1}},"key":"ITkW8vqSg9"}],"key":"K5bTIeqKbd"},{"type":"math","value":"\\pr_\\pi (\\mathcal{D}) = \\prod_{n=1}^{N} \\pi(a_n \\mid s_n)","position":{"start":{"line":79,"column":1},"end":{"line":81,"column":1}},"html":"Pπ(D)=n=1Nπ(ansn)\\pr_\\pi (\\mathcal{D}) = \\prod_{n=1}^{N} \\pi(a_n \\mid s_n)Pπ(D)=n=1Nπ(ansn)","enumerator":"7.2","key":"JBerwyIl2P"},{"type":"paragraph","position":{"start":{"line":83,"column":1},"end":{"line":85,"column":1}},"children":[{"type":"text","value":"Note that the states and actions are ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"FvCbE9IlS6"},{"type":"emphasis","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"children":[{"type":"text","value":"not","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"FfNx5x4tQV"}],"key":"QKtETlqHfX"},{"type":"text","value":", however, actually independent! A key property of interactive tasks is that the agent’s output -- the action that it takes -- may influence its next observation.\nWe want to find a policy under which the training dataset ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"vRyL04NM4L"},{"type":"inlineMath","value":"\\mathcal{D}","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"D\\mathcal{D}D","key":"j1wUj4qKez"},{"type":"text","value":" is the most likely.\nThis is called the ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"Vl0TPONKEA"},{"type":"strong","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"children":[{"type":"text","value":"maximum likelihood estimate","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"UcBQ2Grgyz"}],"key":"R9O5tvBDRD"},{"type":"text","value":" of the policy that generated the dataset:","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"PesJP2nxdd"}],"key":"afLeoIjyFm"},{"type":"math","value":"\\widetilde{\\pi} = \\arg\\max_{\\pi \\in \\Pi} \\pr_{\\pi}(\\mathcal{D})","html":"π~=argmaxπΠPπ(D)\\widetilde{\\pi} = \\arg\\max_{\\pi \\in \\Pi} \\pr_{\\pi}(\\mathcal{D})π=argπΠmaxPπ(D)","enumerator":"7.3","key":"flMGvJAaOI"},{"type":"paragraph","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"children":[{"type":"text","value":"This is also equivalent to picking the ","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"key":"vEz8ONSMtv"},{"type":"strong","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"children":[{"type":"text","value":"negative log likelihood","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"key":"Kf2c54WZLS"}],"key":"xDyqr2KrPL"},{"type":"text","value":" as the loss function:","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"key":"XOh2YQoyGU"}],"key":"HtUFP0Bxnr"},{"type":"math","value":"\\begin{align*}\n\\widetilde{\\pi} &= \\arg\\min_{\\pi \\in \\Pi} - \\log \\pr_\\pi(\\mathcal{D}) \\\\\n&= \\arg\\min_{\\pi \\in \\Pi} \\sum_{n=1}^N - \\log \\pi(a_n \\mid s_n)\n\\end{align*}","html":"π~=argminπΠlogPπ(D)=argminπΠn=1Nlogπ(ansn)\\begin{align*}\n\\widetilde{\\pi} &= \\arg\\min_{\\pi \\in \\Pi} - \\log \\pr_\\pi(\\mathcal{D}) \\\\\n&= \\arg\\min_{\\pi \\in \\Pi} \\sum_{n=1}^N - \\log \\pi(a_n \\mid s_n)\n\\end{align*}π=argπΠminlogPπ(D)=argπΠminn=1Nlogπ(ansn)","enumerator":"7.4","key":"AN1HSsAhjq"},{"type":"heading","depth":3,"position":{"start":{"line":100,"column":1},"end":{"line":100,"column":1}},"children":[{"type":"text","value":"Performance of behavioral cloning","position":{"start":{"line":100,"column":1},"end":{"line":100,"column":1}},"key":"IgtY4jSAwd"}],"identifier":"performance-of-behavioral-cloning","label":"Performance of behavioral cloning","html_id":"performance-of-behavioral-cloning","implicit":true,"enumerator":"7.2.1","key":"nbDHJMRPnF"},{"type":"paragraph","position":{"start":{"line":102,"column":1},"end":{"line":106,"column":1}},"children":[{"type":"text","value":"Can we quantify how well this algorithm works?\nFor simplicity, let’s consider the case where the action space is ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"do1BSxYA4w"},{"type":"emphasis","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"finite","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"XX9wZuGIBk"}],"key":"bnvOOAja9V"},{"type":"text","value":" and both the expert policy and learned policy are deterministic.\nSuppose the learned policy obtains ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"VsPWmd023K"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"html":"ε\\varepsilonε","key":"DFxWTIsKb5"},{"type":"text","value":" ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"h7Qp8VcYDK"},{"type":"emphasis","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"classification error","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"BndVVJRd4P"}],"key":"OV4wcHlwpf"},{"type":"text","value":".\nThat is, for trajectories drawn from the expert policy,\nthe learned policy chooses a different action at most ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"PFnJnCczGx"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"html":"ε\\varepsilonε","key":"aZR75xXbm8"},{"type":"text","value":" of the time:","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"tCniDHUuVY"}],"key":"JJ7vBB5YqW"},{"type":"math","value":"\\mathbb{E}_{\\tau \\sim \\rho_{\\pi_{\\text{expert}}}} \\left[ \\frac 1 \\hor \\sum_{\\hi=0}^{\\hor-1} \\ind{ \\widetilde{\\pi}(s_\\hi) \\ne \\pi_{\\text{expert}} (s_\\hi) } \\right] \\le \\varepsilon","html":"Eτρπexpert[1Hh=0H11{π~(sh)πexpert(sh)}]ε\\mathbb{E}_{\\tau \\sim \\rho_{\\pi_{\\text{expert}}}} \\left[ \\frac 1 \\hor \\sum_{\\hi=0}^{\\hor-1} \\ind{ \\widetilde{\\pi}(s_\\hi) \\ne \\pi_{\\text{expert}} (s_\\hi) } \\right] \\le \\varepsilonEτρπexpert[H1h=0H11{π(sh)=πexpert(sh)}]ε","enumerator":"7.5","key":"SSVZTHzWJ5"},{"type":"paragraph","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"children":[{"type":"text","value":"Then, their value functions differ by","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"c7NuoRYp6Y"}],"key":"nyX3GJfHXg"},{"type":"math","value":"| V^{\\pi_{\\text{expert}}} - V^{\\widetilde{\\pi}} | \\le H^2 \\varepsilon","html":"VπexpertVπ~H2ε| V^{\\pi_{\\text{expert}}} - V^{\\widetilde{\\pi}} | \\le H^2 \\varepsilonVπexpertVπH2ε","enumerator":"7.6","key":"AE7d8IDZBu"},{"type":"paragraph","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"qvp9hbBEhi"},{"type":"inlineMath","value":"H","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"html":"HHH","key":"vS2Xo2WnUm"},{"type":"text","value":" is the horizon.","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"Ha4I21UnqZ"}],"key":"XqPr57hxAc"},{"type":"proof","kind":"theorem","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Performance of behavioral cloning","position":{"start":{"line":120,"column":1},"end":{"line":120,"column":1}},"key":"p4xcFKOfFw"}],"key":"RUJTnxNnyu"},{"type":"paragraph","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"children":[{"type":"text","value":"Recall the ","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"Bxhtz85wow"},{"type":"crossReference","kind":"proof:theorem","identifier":"pdl","label":"pdl","children":[{"type":"text","value":"Theorem ","key":"mDlpNWoImg"},{"type":"text","value":"1","key":"gnc0pGQRKh"}],"template":"Theorem %s","enumerator":"1","resolved":true,"html_id":"pdl","remote":true,"url":"/pg","dataUrl":"/pg.json","key":"T9F4LN8r6E"},{"type":"text","value":" allows us to express the difference between ","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"X23J3sGVn6"},{"type":"inlineMath","value":"\\pi_{\\text{expert}}","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"html":"πexpert\\pi_{\\text{expert}}πexpert","key":"XZA0yEHL4A"},{"type":"text","value":" and ","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"YqB08S1wt4"},{"type":"inlineMath","value":"\\widetilde{\\pi}","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"html":"π~\\widetilde{\\pi}π","key":"syPcACSxdQ"},{"type":"text","value":" as","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"mmLU4ksMMW"}],"key":"UVKmHkpo0g"},{"type":"math","value":"V_0^{\\pi_{\\text{expert}}}(s) - V_0^{\\widetilde{\\pi}} (s) = \\E_{\\tau \\sim \\rho^{\\pi_{\\text{expert}}} \\mid s_0 = s} \\left[ \\sum_{\\hi=0}^{\\hor-1} A_\\hi^{\\widetilde{\\pi}} (s_\\hi, a_\\hi) \\right].","position":{"start":{"line":124,"column":1},"end":{"line":127,"column":1}},"identifier":"eq:pdl-rhs","label":"eq:pdl-rhs","html_id":"eq-pdl-rhs","html":"V0πexpert(s)V0π~(s)=Eτρπexperts0=s[h=0H1Ahπ~(sh,ah)].V_0^{\\pi_{\\text{expert}}}(s) - V_0^{\\widetilde{\\pi}} (s) = \\E_{\\tau \\sim \\rho^{\\pi_{\\text{expert}}} \\mid s_0 = s} \\left[ \\sum_{\\hi=0}^{\\hor-1} A_\\hi^{\\widetilde{\\pi}} (s_\\hi, a_\\hi) \\right].V0πexpert(s)V0π(s)=Eτρπexperts0=s[h=0H1Ahπ(sh,ah)].","enumerator":"7.7","key":"L2ZRDZErrQ"},{"type":"paragraph","position":{"start":{"line":129,"column":1},"end":{"line":132,"column":1}},"children":[{"type":"text","value":"Now since the expert policy is deterministic, we can substitute ","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"key":"YKCWyAndN5"},{"type":"inlineMath","value":"a_\\hi = \\pi_{\\text{expert}}(s_\\hi)","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"html":"ah=πexpert(sh)a_\\hi = \\pi_{\\text{expert}}(s_\\hi)ah=πexpert(sh)","key":"HPe35K1yGb"},{"type":"text","value":".\nThis allows us to make a further simplification:\nsince ","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"key":"ItXdGgnH6W"},{"type":"inlineMath","value":"\\pi_{\\text{expert}}","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"html":"πexpert\\pi_{\\text{expert}}πexpert","key":"n3i7PqgLl7"},{"type":"text","value":" is deterministic,\nthe advantage of the chosen action is exactly zero:","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"key":"AEX415k4e7"}],"key":"nkgZf3xTvd"},{"type":"math","value":"A^{\\pi_{\\text{expert}}}(s, \\pi_{\\text{expert}}(s)) = Q^{\\pi_{\\text{expert}}}(s, \\pi_{\\text{expert}}(s)) - V^{\\pi_{\\text{expert}}}(s) = 0.","position":{"start":{"line":134,"column":1},"end":{"line":136,"column":1}},"html":"Aπexpert(s,πexpert(s))=Qπexpert(s,πexpert(s))Vπexpert(s)=0.A^{\\pi_{\\text{expert}}}(s, \\pi_{\\text{expert}}(s)) = Q^{\\pi_{\\text{expert}}}(s, \\pi_{\\text{expert}}(s)) - V^{\\pi_{\\text{expert}}}(s) = 0.Aπexpert(s,πexpert(s))=Qπexpert(s,πexpert(s))Vπexpert(s)=0.","enumerator":"7.8","key":"DoEm90Jmy3"},{"type":"paragraph","position":{"start":{"line":138,"column":1},"end":{"line":143,"column":1}},"children":[{"type":"text","value":"But the right-hand-side of ","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"key":"tQUOSHy8Gg"},{"type":"crossReference","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"children":[{"type":"text","value":"(","key":"xek9IxGIrG"},{"type":"text","value":"7.7","key":"GAbpeL0yPg"},{"type":"text","value":")","key":"OwBu0MrRuw"}],"identifier":"eq:pdl-rhs","label":"eq:pdl-rhs","kind":"equation","template":"(%s)","enumerator":"7.7","resolved":true,"html_id":"eq-pdl-rhs","key":"MLhYf4HOrZ"},{"type":"text","value":" uses ","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"key":"lx6z4UDoGa"},{"type":"inlineMath","value":"A^{\\widetilde{\\pi}}","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"html":"Aπ~A^{\\widetilde{\\pi}}Aπ","key":"uBfuEY9p3C"},{"type":"text","value":", not ","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"key":"zyaN24nx8l"},{"type":"inlineMath","value":"A^{\\pi_{\\text{expert}}}","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"html":"AπexpertA^{\\pi_{\\text{expert}}}Aπexpert","key":"KDpJrsfiCs"},{"type":"text","value":".\nTo bridge this gap,\nwe now use the assumption that ","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"key":"V7OLAWUkgU"},{"type":"inlineMath","value":"\\widetilde{\\pi}","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"html":"π~\\widetilde{\\pi}π","key":"hBgXI7BkrD"},{"type":"text","value":" obtains ","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"key":"Bv8wNVblFi"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"html":"ε\\varepsilonε","key":"lgWnG9swIG"},{"type":"text","value":" classification error.\nNote that ","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"key":"iO8zPuajye"},{"type":"inlineMath","value":"A_\\hi^{\\widetilde{\\pi}}(s_\\hi, \\pi_{\\text{expert}}(s_\\hi)) = 0","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"html":"Ahπ~(sh,πexpert(sh))=0A_\\hi^{\\widetilde{\\pi}}(s_\\hi, \\pi_{\\text{expert}}(s_\\hi)) = 0Ahπ(sh,πexpert(sh))=0","key":"YFHZAJQk4L"},{"type":"text","value":" when ","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"key":"GllWfeSCr9"},{"type":"inlineMath","value":"\\pi_{\\text{expert}}(s_\\hi) = \\widetilde{\\pi}(s_\\hi)","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"html":"πexpert(sh)=π~(sh)\\pi_{\\text{expert}}(s_\\hi) = \\widetilde{\\pi}(s_\\hi)πexpert(sh)=π(sh)","key":"YZpUO8MhFP"},{"type":"text","value":".\nIn the case where the two policies differ on ","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"key":"hxlV2A5DCg"},{"type":"inlineMath","value":"s_\\hi","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"html":"shs_\\hish","key":"AvMIaZBn18"},{"type":"text","value":", which occurs with probability ","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"key":"JbGEXrOnTy"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"html":"ε\\varepsilonε","key":"uPYApSUoLc"},{"type":"text","value":", the advantage is naively upper bounded by ","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"key":"Sm0NPXqrgm"},{"type":"inlineMath","value":"H","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"html":"HHH","key":"lROlF4ReqV"},{"type":"text","value":" (assuming rewards are bounded between ","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"key":"LRtPuB3SgS"},{"type":"text","value":"0","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"key":"BLGDVQWbfd"},{"type":"text","value":" and ","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"key":"cQ86RjDALg"},{"type":"text","value":"1","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"key":"ArupEMc640"},{"type":"text","value":").\nTaking the final sum gives the desired bound.","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"key":"V6awQ6Bv6X"}],"key":"HD5bux5Pjp"}],"enumerator":"7.1","key":"tVKo9mvbl1"},{"type":"comment","value":" TODO ADD DISTRIBUTION SHIFT EXAMPLE FROM SLIDES ","key":"b51tWqPujD"},{"type":"heading","depth":2,"position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"children":[{"type":"text","value":"Distribution shift","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"key":"QxydYX3wpo"}],"identifier":"distribution-shift","label":"Distribution shift","html_id":"distribution-shift","implicit":true,"enumerator":"7.3","key":"vkomFSQB3m"},{"type":"paragraph","position":{"start":{"line":150,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"Let us return to the driving analogy. Suppose you have taken some driving lessons and now feel comfortable in your neighbourhood. But today you have to travel to an area you haven’t visited before, such as a highway, where it would be dangerous to try and apply the techniques you’ve already learned.\nThis is the issue of ","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"key":"VrBFh4D6ut"},{"type":"emphasis","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"children":[{"type":"text","value":"distribution shift","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"key":"Jh9TiTj35b"}],"key":"zoLQUn9yPS"},{"type":"text","value":": a policy learned under a certain distribution of states may not perform well if this distribution changes.","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"key":"nRE5DfZl3C"}],"key":"pnQnT30soj"},{"type":"paragraph","position":{"start":{"line":153,"column":1},"end":{"line":154,"column":1}},"children":[{"type":"text","value":"This is already a common issue in supervised learning, where the training dataset for a model might not resemble the environment where it gets deployed.\nIn interactive environments, this issue is further exacerbated by the dependency between the observations and the agent’s behavior; if you take a wrong turn early on, it may be difficult or impossible to recover in that trajectory.","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"jmqpTIV1EF"}],"key":"Fn0dxDEyX8"},{"type":"paragraph","position":{"start":{"line":156,"column":1},"end":{"line":160,"column":1}},"children":[{"type":"text","value":"How could you learn a strategy for these new settings?\nIn the driving example, you might decide to install a dashcam to record the car’s surroundings. That way, once you make it back to safety, you can show the recording to an expert, who can provide feedback at each step of the way.\nThen the next time you go for a drive, you can remember the expert’s advice, and take a safer route.\nYou could then repeat this training as many times as desired, thereby collecting the expert’s feedback over a diverse range of locations.\nThis is the key idea behind ","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"key":"eeudZkoBmr"},{"type":"emphasis","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"text","value":"dataset aggregation","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"key":"eq8bYgbQkd"}],"key":"GKJ6h6f5zz"},{"type":"text","value":".","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"key":"JnnGowLYIi"}],"key":"t1uRGR1FLO"},{"type":"heading","depth":2,"position":{"start":{"line":162,"column":1},"end":{"line":162,"column":1}},"children":[{"type":"text","value":"Dataset aggregation (DAgger)","position":{"start":{"line":162,"column":1},"end":{"line":162,"column":1}},"key":"Gd4mMuDTMj"}],"identifier":"dataset-aggregation-dagger","label":"Dataset aggregation (DAgger)","html_id":"dataset-aggregation-dagger","implicit":true,"enumerator":"7.4","key":"mrBTKdwrT1"},{"type":"paragraph","position":{"start":{"line":164,"column":1},"end":{"line":171,"column":1}},"children":[{"type":"text","value":"The DAgger algorithm is due to ","position":{"start":{"line":164,"column":1},"end":{"line":164,"column":1}},"key":"QKPFHAEUzB"},{"type":"cite","kind":"narrative","label":"ross_reduction_2010","identifier":"ross_reduction_2010","children":[{"type":"text","value":"Ross ","key":"T2brQy8xoR"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"mfz8az5hEe"}],"key":"gOdhAJIqLR"},{"type":"text","value":" (2010)","key":"f3F1uGu9je"}],"enumerator":"1","key":"LssYteKSid"},{"type":"text","value":".\nIt assumes that we have ","position":{"start":{"line":164,"column":1},"end":{"line":164,"column":1}},"key":"PLNtVIpRVb"},{"type":"emphasis","position":{"start":{"line":164,"column":1},"end":{"line":164,"column":1}},"children":[{"type":"text","value":"query access","position":{"start":{"line":164,"column":1},"end":{"line":164,"column":1}},"key":"mgZigupICF"}],"key":"lsZzPcj5sq"},{"type":"text","value":" to the expert policy.\nThat is, for a given state ","position":{"start":{"line":164,"column":1},"end":{"line":164,"column":1}},"key":"N4MFyEwBFh"},{"type":"inlineMath","value":"s","position":{"start":{"line":164,"column":1},"end":{"line":164,"column":1}},"html":"sss","key":"FFcAQRRyhX"},{"type":"text","value":",\nwe can ask for the expert’s action ","position":{"start":{"line":164,"column":1},"end":{"line":164,"column":1}},"key":"M2o6intMcE"},{"type":"inlineMath","value":"\\pi_{\\text{expert}}(s)","position":{"start":{"line":164,"column":1},"end":{"line":164,"column":1}},"html":"πexpert(s)\\pi_{\\text{expert}}(s)πexpert(s)","key":"a8bR9cFcdB"},{"type":"text","value":" in that state.\nWe also need access to the environment for rolling out policies.\nThis makes DAgger an ","position":{"start":{"line":164,"column":1},"end":{"line":164,"column":1}},"key":"C3sDtYfCHR"},{"type":"strong","position":{"start":{"line":164,"column":1},"end":{"line":164,"column":1}},"children":[{"type":"text","value":"online","position":{"start":{"line":164,"column":1},"end":{"line":164,"column":1}},"key":"x9V779pe7D"}],"key":"PuP3ju0GQb"},{"type":"text","value":" algorithm,\nas opposed to pure behavioral cloning,\nwhich is ","position":{"start":{"line":164,"column":1},"end":{"line":164,"column":1}},"key":"hoK03MyDwY"},{"type":"strong","position":{"start":{"line":164,"column":1},"end":{"line":164,"column":1}},"children":[{"type":"text","value":"offline","position":{"start":{"line":164,"column":1},"end":{"line":164,"column":1}},"key":"XejHkevstN"}],"key":"v6XWWo8xxt"},{"type":"text","value":" since we don’t need to act in the environment at all.","position":{"start":{"line":164,"column":1},"end":{"line":164,"column":1}},"key":"nTwnkdRb3T"}],"key":"Ut3bwm8d9o"},{"type":"paragraph","position":{"start":{"line":173,"column":1},"end":{"line":173,"column":1}},"children":[{"type":"text","value":"You can think of DAgger as a specific way of collecting the dataset ","position":{"start":{"line":173,"column":1},"end":{"line":173,"column":1}},"key":"VrlKsRg6AB"},{"type":"inlineMath","value":"\\mathcal{D}","position":{"start":{"line":173,"column":1},"end":{"line":173,"column":1}},"html":"D\\mathcal{D}D","key":"IiwUkbHJnw"},{"type":"text","value":".","position":{"start":{"line":173,"column":1},"end":{"line":173,"column":1}},"key":"W5zrRYexY7"}],"key":"htttmu0HlH"},{"type":"proof","kind":"algorithm","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"DAgger","position":{"start":{"line":175,"column":1},"end":{"line":175,"column":1}},"key":"KZhCtKIrKF"}],"key":"lUbvbDO6lp"},{"type":"paragraph","position":{"start":{"line":177,"column":1},"end":{"line":177,"column":1}},"children":[{"type":"text","value":"Inputs: ","position":{"start":{"line":177,"column":1},"end":{"line":177,"column":1}},"key":"LdRwh8YDH7"},{"type":"inlineMath","value":"\\pi_{\\text{expert}}","position":{"start":{"line":177,"column":1},"end":{"line":177,"column":1}},"html":"πexpert\\pi_{\\text{expert}}πexpert","key":"Xe0s0N3pM8"},{"type":"text","value":", an initial policy ","position":{"start":{"line":177,"column":1},"end":{"line":177,"column":1}},"key":"MhuV2k9xla"},{"type":"inlineMath","value":"\\pi_{\\text{init}}","position":{"start":{"line":177,"column":1},"end":{"line":177,"column":1}},"html":"πinit\\pi_{\\text{init}}πinit","key":"jEYSUUmWiV"},{"type":"text","value":", the number of iterations ","position":{"start":{"line":177,"column":1},"end":{"line":177,"column":1}},"key":"zvYqXXpQyG"},{"type":"inlineMath","value":"T","position":{"start":{"line":177,"column":1},"end":{"line":177,"column":1}},"html":"TTT","key":"CVzxSOf3jq"},{"type":"text","value":", and the number of trajectories ","position":{"start":{"line":177,"column":1},"end":{"line":177,"column":1}},"key":"HjEiwmimI4"},{"type":"inlineMath","value":"N","position":{"start":{"line":177,"column":1},"end":{"line":177,"column":1}},"html":"NNN","key":"xsM6gu3JrU"},{"type":"text","value":" to collect per iteration.","position":{"start":{"line":177,"column":1},"end":{"line":177,"column":1}},"key":"nr5EoVuZNo"}],"key":"bOi7zVbM2o"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":179,"column":1},"end":{"line":187,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":179,"column":1},"end":{"line":179,"column":1}},"children":[{"type":"text","value":"Initialize ","position":{"start":{"line":179,"column":1},"end":{"line":179,"column":1}},"key":"FP9jNXVc2x"},{"type":"inlineMath","value":"\\mathcal{D} = \\{\\}","position":{"start":{"line":179,"column":1},"end":{"line":179,"column":1}},"html":"D={}\\mathcal{D} = \\{\\}D={}","key":"LsoAe2ojp6"},{"type":"text","value":" (the empty set) and ","position":{"start":{"line":179,"column":1},"end":{"line":179,"column":1}},"key":"tKE8frT6Bh"},{"type":"inlineMath","value":"\\pi = \\pi_{\\text{init}}","position":{"start":{"line":179,"column":1},"end":{"line":179,"column":1}},"html":"π=πinit\\pi = \\pi_{\\text{init}}π=πinit","key":"vZ25EBjEDf"},{"type":"text","value":".","position":{"start":{"line":179,"column":1},"end":{"line":179,"column":1}},"key":"paPEG7SPfl"}],"key":"j9KlAk0DJV"},{"type":"listItem","spread":true,"position":{"start":{"line":180,"column":1},"end":{"line":186,"column":1}},"children":[{"type":"text","value":"For ","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"uV5U9PLP7d"},{"type":"inlineMath","value":"t = 1, \\dots, T","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"html":"t=1,,Tt = 1, \\dots, Tt=1,,T","key":"CBKMqh6gPC"},{"type":"text","value":":","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"ycvaiHM2S1"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":181,"column":1},"end":{"line":186,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":181,"column":1},"end":{"line":181,"column":1}},"children":[{"type":"text","value":"Collect ","position":{"start":{"line":181,"column":1},"end":{"line":181,"column":1}},"key":"mEgW93mnzL"},{"type":"inlineMath","value":"N","position":{"start":{"line":181,"column":1},"end":{"line":181,"column":1}},"html":"NNN","key":"KTVgoA020W"},{"type":"text","value":" trajectories ","position":{"start":{"line":181,"column":1},"end":{"line":181,"column":1}},"key":"LTrQyWpZAp"},{"type":"inlineMath","value":"\\tau_1, \\dots, \\tau_N","position":{"start":{"line":181,"column":1},"end":{"line":181,"column":1}},"html":"τ1,,τN\\tau_1, \\dots, \\tau_Nτ1,,τN","key":"p1S1HoACH5"},{"type":"text","value":" using the current policy ","position":{"start":{"line":181,"column":1},"end":{"line":181,"column":1}},"key":"vleaPZ35t7"},{"type":"text","value":"π","position":{"start":{"line":181,"column":1},"end":{"line":181,"column":1}},"key":"zaKEOcZExN"},{"type":"text","value":".","position":{"start":{"line":181,"column":1},"end":{"line":181,"column":1}},"key":"ECSYCylLyb"}],"key":"jQi6a9CsZZ"},{"type":"listItem","spread":true,"position":{"start":{"line":182,"column":1},"end":{"line":184,"column":1}},"children":[{"type":"text","value":"For each trajectory ","position":{"start":{"line":182,"column":1},"end":{"line":182,"column":1}},"key":"ye4dbjI1Co"},{"type":"inlineMath","value":"\\tau_n","position":{"start":{"line":182,"column":1},"end":{"line":182,"column":1}},"html":"τn\\tau_nτn","key":"yqoQywQv1i"},{"type":"text","value":":","position":{"start":{"line":182,"column":1},"end":{"line":182,"column":1}},"key":"T86S6vWIyx"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":183,"column":1},"end":{"line":184,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"children":[{"type":"text","value":"Replace each action ","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"jVB52lByrx"},{"type":"inlineMath","value":"a_h","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"html":"aha_hah","key":"oaKufNGLwt"},{"type":"text","value":" in ","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"eIgkqZzCml"},{"type":"inlineMath","value":"\\tau_n","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"html":"τn\\tau_nτn","key":"H50CPR1Xk7"},{"type":"text","value":" with the ","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"gIJKlIdvl9"},{"type":"strong","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"children":[{"type":"text","value":"expert action","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"Rc7bYfs15g"}],"key":"yaFbCPEr9F"},{"type":"text","value":" ","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"rrHmgbyrUX"},{"type":"inlineMath","value":"\\pi_{\\text{expert}}(s_h)","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"html":"πexpert(sh)\\pi_{\\text{expert}}(s_h)πexpert(sh)","key":"UhPM0Xj5XO"},{"type":"text","value":".","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"HL53UUIWI8"}],"key":"syLJTSJI9f"},{"type":"listItem","spread":true,"position":{"start":{"line":184,"column":1},"end":{"line":184,"column":1}},"children":[{"type":"text","value":"Call the resulting trajectory ","position":{"start":{"line":184,"column":1},"end":{"line":184,"column":1}},"key":"G2BWLWKNJT"},{"type":"inlineMath","value":"\\tau^{\\text{expert}}_n","position":{"start":{"line":184,"column":1},"end":{"line":184,"column":1}},"html":"τnexpert\\tau^{\\text{expert}}_nτnexpert","key":"WG2S5sVTrA"},{"type":"text","value":".","position":{"start":{"line":184,"column":1},"end":{"line":184,"column":1}},"key":"D3ifxrws28"}],"key":"vvApTjn8Dt"}],"key":"dn0ya5ZdEX"}],"key":"KjPrzDfQl1"},{"type":"listItem","spread":true,"position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"children":[{"type":"inlineMath","value":"\\mathcal{D} \\gets \\mathcal{D} \\cup \\{ \\tau^{\\text{expert}}_1, \\dots, \\tau^{\\text{expert}}_n \\}","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"html":"DD{τ1expert,,τnexpert}\\mathcal{D} \\gets \\mathcal{D} \\cup \\{ \\tau^{\\text{expert}}_1, \\dots, \\tau^{\\text{expert}}_n \\}DD{τ1expert,,τnexpert}","key":"zdaUVbWko8"},{"type":"text","value":".","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"HszXfIkgsQ"}],"key":"JhMXCWTEIX"},{"type":"listItem","spread":true,"position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"key":"VSBnAJEGkq"},{"type":"inlineMath","value":"\\pi \\gets \\texttt{fit}(\\mathcal{D})","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"html":"πfit(D)\\pi \\gets \\texttt{fit}(\\mathcal{D})πfit(D)","key":"rfmWl3Dvrz"},{"type":"text","value":", where ","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"key":"yxjsVuHSX5"},{"type":"inlineMath","value":"\\texttt{fit}","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"html":"fit\\texttt{fit}fit","key":"PKbaGpW3Sx"},{"type":"text","value":" is a behavioral cloning algorithm.","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"key":"MqrtFiGpdX"}],"key":"zYFScEeTb0"}],"key":"XtbBzP7SjT"}],"key":"XQpNeMms8A"},{"type":"listItem","spread":true,"position":{"start":{"line":187,"column":1},"end":{"line":187,"column":1}},"children":[{"type":"text","value":"Return ","position":{"start":{"line":187,"column":1},"end":{"line":187,"column":1}},"key":"xLcfJ3IhyT"},{"type":"text","value":"π","position":{"start":{"line":187,"column":1},"end":{"line":187,"column":1}},"key":"XnMiVAoEkE"},{"type":"text","value":".","position":{"start":{"line":187,"column":1},"end":{"line":187,"column":1}},"key":"BHtwIdIpnl"}],"key":"lRSwPDrWUl"}],"key":"q7AQx87C0X"}],"enumerator":"7.1","key":"BTsqulQxS4"},{"type":"paragraph","position":{"start":{"line":190,"column":1},"end":{"line":192,"column":1}},"children":[{"type":"text","value":"How well does DAgger perform?\nWe omit a proof here, but under certain assumptions,\nthe DAgger algorithm can better approximate the expert policy:","position":{"start":{"line":190,"column":1},"end":{"line":190,"column":1}},"key":"EITxmV7wUH"}],"key":"IcoDMbKIyV"},{"type":"math","value":"|V^{\\pi_{\\text{expert}}} - V^{\\pi_{\\text{DAgger}}}| \\le H \\varepsilon","position":{"start":{"line":194,"column":1},"end":{"line":196,"column":1}},"html":"VπexpertVπDAggerHε|V^{\\pi_{\\text{expert}}} - V^{\\pi_{\\text{DAgger}}}| \\le H \\varepsilonVπexpertVπDAggerHε","enumerator":"7.9","key":"MMqbzh204n"},{"type":"paragraph","position":{"start":{"line":198,"column":1},"end":{"line":198,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":198,"column":1},"end":{"line":198,"column":1}},"key":"KSCBfRcDT3"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":198,"column":1},"end":{"line":198,"column":1}},"html":"ε\\varepsilonε","key":"W3OBuM83Uh"},{"type":"text","value":" is the “classification error” guaranteed by the supervised learning algorithm.","position":{"start":{"line":198,"column":1},"end":{"line":198,"column":1}},"key":"jOIvufqJlS"}],"key":"QDATmmdndF"},{"type":"comment","value":" TODO ","key":"fCHPDI5gxs"},{"type":"heading","depth":2,"position":{"start":{"line":202,"column":1},"end":{"line":202,"column":1}},"children":[{"type":"text","value":"Summary","position":{"start":{"line":202,"column":1},"end":{"line":202,"column":1}},"key":"n9czKM6ETu"}],"identifier":"summary","label":"Summary","html_id":"summary","implicit":true,"enumerator":"7.5","key":"eZo1kZ7j8T"},{"type":"paragraph","position":{"start":{"line":204,"column":1},"end":{"line":206,"column":1}},"children":[{"type":"text","value":"For tasks where it is too difficult or expensive to learn from scratch,\nwe can instead start off with a collection of ","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"oW42kQBnI7"},{"type":"strong","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"children":[{"type":"text","value":"expert demonstrations","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"kNzZyTOXha"}],"key":"VzIM3Czq6K"},{"type":"text","value":".\nThen we can use supervised learning techniques to find a policy that imitates the expert demonstrations.","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"yxc5OIqeWM"}],"key":"bXwyz5OpsY"},{"type":"paragraph","position":{"start":{"line":208,"column":1},"end":{"line":213,"column":1}},"children":[{"type":"text","value":"The simplest way to do this is to apply a supervised learning algorithm to an already-collected dataset of expert state-action pairs.\nThis is called ","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"hKhx2Wj1Ic"},{"type":"strong","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"children":[{"type":"text","value":"behavioral cloning","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"NSLWg76ZLl"}],"key":"atHb3xeAZC"},{"type":"text","value":".\nHowever, given query access to the expert policy,\nwe can do better by integrating its feedback in an online loop.\nThe ","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"CZPXk7TBUW"},{"type":"strong","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"children":[{"type":"text","value":"DAgger","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"wTwdKtUGX1"}],"key":"W0tC2VUcGy"},{"type":"text","value":" algorithm is one way of doing this,\nwhere we use the expert policy to augment trajectories and then learn from this augmented dataset using behavioral cloning.","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"LkS5OCo8Ak"}],"key":"GlpTUaz37z"}],"key":"PFbsYxOECc"}],"key":"gc2rcgGlsm"},"references":{"cite":{"order":["ross_reduction_2010"],"data":{"ross_reduction_2010":{"label":"ross_reduction_2010","enumerator":"1","html":"Ross, S., Gordon, G. J., & Bagnell, J. (2010, November). A Reduction of Imitation Learning and Structured Prediction to No-Regret Online Learning. International Conference on Artificial Intelligence and Statistics."}}}},"footer":{"navigation":{"prev":{"title":"6 Policy Gradient Methods","url":"/pg","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"8 Tree Search Methods","url":"/planning","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file diff --git a/index.html b/index.html index 9856e8f..9f6842c 100644 --- a/index.html +++ b/index.html @@ -1,4 +1,4 @@ -CS/STAT 184: Introduction to Reinforcement Learning

Introduction

Welcome to the study of reinforcement learning! + c-1.4,4.9-3.6,8.6-6.8,11.1c-3.1,2.5-7.3,3.7-12.4,3.8H63c-6.7,0-10,2.9-10,8.8">Made with MyST

Introduction

Welcome to the study of reinforcement learning! This textbook accompanies the undergraduate course CS 1840/STAT 184 taught at Harvard. -It is intended to be a friendly yet rigorous introduction to this active subfield of machine learning.

1Prerequisites

This book assumes the same prerequisites as the course: You should be familiar with multivariable calculus, linear algebra, and probability. +It is intended to be a friendly yet rigorous introduction to this active subfield of machine learning.

1Prerequisites

This book assumes the same prerequisites as the course: You should be familiar with multivariable calculus, linear algebra, and probability. For Harvard undergraduates, this is fulfilled by Math 21a, Math 21b, and Stat 110, or their equivalents. Stat 111 is strongly recommended but not required. Specifically, we will assume that you know the following topics. The italicized terms have brief re-introductions in the text or in the Appendix: Background:

  • Linear Algebra: Vectors and matrices, matrix multiplication, matrix inversion, eigenvalues and eigenvectors.
  • Multivariable Calculus: Partial derivatives, the chain rule, Taylor series, gradients, directional derivatives, Lagrange multipliers.
  • Probability: Random variables, probability distributions, expectation and variance, the law of iterated expectations (Adam’s rule), covariance, conditional probability, Bayes’s rule, and the law of total probability.

You should also be comfortable with programming in Python. -See Section 6 for more about this textbook’s philosophy regarding programming.

2Reinforcement learning in a nutshell

Broadly speaking, +See Section 6 for more about this textbook’s philosophy regarding programming.

2Reinforcement learning in a nutshell

Broadly speaking, RL studies sequential decision-making in dynamic environments. An RL algorithm finds a strategy, called a policy, that maximizes the reward it obtains from the environment.

RL provides a powerful framework for attacking a wide variety of problems, including robotic control, video games and board games, resource management, language modelling, and more. It also provides an interdisciplinary paradigm for studying animal and human behavior. -Many of the most stunning results in machine learning, ranging from AlphaGo to ChatGPT, are built using RL algorithms.

How does RL compare to the other two core machine learning paradigms, +Many of the most stunning results in machine learning, ranging from AlphaGo to ChatGPT, are built using RL algorithms.

How does RL compare to the other two core machine learning paradigms, supervised learning and unsupervised learning?

  • Supervised learning (SL) concerns itself with learning a mapping from inputs to outputs. Typically the data takes the form of statistically independent input-output pairs. In RL, however, the data is generated by the agent interacting with the environment, meaning the sequential observations of the state are not independent from each other.

    Conversely, SL is a well-studied field that provides many useful tools for RL.

  • Unsupervised learning concerns itself with learning the structure of data without the use of outside feedback or labels. In RL, though, the agent receives a reward signal from the environment, -which can be thought of as a sort of feedback.

    Unsupervised learning is crucial in many real-world applications of RL for dimensionality reduction and other purposes.

3Core tasks of reinforcement learning

What tasks, exactly, does RL comprise? +which can be thought of as a sort of feedback.

Unsupervised learning is crucial in many real-world applications of RL for dimensionality reduction and other purposes.

3Core tasks of reinforcement learning

What tasks, exactly, does RL comprise? An RL algorithm must typically solve two main subtasks:

  • Policy evaluation (prediction): How ‘good’ is a specific state, or state-action pair (under a given policy)? That is, how much reward does it lead to in the long run?

  • Policy optimization (control): Suppose we fully understand how the environment behaves. -What is the best action to take in every scenario?

4Course overview

The course will progress through the following units:

1 Markov Decision Processes introduces Markov Decision Processes, +What is the best action to take in every scenario?

4Course overview

The course will progress through the following units:

1 Markov Decision Processes introduces Markov Decision Processes, the core mathematical framework for describing a large class of interactive environments.

2 Linear Quadratic Regulators is a standalone chapter on the linear quadratic regulator (LQR), an important tool for continuous control, in which the state and action spaces are no longer finite but rather continuous. @@ -50,15 +50,15 @@ we will see how each of them strikes a different balance between exploring new options and exploiting known options. This exploration-exploitation tradeoff is a core consideration in RL algorithm design.

4 Supervised learning is a standalone crash course on some tools from supervised learning that we will use in later chapters.

5 Fitted Dynamic Programming Algorithms introduces fitted dynamic programming (fitted DP) algorithms for solving MDPs. These algorithms use supervised learning to approximately evaluate policies when they cannot be evaluated exactly.

6 Policy Gradient Methods explores an important class of algorithms based on iteratively improving a policy. -We will also encounter the use of deep neural networks to express more complicated policies and approximate complicated functions.

7 Imitation Learning attempts to learn a good policy from expert demonstrations. +We will also encounter the use of deep neural networks to express more complicated policies and approximate complicated functions.

7 Imitation Learning attempts to learn a good policy from expert demonstrations. At its most basic, this is an application of supervised learning to RL tasks.

8 Tree Search Methods looks at ways to explicitly plan ahead when the environment’s dynamics are known. We will study the Monte Carlo Tree Search heuristic, which has been used to great success in the famous AlphaGo algorithm and its successors.

9 Exploration in MDPs continues to investigate the exploration-exploitation tradeoff. -We will extend ideas from multi-armed bandits to the MDP setting.

Appendix: Background contains an overview of selected background mathematical content and programming content.

5Notation

We will use the following notation throughout the book. +We will extend ideas from multi-armed bandits to the MDP setting.

Appendix: Background contains an overview of selected background mathematical content and programming content.

5Notation

We will use the following notation throughout the book. This notation is inspired by Sutton & Barto (2018) and Agarwal et al. (2022). We use [N][N] as shorthand for the set {0,1,,N1}\{ 0, 1, \dots, N-1 \}.

ElementSpaceDefinition (of element)
ssS\mathcal{S}A state.
aaA\mathcal{A}An action.
rrA reward.
γA discount factor.
τT\mathcal{T}A trajectory.
πΠA policy.
VπV^\piSR\mathcal{S} \to \mathbb{R}The value function of policy π.
QπQ^\piS×AR\mathcal{S} \times \mathcal{A} \to \mathbb{R}The action-value function (a.k.a. Q-function) of policy π.
AπA^\piS×AR\mathcal{S} \times \mathcal{A} \to \mathbb{R}The advantage function of policy π.
(X)\triangle(\mathcal{X})A distribution supported on X\mathcal{X}.
h\hi[H][\hor]Time horizon index of an MDP (subscript).
kk[K][K]Arm index of a multi-armed bandit (superscript).
tt[T][T]Iteration index of an algorithm (subscript).
θΘA set of parameters.

Note that throughout the text, certain symbols will stand for either random variables or fixed values. We aim to clarify in ambiguous settings. -Be warned that

6Programming

Why include code in a textbook? +Be warned that

6Programming

Why include code in a textbook? We believe that implementing an algorithm is a strong test of your understanding of it; mathematical notation can often abstract away details, while a computer must be given every single instruction. @@ -92,9 +92,9 @@ # print functions as latex import latexify -plt.style.use("fivethirtyeight")

References
  1. Sutton, R. S., & Barto, A. G. (2018). Reinforcement Learning: An Introduction (Second edition). The MIT Press.
  2. Agarwal, A., Jiang, N., Kakade, S. M., & Sun, W. (2022). Reinforcement Learning: Theory and Algorithms.
  3. Sussman, G. J., Wisdom, J., & Farr, W. (2013). Functional Differential Geometry. The MIT Press.
\ No newline at end of file diff --git a/index.json b/index.json index 0f98eeb..d9a14ec 100644 --- a/index.json +++ b/index.json @@ -1 +1 @@ -{"kind":"Notebook","sha256":"2cdeee9bc604ea0150aa2ba9d0d7b73c09784f007761496df1c2715f83d28614","slug":"index","location":"/index.md","dependencies":[],"frontmatter":{"title":"Introduction","kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","numbering":{"all":{"enabled":true}},"math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"exports":[{"format":"md","filename":"index.md","url":"/build/index-b84d1d5a6390c0b2f1723ee4aeac02d1.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":16,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"Welcome to the study of reinforcement learning!\nThis textbook accompanies the undergraduate course ","position":{"start":{"line":16,"column":1},"end":{"line":16,"column":1}},"key":"eUY2p9j14c"},{"type":"link","url":"http://lucasjanson.fas.harvard.edu/courses/CS_Stat_184_0.html","position":{"start":{"line":16,"column":1},"end":{"line":16,"column":1}},"children":[{"type":"text","value":"CS 1840/STAT 184","position":{"start":{"line":16,"column":1},"end":{"line":16,"column":1}},"key":"LlDJakhJl9"}],"urlSource":"http://lucasjanson.fas.harvard.edu/courses/CS_Stat_184_0.html","key":"hjjMHDQ8vD"},{"type":"text","value":" taught at Harvard.\nIt is intended to be a friendly yet rigorous introduction to this active subfield of machine learning.","position":{"start":{"line":16,"column":1},"end":{"line":16,"column":1}},"key":"zUZniUjE01"}],"key":"c90F2YxMsI"}],"key":"SHf3lE39fc"},{"type":"block","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"children":[{"type":"text","value":"Prerequisites","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"key":"H5Suu9L3s3"}],"identifier":"prerequisites","label":"Prerequisites","html_id":"prerequisites","implicit":true,"enumerator":"1","key":"RkiQSohnJZ"},{"type":"paragraph","position":{"start":{"line":24,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"text","value":"This book assumes the same prerequisites as the course: You should be familiar with multivariable calculus, linear algebra, and probability.\nFor Harvard undergraduates, this is fulfilled by Math 21a, Math 21b, and Stat 110, or their equivalents.\nStat 111 is strongly recommended but not required.\nSpecifically, we will assume that you know the following topics. The ","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"jzmIr9ceh1"},{"type":"emphasis","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"children":[{"type":"text","value":"italicized terms","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"dC2u25IzWZ"}],"key":"shiVYqF0OK"},{"type":"text","value":" have brief re-introductions in the text or in the ","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"DsTxtreLNn"},{"type":"link","url":"/background","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"children":[{"type":"text","value":"Appendix: Background","key":"RRfh4emlqH"}],"urlSource":"./background.md","dataUrl":"/background.json","internal":true,"protocol":"file","key":"itvIchbW3K"},{"type":"text","value":":","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"YNEK2qJi1a"}],"key":"pfHqrqFrCI"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":29,"column":1},"end":{"line":34,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":29,"column":1},"end":{"line":30,"column":1}},"children":[{"type":"strong","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"text","value":"Linear Algebra:","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"key":"FmqFc9cWqR"}],"key":"NGrt9Kx56U"},{"type":"text","value":" Vectors and matrices, matrix multiplication, matrix\ninversion, eigenvalues and eigenvectors.","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"key":"X4jHXVUVhi"}],"key":"NJf9bvHqpg"},{"type":"listItem","spread":true,"position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"children":[{"type":"strong","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"children":[{"type":"text","value":"Multivariable Calculus:","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"uQO5kgVikT"}],"key":"byNfZbMy6r"},{"type":"text","value":" Partial derivatives, the chain rule, Taylor series, ","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"vnpcLOwnzd"},{"type":"emphasis","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"children":[{"type":"text","value":"gradients, directional derivatives, Lagrange multipliers.","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"PgEVVDejsR"}],"key":"lDweUDDmGr"}],"key":"WAABdEVFgY"},{"type":"listItem","spread":true,"position":{"start":{"line":32,"column":1},"end":{"line":34,"column":1}},"children":[{"type":"strong","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"children":[{"type":"text","value":"Probability:","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"key":"xcYQqKPcSJ"}],"key":"PKqCCW9NuC"},{"type":"text","value":" Random variables, probability distributions,\nexpectation and variance, the law of iterated expectations (Adam’s rule), covariance, conditional probability, Bayes’s rule, and the law of total probability.","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"key":"LIYWnRe6pA"}],"key":"Yta3GpJpdT"}],"key":"Ji5JM43eS0"},{"type":"paragraph","position":{"start":{"line":35,"column":1},"end":{"line":36,"column":1}},"children":[{"type":"text","value":"You should also be comfortable with programming in Python.\nSee ","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"H7td5ppQYk"},{"type":"crossReference","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"children":[{"type":"text","value":"Section ","key":"MznmpEmnTA"},{"type":"text","value":"6","key":"bS4wQRrwSE"}],"identifier":"programming","label":"programming","kind":"heading","template":"Section %s","enumerator":"6","resolved":true,"html_id":"programming","key":"ZoaSIyGY7t"},{"type":"text","value":" for more about this textbook’s philosophy regarding programming.","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"G6CzETnpyL"}],"key":"uD0qrn7Vyw"}],"key":"yK3KASuhxj"},{"type":"block","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"children":[{"type":"text","value":"Reinforcement learning in a nutshell","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"PSuaTwgXuN"}],"identifier":"reinforcement-learning-in-a-nutshell","label":"Reinforcement learning in a nutshell","html_id":"reinforcement-learning-in-a-nutshell","implicit":true,"enumerator":"2","key":"sUns36yIDP"},{"type":"paragraph","position":{"start":{"line":42,"column":1},"end":{"line":44,"column":1}},"children":[{"type":"text","value":"Broadly speaking,\nRL studies ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"zKQFjqPVUy"},{"type":"strong","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"sequential decision-making","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"fGUbp71vXZ"}],"key":"PbrBzPa7bu"},{"type":"text","value":" in ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"ZHsEmDYYbd"},{"type":"strong","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"dynamic environments.","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"yHHqZHDrGR"}],"key":"Tf0LpmmbhG"},{"type":"text","value":"\nAn RL algorithm finds a strategy, called a ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"dUGZylQnFa"},{"type":"strong","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"policy,","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"hgD1QWMstb"}],"key":"cwOTsoJ18U"},{"type":"text","value":" that maximizes the ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"qGa41FbGO0"},{"type":"strong","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"reward","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"YkoKYEofow"}],"key":"FUxRNjbUN5"},{"type":"text","value":" it obtains from the environment.","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"hxO5yXXpk0"}],"key":"hncwKUHLSM"},{"type":"paragraph","position":{"start":{"line":46,"column":1},"end":{"line":49,"column":1}},"children":[{"type":"text","value":"RL provides a powerful framework for attacking a wide variety of problems,\nincluding robotic control, video games and board games, resource management, language modelling, and more.\nIt also provides an interdisciplinary paradigm for studying animal and human behavior.\nMany of the most stunning results in machine learning, ranging from AlphaGo to ChatGPT, are built using RL algorithms.","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"key":"XgBbc1Apq5"}],"key":"zHBVFKAR1e"}],"key":"iawtpLo18y"},{"type":"block","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":53,"column":1},"end":{"line":54,"column":1}},"children":[{"type":"text","value":"How does RL compare to the other two core machine learning paradigms,\n","position":{"start":{"line":53,"column":1},"end":{"line":53,"column":1}},"key":"Lyynvd8iCQ"},{"type":"strong","position":{"start":{"line":53,"column":1},"end":{"line":53,"column":1}},"children":[{"type":"text","value":"supervised learning","position":{"start":{"line":53,"column":1},"end":{"line":53,"column":1}},"key":"hDnDPyLIc8"}],"key":"Vj6pIIEJT9"},{"type":"text","value":" and ","position":{"start":{"line":53,"column":1},"end":{"line":53,"column":1}},"key":"JkmjByLTP9"},{"type":"strong","position":{"start":{"line":53,"column":1},"end":{"line":53,"column":1}},"children":[{"type":"text","value":"unsupervised learning?","position":{"start":{"line":53,"column":1},"end":{"line":53,"column":1}},"key":"d2H1GNoIRM"}],"key":"jEpoKumYjQ"}],"key":"y6UGtKtMha"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":56,"column":1},"end":{"line":68,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":56,"column":1},"end":{"line":62,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":56,"column":1},"end":{"line":59,"column":1}},"children":[{"type":"strong","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"Supervised learning","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"yyggOtJgd8"}],"key":"dtAlFWSz44"},{"type":"text","value":" (SL) concerns itself with learning a mapping from inputs to outputs.\nTypically the data takes the form of ","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"PAv77VBAgx"},{"type":"emphasis","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"statistically independent","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"OFNjtaOZp8"}],"key":"csofffmavF"},{"type":"text","value":" input-output pairs.\nIn RL, however, the data is generated by the agent interacting with the environment,\nmeaning the sequential observations of the state are ","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"C9PesWBCfG"},{"type":"emphasis","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"not independent","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"VaTgCBpbjY"}],"key":"cqjAb4aPeX"},{"type":"text","value":" from each other.","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"CDIE0Yk1pN"}],"key":"gJNEpnwHxe"},{"type":"paragraph","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"children":[{"type":"text","value":"Conversely, SL is a well-studied field that provides many useful tools for RL.","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"key":"IdCbMZmL1Y"}],"key":"winFA3SNnE"}],"key":"tRm5HhDZo5"},{"type":"listItem","spread":true,"position":{"start":{"line":63,"column":1},"end":{"line":68,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":63,"column":1},"end":{"line":65,"column":1}},"children":[{"type":"strong","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"children":[{"type":"text","value":"Unsupervised learning","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"key":"GBD11HvJ6u"}],"key":"fBBy5Tc8l7"},{"type":"text","value":" concerns itself with learning the ","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"key":"sDGxWSXxJF"},{"type":"emphasis","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"children":[{"type":"text","value":"structure","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"key":"zQu2a00rvy"}],"key":"R7YipiOry5"},{"type":"text","value":" of data without the use of outside feedback or labels.\nIn RL, though, the agent receives a ","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"key":"xhAUaveobb"},{"type":"strong","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"children":[{"type":"text","value":"reward signal","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"key":"jpccG53sUI"}],"key":"pl9uB4B1ej"},{"type":"text","value":" from the environment,\nwhich can be thought of as a sort of feedback.","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"key":"gxZAbhegWB"}],"key":"PYNEj8hBtN"},{"type":"paragraph","position":{"start":{"line":67,"column":1},"end":{"line":67,"column":1}},"children":[{"type":"text","value":"Unsupervised learning is crucial in many real-world applications of RL for dimensionality reduction and other purposes.","position":{"start":{"line":67,"column":1},"end":{"line":67,"column":1}},"key":"eBfTV3K355"}],"key":"HF3cMuN7Ja"}],"key":"M1XMrElxU9"}],"key":"t8wtdtJ16T"}],"key":"wxK1jN9FSg"},{"type":"block","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"Core tasks of reinforcement learning","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"RTBTaEkA9t"}],"identifier":"core-tasks-of-reinforcement-learning","label":"Core tasks of reinforcement learning","html_id":"core-tasks-of-reinforcement-learning","implicit":true,"enumerator":"3","key":"mHXIHjofet"},{"type":"paragraph","position":{"start":{"line":73,"column":1},"end":{"line":74,"column":1}},"children":[{"type":"text","value":"What tasks, exactly, does RL comprise?\nAn RL algorithm must typically solve two main subtasks:","position":{"start":{"line":73,"column":1},"end":{"line":73,"column":1}},"key":"t7Vgetjqeq"}],"key":"Y5p0sLkaoE"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":76,"column":1},"end":{"line":83,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":76,"column":1},"end":{"line":79,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":76,"column":1},"end":{"line":78,"column":1}},"children":[{"type":"strong","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"children":[{"type":"text","value":"Policy evaluation (prediction):","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"m5rwjOr7Uv"}],"key":"CrO7089x9Q"},{"type":"text","value":"\nHow ‘good’ is a specific state, or state-action pair (under a given policy)?\nThat is, how much reward does it lead to in the long run?","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"fZUm0FsXOf"}],"key":"jldiRWVORE"}],"key":"nIwv8UxAA9"},{"type":"listItem","spread":true,"position":{"start":{"line":80,"column":1},"end":{"line":83,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":80,"column":1},"end":{"line":82,"column":1}},"children":[{"type":"strong","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"children":[{"type":"text","value":"Policy optimization (control):","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"aXG73C5WB1"}],"key":"cykdUU91b7"},{"type":"text","value":"\nSuppose we fully understand how the environment behaves.\nWhat is the best action to take in every scenario?","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"eQJx5UPqif"}],"key":"hFn8QAAAxd"}],"key":"E9BIeOkyMb"}],"key":"UQpwoteTgH"},{"type":"comment","value":" **Recursion (bootstrapping):** How can we \"reuse\" our current predictions to generate new information? ","key":"y6LWiWj9jk"},{"type":"comment","value":" **Exploration-exploitation tradeoff:** Should we try new actions, or capitalize on actions that we currently believe to be good? ","key":"QYbYpVUu8b"}],"key":"Nmn2Nm0C2x"},{"type":"block","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"children":[{"type":"text","value":"Course overview","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"key":"nNgMzrMNmr"}],"identifier":"course-overview","label":"Course overview","html_id":"course-overview","implicit":true,"enumerator":"4","key":"RIGkCbEu1C"},{"type":"paragraph","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"children":[{"type":"text","value":"The course will progress through the following units:","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"Xe7oc6Zz9g"}],"key":"larmpUUJmD"},{"type":"paragraph","position":{"start":{"line":94,"column":1},"end":{"line":95,"column":1}},"children":[{"type":"link","url":"/mdps","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"1 Markov Decision Processes","key":"R61EoZXa5O"}],"urlSource":"./mdps.md","dataUrl":"/mdps.json","internal":true,"protocol":"file","key":"TCrbDf0vUY"},{"type":"text","value":" introduces ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"J2te4N9G3w"},{"type":"strong","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"Markov Decision Processes,","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"RsDN87bvrj"}],"key":"S3DutmGupz"},{"type":"text","value":"\nthe core mathematical framework for describing a large class of interactive environments.","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"hturBwU3mu"}],"key":"AS24pu7re4"},{"type":"paragraph","position":{"start":{"line":97,"column":1},"end":{"line":100,"column":1}},"children":[{"type":"link","url":"/control","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"text","value":"2 Linear Quadratic Regulators","key":"TSAaKjGGJt"}],"urlSource":"./control.md","dataUrl":"/control.json","internal":true,"protocol":"file","key":"MCXLkP25Xl"},{"type":"text","value":" is a standalone chapter on the ","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"HRXf1ndKCS"},{"type":"strong","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"text","value":"linear quadratic regulator","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"FHNRFQ9eXc"}],"key":"VKpjbNrhom"},{"type":"text","value":" (LQR),\nan important tool for ","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"siFFOnoyk7"},{"type":"emphasis","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"text","value":"continuous control","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"N7LkIsr6OR"}],"key":"LkR8yQS2EC"},{"type":"text","value":",\nin which the state and action spaces are no longer ","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"G1gjWJZRBj"},{"type":"emphasis","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"text","value":"finite","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"UIcdLeOfJw"}],"key":"vnbTJGsEbJ"},{"type":"text","value":" but rather ","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"AOvDtRP4VY"},{"type":"emphasis","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"text","value":"continuous","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"DJeVDqKFOY"}],"key":"eyjxpvClAo"},{"type":"text","value":".\nThis has widespread applications in robotics.","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"dWK7zGAkYG"}],"key":"PegdZLnu5b"},{"type":"paragraph","position":{"start":{"line":102,"column":1},"end":{"line":105,"column":1}},"children":[{"type":"link","url":"/bandits","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"3 Multi-Armed Bandits","key":"LzxLnPo4KZ"}],"urlSource":"./bandits.md","dataUrl":"/bandits.json","internal":true,"protocol":"file","key":"Kg3K8viBdw"},{"type":"text","value":" introduces the ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"dWRMeiodlm"},{"type":"strong","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"multi-armed bandit","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"peVUXxxKaX"}],"key":"aHOTDtfQia"},{"type":"text","value":" (MAB) model for ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"Su8WDIKZF4"},{"type":"emphasis","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"stateless","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"EnJK0Bmj6T"}],"key":"v3I03gpl0F"},{"type":"text","value":" sequential decision-making tasks.\nIn exploring a number of algorithms,\nwe will see how each of them strikes a different balance between ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"k862r5i12Q"},{"type":"emphasis","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"exploring","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"zWPLg3PPKb"}],"key":"oDRF1nyDyr"},{"type":"text","value":" new options and ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"T33CGDH90u"},{"type":"emphasis","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"exploiting","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"L99XTx2Heh"}],"key":"k26XpK6KxG"},{"type":"text","value":" known options.\nThis ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"AZim3CBMEO"},{"type":"strong","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"exploration-exploitation tradeoff","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"F5y8u39TK0"}],"key":"eRbHWd8SZA"},{"type":"text","value":" is a core consideration in RL algorithm design.","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"z8kWo5tjEU"}],"key":"q1jLmxqSjv"},{"type":"paragraph","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"children":[{"type":"link","url":"/supervised-learning","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"children":[{"type":"text","value":"4 Supervised learning","key":"ws9qdi4ICP"}],"urlSource":"./supervised_learning.md","dataUrl":"/supervised-learning.json","internal":true,"protocol":"file","key":"hKbOFup9Q7"},{"type":"text","value":" is a standalone crash course on some tools from supervised learning that we will use in later chapters.","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"s0AhfEiJ0J"}],"key":"H4qHnyScUb"},{"type":"paragraph","position":{"start":{"line":109,"column":1},"end":{"line":110,"column":1}},"children":[{"type":"link","url":"/fitted-dp","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"children":[{"type":"text","value":"5 Fitted Dynamic Programming Algorithms","key":"i3udRNwAgN"}],"urlSource":"./fitted_dp.md","dataUrl":"/fitted-dp.json","internal":true,"protocol":"file","key":"O97OQ7cJDw"},{"type":"text","value":" introduces ","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"PMsCJ7Ft8s"},{"type":"strong","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"children":[{"type":"text","value":"fitted dynamic programming","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"p3GOUO9qsl"}],"key":"WFvQ1LN7JD"},{"type":"text","value":" (fitted DP) algorithms for solving MDPs.\nThese algorithms use supervised learning to approximately evaluate policies when they cannot be evaluated exactly.","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"MUnR4OBPiz"}],"key":"cVPtIuRFLl"},{"type":"paragraph","position":{"start":{"line":112,"column":1},"end":{"line":113,"column":1}},"children":[{"type":"link","url":"/pg","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"children":[{"type":"text","value":"6 Policy Gradient Methods","key":"WXyw8QnmTC"}],"urlSource":"./pg.md","dataUrl":"/pg.json","internal":true,"protocol":"file","key":"nHHzb337aE"},{"type":"text","value":" explores an important class of algorithms based on iteratively improving a policy.\nWe will also encounter the use of ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"XqdEL4MP2V"},{"type":"emphasis","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"children":[{"type":"text","value":"deep neural networks","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"ro06ID7f2V"}],"key":"ubVdj1vUxu"},{"type":"text","value":" to express more complicated policies and approximate complicated functions.","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"d2Q8d2WRCZ"}],"key":"M5avcRxHTj"},{"type":"paragraph","position":{"start":{"line":115,"column":1},"end":{"line":116,"column":1}},"children":[{"type":"link","url":"/imitation-learning","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"children":[{"type":"text","value":"7 Imitation Learning","key":"IjM22K3N6X"}],"urlSource":"./imitation_learning.md","dataUrl":"/imitation-learning.json","internal":true,"protocol":"file","key":"ETI6AaKK0F"},{"type":"text","value":" attempts to learn a good policy from expert demonstrations.\nAt its most basic, this is an application of supervised learning to RL tasks.","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"fi35WR7Ue7"}],"key":"EuGJlbIoNm"},{"type":"paragraph","position":{"start":{"line":118,"column":1},"end":{"line":120,"column":1}},"children":[{"type":"link","url":"/planning","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"8 Tree Search Methods","key":"UruAMBcmQv"}],"urlSource":"./planning.md","dataUrl":"/planning.json","internal":true,"protocol":"file","key":"GLnVYTQWi1"},{"type":"text","value":" looks at ways to ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"GaRvZ6pBHh"},{"type":"emphasis","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"explicitly","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"duEPEDB1FO"}],"key":"tMvr2Q1itJ"},{"type":"text","value":" plan ahead when the environment’s dynamics are known.\nWe will study the ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"AYhlZ8bfst"},{"type":"emphasis","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"Monte Carlo Tree Search","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"cHtwpoC1v6"}],"key":"sJsZVr8OLQ"},{"type":"text","value":" heuristic,\nwhich has been used to great success in the famous AlphaGo algorithm and its successors.","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"FGKlExim9g"}],"key":"p5Jnk7V8kc"},{"type":"paragraph","position":{"start":{"line":122,"column":1},"end":{"line":123,"column":1}},"children":[{"type":"link","url":"/exploration","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"children":[{"type":"text","value":"9 Exploration in MDPs","key":"f3swSA7HHK"}],"urlSource":"./exploration.md","dataUrl":"/exploration.json","internal":true,"protocol":"file","key":"pKefgwJr1r"},{"type":"text","value":" continues to investigate the exploration-exploitation tradeoff.\nWe will extend ideas from multi-armed bandits to the MDP setting.","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"k8LsVruKVD"}],"key":"hsUgUWu3Ll"},{"type":"paragraph","position":{"start":{"line":125,"column":1},"end":{"line":125,"column":1}},"children":[{"type":"link","url":"/background","position":{"start":{"line":125,"column":1},"end":{"line":125,"column":1}},"children":[{"type":"text","value":"Appendix: Background","key":"e4yXMUAxhs"}],"urlSource":"./background.md","dataUrl":"/background.json","internal":true,"protocol":"file","key":"zNvFA7XAMl"},{"type":"text","value":" contains an overview of selected background mathematical content and programming content.","position":{"start":{"line":125,"column":1},"end":{"line":125,"column":1}},"key":"E7HW7UB0lD"}],"key":"pAftcxiHwr"},{"type":"comment","value":" \n| Chapter | States | Actions | Rewards (or costs) |\n|:-------:|:------:|:-------:|:-------:|\n| [](#bandits) | N/A | Finite | Stochastic |\n| [](#mdps) | Finite | Finite | Deterministic |\n| [](#fitted_dp) | Large or continuous | Finite | Deterministic |\n| [](#lqr) | Continuous | Continuous | Deterministic |\n","key":"TAOEjFMuxi"}],"key":"RaAK75MEZ2"},{"type":"block","position":{"start":{"line":136,"column":1},"end":{"line":136,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"children":[{"type":"text","value":"Notation","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"key":"QUBfD1B3Az"}],"identifier":"notation","label":"Notation","html_id":"notation","implicit":true,"enumerator":"5","key":"JPWst2Zq8R"},{"type":"paragraph","position":{"start":{"line":140,"column":1},"end":{"line":142,"column":1}},"children":[{"type":"text","value":"We will use the following notation throughout the book.\nThis notation is inspired by ","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"tfBcjdSZDt"},{"type":"cite","kind":"narrative","label":"sutton_reinforcement_2018","identifier":"sutton_reinforcement_2018","children":[{"type":"text","value":"Sutton & Barto (2018)","key":"eOyM7tf6ba"}],"enumerator":"1","key":"GC0ArmJ8cX"},{"type":"text","value":" and ","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"inYgcWwNTL"},{"type":"cite","kind":"narrative","label":"agarwal_reinforcement_2022","identifier":"agarwal_reinforcement_2022","children":[{"type":"text","value":"Agarwal ","key":"MnTsMLL5nV"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"E5t0x6ldiY"}],"key":"UR7oqjSEtF"},{"type":"text","value":" (2022)","key":"lyluAQDYmm"}],"enumerator":"2","key":"e1NYAWDFzg"},{"type":"text","value":".\nWe use ","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"RuZUjrN4vV"},{"type":"inlineMath","value":"[N]","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"html":"[N][N][N]","key":"cH7n4TwqsV"},{"type":"text","value":" as shorthand for the set ","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"izl0lDnIjd"},{"type":"inlineMath","value":"\\{ 0, 1, \\dots, N-1 \\}","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"html":"{0,1,,N1}\\{ 0, 1, \\dots, N-1 \\}{0,1,,N1}","key":"XWmwQUvbvI"},{"type":"text","value":".","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"k4Gf2QuKJA"}],"key":"BTsC8sj9sA"},{"type":"table","position":{"start":{"line":144,"column":1},"end":{"line":159,"column":1}},"children":[{"type":"tableRow","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"children":[{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"children":[{"type":"text","value":"Element","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"key":"pdODWy2Hze"}],"key":"ECsXN1H3eD"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"children":[{"type":"text","value":"Space","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"key":"PpEqn0dhkO"}],"key":"JLARZmR93y"},{"type":"tableCell","header":true,"align":"left","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"children":[{"type":"text","value":"Definition (of element)","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"key":"pdEkeijIzB"}],"key":"F6iwuiNTzP"}],"key":"GaRS2fqhTA"},{"type":"tableRow","position":{"start":{"line":146,"column":1},"end":{"line":146,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":146,"column":1},"end":{"line":146,"column":1}},"children":[{"type":"inlineMath","value":"s","position":{"start":{"line":146,"column":1},"end":{"line":146,"column":1}},"html":"sss","key":"QEydzkP8qb"}],"key":"Dmq7QzVIkU"},{"type":"tableCell","align":"center","position":{"start":{"line":146,"column":1},"end":{"line":146,"column":1}},"children":[{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":146,"column":1},"end":{"line":146,"column":1}},"html":"S\\mathcal{S}S","key":"TyeHcacX1Y"}],"key":"nePQC8VvfI"},{"type":"tableCell","align":"left","position":{"start":{"line":146,"column":1},"end":{"line":146,"column":1}},"children":[{"type":"text","value":"A state.","position":{"start":{"line":146,"column":1},"end":{"line":146,"column":1}},"key":"BTX6YgkpU6"}],"key":"NlwElIIk8a"}],"key":"el0MNOG4ko"},{"type":"tableRow","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"children":[{"type":"inlineMath","value":"a","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"html":"aaa","key":"w01awgxYMw"}],"key":"YLXwnL4nvL"},{"type":"tableCell","align":"center","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"children":[{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"html":"A\\mathcal{A}A","key":"wbynRBXDFS"}],"key":"cpg0VYB2i5"},{"type":"tableCell","align":"left","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"children":[{"type":"text","value":"An action.","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"key":"dQIRmoM37s"}],"key":"I8WVgcdcGl"}],"key":"kRicxd2yXa"},{"type":"tableRow","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"children":[{"type":"inlineMath","value":"r","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"html":"rrr","key":"CNVunJFk9j"}],"key":"NQPhHmMy6X"},{"type":"tableCell","align":"center","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"children":[],"key":"TADRCd4Gtl"},{"type":"tableCell","align":"left","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"children":[{"type":"text","value":"A reward.","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"key":"JA0kFMC0ia"}],"key":"r1V3IRa83A"}],"key":"i3o7MDV8FS"},{"type":"tableRow","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"children":[{"type":"text","value":"γ","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"key":"G9OU8IFAnh"}],"key":"sP1TeCvtZw"},{"type":"tableCell","align":"center","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"children":[],"key":"enn892mXzJ"},{"type":"tableCell","align":"left","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"children":[{"type":"text","value":"A discount factor.","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"key":"ALtkbKC66c"}],"key":"qSwbi1rl0D"}],"key":"tQQzhthnqc"},{"type":"tableRow","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"children":[{"type":"text","value":"τ","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"key":"OLGqtOjRGv"}],"key":"tQ0ENYW48b"},{"type":"tableCell","align":"center","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"children":[{"type":"inlineMath","value":"\\mathcal{T}","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"html":"T\\mathcal{T}T","key":"Q2R4HP6zJz"}],"key":"eOMP3MH9NP"},{"type":"tableCell","align":"left","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"children":[{"type":"text","value":"A trajectory.","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"key":"ujjwKmnZ32"}],"key":"etE8EK3DbP"}],"key":"qfdockbjBj"},{"type":"tableRow","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"π","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"key":"UOluUduBW9"}],"key":"LioZGGsFsW"},{"type":"tableCell","align":"center","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"Π","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"key":"TiPf7m3xAQ"}],"key":"N4ifV8yZXi"},{"type":"tableCell","align":"left","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"A policy.","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"key":"gK3igbSoGp"}],"key":"iPdnlHuoNr"}],"key":"ZSYC7WyIQV"},{"type":"tableRow","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"children":[{"type":"inlineMath","value":"V^\\pi","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"html":"VπV^\\piVπ","key":"CP3cvWSr4W"}],"key":"xrJ4fSArRH"},{"type":"tableCell","align":"center","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"children":[{"type":"inlineMath","value":"\\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"html":"SR\\mathcal{S} \\to \\mathbb{R}SR","key":"eJ85SYTdTt"}],"key":"oTagaTUQMk"},{"type":"tableCell","align":"left","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"children":[{"type":"text","value":"The value function of policy ","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"key":"RL4fVpFUtl"},{"type":"text","value":"π","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"key":"hwzRdwTu7m"},{"type":"text","value":".","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"key":"v34wqaDsD1"}],"key":"ZaNzjl7Xxg"}],"key":"m98bnMIFPP"},{"type":"tableRow","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"children":[{"type":"inlineMath","value":"Q^\\pi","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"QπQ^\\piQπ","key":"LhYexj26Sc"}],"key":"TFNXZCv63v"},{"type":"tableCell","align":"center","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"children":[{"type":"inlineMath","value":"\\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"S×AR\\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}S×AR","key":"qmRb1XMZTQ"}],"key":"dfintz76jq"},{"type":"tableCell","align":"left","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"children":[{"type":"text","value":"The action-value function (a.k.a. Q-function) of policy ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"CNfeUIZhUd"},{"type":"text","value":"π","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"zMMPM9rNxR"},{"type":"text","value":".","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"WXT1Ck337n"}],"key":"fFHM0D8yOH"}],"key":"OkfwzXlzSF"},{"type":"tableRow","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"children":[{"type":"inlineMath","value":"A^\\pi","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"html":"AπA^\\piAπ","key":"IHriSRc4Bo"}],"key":"WhmTtAWXgm"},{"type":"tableCell","align":"center","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"children":[{"type":"inlineMath","value":"\\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"html":"S×AR\\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}S×AR","key":"lyEEVYxPmH"}],"key":"OdPzwgiPvM"},{"type":"tableCell","align":"left","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"children":[{"type":"text","value":"The advantage function of policy ","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"key":"kydcbN84gI"},{"type":"text","value":"π","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"key":"YTNfxp6RWz"},{"type":"text","value":".","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"key":"x6dqMbzNtE"}],"key":"HSDV3bzyYs"}],"key":"DpgEdZXGXC"},{"type":"tableRow","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"children":[],"key":"MSD4KAuthE"},{"type":"tableCell","align":"center","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"children":[{"type":"inlineMath","value":"\\triangle(\\mathcal{X})","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"html":"(X)\\triangle(\\mathcal{X})(X)","key":"c4S3A7lLCW"}],"key":"aqihqGCFlY"},{"type":"tableCell","align":"left","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"children":[{"type":"text","value":"A distribution supported on ","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"nphoK45B4W"},{"type":"inlineMath","value":"\\mathcal{X}","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"html":"X\\mathcal{X}X","key":"HMYDjgYA5Z"},{"type":"text","value":".","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"LUcbJu3x19"}],"key":"w2wzv3waGb"}],"key":"jVRq4eNE3E"},{"type":"tableRow","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"inlineMath","value":"\\hi","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"html":"h\\hih","key":"rtoKnClvUt"}],"key":"RUnCHp7fkO"},{"type":"tableCell","align":"center","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"inlineMath","value":"[\\hor]","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"html":"[H][\\hor][H]","key":"MZgZWlUo8G"}],"key":"N3czgzItzj"},{"type":"tableCell","align":"left","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"text","value":"Time horizon index of an MDP (subscript).","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"key":"E4yuowaWdL"}],"key":"rAbvnmcfb2"}],"key":"AuEIzr4uIo"},{"type":"tableRow","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"children":[{"type":"inlineMath","value":"k","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"html":"kkk","key":"KIA4igHtkH"}],"key":"WYpGa5PFR5"},{"type":"tableCell","align":"center","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"children":[{"type":"inlineMath","value":"[K]","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"html":"[K][K][K]","key":"ugL7F5WPyp"}],"key":"xrAzoz75r8"},{"type":"tableCell","align":"left","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"children":[{"type":"text","value":"Arm index of a multi-armed bandit (superscript).","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"key":"Vwsu2uUW15"}],"key":"WOTYLXiqSh"}],"key":"SFZxWbKcp2"},{"type":"tableRow","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"children":[{"type":"inlineMath","value":"t","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"html":"ttt","key":"rNuHdNKc5Q"}],"key":"uGQEzlz4LQ"},{"type":"tableCell","align":"center","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"children":[{"type":"inlineMath","value":"[T]","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"html":"[T][T][T]","key":"d1TS7wczSf"}],"key":"B0WP09C4Ll"},{"type":"tableCell","align":"left","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"children":[{"type":"text","value":"Iteration index of an algorithm (subscript).","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"key":"twhCZuZZBG"}],"key":"iuMPlBX2SH"}],"key":"hOThgLpiSE"},{"type":"tableRow","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"children":[{"type":"text","value":"θ","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"CiF0FTQbwo"}],"key":"UnRjP1iUYn"},{"type":"tableCell","align":"center","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"children":[{"type":"text","value":"Θ","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"sc8UQWYUSa"}],"key":"MueJpYAa4s"},{"type":"tableCell","align":"left","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"children":[{"type":"text","value":"A set of parameters.","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"UYlO6Wg6sg"}],"key":"wfI2gat0d2"}],"key":"zyBZmjsaRF"}],"key":"PUlxxyxGEE"},{"type":"paragraph","position":{"start":{"line":161,"column":1},"end":{"line":163,"column":1}},"children":[{"type":"text","value":"Note that throughout the text, certain symbols will stand for either random variables or fixed values.\nWe aim to clarify in ambiguous settings.\nBe warned that","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"key":"azWQPDDSPk"}],"key":"mEJbFq29t9"}],"key":"xAC2mzqycs"},{"type":"block","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":168,"column":1},"end":{"line":168,"column":1}},"children":[{"type":"text","value":"Programming","position":{"start":{"line":168,"column":1},"end":{"line":168,"column":1}},"key":"MFX4otGzx2"}],"label":"programming","identifier":"programming","html_id":"programming","enumerator":"6","key":"E2mtA8gAj5"},{"type":"paragraph","position":{"start":{"line":170,"column":1},"end":{"line":176,"column":1}},"children":[{"type":"text","value":"Why include code in a textbook?\nWe believe that implementing an algorithm is a strong test of your understanding of it;\nmathematical notation can often abstract away details,\nwhile a computer must be given every single instruction.\nWe have sought to write readable Python code that is self-contained within each file.\nThis approach is inspired by ","position":{"start":{"line":170,"column":1},"end":{"line":170,"column":1}},"key":"UcTGTNFAmv"},{"type":"cite","kind":"narrative","label":"sussman_functional_2013","identifier":"sussman_functional_2013","children":[{"type":"text","value":"Sussman ","key":"ZyRxy8tJsB"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"HtxR0blNIv"}],"key":"gBNGGJrVUn"},{"type":"text","value":" (2013)","key":"BOXaPfprgS"}],"enumerator":"3","key":"Isw087zmVB"},{"type":"text","value":".\nThere are some ways in which the code style differs from typical software projects:","position":{"start":{"line":170,"column":1},"end":{"line":170,"column":1}},"key":"IK07cbsPOY"}],"key":"a0sX6u2r2G"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":178,"column":1},"end":{"line":182,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":178,"column":1},"end":{"line":179,"column":1}},"children":[{"type":"text","value":"We keep use of language features to a minimum,\neven if it leads to code that could otherwise be more concisely or idiomatically expressed.","position":{"start":{"line":178,"column":1},"end":{"line":178,"column":1}},"key":"Fv1o8BAzlA"}],"key":"bpoQmrNIyN"},{"type":"listItem","spread":true,"position":{"start":{"line":180,"column":1},"end":{"line":182,"column":1}},"children":[{"type":"text","value":"The variable names used in the code match those used in the main text.\nFor example, the variable ","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"jqWMRfkYj3"},{"type":"inlineCode","value":"s","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"Aw4MM0yRCM"},{"type":"text","value":" will be used instead of the more explicit ","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"Hu3FdOmVJb"},{"type":"inlineCode","value":"state","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"K0j3S1KWeX"},{"type":"text","value":".","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"dkVLkAohKD"}],"key":"ykn8XL20xq"}],"key":"Tap98fMntg"},{"type":"paragraph","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"children":[{"type":"text","value":"We also make extensive use of Python ","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"YNjUNAPc7E"},{"type":"emphasis","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"children":[{"type":"text","value":"type annotations","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"evYQIn1OX6"}],"key":"g68uf109FT"},{"type":"text","value":" to explicitly specify variable types, including shapes of vectors and matrices using the ","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"efCfK5ufsd"},{"type":"link","url":"https://github.com/patrick-kidger/jaxtyping","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"children":[{"type":"text","value":"jaxtyping","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"eXs5j7Jw6P"}],"urlSource":"https://github.com/patrick-kidger/jaxtyping","error":true,"key":"XvL3Gld8mX"},{"type":"text","value":" library.","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"ZRFdC5sXnf"}],"key":"SJyY3TtRrb"},{"type":"paragraph","position":{"start":{"line":185,"column":1},"end":{"line":190,"column":1}},"children":[{"type":"text","value":"This is an interactive book built with ","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"pEoZNTTH51"},{"type":"link","url":"https://jupyterbook.org/en/stable/intro.html","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"children":[{"type":"text","value":"Jupyter Book","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"tul3RhP9SJ"}],"urlSource":"https://jupyterbook.org/en/stable/intro.html","key":"NYfae6xRUi"},{"type":"text","value":".\nIt uses ","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"TfXJ0SLDDb"},{"type":"link","url":"https://docs.python.org/3.11/contents.html","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"children":[{"type":"text","value":"Python 3.11","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"hOk5PyFJyM"}],"urlSource":"https://docs.python.org/3.11/contents.html","key":"mQsvxMfH6c"},{"type":"text","value":".\nIt uses the ","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"QTLDA2Yind"},{"type":"link","url":"https://jax.readthedocs.io/en/latest/index.html","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"children":[{"type":"text","value":"JAX","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"eyH0NgddU2"}],"urlSource":"https://jax.readthedocs.io/en/latest/index.html","key":"jvwYMclqR5"},{"type":"text","value":" library for numerical computing.\nJAX was chosen for the clarity of its functional style and due to its mature RL ecosystem,\nsustained in large part by the Google DeepMind research group and a large body of open-source contributors.\nWe use the standard ","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"tOnlXbqump"},{"type":"link","url":"https://gymnasium.farama.org/","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"children":[{"type":"text","value":"Gymnasium","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"kigOOppYWT"}],"urlSource":"https://gymnasium.farama.org/","key":"xzD4CzpxwD"},{"type":"text","value":" library for interfacing with RL environments.","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"WHyRITRcht"}],"key":"h3ex4XxV5n"},{"type":"paragraph","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"children":[{"type":"text","value":"The following names are exported from the ","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"SlKnPDkpmZ"},{"type":"inlineCode","value":"utils","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"eFsosDWUev"},{"type":"text","value":" module:","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"Wr0PaiNoCG"}],"key":"Jq2scvz7cC"},{"type":"code","lang":"python","value":"import matplotlib.pyplot as plt\n\n# convenient class builder\nfrom typing import NamedTuple\n\n# function typings\nfrom collections.abc import Callable\n\n# array typings\nfrom jaxtyping import Float, Array\n\n# convenient function composition\nfrom functools import partial\n\n# numerical computing and linear algebra\nimport jax\nimport jax.numpy as jnp\n\n# print functions as latex\nimport latexify\n\nplt.style.use(\"fivethirtyeight\")","position":{"start":{"line":194,"column":1},"end":{"line":217,"column":1}},"key":"kjqBKIYtfL"}],"key":"TxNpnPxA1V"}],"key":"t6kBEsBLDo"},"references":{"cite":{"order":["sutton_reinforcement_2018","agarwal_reinforcement_2022","sussman_functional_2013"],"data":{"sutton_reinforcement_2018":{"label":"sutton_reinforcement_2018","enumerator":"1","html":"Sutton, R. S., & Barto, A. G. (2018). Reinforcement Learning: An Introduction (Second edition). The MIT Press."},"agarwal_reinforcement_2022":{"label":"agarwal_reinforcement_2022","enumerator":"2","html":"Agarwal, A., Jiang, N., Kakade, S. M., & Sun, W. (2022). Reinforcement Learning: Theory and Algorithms."},"sussman_functional_2013":{"label":"sussman_functional_2013","enumerator":"3","html":"Sussman, G. J., Wisdom, J., & Farr, W. (2013). Functional Differential Geometry. The MIT Press."}}}},"footer":{"navigation":{"next":{"title":"1 Markov Decision Processes","url":"/mdps","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file +{"kind":"Notebook","sha256":"2cdeee9bc604ea0150aa2ba9d0d7b73c09784f007761496df1c2715f83d28614","slug":"index","location":"/index.md","dependencies":[],"frontmatter":{"title":"Introduction","kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","numbering":{"all":{"enabled":true}},"math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"exports":[{"format":"md","filename":"index.md","url":"/build/index-b84d1d5a6390c0b2f1723ee4aeac02d1.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":16,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"Welcome to the study of reinforcement learning!\nThis textbook accompanies the undergraduate course ","position":{"start":{"line":16,"column":1},"end":{"line":16,"column":1}},"key":"HbotXNG5JI"},{"type":"link","url":"http://lucasjanson.fas.harvard.edu/courses/CS_Stat_184_0.html","position":{"start":{"line":16,"column":1},"end":{"line":16,"column":1}},"children":[{"type":"text","value":"CS 1840/STAT 184","position":{"start":{"line":16,"column":1},"end":{"line":16,"column":1}},"key":"mvcDZ1c4Ib"}],"urlSource":"http://lucasjanson.fas.harvard.edu/courses/CS_Stat_184_0.html","key":"ySaOySh5vX"},{"type":"text","value":" taught at Harvard.\nIt is intended to be a friendly yet rigorous introduction to this active subfield of machine learning.","position":{"start":{"line":16,"column":1},"end":{"line":16,"column":1}},"key":"jKGWcLRKx6"}],"key":"ef18hAHgRM"}],"key":"OW3ekbaDkT"},{"type":"block","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"children":[{"type":"text","value":"Prerequisites","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"key":"tSqVoML0fE"}],"identifier":"prerequisites","label":"Prerequisites","html_id":"prerequisites","implicit":true,"enumerator":"1","key":"WWNAMElyg4"},{"type":"paragraph","position":{"start":{"line":24,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"text","value":"This book assumes the same prerequisites as the course: You should be familiar with multivariable calculus, linear algebra, and probability.\nFor Harvard undergraduates, this is fulfilled by Math 21a, Math 21b, and Stat 110, or their equivalents.\nStat 111 is strongly recommended but not required.\nSpecifically, we will assume that you know the following topics. The ","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"sbJOVt1dU8"},{"type":"emphasis","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"children":[{"type":"text","value":"italicized terms","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"x1uHvhsJth"}],"key":"C2mIw0D689"},{"type":"text","value":" have brief re-introductions in the text or in the ","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"dsQw4GvsVr"},{"type":"link","url":"/background","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"children":[{"type":"text","value":"Appendix: Background","key":"R8F3rsQCiR"}],"urlSource":"./background.md","dataUrl":"/background.json","internal":true,"protocol":"file","key":"YIwyFZC1Zm"},{"type":"text","value":":","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"SdZwwMeG3F"}],"key":"fPTX4sxz9V"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":29,"column":1},"end":{"line":34,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":29,"column":1},"end":{"line":30,"column":1}},"children":[{"type":"strong","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"text","value":"Linear Algebra:","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"key":"rltH968TIx"}],"key":"foDM79NVka"},{"type":"text","value":" Vectors and matrices, matrix multiplication, matrix\ninversion, eigenvalues and eigenvectors.","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"key":"OonfUZ5mgc"}],"key":"FBY9DyPRU4"},{"type":"listItem","spread":true,"position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"children":[{"type":"strong","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"children":[{"type":"text","value":"Multivariable Calculus:","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"HF5PzjKZO0"}],"key":"DkKdJ9BBJR"},{"type":"text","value":" Partial derivatives, the chain rule, Taylor series, ","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"rIEeoxBa25"},{"type":"emphasis","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"children":[{"type":"text","value":"gradients, directional derivatives, Lagrange multipliers.","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"wHBlxWuKE0"}],"key":"nhIt4XRNMi"}],"key":"PglYXOLWWN"},{"type":"listItem","spread":true,"position":{"start":{"line":32,"column":1},"end":{"line":34,"column":1}},"children":[{"type":"strong","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"children":[{"type":"text","value":"Probability:","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"key":"leR4oSwAfW"}],"key":"nSfAYxBbGb"},{"type":"text","value":" Random variables, probability distributions,\nexpectation and variance, the law of iterated expectations (Adam’s rule), covariance, conditional probability, Bayes’s rule, and the law of total probability.","position":{"start":{"line":32,"column":1},"end":{"line":32,"column":1}},"key":"oDRKLWkSfy"}],"key":"Z9q6lvKnAy"}],"key":"fQYszkqSyS"},{"type":"paragraph","position":{"start":{"line":35,"column":1},"end":{"line":36,"column":1}},"children":[{"type":"text","value":"You should also be comfortable with programming in Python.\nSee ","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"HXgOjUIk0P"},{"type":"crossReference","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"children":[{"type":"text","value":"Section ","key":"ZVObZ4rFhC"},{"type":"text","value":"6","key":"G79uGcAEz5"}],"identifier":"programming","label":"programming","kind":"heading","template":"Section %s","enumerator":"6","resolved":true,"html_id":"programming","key":"LbF033iDji"},{"type":"text","value":" for more about this textbook’s philosophy regarding programming.","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"EucHy6CJYH"}],"key":"oGadAa1qvi"}],"key":"d89NdZIULy"},{"type":"block","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"children":[{"type":"text","value":"Reinforcement learning in a nutshell","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"WzsXZ4ouyf"}],"identifier":"reinforcement-learning-in-a-nutshell","label":"Reinforcement learning in a nutshell","html_id":"reinforcement-learning-in-a-nutshell","implicit":true,"enumerator":"2","key":"MqNE9OVwlf"},{"type":"paragraph","position":{"start":{"line":42,"column":1},"end":{"line":44,"column":1}},"children":[{"type":"text","value":"Broadly speaking,\nRL studies ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"KMj7xvm1el"},{"type":"strong","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"sequential decision-making","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"kOJ8sc93Nz"}],"key":"oYSHvTNiA2"},{"type":"text","value":" in ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"Ndp9P0rqUG"},{"type":"strong","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"dynamic environments.","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"Hab3zcVHYL"}],"key":"FrHaq6WbMr"},{"type":"text","value":"\nAn RL algorithm finds a strategy, called a ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"vOgjdIwUy4"},{"type":"strong","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"policy,","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"VKb2Vz8wnJ"}],"key":"iyU5T0H9a8"},{"type":"text","value":" that maximizes the ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"UoOESvBWGh"},{"type":"strong","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"reward","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"tbA50KopS3"}],"key":"tvhOJjYmVK"},{"type":"text","value":" it obtains from the environment.","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"Ue2WnirnBV"}],"key":"hHCMDH3ljk"},{"type":"paragraph","position":{"start":{"line":46,"column":1},"end":{"line":49,"column":1}},"children":[{"type":"text","value":"RL provides a powerful framework for attacking a wide variety of problems,\nincluding robotic control, video games and board games, resource management, language modelling, and more.\nIt also provides an interdisciplinary paradigm for studying animal and human behavior.\nMany of the most stunning results in machine learning, ranging from AlphaGo to ChatGPT, are built using RL algorithms.","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"key":"YrB4FzD7HO"}],"key":"JrR7J653ch"}],"key":"UfkTVwkSSx"},{"type":"block","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":53,"column":1},"end":{"line":54,"column":1}},"children":[{"type":"text","value":"How does RL compare to the other two core machine learning paradigms,\n","position":{"start":{"line":53,"column":1},"end":{"line":53,"column":1}},"key":"xjqsgb3EmE"},{"type":"strong","position":{"start":{"line":53,"column":1},"end":{"line":53,"column":1}},"children":[{"type":"text","value":"supervised learning","position":{"start":{"line":53,"column":1},"end":{"line":53,"column":1}},"key":"lQNgwCRpm7"}],"key":"DYHRNxGL4d"},{"type":"text","value":" and ","position":{"start":{"line":53,"column":1},"end":{"line":53,"column":1}},"key":"Z2AIiIq2gv"},{"type":"strong","position":{"start":{"line":53,"column":1},"end":{"line":53,"column":1}},"children":[{"type":"text","value":"unsupervised learning?","position":{"start":{"line":53,"column":1},"end":{"line":53,"column":1}},"key":"znW0QcmzT3"}],"key":"b41QnbmXPf"}],"key":"AwNTDVE0Pk"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":56,"column":1},"end":{"line":68,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":56,"column":1},"end":{"line":62,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":56,"column":1},"end":{"line":59,"column":1}},"children":[{"type":"strong","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"Supervised learning","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"mJ4yGWFtEn"}],"key":"ZceCPWmX6X"},{"type":"text","value":" (SL) concerns itself with learning a mapping from inputs to outputs.\nTypically the data takes the form of ","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"VXwHz8gRBU"},{"type":"emphasis","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"statistically independent","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"kTZeTAxUvH"}],"key":"RoQkng57Sw"},{"type":"text","value":" input-output pairs.\nIn RL, however, the data is generated by the agent interacting with the environment,\nmeaning the sequential observations of the state are ","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"xbbkaz6sBa"},{"type":"emphasis","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"not independent","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"mCzpwaxcDf"}],"key":"bZWYnLVDRF"},{"type":"text","value":" from each other.","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"xkNm7HcHar"}],"key":"sszcE9zMzL"},{"type":"paragraph","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"children":[{"type":"text","value":"Conversely, SL is a well-studied field that provides many useful tools for RL.","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"key":"kml8BWBUno"}],"key":"x50WqdC7jW"}],"key":"tCbsnOsrRG"},{"type":"listItem","spread":true,"position":{"start":{"line":63,"column":1},"end":{"line":68,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":63,"column":1},"end":{"line":65,"column":1}},"children":[{"type":"strong","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"children":[{"type":"text","value":"Unsupervised learning","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"key":"XzZdbmz8zB"}],"key":"okUTdRCNLL"},{"type":"text","value":" concerns itself with learning the ","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"key":"AEB8M4iLis"},{"type":"emphasis","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"children":[{"type":"text","value":"structure","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"key":"GZpIss5Gmu"}],"key":"A5GGrHt40n"},{"type":"text","value":" of data without the use of outside feedback or labels.\nIn RL, though, the agent receives a ","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"key":"ucNBbQOETx"},{"type":"strong","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"children":[{"type":"text","value":"reward signal","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"key":"cbYC7Rmpxn"}],"key":"XQWIez9tro"},{"type":"text","value":" from the environment,\nwhich can be thought of as a sort of feedback.","position":{"start":{"line":63,"column":1},"end":{"line":63,"column":1}},"key":"zbZYVcjq7g"}],"key":"JOdT14ZAgO"},{"type":"paragraph","position":{"start":{"line":67,"column":1},"end":{"line":67,"column":1}},"children":[{"type":"text","value":"Unsupervised learning is crucial in many real-world applications of RL for dimensionality reduction and other purposes.","position":{"start":{"line":67,"column":1},"end":{"line":67,"column":1}},"key":"WBPEy9uInD"}],"key":"F9ROBsi4dq"}],"key":"AUAYBMbkav"}],"key":"zuN3XNJ5Bi"}],"key":"uijqZukjvw"},{"type":"block","position":{"start":{"line":69,"column":1},"end":{"line":69,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"children":[{"type":"text","value":"Core tasks of reinforcement learning","position":{"start":{"line":71,"column":1},"end":{"line":71,"column":1}},"key":"BB94KXf9mx"}],"identifier":"core-tasks-of-reinforcement-learning","label":"Core tasks of reinforcement learning","html_id":"core-tasks-of-reinforcement-learning","implicit":true,"enumerator":"3","key":"KU8Mcwxqwz"},{"type":"paragraph","position":{"start":{"line":73,"column":1},"end":{"line":74,"column":1}},"children":[{"type":"text","value":"What tasks, exactly, does RL comprise?\nAn RL algorithm must typically solve two main subtasks:","position":{"start":{"line":73,"column":1},"end":{"line":73,"column":1}},"key":"YlRESIy657"}],"key":"wpZetFHtUE"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":76,"column":1},"end":{"line":83,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":76,"column":1},"end":{"line":79,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":76,"column":1},"end":{"line":78,"column":1}},"children":[{"type":"strong","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"children":[{"type":"text","value":"Policy evaluation (prediction):","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"WgDaVYvzsW"}],"key":"gIusu0DKGJ"},{"type":"text","value":"\nHow ‘good’ is a specific state, or state-action pair (under a given policy)?\nThat is, how much reward does it lead to in the long run?","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"EUM0roIVSx"}],"key":"AtoZPsE2GO"}],"key":"sZpriUCIKY"},{"type":"listItem","spread":true,"position":{"start":{"line":80,"column":1},"end":{"line":83,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":80,"column":1},"end":{"line":82,"column":1}},"children":[{"type":"strong","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"children":[{"type":"text","value":"Policy optimization (control):","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"YDK4A0RGgm"}],"key":"uFoQgtymUj"},{"type":"text","value":"\nSuppose we fully understand how the environment behaves.\nWhat is the best action to take in every scenario?","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"bTF66Jh7Fl"}],"key":"GLttTVwe4E"}],"key":"RmsYb4sM2S"}],"key":"A2Uwm5MPwn"},{"type":"comment","value":" **Recursion (bootstrapping):** How can we \"reuse\" our current predictions to generate new information? ","key":"BwElze7157"},{"type":"comment","value":" **Exploration-exploitation tradeoff:** Should we try new actions, or capitalize on actions that we currently believe to be good? ","key":"cSRAkvx9aV"}],"key":"hFOkBTc6q0"},{"type":"block","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"children":[{"type":"text","value":"Course overview","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"key":"bRiBGbAnFH"}],"identifier":"course-overview","label":"Course overview","html_id":"course-overview","implicit":true,"enumerator":"4","key":"O7nUofQZib"},{"type":"paragraph","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"children":[{"type":"text","value":"The course will progress through the following units:","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"NyJ5In5sV3"}],"key":"i8M7Ydm1Ma"},{"type":"paragraph","position":{"start":{"line":94,"column":1},"end":{"line":95,"column":1}},"children":[{"type":"link","url":"/mdps","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"1 Markov Decision Processes","key":"SW89NVeBiW"}],"urlSource":"./mdps.md","dataUrl":"/mdps.json","internal":true,"protocol":"file","key":"h5sGZlfFzq"},{"type":"text","value":" introduces ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"HbhxroTx0d"},{"type":"strong","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"Markov Decision Processes,","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"dsaxuNUjLB"}],"key":"v2n2n1XelT"},{"type":"text","value":"\nthe core mathematical framework for describing a large class of interactive environments.","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"rvSvXGPSBP"}],"key":"UL6alhhPPr"},{"type":"paragraph","position":{"start":{"line":97,"column":1},"end":{"line":100,"column":1}},"children":[{"type":"link","url":"/control","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"text","value":"2 Linear Quadratic Regulators","key":"VMcOd2my75"}],"urlSource":"./control.md","dataUrl":"/control.json","internal":true,"protocol":"file","key":"Z8oVG0JIz3"},{"type":"text","value":" is a standalone chapter on the ","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"TzcgqMwPIA"},{"type":"strong","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"text","value":"linear quadratic regulator","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"CSanAwt6Qm"}],"key":"xqkx2VCend"},{"type":"text","value":" (LQR),\nan important tool for ","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"AF0x5N2hgf"},{"type":"emphasis","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"text","value":"continuous control","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"hoJQ3IymBJ"}],"key":"sHilaKnRlC"},{"type":"text","value":",\nin which the state and action spaces are no longer ","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"pCuFA55tK0"},{"type":"emphasis","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"text","value":"finite","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"bL5xHXMsRm"}],"key":"Y1fg8dms1l"},{"type":"text","value":" but rather ","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"GEg0wtuAgT"},{"type":"emphasis","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"text","value":"continuous","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"EORfRcRgyF"}],"key":"RFML3XYLnY"},{"type":"text","value":".\nThis has widespread applications in robotics.","position":{"start":{"line":97,"column":1},"end":{"line":97,"column":1}},"key":"RLyhbNjr34"}],"key":"r9LDRc7MCi"},{"type":"paragraph","position":{"start":{"line":102,"column":1},"end":{"line":105,"column":1}},"children":[{"type":"link","url":"/bandits","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"3 Multi-Armed Bandits","key":"Y8nK2fdtpz"}],"urlSource":"./bandits.md","dataUrl":"/bandits.json","internal":true,"protocol":"file","key":"ufEUPhxa6h"},{"type":"text","value":" introduces the ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"BMxZsI2l5M"},{"type":"strong","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"multi-armed bandit","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"NhZ93F96Cf"}],"key":"YXfln6SRAW"},{"type":"text","value":" (MAB) model for ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"MPj6x07rUm"},{"type":"emphasis","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"stateless","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"T9eWuGB6Yz"}],"key":"NTa5PXiM2b"},{"type":"text","value":" sequential decision-making tasks.\nIn exploring a number of algorithms,\nwe will see how each of them strikes a different balance between ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"MOfibLGa6n"},{"type":"emphasis","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"exploring","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"zFbLhBDCqz"}],"key":"BWeDZS4C82"},{"type":"text","value":" new options and ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"W8v27oYOWq"},{"type":"emphasis","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"exploiting","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"RkJ1T8XbgV"}],"key":"MyGOH90hgh"},{"type":"text","value":" known options.\nThis ","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"SKz34R021C"},{"type":"strong","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"children":[{"type":"text","value":"exploration-exploitation tradeoff","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"mhl2vEb9UD"}],"key":"Ct3omAWH7N"},{"type":"text","value":" is a core consideration in RL algorithm design.","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"bD2LUtdsvU"}],"key":"W0j5lfxeKD"},{"type":"paragraph","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"children":[{"type":"link","url":"/supervised-learning","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"children":[{"type":"text","value":"4 Supervised learning","key":"v4paZVGmA3"}],"urlSource":"./supervised_learning.md","dataUrl":"/supervised-learning.json","internal":true,"protocol":"file","key":"Zap401rHAH"},{"type":"text","value":" is a standalone crash course on some tools from supervised learning that we will use in later chapters.","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"kAQjRqD05P"}],"key":"vSjPVwPIRS"},{"type":"paragraph","position":{"start":{"line":109,"column":1},"end":{"line":110,"column":1}},"children":[{"type":"link","url":"/fitted-dp","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"children":[{"type":"text","value":"5 Fitted Dynamic Programming Algorithms","key":"ZYJA7UIGW1"}],"urlSource":"./fitted_dp.md","dataUrl":"/fitted-dp.json","internal":true,"protocol":"file","key":"JthPKex1jn"},{"type":"text","value":" introduces ","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"ebVyQ9BFa6"},{"type":"strong","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"children":[{"type":"text","value":"fitted dynamic programming","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"pnfdqViY1Q"}],"key":"zTg0o88DLo"},{"type":"text","value":" (fitted DP) algorithms for solving MDPs.\nThese algorithms use supervised learning to approximately evaluate policies when they cannot be evaluated exactly.","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"dTP87oNWBJ"}],"key":"RugASJTVa4"},{"type":"paragraph","position":{"start":{"line":112,"column":1},"end":{"line":113,"column":1}},"children":[{"type":"link","url":"/pg","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"children":[{"type":"text","value":"6 Policy Gradient Methods","key":"VLp0vOnZtR"}],"urlSource":"./pg.md","dataUrl":"/pg.json","internal":true,"protocol":"file","key":"ubuq78JZNF"},{"type":"text","value":" explores an important class of algorithms based on iteratively improving a policy.\nWe will also encounter the use of ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"QeUxAbxw7r"},{"type":"emphasis","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"children":[{"type":"text","value":"deep neural networks","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"Zlvl0yZDF6"}],"key":"zBUbIyAwyE"},{"type":"text","value":" to express more complicated policies and approximate complicated functions.","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"Y1FyK9F6J3"}],"key":"VHLf1icYh5"},{"type":"paragraph","position":{"start":{"line":115,"column":1},"end":{"line":116,"column":1}},"children":[{"type":"link","url":"/imitation-learning","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"children":[{"type":"text","value":"7 Imitation Learning","key":"EJWwJ3eJmC"}],"urlSource":"./imitation_learning.md","dataUrl":"/imitation-learning.json","internal":true,"protocol":"file","key":"jLqUFNDsoA"},{"type":"text","value":" attempts to learn a good policy from expert demonstrations.\nAt its most basic, this is an application of supervised learning to RL tasks.","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"KmsEUMORnK"}],"key":"gafhDMizMm"},{"type":"paragraph","position":{"start":{"line":118,"column":1},"end":{"line":120,"column":1}},"children":[{"type":"link","url":"/planning","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"8 Tree Search Methods","key":"dZ3CIz8zVH"}],"urlSource":"./planning.md","dataUrl":"/planning.json","internal":true,"protocol":"file","key":"UFsAXle3QK"},{"type":"text","value":" looks at ways to ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"Re2OZUWQFl"},{"type":"emphasis","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"explicitly","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"IYJ20vwg9X"}],"key":"GgJvGXIG6w"},{"type":"text","value":" plan ahead when the environment’s dynamics are known.\nWe will study the ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"Y5S1uGOBQm"},{"type":"emphasis","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"Monte Carlo Tree Search","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"gTpk2i33gd"}],"key":"xKmfVhUy2N"},{"type":"text","value":" heuristic,\nwhich has been used to great success in the famous AlphaGo algorithm and its successors.","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"eoaAypaebw"}],"key":"PaXixMYHIr"},{"type":"paragraph","position":{"start":{"line":122,"column":1},"end":{"line":123,"column":1}},"children":[{"type":"link","url":"/exploration","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"children":[{"type":"text","value":"9 Exploration in MDPs","key":"V0bSrQKByx"}],"urlSource":"./exploration.md","dataUrl":"/exploration.json","internal":true,"protocol":"file","key":"TC4Ya7mYci"},{"type":"text","value":" continues to investigate the exploration-exploitation tradeoff.\nWe will extend ideas from multi-armed bandits to the MDP setting.","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"gVktCK6qIT"}],"key":"D6B44b5L2e"},{"type":"paragraph","position":{"start":{"line":125,"column":1},"end":{"line":125,"column":1}},"children":[{"type":"link","url":"/background","position":{"start":{"line":125,"column":1},"end":{"line":125,"column":1}},"children":[{"type":"text","value":"Appendix: Background","key":"xT2Co8hdfk"}],"urlSource":"./background.md","dataUrl":"/background.json","internal":true,"protocol":"file","key":"Cc6mDOkvAx"},{"type":"text","value":" contains an overview of selected background mathematical content and programming content.","position":{"start":{"line":125,"column":1},"end":{"line":125,"column":1}},"key":"gKrKEXh8J9"}],"key":"VW9Ah8blLM"},{"type":"comment","value":" \n| Chapter | States | Actions | Rewards (or costs) |\n|:-------:|:------:|:-------:|:-------:|\n| [](#bandits) | N/A | Finite | Stochastic |\n| [](#mdps) | Finite | Finite | Deterministic |\n| [](#fitted_dp) | Large or continuous | Finite | Deterministic |\n| [](#lqr) | Continuous | Continuous | Deterministic |\n","key":"iyh8AQmgnC"}],"key":"rLPqvaTLfM"},{"type":"block","position":{"start":{"line":136,"column":1},"end":{"line":136,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"children":[{"type":"text","value":"Notation","position":{"start":{"line":138,"column":1},"end":{"line":138,"column":1}},"key":"gXqLPQp5OP"}],"identifier":"notation","label":"Notation","html_id":"notation","implicit":true,"enumerator":"5","key":"zpl02dHuqb"},{"type":"paragraph","position":{"start":{"line":140,"column":1},"end":{"line":142,"column":1}},"children":[{"type":"text","value":"We will use the following notation throughout the book.\nThis notation is inspired by ","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"lhL7t220vf"},{"type":"cite","kind":"narrative","label":"sutton_reinforcement_2018","identifier":"sutton_reinforcement_2018","children":[{"type":"text","value":"Sutton & Barto (2018)","key":"FJ5qiBMGtY"}],"enumerator":"1","key":"fF8T9sgcBH"},{"type":"text","value":" and ","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"bAG8pL5n6d"},{"type":"cite","kind":"narrative","label":"agarwal_reinforcement_2022","identifier":"agarwal_reinforcement_2022","children":[{"type":"text","value":"Agarwal ","key":"m557pLl5q4"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"DynjeVGV76"}],"key":"i6BAR5XFtK"},{"type":"text","value":" (2022)","key":"p2SC1gcXxt"}],"enumerator":"2","key":"fCXggPT3Hd"},{"type":"text","value":".\nWe use ","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"OSGMJtySHw"},{"type":"inlineMath","value":"[N]","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"html":"[N][N][N]","key":"xPmccrmefJ"},{"type":"text","value":" as shorthand for the set ","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"A6rIiIZfcM"},{"type":"inlineMath","value":"\\{ 0, 1, \\dots, N-1 \\}","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"html":"{0,1,,N1}\\{ 0, 1, \\dots, N-1 \\}{0,1,,N1}","key":"NvYdO9bFki"},{"type":"text","value":".","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"RKSDxp6XwM"}],"key":"gPaXy7Ov0o"},{"type":"table","position":{"start":{"line":144,"column":1},"end":{"line":159,"column":1}},"children":[{"type":"tableRow","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"children":[{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"children":[{"type":"text","value":"Element","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"key":"ahsB9O0P0m"}],"key":"UnuGySMzxj"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"children":[{"type":"text","value":"Space","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"key":"aBsWfMtZQ9"}],"key":"pewsKJt7x4"},{"type":"tableCell","header":true,"align":"left","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"children":[{"type":"text","value":"Definition (of element)","position":{"start":{"line":144,"column":1},"end":{"line":144,"column":1}},"key":"mpD0vp8QIU"}],"key":"TwWlhw8hRI"}],"key":"JeLOmslncF"},{"type":"tableRow","position":{"start":{"line":146,"column":1},"end":{"line":146,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":146,"column":1},"end":{"line":146,"column":1}},"children":[{"type":"inlineMath","value":"s","position":{"start":{"line":146,"column":1},"end":{"line":146,"column":1}},"html":"sss","key":"doI0UBVif9"}],"key":"FjKtPLOAZh"},{"type":"tableCell","align":"center","position":{"start":{"line":146,"column":1},"end":{"line":146,"column":1}},"children":[{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":146,"column":1},"end":{"line":146,"column":1}},"html":"S\\mathcal{S}S","key":"UZFs4OPv0h"}],"key":"KC47Sxqs39"},{"type":"tableCell","align":"left","position":{"start":{"line":146,"column":1},"end":{"line":146,"column":1}},"children":[{"type":"text","value":"A state.","position":{"start":{"line":146,"column":1},"end":{"line":146,"column":1}},"key":"l0wBFNgpyH"}],"key":"JY8rvyz2Hx"}],"key":"IQRuSFqGZY"},{"type":"tableRow","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"children":[{"type":"inlineMath","value":"a","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"html":"aaa","key":"Hmhb1rO2eM"}],"key":"mWGAxaACJs"},{"type":"tableCell","align":"center","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"children":[{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"html":"A\\mathcal{A}A","key":"i0YbhTalN5"}],"key":"BBXeMTV4CO"},{"type":"tableCell","align":"left","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"children":[{"type":"text","value":"An action.","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"key":"ac8jbcileg"}],"key":"dnBZrze3hD"}],"key":"x2crFTkDBZ"},{"type":"tableRow","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"children":[{"type":"inlineMath","value":"r","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"html":"rrr","key":"OkbccWvs4A"}],"key":"sAXQVOjfgG"},{"type":"tableCell","align":"center","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"children":[],"key":"CKYFy0mKwL"},{"type":"tableCell","align":"left","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"children":[{"type":"text","value":"A reward.","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"key":"uAcszeByiz"}],"key":"Q8Ybstqbup"}],"key":"XTZGmOiZxM"},{"type":"tableRow","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"children":[{"type":"text","value":"γ","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"key":"XhHqUv2zvA"}],"key":"KLxLEiex75"},{"type":"tableCell","align":"center","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"children":[],"key":"xKnYogDFac"},{"type":"tableCell","align":"left","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"children":[{"type":"text","value":"A discount factor.","position":{"start":{"line":149,"column":1},"end":{"line":149,"column":1}},"key":"yirjp3Nhs0"}],"key":"nttvVSpgfz"}],"key":"pRt8JFmAu5"},{"type":"tableRow","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"children":[{"type":"text","value":"τ","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"key":"Wq1wK8xCEn"}],"key":"WS7OlQmgz7"},{"type":"tableCell","align":"center","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"children":[{"type":"inlineMath","value":"\\mathcal{T}","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"html":"T\\mathcal{T}T","key":"YPQZaY3mJ6"}],"key":"V2zPYBnI52"},{"type":"tableCell","align":"left","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"children":[{"type":"text","value":"A trajectory.","position":{"start":{"line":150,"column":1},"end":{"line":150,"column":1}},"key":"vjvl8luru0"}],"key":"iJZIwxuXm2"}],"key":"VwSAQeaURa"},{"type":"tableRow","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"π","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"key":"eyPhMSaQRo"}],"key":"AwpppjNNVI"},{"type":"tableCell","align":"center","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"Π","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"key":"xMr8Y0vZ5n"}],"key":"NpjmQqUag1"},{"type":"tableCell","align":"left","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"children":[{"type":"text","value":"A policy.","position":{"start":{"line":151,"column":1},"end":{"line":151,"column":1}},"key":"GQtCI7JxHp"}],"key":"lc61NPS474"}],"key":"WChyQ4jeP1"},{"type":"tableRow","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"children":[{"type":"inlineMath","value":"V^\\pi","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"html":"VπV^\\piVπ","key":"iCMLQw44no"}],"key":"SVWv8q6BSN"},{"type":"tableCell","align":"center","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"children":[{"type":"inlineMath","value":"\\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"html":"SR\\mathcal{S} \\to \\mathbb{R}SR","key":"TyGuibMr3A"}],"key":"aRupo1QB2I"},{"type":"tableCell","align":"left","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"children":[{"type":"text","value":"The value function of policy ","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"key":"WbNGLIYpyI"},{"type":"text","value":"π","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"key":"r8MMxZlB9y"},{"type":"text","value":".","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"key":"YcVkdu63yC"}],"key":"QruGhimTj1"}],"key":"sgJL6zPnaV"},{"type":"tableRow","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"children":[{"type":"inlineMath","value":"Q^\\pi","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"QπQ^\\piQπ","key":"SGw11QOKok"}],"key":"Nndg7baL9X"},{"type":"tableCell","align":"center","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"children":[{"type":"inlineMath","value":"\\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"html":"S×AR\\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}S×AR","key":"EONbhbOnQN"}],"key":"o1IoTWjD4X"},{"type":"tableCell","align":"left","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"children":[{"type":"text","value":"The action-value function (a.k.a. Q-function) of policy ","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"M7DXXYc0cQ"},{"type":"text","value":"π","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"xtvrSfkTr2"},{"type":"text","value":".","position":{"start":{"line":153,"column":1},"end":{"line":153,"column":1}},"key":"lvRLFgyYzu"}],"key":"xmf7rVHMBg"}],"key":"wV48edL3FZ"},{"type":"tableRow","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"children":[{"type":"inlineMath","value":"A^\\pi","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"html":"AπA^\\piAπ","key":"hBgsu13CRU"}],"key":"C7WFVntTPQ"},{"type":"tableCell","align":"center","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"children":[{"type":"inlineMath","value":"\\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"html":"S×AR\\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}S×AR","key":"dghWW7izCp"}],"key":"OiadiciqdE"},{"type":"tableCell","align":"left","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"children":[{"type":"text","value":"The advantage function of policy ","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"key":"VtgBftGSPB"},{"type":"text","value":"π","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"key":"hxLINWFy1I"},{"type":"text","value":".","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"key":"R4EQuwOue3"}],"key":"XggHF7Zof2"}],"key":"njuojyPTYV"},{"type":"tableRow","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"children":[],"key":"x780n58tcQ"},{"type":"tableCell","align":"center","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"children":[{"type":"inlineMath","value":"\\triangle(\\mathcal{X})","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"html":"(X)\\triangle(\\mathcal{X})(X)","key":"cmyTH9Lihp"}],"key":"pKcVs5hQ1b"},{"type":"tableCell","align":"left","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"children":[{"type":"text","value":"A distribution supported on ","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"S5Xgf8S8eO"},{"type":"inlineMath","value":"\\mathcal{X}","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"html":"X\\mathcal{X}X","key":"Xt9pIOGOqo"},{"type":"text","value":".","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"bABUsU4tzD"}],"key":"LMeElDbpnt"}],"key":"odT4wD5nnT"},{"type":"tableRow","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"inlineMath","value":"\\hi","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"html":"h\\hih","key":"OjQhiNL05S"}],"key":"zx1OQC9YMG"},{"type":"tableCell","align":"center","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"inlineMath","value":"[\\hor]","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"html":"[H][\\hor][H]","key":"VeRFT1DavE"}],"key":"b26WOeAsW8"},{"type":"tableCell","align":"left","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"text","value":"Time horizon index of an MDP (subscript).","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"key":"BDE89DrlLK"}],"key":"jjELmbo0Jh"}],"key":"czNxtAMZYk"},{"type":"tableRow","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"children":[{"type":"inlineMath","value":"k","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"html":"kkk","key":"Yi8GDAsVfi"}],"key":"n8p3nQ7tmj"},{"type":"tableCell","align":"center","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"children":[{"type":"inlineMath","value":"[K]","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"html":"[K][K][K]","key":"prLTv3XzWF"}],"key":"wP4n4EwBXo"},{"type":"tableCell","align":"left","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"children":[{"type":"text","value":"Arm index of a multi-armed bandit (superscript).","position":{"start":{"line":157,"column":1},"end":{"line":157,"column":1}},"key":"Qx0NKVg4lr"}],"key":"w3QMMPSm7q"}],"key":"RbMh7DogWf"},{"type":"tableRow","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"children":[{"type":"inlineMath","value":"t","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"html":"ttt","key":"OO48P9oGzB"}],"key":"meEoxN6vMv"},{"type":"tableCell","align":"center","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"children":[{"type":"inlineMath","value":"[T]","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"html":"[T][T][T]","key":"YZX7nvhfL0"}],"key":"djvQ8LxK7M"},{"type":"tableCell","align":"left","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"children":[{"type":"text","value":"Iteration index of an algorithm (subscript).","position":{"start":{"line":158,"column":1},"end":{"line":158,"column":1}},"key":"XJlmR0UL8D"}],"key":"MJFBGLU5GD"}],"key":"JBDaQGC0E7"},{"type":"tableRow","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"children":[{"type":"text","value":"θ","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"jdeB2uQGv2"}],"key":"k0HQRNoN7s"},{"type":"tableCell","align":"center","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"children":[{"type":"text","value":"Θ","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"pDQi3nvrq5"}],"key":"oftTez3hPo"},{"type":"tableCell","align":"left","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"children":[{"type":"text","value":"A set of parameters.","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"meOp7iNxns"}],"key":"lwtzcm6Krx"}],"key":"WtKbXcnTYj"}],"key":"mDvbElX3yF"},{"type":"paragraph","position":{"start":{"line":161,"column":1},"end":{"line":163,"column":1}},"children":[{"type":"text","value":"Note that throughout the text, certain symbols will stand for either random variables or fixed values.\nWe aim to clarify in ambiguous settings.\nBe warned that","position":{"start":{"line":161,"column":1},"end":{"line":161,"column":1}},"key":"kePXJsrYv4"}],"key":"tKameABp6Z"}],"key":"CkZGrBynwx"},{"type":"block","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":168,"column":1},"end":{"line":168,"column":1}},"children":[{"type":"text","value":"Programming","position":{"start":{"line":168,"column":1},"end":{"line":168,"column":1}},"key":"udj0gqdqvM"}],"label":"programming","identifier":"programming","html_id":"programming","enumerator":"6","key":"CR1fey4Rl8"},{"type":"paragraph","position":{"start":{"line":170,"column":1},"end":{"line":176,"column":1}},"children":[{"type":"text","value":"Why include code in a textbook?\nWe believe that implementing an algorithm is a strong test of your understanding of it;\nmathematical notation can often abstract away details,\nwhile a computer must be given every single instruction.\nWe have sought to write readable Python code that is self-contained within each file.\nThis approach is inspired by ","position":{"start":{"line":170,"column":1},"end":{"line":170,"column":1}},"key":"bdUryyYYva"},{"type":"cite","kind":"narrative","label":"sussman_functional_2013","identifier":"sussman_functional_2013","children":[{"type":"text","value":"Sussman ","key":"xU5nd5aHou"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"l4nkmnkUCp"}],"key":"oja2hESmxM"},{"type":"text","value":" (2013)","key":"ouoTslPSiT"}],"enumerator":"3","key":"X06WxIkiJh"},{"type":"text","value":".\nThere are some ways in which the code style differs from typical software projects:","position":{"start":{"line":170,"column":1},"end":{"line":170,"column":1}},"key":"j88yu5wccr"}],"key":"BQ9R7i6Kri"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":178,"column":1},"end":{"line":182,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":178,"column":1},"end":{"line":179,"column":1}},"children":[{"type":"text","value":"We keep use of language features to a minimum,\neven if it leads to code that could otherwise be more concisely or idiomatically expressed.","position":{"start":{"line":178,"column":1},"end":{"line":178,"column":1}},"key":"e7ixPe6Ur1"}],"key":"fFgXvLSxWs"},{"type":"listItem","spread":true,"position":{"start":{"line":180,"column":1},"end":{"line":182,"column":1}},"children":[{"type":"text","value":"The variable names used in the code match those used in the main text.\nFor example, the variable ","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"KyDKZImXVu"},{"type":"inlineCode","value":"s","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"kg6Zgisc1S"},{"type":"text","value":" will be used instead of the more explicit ","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"UMBc4h4sUb"},{"type":"inlineCode","value":"state","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"EBbaVNHwvJ"},{"type":"text","value":".","position":{"start":{"line":180,"column":1},"end":{"line":180,"column":1}},"key":"R7YaoPG8V6"}],"key":"fzqaxL7bQ7"}],"key":"BwhbSLHann"},{"type":"paragraph","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"children":[{"type":"text","value":"We also make extensive use of Python ","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"fNkTPpoBaU"},{"type":"emphasis","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"children":[{"type":"text","value":"type annotations","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"Cg38qplFIo"}],"key":"F5KlJrSmX1"},{"type":"text","value":" to explicitly specify variable types, including shapes of vectors and matrices using the ","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"ctrrqpodT1"},{"type":"link","url":"https://github.com/patrick-kidger/jaxtyping","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"children":[{"type":"text","value":"jaxtyping","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"l120ifKQ5G"}],"urlSource":"https://github.com/patrick-kidger/jaxtyping","error":true,"key":"f3HbX1ZNzf"},{"type":"text","value":" library.","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"cMSwhXWysn"}],"key":"iTjgujIOuq"},{"type":"paragraph","position":{"start":{"line":185,"column":1},"end":{"line":190,"column":1}},"children":[{"type":"text","value":"This is an interactive book built with ","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"X467DqlzJS"},{"type":"link","url":"https://jupyterbook.org/en/stable/intro.html","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"children":[{"type":"text","value":"Jupyter Book","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"AbUbNaE39b"}],"urlSource":"https://jupyterbook.org/en/stable/intro.html","key":"syeaBCHwH7"},{"type":"text","value":".\nIt uses ","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"OsvV6crJXD"},{"type":"link","url":"https://docs.python.org/3.11/contents.html","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"children":[{"type":"text","value":"Python 3.11","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"BBR1vKMWVz"}],"urlSource":"https://docs.python.org/3.11/contents.html","key":"biklgcYugL"},{"type":"text","value":".\nIt uses the ","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"RBy7DYI3v0"},{"type":"link","url":"https://jax.readthedocs.io/en/latest/index.html","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"children":[{"type":"text","value":"JAX","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"YJiIfGhsR7"}],"urlSource":"https://jax.readthedocs.io/en/latest/index.html","key":"h0RRm3RTeW"},{"type":"text","value":" library for numerical computing.\nJAX was chosen for the clarity of its functional style and due to its mature RL ecosystem,\nsustained in large part by the Google DeepMind research group and a large body of open-source contributors.\nWe use the standard ","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"rV7TmWJxp6"},{"type":"link","url":"https://gymnasium.farama.org/","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"children":[{"type":"text","value":"Gymnasium","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"YGeg5phz2d"}],"urlSource":"https://gymnasium.farama.org/","key":"dNH05WZYu3"},{"type":"text","value":" library for interfacing with RL environments.","position":{"start":{"line":185,"column":1},"end":{"line":185,"column":1}},"key":"UPNIrbbBg9"}],"key":"lyY6mV6GJA"},{"type":"paragraph","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"children":[{"type":"text","value":"The following names are exported from the ","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"cGQZHuPCck"},{"type":"inlineCode","value":"utils","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"hlxahsm5Tb"},{"type":"text","value":" module:","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"g4e2vmB4I5"}],"key":"k7VsoXwdQr"},{"type":"code","lang":"python","value":"import matplotlib.pyplot as plt\n\n# convenient class builder\nfrom typing import NamedTuple\n\n# function typings\nfrom collections.abc import Callable\n\n# array typings\nfrom jaxtyping import Float, Array\n\n# convenient function composition\nfrom functools import partial\n\n# numerical computing and linear algebra\nimport jax\nimport jax.numpy as jnp\n\n# print functions as latex\nimport latexify\n\nplt.style.use(\"fivethirtyeight\")","position":{"start":{"line":194,"column":1},"end":{"line":217,"column":1}},"key":"U9KRRrhvcA"}],"key":"S4OB0PdlCt"}],"key":"UgSlXWZUi4"},"references":{"cite":{"order":["sutton_reinforcement_2018","agarwal_reinforcement_2022","sussman_functional_2013"],"data":{"sutton_reinforcement_2018":{"label":"sutton_reinforcement_2018","enumerator":"1","html":"Sutton, R. S., & Barto, A. G. (2018). Reinforcement Learning: An Introduction (Second edition). The MIT Press."},"agarwal_reinforcement_2022":{"label":"agarwal_reinforcement_2022","enumerator":"2","html":"Agarwal, A., Jiang, N., Kakade, S. M., & Sun, W. (2022). Reinforcement Learning: Theory and Algorithms."},"sussman_functional_2013":{"label":"sussman_functional_2013","enumerator":"3","html":"Sussman, G. J., Wisdom, J., & Farr, W. (2013). Functional Differential Geometry. The MIT Press."}}}},"footer":{"navigation":{"next":{"title":"1 Markov Decision Processes","url":"/mdps","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file diff --git a/mdps.html b/mdps.html index 6333bf5..24bc3eb 100644 --- a/mdps.html +++ b/mdps.html @@ -1,4 +1,4 @@ -1 Markov Decision Processes - CS/STAT 184: Introduction to Reinforcement Learning

1 Markov Decision Processes

1.1Introduction

The field of RL studies how an agent can learn to make sequential decisions in an interactive environment. + c-1.4,4.9-3.6,8.6-6.8,11.1c-3.1,2.5-7.3,3.7-12.4,3.8H63c-6.7,0-10,2.9-10,8.8">Made with MyST

1 Markov Decision Processes

1.1Introduction

The field of RL studies how an agent can learn to make sequential decisions in an interactive environment. This is a very general problem! How can we formalize this task in a way that is both sufficiently general yet also tractable enough for fruitful analysis?

Let’s consider some examples of sequential decision problems to identify the key common properties we’d like to capture:

  • Board games and video games, where a player takes actions in a virtual environment.
  • Inventory management, where a company must efficiently move resources from producers to consumers.
  • Robotic control, where a robot can move and interact with the real world to complete some task.

In these environments and many others, the state transitions, the “rules” of the environment, @@ -28,7 +28,7 @@ the past history of moves doesn’t matter (generally speaking). This is called the Markov property.

Environments that satisfy the Markov property are called Markov decision processes (MDPs). This chapter will focus on introducing core vocabulary for MDPs that will be useful throughout the book.

class MDP(NamedTuple):
     """A description of a Markov decision process with finitely many states and actions."""
     S: int  # number of states
     A: int  # number of actions
@@ -60,7 +60,7 @@
     P: Float[Array, "S A S"]  # "current" state, "current" action, "next" state
     r: Float[Array, "S A"]
     H: int
-    γ: float = 1.0  # discount factor (used later)
tidy_mdp = MDP(
+\end{array}

Consider a time horizon of H=7\hor = 7 days (one interaction per day). Let +t=0t = 0 correspond to Monday and t=6t = 6 correspond to Sunday.

tidy_mdp = MDP(
     S=2,  # 0 = orderly, 1 = messy
     A=2,  # 0 = ignore, 1 = tidy
     μ=jnp.array([1.0, 0.0]),  # start in orderly state
@@ -100,26 +100,26 @@
         ]
     ]),
     H=7,
-)

1.2.2Policies

1.2.2Policies

Note that for finite state and action spaces, +i.e. π={π0,,πH1}.\pi = \{ \pi_0, \dots, \pi_{\hor-1} \}.

Note that for finite state and action spaces, we can represent a randomized mapping SΔ(A)\mathcal{S} \to \Delta(\mathcal{A}) as a matrix π[0,1]S×A\pi \in [0, 1]^{\mathcal{S} \times \mathcal{A}} where each row describes the policy’s distribution over actions for the corresponding state.

A fascinating result is that every finite-horizon MDP has an optimal deterministic time-dependent policy! Intuitively, the Markov property implies that the current state contains all the information we need to make the optimal decision. We’ll prove this result constructively later in the chapter.

# arrays of shape (H, S, A) represent time-dependent policies
+and πh(orderly)=ignore\pi_\hi(\text{orderly}) = \text{ignore} for all h\hi.

# arrays of shape (H, S, A) represent time-dependent policies
 tidy_policy_always_tidy = (
     jnp.zeros((7, 2, 2))
     .at[:, :, 1].set(1.0)
@@ -133,21 +133,21 @@
     jnp.zeros((7, 2, 2))
     .at[:, 1, 1].set(1.0)
     .at[:, 0, 0].set(1.0)
-)

1.2.3Trajectories

class Transition(NamedTuple):
+immutability makes code much easier to reason about.

1.2.3Trajectories

class Transition(NamedTuple):
     """A single state-action-reward interaction with the environment.
 
     A trajectory comprises a sequence of transitions.
     """
     s: int
     a: int
-    r: float

Once we’ve chosen a policy, + r: float

Once we’ve chosen a policy, we can sample trajectories by repeatedly choosing actions according to the policy, -transitioning according to the state transitions, and observing the rewards.

That is, a policy induces a distribution ρπ\rho^{\pi} over trajectories. -(We assume that μ and PP are clear from context.)

Note that for a state-dependent policy, using the Markov property Definition 1.1, -we can write down the likelihood function of this probability distribution in an autoregressive way (i.e. one timestep at a time):

def trajectory_log_likelihood(
+transitioning according to the state transitions, and observing the rewards.

That is, a policy induces a distribution ρπ\rho^{\pi} over trajectories. +(We assume that μ and PP are clear from context.)

Note that for a state-dependent policy, using the Markov property Definition 1.1, +we can write down the likelihood function of this probability distribution in an autoregressive way (i.e. one timestep at a time):

def trajectory_log_likelihood(
     mdp: MDP,
     τ: list[Transition],
     π: Float[Array, "S A"],
@@ -163,14 +163,14 @@
         total += jnp.log(mdp.P[τ[i - 1].s, τ[i - 1].a, τ[i].s])
         total += jnp.log(π[τ[i].s, τ[i].a])
 
-    return total

For a deterministic policy π, we have that πh(as)=I[a=πh(s)]\pi_\hi(a \mid s) = \mathbb{I}[a = \pi_\hi(s)]; + return total

For a deterministic policy π, we have that πh(as)=I[a=πh(s)]\pi_\hi(a \mid s) = \mathbb{I}[a = \pi_\hi(s)]; that is, the probability of taking an action is 1 if it’s the unique action prescribed by the policy for that state and 0 otherwise. -In this case, the only randomness in sampling trajectories comes from the initial state distribution μ and the state transitions PP.

1.2.4Value functions

The main goal of RL is to find a policy that maximizes the expected total +In this case, the only randomness in sampling trajectories comes from the initial state distribution μ and the state transitions PP.

1.2.4Value functions

The main goal of RL is to find a policy that maximizes the expected total reward E[r0++rH1]\E [r_0 + \cdots + r_{\hor-1}].

Let’s introduce some notation for analyzing this quantity.

A policy’s value function at time h\hi is its expected remaining reward from a given state:

Similarly, we can define the action-value function (aka the -Q-function) at time hh as the expected remaining reward from a given state and taking a given action:

1.2.4.1Relating the value function and action-value function

Note that the value function is just the expected action-value over -actions drawn from the policy:

Vhπ(s)=Eaπh(s)[Qhπ(s,a)]V_\hi^\pi(s) = \E_{a \sim \pi_\hi(s)} [Q_\hi^\pi(s, a)]
def q_to_v(
+Describe the generating process.

Let’s introduce some notation for analyzing this quantity.

A policy’s value function at time h\hi is its expected remaining reward from a given state:

Similarly, we can define the action-value function (aka the +Q-function) at time hh as the expected remaining reward from a given state and taking a given action:

1.2.4.1Relating the value function and action-value function

Note that the value function is just the expected action-value over +actions drawn from the policy:

Vhπ(s)=Eaπh(s)[Qhπ(s,a)]V_\hi^\pi(s) = \E_{a \sim \pi_\hi(s)} [Q_\hi^\pi(s, a)]
def q_to_v(
     policy: Float[Array, "S A"],
     q: Float[Array, "S A"],
 ) -> Float[Array, " S"]:
@@ -178,8 +178,8 @@
     Compute the value function for a given policy in a known finite MDP
     at a single timestep from its action-value function.
     """
-    return jnp.average(q, weights=policy, axis=1)

and the action-value is the sum of the immediate reward and the expected value of the following -state:

Qhπ(s,a)=r(s,a)+EsP(s,a)[Vh+1π(s)]Q_\hi^\pi(s, a) = r(s, a) + \E_{s' \sim P(s, a)} [V_{\hi+1}^\pi(s')]
def v_to_q(
+    return jnp.average(q, weights=policy, axis=1)

and the action-value is the sum of the immediate reward and the expected value of the following +state:

Qhπ(s,a)=r(s,a)+EsP(s,a)[Vh+1π(s)]Q_\hi^\pi(s, a) = r(s, a) + \E_{s' \sim P(s, a)} [V_{\hi+1}^\pi(s')]
def v_to_q(
     mdp: MDP,
     v_next: Float[Array, " S"],
 ) -> Float[Array, "S A"]:
@@ -192,7 +192,7 @@
 
 
 # convert a list of v functions to a list of q functions
-v_ary_to_q_ary = jax.vmap(v_to_q, in_axes=(None, 0))

1.2.4.2Greedy policies

For any given QRS×AQ \in \mathbb{R}^{|\mathcal{S}| \times |\mathcal{A}|}, we can define the greedy policy π^Q\hat \pi_Q as the deterministic policy that selects the action with the highest QQ-value at each state:

π^Q(s)=argmaxaQsa\hat \pi_Q(s) = \arg\max_{a} Q_{sa}
def q_to_greedy(q: Float[Array, "S A"]) -> Float[Array, "S A"]:
+v_ary_to_q_ary = jax.vmap(v_to_q, in_axes=(None, 0))

1.2.4.2Greedy policies

For any given QRS×AQ \in \mathbb{R}^{|\mathcal{S}| \times |\mathcal{A}|}, we can define the greedy policy π^Q\hat \pi_Q as the deterministic policy that selects the action with the highest QQ-value at each state:

π^Q(s)=argmaxaQsa\hat \pi_Q(s) = \arg\max_{a} Q_{sa}
def q_to_greedy(q: Float[Array, "S A"]) -> Float[Array, "S A"]:
     """
     Get the (deterministic) greedy policy with respect to an action-value function.
     Return the policy as a matrix of shape (S, A) where each row is a one-hot vector.
@@ -204,11 +204,11 @@
 
 def v_to_greedy(mdp: MDP, v: Float[Array, " S"]) -> Float[Array, "S A"]:
     """Get the (deterministic) greedy policy with respect to a value function."""
-    return q_to_greedy(v_to_q(mdp, v))

1.2.5The one-step (Bellman) consistency equation

Note that by simply considering the cumulative reward as the sum of the + return q_to_greedy(v_to_q(mdp, v))

1.2.5The one-step (Bellman) consistency equation

Note that by simply considering the cumulative reward as the sum of the current reward and the future cumulative reward, we can describe the value function recursively (in terms of itself). This is named the Bellman consistency equation after Richard Bellman (1920--1984), -who is credited with introducing dynamic programming in 1953.

def check_bellman_consistency_v(
+who is credited with introducing dynamic programming in 1953.

def check_bellman_consistency_v(
     mdp: MDP,
     policy: Float[Array, "H S A"],
     v_ary: Float[Array, "H S"],
@@ -225,18 +225,18 @@
             jnp.sum(policy[h] * (mdp.r + mdp.γ * mdp.P @ v_ary[h + 1]), axis=1),
         )
         for h in range(mdp.H - 1)
-    )

One can analogously derive the Bellman consistency equation for the -action-value function:

1.2.6The one-step Bellman operator

Fix a policy π. Consider the higher-order operator that takes in a +\end{aligned}

1.2.6The one-step Bellman operator

Fix a policy π. Consider the higher-order operator that takes in a “value function” v:SRv : \mathcal{S} \to \mathbb{R} and returns the r.h.s. of the Bellman -equation for that “value function”:

def bellman_operator_looping(
     mdp: MDP,
     policy: Float[Array, "S A"],
     v: Float[Array, " S"],
@@ -254,18 +254,18 @@
                     * mdp.P[s, a, s_next]
                     * (mdp.r[s, a] + mdp.γ * v[s_next])
                 )
-    return v_new

Note that we can concisely implement this using the q_to_v and v_to_q utilities from above:

def bellman_operator(
+    return v_new

Note that we can concisely implement this using the q_to_v and v_to_q utilities from above:

def bellman_operator(
     mdp: MDP,
     policy: Float[Array, "S A"],
     v: Float[Array, " S"],
 ) -> Float[Array, " S"]:
     """For a known finite MDP, the Bellman operator can be exactly evaluated."""
     return q_to_v(policy, v_to_q(mdp, v))  # equivalent
-    return jnp.sum(policy * (mdp.r + mdp.γ * mdp.P @ v), axis=1)

We’ll call Jπ:RSRS\mathcal{J}^\pi : \mathbb{R}^\mathcal{S} \to \mathbb{R}^\mathcal{S} the Bellman + return jnp.sum(policy * (mdp.r + mdp.γ * mdp.P @ v), axis=1)

We’ll call Jπ:RSRS\mathcal{J}^\pi : \mathbb{R}^\mathcal{S} \to \mathbb{R}^\mathcal{S} the Bellman operator of π. Note that it’s defined on any “value function” mapping states to real numbers; vv doesn’t have to be a well-defined value function for some policy (hence the lowercase notation). -The Bellman operator also gives us a concise way to express Theorem 1.1 for the value function:

Vhπ=Jπ(Vh+1π)V_\hi^\pi = \mathcal{J}^{\pi}(V_{\hi+1}^\pi)

Intuitively, the output of the Bellman operator, a new “value function”, +The Bellman operator also gives us a concise way to express Theorem 1.1 for the value function:

Vhπ=Jπ(Vh+1π)V_\hi^\pi = \mathcal{J}^{\pi}(V_{\hi+1}^\pi)

Intuitively, the output of the Bellman operator, a new “value function”, evaluates states as follows: from a given state, take one action according to π, observe the reward, and then evaluate the next state using the input “value function”.

When we discuss infinite-horizon MDPs, the Bellman operator will turn @@ -278,18 +278,18 @@ timestep h\hi as a function of the value function at timestep h+1\hi+1. This means we can start at the end of the time horizon, where the value is known, and work backwards in time, using the Bellman consistency -equation to compute the value function at each time step.

def dp_eval_finite(mdp: MDP, policy: Float[Array, "S A"]) -> Float[Array, "H S"]:
+equation to compute the value function at each time step.

def dp_eval_finite(mdp: MDP, policy: Float[Array, "S A"]) -> Float[Array, "H S"]:
     """Evaluate a policy using dynamic programming."""
     V_ary = [None] * mdp.H + [jnp.zeros(mdp.S)]  # initialize to 0 at end of time horizon
     for h in range(mdp.H - 1, -1, -1):
         V_ary[h] = bellman_operator(mdp, policy[h], V_ary[h + 1])
-    return jnp.stack(V_ary[:-1])

This runs in time O(HS2A)O(H \cdot |\mathcal{S}|^2 \cdot |\mathcal{A}|) by counting the + return jnp.stack(V_ary[:-1])

This runs in time O(HS2A)O(H \cdot |\mathcal{S}|^2 \cdot |\mathcal{A}|) by counting the loops.

V_messy = dp_eval_finite(tidy_mdp, tidy_policy_messy_only)
-V_messy
Array([[5.5621696, 4.7927704], +\end{aligned}

etc. You may wish to repeat this computation for the +other policies to get a better sense of this algorithm.

V_messy = dp_eval_finite(tidy_mdp, tidy_policy_messy_only)
+V_messy
Array([[5.5621696, 4.7927704], [4.7927704, 4.0241003], [4.0241003, 3.253 ], [3.253 , 2.49 ], [2.49 , 1.7 ], [1.7 , 1. ], - [1. , 0. ]], dtype=float32)

1.3.2Optimal policies in finite-horizon MDPs

We’ve just seen how to evaluate a given policy. But how can we find + [1. , 0. ]], dtype=float32)

1.3.2Optimal policies in finite-horizon MDPs

We’ve just seen how to evaluate a given policy. But how can we find the optimal policy for a given environment?

Convince yourself that all optimal policies must have the same value function. We call this the optimal value function and denote it by @@ -330,25 +330,25 @@ Qh(s,a)Q_\hi^\star(s, a).

It is a stunning fact that every finite-horizon MDP has an optimal policy that is time-dependent and deterministic. In particular, we can construct such a policy by acting greedily with respect to the optimal -action-value function:

Note that this also gives simplified forms of the Bellman consistency equations for the optimal policy:

Note that this also gives simplified forms of the Bellman consistency equations for the optimal policy:

Now that we’ve shown this particular greedy policy is optimal, all we +\end{aligned}

Now that we’ve shown this particular greedy policy is optimal, all we need to do is compute the optimal value function and optimal policy. We can do this by working backwards in time using dynamic programming (DP).

def find_optimal_policy(mdp: MDP):
+\end{aligned}
def find_optimal_policy(mdp: MDP):
     Q = [None] * mdp.H
     pi = [None] * mdp.H
     V = [None] * mdp.H + [jnp.zeros(mdp.S)]  # initialize to 0 at end of time horizon
@@ -387,7 +387,7 @@
     pi = jnp.stack(pi)
     V = jnp.stack(V[:-1])
 
-    return pi, V, Q

At each of the HH timesteps, we must compute QQ^{\star} for each of + return pi, V, Q

At each of the HH timesteps, we must compute QQ^{\star} for each of the SA|\mathcal{S}| |\mathcal{A}| state-action pairs. Each computation takes S|\mathcal{S}| operations to evaluate the average value over ss'. This gives a total computation time of O(HS2A)O(H \cdot |\mathcal{S}|^2 \cdot |\mathcal{A}|).

Note that this algorithm is identical to the policy evaluation algorithm @@ -395,11 +395,11 @@ actions chosen by a policy, we instead simply take a maximum over the action-values. We’ll see this relationship between policy evaluation and optimal policy computation show up again in the infinite-horizon -setting.

π_opt, V_opt, Q_opt = find_optimal_policy(tidy_mdp)
+setting.

π_opt, V_opt, Q_opt = find_optimal_policy(tidy_mdp)
 assert jnp.allclose(π_opt, tidy_policy_messy_only)
 assert jnp.allclose(V_opt, V_messy)
 assert jnp.allclose(Q_opt[:-1], v_ary_to_q_ary(tidy_mdp, V_messy)[1:])
-"Assertions passed (the 'tidy when messy' policy is optimal)"
"Assertions passed (the 'tidy when messy' policy is optimal)"

1.4Infinite-horizon MDPs

What happens if a trajectory is allowed to continue forever (i.e. +"Assertions passed (the 'tidy when messy' policy is optimal)"

"Assertions passed (the 'tidy when messy' policy is optimal)"

1.4Infinite-horizon MDPs

What happens if a trajectory is allowed to continue forever (i.e. H=H = \infty)? This is the setting of infinite horizon MDPs.

In this chapter, we’ll describe the necessary adjustments from the finite-horizon case to make the problem tractable. We’ll show that the Bellman operator in the discounted reward setting is a @@ -412,7 +412,7 @@ rh+rh+1+rh+2+r_\hi + r_{\hi+1} + r_{\hi+2} + \cdots is no longer a good idea since it might blow up to infinity. Instead of a time horizon HH, we now need a discount factor γ[0,1)\gamma \in [0, 1) such that rewards become less -valuable the further into the future they are:

rh+γrh+1+γ2rh+2+=k=0γkrh+k.r_\hi + \gamma r_{\hi+1} + \gamma^2 r_{\hi+2} + \cdots = \sum_{k=0}^\infty \gamma^k r_{\hi+k}.

We can think of γ as measuring how much we care about the future: +valuable the further into the future they are:

rh+γrh+1+γ2rh+2+=k=0γkrh+k.r_\hi + \gamma r_{\hi+1} + \gamma^2 r_{\hi+2} + \cdots = \sum_{k=0}^\infty \gamma^k r_{\hi+k}.

We can think of γ as measuring how much we care about the future: if it’s close to 0, we only care about the near-term rewards; it’s close to 1, we put more weight into future rewards.

You can also analyze γ as the probability of continuing the trajectory at each time step. (This is equivalent to HH being @@ -422,7 +422,7 @@ γ is close to 1, the trajectory will likely continue for a long time.

The other components of the MDP remain the same:

M=(S,A,μ,P,r,γ).M = (\mathcal{S}, \mathcal{A}, \mu, P, r, \gamma).

Code-wise, we can reuse the MDP class from before Definition 1.2 and set mdp.H = float('inf').

tidy_mdp_inf = tidy_mdp._replace(H=float("inf"), γ=0.95)

1.4.2Stationary policies

The time-dependent policies from the finite-horizon case become +useful to review geometric series.

The other components of the MDP remain the same:

M=(S,A,μ,P,r,γ).M = (\mathcal{S}, \mathcal{A}, \mu, P, r, \gamma).

Code-wise, we can reuse the MDP class from before Definition 1.2 and set mdp.H = float('inf').

tidy_mdp_inf = tidy_mdp._replace(H=float("inf"), γ=0.95)

1.4.2Stationary policies

The time-dependent policies from the finite-horizon case become difficult to handle in the infinite-horizon case. In particular, many of the DP approaches we saw required us to start at the end of the trajectory, which is no longer possible. We’ll shift to stationary @@ -437,15 +437,15 @@ time step we condition on when defining the value function?

1.5Solving infinite-horizon MDPs

1.5.1The Bellman operator is a contraction mapping

Recall from Definition 1.8 that the Bellman operator Jπ\mathcal{J}^{\pi} for a policy π takes in a “value function” v:SRv : \mathcal{S} \to \mathbb{R} and returns the r.h.s. of the Bellman equation for that “value function”. In -the infinite-horizon setting, this is

[Jπ(v)](s):=Eaπ(s)sP(s,a)[r(s,a)+γv(s)].[\mathcal{J}^{\pi}(v)](s) := \E_{\substack{a \sim \pi(s) \\ s' \sim P(s, a)}} [r(s, a) + \gamma v(s')].

The crucial property of the Bellman operator is that it is a +the infinite-horizon setting, this is

[Jπ(v)](s):=Eaπ(s)sP(s,a)[r(s,a)+γv(s)].[\mathcal{J}^{\pi}(v)](s) := \E_{\substack{a \sim \pi(s) \\ s' \sim P(s, a)}} [r(s, a) + \gamma v(s')].

The crucial property of the Bellman operator is that it is a contraction mapping for any policy. Intuitively, if we start with two “value functions” v,u:SRv, u : \mathcal{S} \to \mathbb{R}, if we repeatedly apply the Bellman operator to each of them, they will get closer and closer together at an exponential rate.

It is a powerful fact (known as the Banach fixed-point theorem) that every contraction mapping has a unique fixed point xx^\star such @@ -453,61 +453,61 @@ to any starting point, we will eventually converge to xx^\star:

f(t)(x)xγtxx.\|f^{(t)}(x) - x^\star\| \le \gamma^t \|x - x^\star\|.

Let’s return to the RL setting and apply this result to the Bellman operator. How can we measure the distance between two “value functions” v,u:SRv, u : \mathcal{S} \to \mathbb{R}? We’ll take the supremum norm as our distance -metric:

vu:=supsSv(s)u(s),\| v - u \|_{\infty} := \sup_{s \in \mathcal{S}} |v(s) - u(s)|,

i.e. +metric:

vu:=supsSv(s)u(s),\| v - u \|_{\infty} := \sup_{s \in \mathcal{S}} |v(s) - u(s)|,

i.e. we compare the “value functions” on the state that causes the biggest gap between them. Then (1.36) implies that if we repeatedly apply Jπ\mathcal{J}^\pi to any starting “value function”, we will eventually converge to VπV^\pi:

(Jπ)(t)(v)VπγtvVπ.\|(\mathcal{J}^\pi)^{(t)}(v) - V^\pi \|_{\infty} \le \gamma^{t} \| v - V^\pi\|_{\infty}.

We’ll use this useful fact to prove the convergence of several -algorithms later on.

1.5.2Policy evaluation in infinite-horizon MDPs

The backwards DP technique we used in the finite-horizon case no +\end{aligned}

1.5.2Policy evaluation in infinite-horizon MDPs

The backwards DP technique we used in the finite-horizon case no longer works since there is no “final timestep” to start from. We’ll need another approach to policy evaluation.

The Bellman consistency conditions yield a system of equations we can solve to evaluate a deterministic policy exactly. For a faster approximate solution, we can iterate the policy’s Bellman operator, since we know that it has a unique fixed point at the true value function.

1.5.2.1Matrix inversion for deterministic policies

Note that when the policy π is deterministic, the actions can be determined from the states, and so we can chop off the action dimension -for the rewards and state transitions:

rπRSPπ[0,1]S×Sμ[0,1]SπASVπRSQπRS×A.\begin{aligned} +for the rewards and state transitions:

rπRSPπ[0,1]S×Sμ[0,1]SπASVπRSQπRS×A.\begin{aligned} r^{\pi} &\in \mathbb{R}^{|\mathcal{S}|} & P^{\pi} &\in [0, 1]^{|\mathcal{S}| \times |\mathcal{S}|} & \mu &\in [0, 1]^{|\mathcal{S}|} \\ \pi &\in \mathcal{A}^{|\mathcal{S}|} & V^\pi &\in \mathbb{R}^{|\mathcal{S}|} & Q^\pi &\in \mathbb{R}^{|\mathcal{S}| \times |\mathcal{A}|}. -\end{aligned}

For PπP^\pi, we’ll treat the rows as the states and the +\end{aligned}

For PπP^\pi, we’ll treat the rows as the states and the columns as the next states. Then Ps,sπP^\pi_{s, s'} is the probability of transitioning from state ss to state ss' under policy π.

The Bellman consistency equation for a deterministic policy can be -written in tabular notation as

Vπ=rπ+γPπVπ.V^\pi = r^\pi + \gamma P^\pi V^\pi.

(Unfortunately, this notation doesn’t simplify the expression for +written in tabular notation as

Vπ=rπ+γPπVπ.V^\pi = r^\pi + \gamma P^\pi V^\pi.

(Unfortunately, this notation doesn’t simplify the expression for QπQ^\pi.) This system of equations can be solved with a matrix inversion:

Vπ=(IγPπ)1rπ.V^\pi = (I - \gamma P^\pi)^{-1} r^\pi.
def eval_deterministic_infinite(
+least one nonzero element.)

def eval_deterministic_infinite(
     mdp: MDP, policy: Float[Array, "S A"]
 ) -> Float[Array, " S"]:
     pi = jnp.argmax(policy, axis=1)  # un-one-hot
     P_π = mdp.P[jnp.arange(mdp.S), pi]
     r_π = mdp.r[jnp.arange(mdp.S), pi]
-    return jnp.linalg.solve(jnp.eye(mdp.S) - mdp.γ * P_π, r_π)
eval_deterministic_infinite(tidy_mdp_inf, tidy_policy_messy_only[0])
Array([15.56419, 14.78598], dtype=float32)

1.5.2.2Iterative policy evaluation

The matrix inversion above takes roughly O(S3)O(|\mathcal{S}|^3) time. +lower than this.

eval_deterministic_infinite(tidy_mdp_inf, tidy_policy_messy_only[0])
Array([15.56419, 14.78598], dtype=float32)

1.5.2.2Iterative policy evaluation

The matrix inversion above takes roughly O(S3)O(|\mathcal{S}|^3) time. It also only works for deterministic policies. Can we trade off the requirement of finding the exact value function for a faster approximate algorithm that will also extend to stochastic policies?

Let’s use the Bellman operator to define an iterative algorithm for computing the value function. We’ll start with an initial guess v(0)v^{(0)} with elements in [0,1/(1γ)][0, 1/(1-\gamma)] and then iterate the -Bellman operator:

v(t+1)=Jπ(v(t)),v^{(t+1)} = \mathcal{J}^{\pi}(v^{(t)}),

i.e. v(t)=(Jπ)(t)(v(0))v^{(t)} = (\mathcal{J}^{\pi})^{(t)} (v^{(0)}). Note that each iteration -takes O(S2)O(|\mathcal{S}|^2) time for the matrix-vector multiplication.

def supremum_norm(v):
+Bellman operator:

v(t+1)=Jπ(v(t)),v^{(t+1)} = \mathcal{J}^{\pi}(v^{(t)}),

i.e. v(t)=(Jπ)(t)(v(0))v^{(t)} = (\mathcal{J}^{\pi})^{(t)} (v^{(0)}). Note that each iteration +takes O(S2)O(|\mathcal{S}|^2) time for the matrix-vector multiplication.

def supremum_norm(v):
     return jnp.max(jnp.abs(v))  # same as jnp.linalg.norm(v, jnp.inf)
 
 
@@ -522,13 +522,13 @@
 
 def iterative_evaluation(mdp: MDP, pi: Float[Array, "S A"], ε=1e-6) -> Float[Array, " S"]:
     op = partial(bellman_operator, mdp, pi)
-    return loop_until_convergence(op, jnp.zeros(mdp.S), ε)

Then, as we showed in (1.38), by the Banach fixed-point theorem:

v(t)Vπγtv(0)Vπ.\|v^{(t)} - V^\pi \|_{\infty} \le \gamma^{t} \| v^{(0)} - V^\pi\|_{\infty}.
iterative_evaluation(tidy_mdp_inf, tidy_policy_messy_only[0])
Array([15.564166, 14.785956], dtype=float32)

1.5.3Optimal policies in infinite-horizon MDPs

Now let’s move on to solving for an optimal policy in the infinite-horizon case. As in the finite-horizon case, an optimal policy π\pi^\star @@ -545,76 +545,76 @@ equation (1.32) for the optimal value function doesn’t depend on any policy:

V(s)=maxa[r(s,a)+γEsP(s,a)V(s).]V^\star(s) = \max_a \left[ r(s, a) + \gamma \E_{s' \sim P(s, a)} V^\star(s'). \right]

As before, thinking of the r.h.s. of (1.53) as an operator on value functions -gives the Bellman optimality operator

[J(v)](s)=maxa[r(s,a)+γEsP(s,a)v(s)][\mathcal{J}^{\star}(v)](s) = \max_a \left[ r(s, a) + \gamma \E_{s' \sim P(s, a)} v(s') \right]
def bellman_optimality_operator(mdp: MDP, v: Float[Array, " S"]) -> Float[Array, " S"]:
+gives the Bellman optimality operator

[J(v)](s)=maxa[r(s,a)+γEsP(s,a)v(s)][\mathcal{J}^{\star}(v)](s) = \max_a \left[ r(s, a) + \gamma \E_{s' \sim P(s, a)} v(s') \right]
def bellman_optimality_operator(mdp: MDP, v: Float[Array, " S"]) -> Float[Array, " S"]:
     return jnp.max(mdp.r + mdp.γ * mdp.P @ v, axis=1)
 
 
 def check_optimal(v: Float[Array, " S"], mdp: MDP):
-    return jnp.allclose(v, bellman_optimality_operator(v, mdp))

1.5.3.1Value iteration

Since the optimal policy is still a policy, our result that the Bellman + return jnp.allclose(v, bellman_optimality_operator(v, mdp))

1.5.3.1Value iteration

Since the optimal policy is still a policy, our result that the Bellman operator is a contracting map still holds, and so we can repeatedly apply this operator to converge to the optimal value function! This -algorithm is known as value iteration.

def value_iteration(mdp: MDP, ε: float = 1e-6) -> Float[Array, " S"]:
+algorithm is known as value iteration.

def value_iteration(mdp: MDP, ε: float = 1e-6) -> Float[Array, " S"]:
     """Iterate the Bellman optimality operator until convergence."""
     op = partial(bellman_optimality_operator, mdp)
-    return loop_until_convergence(op, jnp.zeros(mdp.S), ε)
value_iteration(tidy_mdp_inf)
Array([15.564166, 14.785956], dtype=float32)

Note that the runtime analysis for an ε-optimal value function + return loop_until_convergence(op, jnp.zeros(mdp.S), ε)

value_iteration(tidy_mdp_inf)
Array([15.564166, 14.785956], dtype=float32)

Note that the runtime analysis for an ε-optimal value function is exactly the same as iterative policy evaluation! This is because value iteration is simply the special case of applying iterative policy evaluation to the optimal value function.

As the final step of the algorithm, to return an actual policy π^\hat \pi, we can simply act greedily with respect to the final iteration -v(T)v^{(T)} of our above algorithm:

π^(s)=argmaxa[r(s,a)+γEsP(s,a)v(T)(s)].\hat \pi(s) = \arg\max_a \left[ r(s, a) + \gamma \E_{s' \sim P(s, a)} v^{(T)}(s') \right].

We must be careful, though: the value function of this greedy policy, +v(T)v^{(T)} of our above algorithm:

π^(s)=argmaxa[r(s,a)+γEsP(s,a)v(T)(s)].\hat \pi(s) = \arg\max_a \left[ r(s, a) + \gamma \E_{s' \sim P(s, a)} v^{(T)}(s') \right].

We must be careful, though: the value function of this greedy policy, Vπ^V^{\hat \pi}, is not the same as v(T)v^{(T)}, which need not even be a well-defined value function for some policy!

The bound on the policy’s quality is actually quite loose: if v(T)Vϵ\|v^{(T)} - V^\star\|_{\infty} \le \epsilon, then the greedy policy π^\hat \pi satisfies Vπ^V2γ1γϵ\|V^{\hat \pi} - V^\star\|_{\infty} \le \frac{2\gamma}{1-\gamma} \epsilon, -which might potentially be very large.

So in order to compensate and achieve Vπ^Vϵ\|V^{\hat \pi} - V^{\star}\| \le \epsilon, we must have

v(T)V1γ2γϵ.\|v^{(T)} - V^\star\|_{\infty} \le \frac{1-\gamma}{2 \gamma} \epsilon.

This means, using Remark 1.2, we need to run value iteration for

T=O(11γlog(γϵ(1γ)2))T = O\left( \frac{1}{1-\gamma} \log\left(\frac{\gamma}{\epsilon (1-\gamma)^2}\right) \right)

iterations to achieve an ε-accurate estimate of the optimal value function.

1.5.3.2Policy iteration

Can we mitigate this “greedy worsening”? What if instead of approximating the optimal value function and then acting greedily by it at the very end, we iteratively improve the policy and value function together? This is the idea behind policy iteration. In each step, we simply set the policy to act greedily with respect to its own value function.

def policy_iteration(mdp: MDP, ε=1e-6) -> Float[Array, "S A"]:
+\end{aligned}

So in order to compensate and achieve Vπ^Vϵ\|V^{\hat \pi} - V^{\star}\| \le \epsilon, we must have

v(T)V1γ2γϵ.\|v^{(T)} - V^\star\|_{\infty} \le \frac{1-\gamma}{2 \gamma} \epsilon.

This means, using Remark 1.2, we need to run value iteration for

T=O(11γlog(γϵ(1γ)2))T = O\left( \frac{1}{1-\gamma} \log\left(\frac{\gamma}{\epsilon (1-\gamma)^2}\right) \right)

iterations to achieve an ε-accurate estimate of the optimal value function.

1.5.3.2Policy iteration

Can we mitigate this “greedy worsening”? What if instead of approximating the optimal value function and then acting greedily by it at the very end, we iteratively improve the policy and value function together? This is the idea behind policy iteration. In each step, we simply set the policy to act greedily with respect to its own value function.

def policy_iteration(mdp: MDP, ε=1e-6) -> Float[Array, "S A"]:
     """Iteratively improve the policy and value function."""
     def op(pi):
         return v_to_greedy(mdp, eval_deterministic_infinite(mdp, pi))
     π_init = jnp.ones((mdp.S, mdp.A)) / mdp.A  # uniform random policy
-    return loop_until_convergence(op, π_init, ε)
policy_iteration(tidy_mdp_inf)
Array([[1., 0.], - [0., 1.]], dtype=float32)

Although PI appears more complex than VI, we’ll use the same contraction property Theorem 1.4 to show convergence. This will give us the same runtime bound as value iteration and iterative policy evaluation for an ε-optimal value function Remark 1.2, although in practice, PI often converges much faster.

1.6Summary

  • Markov decision processes (MDPs) are a framework for sequential +\end{aligned}

This means we can now apply the Bellman convergence result (1.38) to get

Vπt+1VJ(Vπt)VγVπtV.\|V^{\pi^{t+1}} - V^\star \|_{\infty} \le \|\mathcal{J}^{\star} (V^{\pi^{t}}) - V^{\star}\|_{\infty} \le \gamma \|V^{\pi^{t}} - V^\star \|_{\infty}.

1.6Summary

  • Markov decision processes (MDPs) are a framework for sequential decision making under uncertainty. They consist of a state space S\mathcal{S}, an action space A\mathcal{A}, an initial state distribution μΔ(S)\mu \in \Delta(\mathcal{S}), a transition function P(ss,a)P(s' \mid s, a), and a @@ -634,9 +634,9 @@ functions exactly. Thinking of the r.h.s. of this equation as an operator on value functions gives the Bellman operator.

  • In the finite-horizon setting, we can compute the optimal policy using dynamic programming.

  • In the infinite-horizon setting, we can compute the optimal policy -using value iteration or policy iteration.

\ No newline at end of file diff --git a/mdps.json b/mdps.json index 7f4f24a..263e951 100644 --- a/mdps.json +++ b/mdps.json @@ -1 +1 @@ -{"kind":"Notebook","sha256":"32c2f6fe9e96648ecf8985a4e80db115d0d6950b01e46976348cc5f4529cd76f","slug":"mdps","location":"/mdps.md","dependencies":[],"frontmatter":{"title":"1 Markov Decision Processes","numbering":{"all":{"enabled":true},"enumerator":{"template":"1.%s"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"thumbnail":"/build/deterministic_policy-9d0b50d69541007293ead345d987b682.png","thumbnailOptimized":"/build/deterministic_policy-9d0b50d69541007293ead345d987b682.webp","exports":[{"format":"md","filename":"mdps.md","url":"/build/mdps-eb86bf115f025d31fd89a81ae9f29e0d.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"QAJ47NVJ2e"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"1.1","key":"YwkImtjGje"},{"type":"paragraph","position":{"start":{"line":20,"column":1},"end":{"line":22,"column":1}},"children":[{"type":"text","value":"The field of RL studies how an agent can learn to make sequential decisions in an interactive environment.\nThis is a very general problem!\nHow can we ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"gAk5uqJbBY"},{"type":"emphasis","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"formalize","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"ZhaIbgzD5V"}],"key":"l0VquglOiZ"},{"type":"text","value":" this task in a way that is both ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"lsg2v8KT8Q"},{"type":"emphasis","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"sufficiently general","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"d8kSa81KyS"}],"key":"kAezd8rLgB"},{"type":"text","value":" yet also tractable enough for ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"wnCeMdJgMq"},{"type":"emphasis","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"fruitful analysis","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"IWpf8TNY29"}],"key":"U0oAsnunZ5"},{"type":"text","value":"?","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"bf5K3N7xvr"}],"key":"UHBjldtajv"},{"type":"paragraph","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"children":[{"type":"text","value":"Let’s consider some examples of sequential decision problems to identify the key common properties we’d like to capture:","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"aoCkCjRoRr"}],"key":"FAJQfeK17E"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":26,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"children":[{"type":"strong","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"children":[{"type":"text","value":"Board games and video games,","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"vWoOPEvRve"}],"key":"bAqv4CR5WD"},{"type":"text","value":" where a player takes actions in a virtual environment.","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"hrwpRFk4XJ"}],"key":"gd8Bh4HDsJ"},{"type":"listItem","spread":true,"position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"strong","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"text","value":"Inventory management,","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"sDnXjgA0nL"}],"key":"O3mYKyeCox"},{"type":"text","value":" where a company must efficiently move resources from producers to consumers.","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"OPC3tGtCPM"}],"key":"tFRfkSpJZi"},{"type":"listItem","spread":true,"position":{"start":{"line":28,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"strong","position":{"start":{"line":28,"column":1},"end":{"line":28,"column":1}},"children":[{"type":"text","value":"Robotic control","position":{"start":{"line":28,"column":1},"end":{"line":28,"column":1}},"key":"oFNXomSU5u"}],"key":"kNJKIJ6GSh"},{"type":"text","value":", where a robot can move and interact with the real world to complete some task.","position":{"start":{"line":28,"column":1},"end":{"line":28,"column":1}},"key":"ovLyfJE6aJ"}],"key":"qQkiBt3Upn"}],"key":"z5NEfFctaA"},{"type":"paragraph","position":{"start":{"line":30,"column":1},"end":{"line":37,"column":1}},"children":[{"type":"text","value":"In these environments and many others, the ","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"igdPEkY5O6"},{"type":"strong","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"children":[{"type":"text","value":"state transitions","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"TPU0k8Vdv1"}],"key":"OktC737tpp"},{"type":"text","value":",\nthe “rules” of the environment,\nonly depend on the ","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"tQIaXakFG7"},{"type":"emphasis","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"children":[{"type":"text","value":"most recent","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"yejg4EusaG"}],"key":"NVpJAcNgyg"},{"type":"text","value":" state and action (generally speaking).\nFor example, if you want to take a break while playing a game of chess,\nyou could take a picture of the board,\nand later on reset the board to that state and continue playing;\nthe past history of moves doesn’t matter (generally speaking).\nThis is called the ","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"Rws0fvpZqo"},{"type":"strong","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"children":[{"type":"text","value":"Markov property.","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"lE03lqsBVd"}],"key":"Xq5hXgma9B"}],"key":"Rb5ZHXaB9r"},{"type":"proof","kind":"definition","label":"markov","identifier":"markov","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Markov property","position":{"start":{"line":39,"column":1},"end":{"line":39,"column":1}},"key":"DMwpY8Tw98"}],"key":"Z6YoMw57gU"},{"type":"paragraph","position":{"start":{"line":42,"column":1},"end":{"line":44,"column":1}},"children":[{"type":"text","value":"An interactive environment satisfies the ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"GcZ1s0OQMh"},{"type":"strong","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"Markov property","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"wbSAIbmnBD"}],"key":"izG9JNkdOg"},{"type":"text","value":" if the\nprobability of transitioning to a new state only depends on the current\nstate and action:","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"Z1ZTEBX7CV"}],"key":"jcxB688ddN"},{"type":"math","value":"\\pr(s_{\\hi+1} \\mid s_0, a_0, \\dots, s_\\hi, a_\\hi) = P(s_{\\hi+1} \\mid s_\\hi, a_\\hi)","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"html":"P(sh+1s0,a0,,sh,ah)=P(sh+1sh,ah)\\pr(s_{\\hi+1} \\mid s_0, a_0, \\dots, s_\\hi, a_\\hi) = P(s_{\\hi+1} \\mid s_\\hi, a_\\hi)P(sh+1s0,a0,,sh,ah)=P(sh+1sh,ah)","enumerator":"1.1","key":"p5kWz5ZHaf"},{"type":"paragraph","position":{"start":{"line":48,"column":1},"end":{"line":49,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"key":"ve4wXXC46B"},{"type":"inlineMath","value":"P : \\mathcal{S} \\times \\mathcal{A} \\to \\triangle(\\mathcal{S})","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"html":"P:S×A(S)P : \\mathcal{S} \\times \\mathcal{A} \\to \\triangle(\\mathcal{S})P:S×A(S)","key":"iLJFUWhgUo"},{"type":"text","value":" describes the state transitions.\n(We’ll elaborate on this notation later in the chapter.)","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"key":"L0LBM1AdOZ"}],"key":"noq6a4naEw"}],"enumerator":"1.1","html_id":"markov","key":"YJGh1Z5lPz"},{"type":"paragraph","position":{"start":{"line":52,"column":1},"end":{"line":53,"column":1}},"children":[{"type":"text","value":"Environments that satisfy the Markov property are called ","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"Ef6ZTcOnzI"},{"type":"strong","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"children":[{"type":"text","value":"Markov decision processes","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"HzVtI61cF2"}],"key":"mwex4J9tWD"},{"type":"text","value":" (MDPs).\nThis chapter will focus on introducing core vocabulary for MDPs that will be useful throughout the book.","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"Usmv9D67Xc"}],"key":"vZmHt5zoy8"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"BQzVF6zlX8"}],"key":"D3FyAnc9P5"},{"type":"paragraph","position":{"start":{"line":56,"column":1},"end":{"line":58,"column":1}},"children":[{"type":"text","value":"What information might be encoded in the ","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"kSlAFwh9tF"},{"type":"emphasis","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"state","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"ItQvIUWC7f"}],"key":"jYd1GDRwww"},{"type":"text","value":" for each of the above examples?\nWhat might the valid set of ","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"NKF1JT3BQH"},{"type":"emphasis","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"actions","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"nJN6w8PAdv"}],"key":"JQ1PEb5jiE"},{"type":"text","value":" be?\nDescribe the ","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"gX1niptpY7"},{"type":"emphasis","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"state transitions","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"PwKyouz0ES"}],"key":"Fj9RiTt9SJ"},{"type":"text","value":" heuristically and verify that they satisfy the Markov property.","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"VyXu82mquZ"}],"key":"bcwA7nFBlv"}],"key":"rtsJ1yvCci"},{"type":"paragraph","position":{"start":{"line":61,"column":1},"end":{"line":63,"column":1}},"children":[{"type":"text","value":"MDPs are usually classified as ","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"key":"IY2jyfFqVJ"},{"type":"strong","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"children":[{"type":"text","value":"finite-horizon","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"key":"r1QBRrOaez"}],"key":"HMNdO6FtUR"},{"type":"text","value":", where the interactions end after some finite number of time steps,\nor ","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"key":"jtge0JiqGy"},{"type":"strong","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"children":[{"type":"text","value":"infinite-horizon","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"key":"Xt1mqRbRR5"}],"key":"vWbbnfkYGB"},{"type":"text","value":", where the interactions can continue indefinitely.\nWe’ll begin with the finite-horizon case and discuss the infinite-horizon case in the second half of the chapter.","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"key":"JQF2aXMSDd"}],"key":"qU1BCkb6oP"},{"type":"paragraph","position":{"start":{"line":65,"column":1},"end":{"line":67,"column":1}},"children":[{"type":"text","value":"We’ll describe how to ","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"uJItuS1QEO"},{"type":"emphasis","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"children":[{"type":"text","value":"evaluate","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"NMNAQpoFkl"}],"key":"ERc00j0lD5"},{"type":"text","value":" different strategies, called ","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"c5QFf1MJOx"},{"type":"strong","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"children":[{"type":"text","value":"policies,","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"dlPr7LRpJO"}],"key":"xtWJLJWMqo"},{"type":"text","value":" and how to compute (or approximate)\nthe ","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"KhXHGlFOiV"},{"type":"strong","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"children":[{"type":"text","value":"optimal policy","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"hdelaD3iIM"}],"key":"FlsugMJ262"},{"type":"text","value":" for a given MDP.\nWe’ll introduce the ","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"yJmRBBYt7o"},{"type":"strong","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"children":[{"type":"text","value":"Bellman consistency condition","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"wQEK3m50eI"}],"key":"uF8IFarWKH"},{"type":"text","value":", which allows us to analyze the whole sequence of interactions in terms of individual timesteps.","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"Uyur9rdMg6"}],"key":"k4kiYSELEE"}],"key":"hf3p76PExN"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"from utils import NamedTuple, Float, Array, partial, jax, jnp, latexify","key":"h6XQoXdD0T"},{"type":"output","id":"Pk6hHeWLnMBjg3fYOQgNo","data":[],"key":"Xk3u0a4nOk"}],"data":{},"key":"RYpOHuSp5D"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":73,"column":1},"end":{"line":73,"column":1}},"children":[{"type":"text","value":"Finite-horizon MDPs","position":{"start":{"line":73,"column":1},"end":{"line":73,"column":1}},"key":"uqzwzKwecE"}],"identifier":"finite-horizon-mdps","label":"Finite-horizon MDPs","html_id":"finite-horizon-mdps","implicit":true,"enumerator":"1.2","key":"iG9UCQssBl"},{"type":"heading","depth":3,"position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"children":[{"type":"text","value":"Definition","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"key":"lWiEA8uDVm"}],"identifier":"definition","label":"Definition","html_id":"definition","implicit":true,"enumerator":"1.2.1","key":"H3vHjCQ72w"},{"type":"proof","kind":"definition","label":"finite_horizon_mdp","identifier":"finite_horizon_mdp","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Finite-horizon Markov decision process","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"key":"F18ZPqWHEh"}],"key":"TXjXhpikez"},{"type":"paragraph","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"children":[{"type":"text","value":"The components of a finite-horizon Markov decision process are:","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"TJoqxfwClm"}],"key":"syjqAE2bmi"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":82,"column":1},"end":{"line":101,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":82,"column":1},"end":{"line":84,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":82,"column":1},"end":{"line":83,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"tak4cBa7pQ"},{"type":"strong","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"children":[{"type":"text","value":"state","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"TfumiyQ6pL"}],"key":"JzxFTDCIQH"},{"type":"text","value":" that the agent interacts with. We use ","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"VhuBbZk2PF"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"html":"S\\mathcal{S}S","key":"FjDnbFJqqk"},{"type":"text","value":" to denote\nthe set of possible states, called the ","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"reGUFtCfpk"},{"type":"strong","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"children":[{"type":"text","value":"state space","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"DfLlsrbkUY"}],"key":"T9Pe2TtE6H"},{"type":"text","value":".","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"E1oFMRLZrY"}],"key":"WivMjke8ZM"}],"key":"AxktF3VctA"},{"type":"listItem","spread":true,"position":{"start":{"line":85,"column":1},"end":{"line":87,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":85,"column":1},"end":{"line":86,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"e2hZlsHLj4"},{"type":"strong","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"children":[{"type":"text","value":"actions","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"sbg1VQzGFy"}],"key":"chp8C6Ktla"},{"type":"text","value":" that the agent can take. We use ","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"qCYpePUXb2"},{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"html":"A\\mathcal{A}A","key":"JcJstBzzrW"},{"type":"text","value":" to denote the\nset of possible actions, called the ","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"HHPIraiDz8"},{"type":"strong","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"children":[{"type":"text","value":"action space","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"mgFNlzXqTV"}],"key":"W1v9OWo9nf"},{"type":"text","value":".","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"zErOVWL4tc"}],"key":"fbd7y3HXZ6"}],"key":"Tc06mnzUbc"},{"type":"listItem","spread":true,"position":{"start":{"line":88,"column":1},"end":{"line":89,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"children":[{"type":"text","value":"Some ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"VvxeeTTFqU"},{"type":"strong","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"children":[{"type":"text","value":"initial state distribution","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"sdXNLHi6UK"}],"key":"VQJLik6qS0"},{"type":"text","value":" ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"JPxV86UA5w"},{"type":"inlineMath","value":"\\mu \\in \\triangle(\\mathcal{S})","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"html":"μ(S)\\mu \\in \\triangle(\\mathcal{S})μ(S)","key":"yO1AfWaKJp"},{"type":"text","value":".","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"kPDkT8Lz7w"}],"key":"kTZdltJwXn"}],"key":"DEfObGb8GV"},{"type":"listItem","spread":true,"position":{"start":{"line":90,"column":1},"end":{"line":93,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":90,"column":1},"end":{"line":92,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"key":"i3H4Py2TWP"},{"type":"strong","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"children":[{"type":"text","value":"state transitions","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"key":"k8vr8jzvig"}],"key":"XiMfRUQx37"},{"type":"text","value":" (a.k.a. ","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"key":"zmiYjcEmcG"},{"type":"strong","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"children":[{"type":"text","value":"dynamics","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"key":"scbNLJMDPF"}],"key":"hkd006fjc5"},{"type":"text","value":")\n","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"key":"XKn6sNpLjR"},{"type":"inlineMath","value":"P : \\mathcal{S} \\times \\mathcal{A} \\to \\triangle(\\mathcal{S})","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"html":"P:S×A(S)P : \\mathcal{S} \\times \\mathcal{A} \\to \\triangle(\\mathcal{S})P:S×A(S)","key":"gr0oDxvhMw"},{"type":"text","value":" that describe what state the agent\ntransitions to after taking an action.","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"key":"KRIKg39wQC"}],"key":"FnUDLAIi9O"}],"key":"XKyFFHLUlO"},{"type":"listItem","spread":true,"position":{"start":{"line":94,"column":1},"end":{"line":98,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":94,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"LDJZNYZ2ds"},{"type":"strong","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"reward","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"jWp0JYSDyd"}],"key":"nWrgRXhnwx"},{"type":"text","value":" signal. In this course we’ll take it to be a\ndeterministic function on state-action pairs,\n","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"uoZoI1qiKn"},{"type":"inlineMath","value":"r : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"r:S×ARr : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}r:S×AR","key":"imvw0GyiKU"},{"type":"text","value":", but in general many results will\nextend to a ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"VVVaPNO9Tp"},{"type":"emphasis","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"stochastic","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"wX0vSZWc4s"}],"key":"dPTPIWBLAR"},{"type":"text","value":" reward signal.","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"W5BzKKYUFN"}],"key":"yt44wkgX9y"}],"key":"YP8o5YxHbQ"},{"type":"listItem","spread":true,"position":{"start":{"line":99,"column":1},"end":{"line":101,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":99,"column":1},"end":{"line":100,"column":1}},"children":[{"type":"text","value":"A time horizon ","position":{"start":{"line":99,"column":1},"end":{"line":99,"column":1}},"key":"ypDUYv7UZe"},{"type":"inlineMath","value":"\\hor \\in \\mathbb{N}","position":{"start":{"line":99,"column":1},"end":{"line":99,"column":1}},"html":"HN\\hor \\in \\mathbb{N}HN","key":"PK7tvj7B9H"},{"type":"text","value":" that specifies the number of\ninteractions in an ","position":{"start":{"line":99,"column":1},"end":{"line":99,"column":1}},"key":"D4pLnJ9AAc"},{"type":"strong","position":{"start":{"line":99,"column":1},"end":{"line":99,"column":1}},"children":[{"type":"text","value":"episode","position":{"start":{"line":99,"column":1},"end":{"line":99,"column":1}},"key":"ZPD8kB1wkl"}],"key":"nsgWhdqqpO"},{"type":"text","value":".","position":{"start":{"line":99,"column":1},"end":{"line":99,"column":1}},"key":"elStbc2tUC"}],"key":"eFFObekraK"}],"key":"HrMm00H9S5"}],"key":"iiJcQfXn6R"},{"type":"paragraph","position":{"start":{"line":102,"column":1},"end":{"line":103,"column":1}},"children":[{"type":"text","value":"Combined together, these objects specify a finite-horizon Markov\ndecision process:","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"bq3mPqKJNb"}],"key":"w9xBWG5g5Q"},{"type":"math","value":"M = (\\mathcal{S}, \\mathcal{A}, \\mu, P, r, \\hor).","position":{"start":{"line":105,"column":1},"end":{"line":105,"column":1}},"html":"M=(S,A,μ,P,r,H).M = (\\mathcal{S}, \\mathcal{A}, \\mu, P, r, \\hor).M=(S,A,μ,P,r,H).","enumerator":"1.2","key":"eufAnvAsGi"},{"type":"paragraph","position":{"start":{"line":107,"column":1},"end":{"line":110,"column":1}},"children":[{"type":"text","value":"When there are ","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"wKpQnUXN8R"},{"type":"strong","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"children":[{"type":"text","value":"finitely","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"UMYPBsOLuF"}],"key":"S3ctMG1LCd"},{"type":"text","value":" many states and actions, i.e.\n","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"HN28941HmQ"},{"type":"inlineMath","value":"|\\mathcal{S}|, |\\mathcal{A}| < \\infty","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"html":"S,A<|\\mathcal{S}|, |\\mathcal{A}| < \\inftyS,A<","key":"ls4DfkY9xL"},{"type":"text","value":", we can express\nthe relevant quantities as vectors and matrices (i.e. ","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"ga5DVOS9bz"},{"type":"emphasis","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"children":[{"type":"text","value":"tables","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"IdgppkL4Hn"}],"key":"TdXbCKAVG9"},{"type":"text","value":" of\nvalues):","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"UpN0MGG8Ph"}],"key":"mspoBT8WY5"},{"type":"math","value":"\\begin{aligned}\n \\mu &\\in [0, 1]^{|\\mathcal{S}|} &\n P &\\in [0, 1]^{(|\\mathcal{S} \\times \\mathcal{A}|) \\times |\\mathcal{S}|} &\n r &\\in \\mathbb{R}^{|\\mathcal{S}| \\times |\\mathcal{A}|}\n\\end{aligned}","position":{"start":{"line":112,"column":1},"end":{"line":118,"column":1}},"html":"μ[0,1]SP[0,1](S×A)×SrRS×A\\begin{aligned}\n \\mu &\\in [0, 1]^{|\\mathcal{S}|} &\n P &\\in [0, 1]^{(|\\mathcal{S} \\times \\mathcal{A}|) \\times |\\mathcal{S}|} &\n r &\\in \\mathbb{R}^{|\\mathcal{S}| \\times |\\mathcal{A}|}\n\\end{aligned}μ[0,1]SP[0,1](S×A)×SrRS×A","enumerator":"1.3","key":"OnUm2me1nu"}],"enumerator":"1.2","html_id":"finite-horizon-mdp","key":"J5zDySeaAU"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"pHTxTU5BaH"}],"key":"iVFUs67ULy"},{"type":"paragraph","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"children":[{"type":"text","value":"Verify that the types and shapes provided above make sense!","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"LfrtoTJ7hV"}],"key":"doT7KFPLM2"}],"key":"PXLjqMd2fX"}],"key":"mqI4kCAT5E"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class MDP(NamedTuple):\n \"\"\"A description of a Markov decision process with finitely many states and actions.\"\"\"\n S: int # number of states\n A: int # number of actions\n μ: Float[Array, \" S\"]\n P: Float[Array, \"S A S\"] # \"current\" state, \"current\" action, \"next\" state\n r: Float[Array, \"S A\"]\n H: int\n γ: float = 1.0 # discount factor (used later)","key":"jJno5x1oh1"},{"type":"output","id":"mzvnUaVALY7OepolD89HX","data":[],"key":"tIoCyVKWBK"}],"data":{},"key":"AJ5v8OJXNX"},{"type":"block","children":[{"type":"proof","kind":"example","label":"tidy_mdp","identifier":"tidy_mdp","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Tidying MDP","position":{"start":{"line":137,"column":1},"end":{"line":137,"column":1}},"key":"yYz9OHtAF0"}],"key":"Dsl4OOz5RR"},{"type":"paragraph","position":{"start":{"line":140,"column":1},"end":{"line":141,"column":1}},"children":[{"type":"text","value":"Let’s consider a simple decision problem throughout this chapter:\nthe task of keeping your room tidy!","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"l7JOeEJYMD"}],"key":"SpkVXbrvKp"},{"type":"paragraph","position":{"start":{"line":143,"column":1},"end":{"line":146,"column":1}},"children":[{"type":"text","value":"Your room has the possible states\n","position":{"start":{"line":143,"column":1},"end":{"line":143,"column":1}},"key":"p8cX8EOL7H"},{"type":"inlineMath","value":"\\mathcal{S} = \\{ \\text{orderly}, \\text{messy} \\}.","position":{"start":{"line":143,"column":1},"end":{"line":143,"column":1}},"html":"S={orderly,messy}.\\mathcal{S} = \\{ \\text{orderly}, \\text{messy} \\}.S={orderly,messy}.","key":"dyRJHktI8k"},{"type":"text","value":"\nYou can take either of the actions ","position":{"start":{"line":143,"column":1},"end":{"line":143,"column":1}},"key":"cY3aYbm1jM"},{"type":"inlineMath","value":"\\mathcal{A} = \\{ \\text{ignore}, \\text{tidy} \\}.","position":{"start":{"line":143,"column":1},"end":{"line":143,"column":1}},"html":"A={ignore,tidy}.\\mathcal{A} = \\{ \\text{ignore}, \\text{tidy} \\}.A={ignore,tidy}.","key":"n7kwWdkTnh"},{"type":"text","value":"\nThe room starts off orderly.","position":{"start":{"line":143,"column":1},"end":{"line":143,"column":1}},"key":"nX6UOiAoq6"}],"key":"reHkSrRwlq"},{"type":"paragraph","position":{"start":{"line":148,"column":1},"end":{"line":150,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"key":"q51lapGqUk"},{"type":"strong","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"children":[{"type":"text","value":"state transitions","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"key":"posxycDpCD"}],"key":"SBegE1tXMt"},{"type":"text","value":" are as follows:\nif you tidy the room, it becomes (or remains) orderly;\nif you ignore the room, it ","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"key":"PuVtsOl29M"},{"type":"emphasis","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"children":[{"type":"text","value":"might","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"key":"WEG4sOBC3z"}],"key":"Muwd8AbMOa"},{"type":"text","value":" become messy (see table below).","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"key":"Fva98bDSVZ"}],"key":"f1XAbS35uB"},{"type":"paragraph","position":{"start":{"line":152,"column":1},"end":{"line":154,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"key":"rdyo1qORJZ"},{"type":"strong","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"children":[{"type":"text","value":"rewards","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"key":"OzbjKMinDS"}],"key":"phhkA3olKs"},{"type":"text","value":" are as follows: You get penalized for tidying an orderly room (a waste of time) or ignoring a messy room,\nbut you get rewarded for ignoring an orderly room (since you can enjoy your additional time).\nTidying a messy room is a chore that gives no reward.","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"key":"FmhRsBUagr"}],"key":"SjqTASxDhK"},{"type":"paragraph","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"text","value":"These are summarized in the following table:","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"key":"Z0TBLMsESq"}],"key":"BtLZ0MHAed"},{"type":"math","value":"\\begin{array}{ccccc}\n s & a & P(\\text{orderly} \\mid s, a) & P(\\text{messy} \\mid s, a) & r(s, a) \\\\\n \\text{orderly} & \\text{ignore} & 0.7 & 0.3 & 1 \\\\\n \\text{orderly} & \\text{tidy} & 1 & 0 & -1 \\\\\n \\text{messy} & \\text{ignore} & 0 & 1 & -1 \\\\\n \\text{messy} & \\text{tidy} & 1 & 0 & 0 \\\\\n\\end{array}","position":{"start":{"line":158,"column":1},"end":{"line":164,"column":1}},"html":"saP(orderlys,a)P(messys,a)r(s,a)orderlyignore0.70.31orderlytidy101messyignore011messytidy100\\begin{array}{ccccc}\n s & a & P(\\text{orderly} \\mid s, a) & P(\\text{messy} \\mid s, a) & r(s, a) \\\\\n \\text{orderly} & \\text{ignore} & 0.7 & 0.3 & 1 \\\\\n \\text{orderly} & \\text{tidy} & 1 & 0 & -1 \\\\\n \\text{messy} & \\text{ignore} & 0 & 1 & -1 \\\\\n \\text{messy} & \\text{tidy} & 1 & 0 & 0 \\\\\n\\end{array}sorderlyorderlymessymessyaignoretidyignoretidyP(orderlys,a)0.7101P(messys,a)0.3010r(s,a)1110","enumerator":"1.4","key":"GY9e7ClPrO"},{"type":"paragraph","position":{"start":{"line":166,"column":1},"end":{"line":167,"column":1}},"children":[{"type":"text","value":"Consider a time horizon of ","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"key":"RTtvBgjgOz"},{"type":"inlineMath","value":"\\hor = 7","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"html":"H=7\\hor = 7H=7","key":"aU4cPbVVuz"},{"type":"text","value":" days (one interaction per day). Let\n","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"key":"WCnE5SWin9"},{"type":"inlineMath","value":"t = 0","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"html":"t=0t = 0t=0","key":"XdrYmX82yd"},{"type":"text","value":" correspond to Monday and ","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"key":"G6xgNkjShv"},{"type":"inlineMath","value":"t = 6","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"html":"t=6t = 6t=6","key":"gXnBTTh4si"},{"type":"text","value":" correspond to Sunday.","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"key":"eMurmRwzwD"}],"key":"u9yoyrdinu"}],"enumerator":"1.1","html_id":"tidy-mdp","key":"i2hpL8zfY2"}],"key":"isjCa0d84w"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"tidy_mdp = MDP(\n S=2, # 0 = orderly, 1 = messy\n A=2, # 0 = ignore, 1 = tidy\n μ=jnp.array([1.0, 0.0]), # start in orderly state\n P=jnp.array([\n [\n [0.7, 0.3], # orderly, ignore\n [1.0, 0.0], # orderly, tidy\n ],\n [\n [0.0, 1.0], # messy, ignore\n [1.0, 0.0], # messy, tidy\n ],\n ]),\n r=jnp.array([\n [\n 1.0, # orderly, ignore\n -1.0, # orderly, tidy\n ],\n [\n -1.0, # messy, ignore\n 0.0, # messy, tidy\n ]\n ]),\n H=7,\n)","key":"Lb8W0azAKb"},{"type":"output","id":"iR9w1Kad3iw4xP_WLUdM6","data":[],"key":"WuLNcoIEtH"}],"data":{},"key":"qDSv7s7wDz"},{"type":"block","children":[{"type":"heading","depth":3,"position":{"start":{"line":199,"column":1},"end":{"line":199,"column":1}},"children":[{"type":"text","value":"Policies","position":{"start":{"line":199,"column":1},"end":{"line":199,"column":1}},"key":"BNMznuuKtI"}],"identifier":"policies","label":"Policies","html_id":"policies","implicit":true,"enumerator":"1.2.2","key":"WIpppZSqmv"},{"type":"proof","kind":"definition","label":"policy","identifier":"policy","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Policies","position":{"start":{"line":201,"column":1},"end":{"line":201,"column":1}},"key":"IqvxMhL3Mn"}],"key":"t509Uhj9GN"},{"type":"paragraph","position":{"start":{"line":204,"column":1},"end":{"line":206,"column":1}},"children":[{"type":"text","value":"A ","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"Yg7HnvW68w"},{"type":"strong","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"children":[{"type":"text","value":"policy","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"AHaDxGPqUm"}],"key":"RZnAJy51nv"},{"type":"text","value":" ","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"Qlp7ld3iq9"},{"type":"text","value":"π","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"hzaEwQLE0w"},{"type":"text","value":" describes the agent’s strategy:\nwhich actions it takes in a given situation.\nA key goal of RL is to find the ","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"gEwofpWFhJ"},{"type":"strong","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"children":[{"type":"text","value":"optimal policy","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"IEhT152Bqd"}],"key":"tZzHqkMoio"},{"type":"text","value":" that maximizes the total reward on average.","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"bZZq2GlzKq"}],"key":"RE4DCskPke"},{"type":"paragraph","position":{"start":{"line":208,"column":1},"end":{"line":209,"column":1}},"children":[{"type":"text","value":"There are three axes along which policies can vary: their outputs,\ninputs, and time-dependence.","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"asH4jNICZk"}],"key":"DCwgF1DrOn"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":211,"column":1},"end":{"line":214,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":211,"column":1},"end":{"line":214,"column":1}},"children":[{"type":"strong","position":{"start":{"line":211,"column":1},"end":{"line":211,"column":1}},"children":[{"type":"text","value":"Deterministic or stochastic.","position":{"start":{"line":211,"column":1},"end":{"line":211,"column":1}},"key":"j0HAV2tWfT"}],"key":"AFhkSlvOpD"},{"type":"text","value":" A deterministic policy outputs\nactions while a stochastic policy outputs ","position":{"start":{"line":211,"column":1},"end":{"line":211,"column":1}},"key":"rmAycrPi52"},{"type":"emphasis","position":{"start":{"line":211,"column":1},"end":{"line":211,"column":1}},"children":[{"type":"text","value":"distributions","position":{"start":{"line":211,"column":1},"end":{"line":211,"column":1}},"key":"kBI4zuYMRL"}],"key":"MRCSWe9iW3"},{"type":"text","value":" over\nactions.","position":{"start":{"line":211,"column":1},"end":{"line":211,"column":1}},"key":"G66UJHvDjD"}],"key":"X4PSWNRI34"}],"key":"YEak12Jvzc"},{"type":"container","kind":"figure","children":[{"type":"image","url":"/build/deterministic_policy-9d0b50d69541007293ead345d987b682.png","alt":"A deterministic policy.","align":"center","data":{"altTextIsAutoGenerated":true},"key":"hoDZDSn3LR","urlSource":"./shared/deterministic_policy.png","urlOptimized":"/build/deterministic_policy-9d0b50d69541007293ead345d987b682.webp"},{"type":"caption","children":[{"type":"paragraph","position":{"start":{"line":218,"column":1},"end":{"line":218,"column":1}},"children":[{"type":"text","value":"A deterministic policy.","position":{"start":{"line":218,"column":1},"end":{"line":218,"column":1}},"key":"pqsKmWv9O2"}],"key":"YqwV7EsC9S"}],"key":"o2JXAAZXXm"}],"enumerator":"1.1","key":"KtFfzNewmq"},{"type":"container","kind":"figure","children":[{"type":"image","url":"/build/stochastic_policy-bc720a6ff54c4a27f3c7ec4de93b5c0d.png","alt":"A stochastic policy.","align":"center","data":{"altTextIsAutoGenerated":true},"key":"uNHa1RCKTh","urlSource":"./shared/stochastic_policy.png","urlOptimized":"/build/stochastic_policy-bc720a6ff54c4a27f3c7ec4de93b5c0d.webp"},{"type":"caption","children":[{"type":"paragraph","position":{"start":{"line":224,"column":1},"end":{"line":224,"column":1}},"children":[{"type":"text","value":"A stochastic policy.","position":{"start":{"line":224,"column":1},"end":{"line":224,"column":1}},"key":"vklfn2Pru4"}],"key":"u9JvoX70oO"}],"key":"zejyDXYB48"}],"enumerator":"1.2","key":"pXIoE4bom5"},{"type":"list","ordered":true,"start":2,"spread":false,"position":{"start":{"line":227,"column":1},"end":{"line":236,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":227,"column":1},"end":{"line":232,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":227,"column":1},"end":{"line":231,"column":1}},"children":[{"type":"strong","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"children":[{"type":"text","value":"State-dependent or history-dependent.","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"key":"bRU6tjISMY"}],"key":"XuBJB2zeqM"},{"type":"text","value":" A state-dependent (a.k.a.\n“Markovian”) policy only depends on the current state, while a\nhistory-dependent policy depends on the sequence of past states,\nactions, and rewards. We’ll only consider state-dependent policies\nin this course.","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"key":"omqrv2m9FN"}],"key":"mzsOQMRphw"}],"key":"b228b6Ekw0"},{"type":"listItem","spread":true,"position":{"start":{"line":233,"column":1},"end":{"line":236,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":233,"column":1},"end":{"line":236,"column":1}},"children":[{"type":"strong","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"children":[{"type":"text","value":"Stationary or time-dependent.","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"key":"y326mjb4ve"}],"key":"blCi32feZx"},{"type":"text","value":" A stationary (a.k.a. time-homogeneous) policy\nremains the same function at all time steps, while a time-dependent policy can depend on the current timestep.\nFor consistency with states and actions, we will denote the timestep as a subscript,\ni.e. ","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"key":"wYg1e5UeTT"},{"type":"inlineMath","value":"\\pi = \\{ \\pi_0, \\dots, \\pi_{\\hor-1} \\}.","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"html":"π={π0,,πH1}.\\pi = \\{ \\pi_0, \\dots, \\pi_{\\hor-1} \\}.π={π0,,πH1}.","key":"miyjIE4xXe"}],"key":"j84VphrvwA"}],"key":"ZBmBlGgVNP"}],"key":"dCeFM1w1nz"}],"enumerator":"1.3","html_id":"policy","key":"p1rodBmB2K"}],"key":"vufHjOdmGA"},{"type":"block","position":{"start":{"line":239,"column":1},"end":{"line":239,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":241,"column":1},"end":{"line":244,"column":1}},"children":[{"type":"text","value":"Note that for finite state and action spaces,\nwe can represent a randomized mapping ","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"key":"kWnwzY8Z3V"},{"type":"inlineMath","value":"\\mathcal{S} \\to \\Delta(\\mathcal{A})","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"html":"SΔ(A)\\mathcal{S} \\to \\Delta(\\mathcal{A})SΔ(A)","key":"w17yjY8sO0"},{"type":"text","value":"\nas a matrix ","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"key":"fbrYUrVsHq"},{"type":"inlineMath","value":"\\pi \\in [0, 1]^{\\mathcal{S} \\times \\mathcal{A}}","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"html":"π[0,1]S×A\\pi \\in [0, 1]^{\\mathcal{S} \\times \\mathcal{A}}π[0,1]S×A","key":"mx2jd9lAzH"},{"type":"text","value":" where each row describes\nthe policy’s distribution over actions for the corresponding state.","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"key":"bcHrkazAG8"}],"key":"EXORPvA53d"},{"type":"paragraph","position":{"start":{"line":246,"column":1},"end":{"line":248,"column":1}},"children":[{"type":"text","value":"A fascinating result is that every finite-horizon MDP has an optimal deterministic time-dependent policy!\nIntuitively, the Markov property implies that the current state contains all the information we need to make the optimal decision.\nWe’ll prove this result constructively later in the chapter.","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"key":"GSvTE2vRgd"}],"key":"urVYK7MWTd"},{"type":"proof","kind":"example","label":"tidy_policy","identifier":"tidy_policy","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Policies for the tidying MDP","position":{"start":{"line":250,"column":1},"end":{"line":250,"column":1}},"key":"zkd0FNjf9r"}],"key":"kUrHlbYNmc"},{"type":"paragraph","position":{"start":{"line":253,"column":1},"end":{"line":253,"column":1}},"children":[{"type":"text","value":"Here are some possible policies for the tidying MDP ","position":{"start":{"line":253,"column":1},"end":{"line":253,"column":1}},"key":"W6wpjnuU2F"},{"type":"crossReference","kind":"proof:example","identifier":"tidy_mdp","label":"tidy_mdp","children":[{"type":"text","value":"Example ","key":"FDR8O14Klm"},{"type":"text","value":"1.1","key":"bZrusmlBuE"}],"template":"Example %s","enumerator":"1.1","resolved":true,"html_id":"tidy-mdp","key":"k5dFqwDpO0"},{"type":"text","value":":","position":{"start":{"line":253,"column":1},"end":{"line":253,"column":1}},"key":"pOjsthzsed"}],"key":"IS0vZkesxE"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":255,"column":1},"end":{"line":261,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":255,"column":1},"end":{"line":256,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":255,"column":1},"end":{"line":255,"column":1}},"children":[{"type":"text","value":"Always tidy: ","position":{"start":{"line":255,"column":1},"end":{"line":255,"column":1}},"key":"zds4L2wRPN"},{"type":"inlineMath","value":"\\pi(s) = \\text{tidy}","position":{"start":{"line":255,"column":1},"end":{"line":255,"column":1}},"html":"π(s)=tidy\\pi(s) = \\text{tidy}π(s)=tidy","key":"Ja71AnHu2e"},{"type":"text","value":".","position":{"start":{"line":255,"column":1},"end":{"line":255,"column":1}},"key":"BUifrGRTqu"}],"key":"qZnkPIyxyO"}],"key":"fUH5MOb0aY"},{"type":"listItem","spread":true,"position":{"start":{"line":257,"column":1},"end":{"line":259,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":257,"column":1},"end":{"line":258,"column":1}},"children":[{"type":"text","value":"Only tidy on weekends: ","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"key":"IL1piWUzRu"},{"type":"inlineMath","value":"\\pi_\\hi(s) = \\text{tidy}","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"html":"πh(s)=tidy\\pi_\\hi(s) = \\text{tidy}πh(s)=tidy","key":"Egni4YDPoq"},{"type":"text","value":" if\n","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"key":"qk1aBTL9JI"},{"type":"inlineMath","value":"\\hi \\in \\{ 5, 6 \\}","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"html":"h{5,6}\\hi \\in \\{ 5, 6 \\}h{5,6}","key":"lz2IZpwiNq"},{"type":"text","value":" and ","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"key":"vYO45vYPzu"},{"type":"inlineMath","value":"\\pi_\\hi(s) = \\text{ignore}","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"html":"πh(s)=ignore\\pi_\\hi(s) = \\text{ignore}πh(s)=ignore","key":"dWI38FdvVz"},{"type":"text","value":" otherwise.","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"key":"V9OEAx4NSU"}],"key":"nekUnThMKv"}],"key":"ov8ScCwf3M"},{"type":"listItem","spread":true,"position":{"start":{"line":260,"column":1},"end":{"line":261,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":260,"column":1},"end":{"line":261,"column":1}},"children":[{"type":"text","value":"Only tidy if the room is messy: ","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"key":"levYisn6fk"},{"type":"inlineMath","value":"\\pi_\\hi(\\text{messy}) = \\text{tidy}","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"html":"πh(messy)=tidy\\pi_\\hi(\\text{messy}) = \\text{tidy}πh(messy)=tidy","key":"T5z7xPwVsl"},{"type":"text","value":"\nand ","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"key":"Y4sDc6SnCZ"},{"type":"inlineMath","value":"\\pi_\\hi(\\text{orderly}) = \\text{ignore}","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"html":"πh(orderly)=ignore\\pi_\\hi(\\text{orderly}) = \\text{ignore}πh(orderly)=ignore","key":"LzpbgAh4vZ"},{"type":"text","value":" for all ","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"key":"h7YExN5vgI"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"html":"h\\hih","key":"QiBT3IV7gd"},{"type":"text","value":".","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"key":"sE3tTqkH03"}],"key":"uU1ZEirMfk"}],"key":"siCy81Ztw2"}],"key":"nM5dwuU5rL"}],"enumerator":"1.2","html_id":"tidy-policy","key":"XKNzF8LnxE"}],"key":"Zv3F4CktAi"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"# arrays of shape (H, S, A) represent time-dependent policies\ntidy_policy_always_tidy = (\n jnp.zeros((7, 2, 2))\n .at[:, :, 1].set(1.0)\n)\ntidy_policy_weekends = (\n jnp.zeros((7, 2, 2))\n .at[5:7, :, 1].set(1.0)\n .at[0:5, :, 0].set(1.0)\n)\ntidy_policy_messy_only = (\n jnp.zeros((7, 2, 2))\n .at[:, 1, 1].set(1.0)\n .at[:, 0, 0].set(1.0)\n)","key":"H7OMxk4c61"},{"type":"output","id":"ml0ab07MTrMwSZ-XaKG0V","data":[],"key":"N3owY2U2KV"}],"data":{},"key":"ciEe2l1kEZ"},{"type":"block","children":[{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Note","key":"EXLAGNo4DT"}],"key":"lVhQFUqJ1F"},{"type":"paragraph","position":{"start":{"line":283,"column":1},"end":{"line":285,"column":1}},"children":[{"type":"text","value":"Array objects in Jax are ","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"sfvU4xw4me"},{"type":"strong","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"children":[{"type":"text","value":"immutable,","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"EAjeVohfmM"}],"key":"FMhmNAKN3m"},{"type":"text","value":" that is, they cannot be ","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"skTPURrHBS"},{"type":"emphasis","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"children":[{"type":"text","value":"changed.","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"Jm7iRoAkm8"}],"key":"rPYy5anH1S"},{"type":"text","value":"\nThis might seem inconvenient, but in larger projects,\nimmutability makes code much easier to reason about.","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"ReDOVqxzim"}],"key":"ZcqOkkhRgo"}],"key":"p1EM7TJt3t"}],"key":"tnPO4osUdc"},{"type":"block","position":{"start":{"line":288,"column":1},"end":{"line":288,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":291,"column":1},"end":{"line":291,"column":1}},"children":[{"type":"text","value":"Trajectories","position":{"start":{"line":291,"column":1},"end":{"line":291,"column":1}},"key":"aQBSVgYRN9"}],"label":"trajectories","identifier":"trajectories","html_id":"trajectories","enumerator":"1.2.3","key":"LXwacCtMlf"},{"type":"proof","kind":"definition","label":"trajectory","identifier":"trajectory","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Trajectories","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"key":"KcYeY3Fuav"}],"key":"yxik7kpFkr"},{"type":"paragraph","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"children":[{"type":"text","value":"A sequence of states, actions, and rewards is called a ","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"key":"G4fkO3iqv5"},{"type":"strong","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"children":[{"type":"text","value":"trajectory","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"key":"YJdVAObrp1"}],"key":"CcPlMa2sEB"},{"type":"text","value":":","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"key":"IltO9RABJi"}],"key":"nGr5uGGxg8"},{"type":"math","value":"\\tau = (s_0, a_0, r_0, \\dots, s_{H-1}, a_{H-1}, r_{H-1})","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"html":"τ=(s0,a0,r0,,sH1,aH1,rH1)\\tau = (s_0, a_0, r_0, \\dots, s_{H-1}, a_{H-1}, r_{H-1})τ=(s0,a0,r0,,sH1,aH1,rH1)","enumerator":"1.5","key":"DYmmTFVhf7"},{"type":"paragraph","position":{"start":{"line":300,"column":1},"end":{"line":301,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":300,"column":1},"end":{"line":300,"column":1}},"key":"raT35Ak11w"},{"type":"inlineMath","value":"r_\\hi = r(s_\\hi, a_\\hi)","position":{"start":{"line":300,"column":1},"end":{"line":300,"column":1}},"html":"rh=r(sh,ah)r_\\hi = r(s_\\hi, a_\\hi)rh=r(sh,ah)","key":"gLHvuudhHX"},{"type":"text","value":".\n(Note that some sources omit the reward at the final time step. This is a minor detail.)","position":{"start":{"line":300,"column":1},"end":{"line":300,"column":1}},"key":"a9JWh2RJ0V"}],"key":"Ug3bDh5MIt"}],"enumerator":"1.4","html_id":"trajectory","key":"IM5Yup7puL"}],"key":"O2L5H9mHBT"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class Transition(NamedTuple):\n \"\"\"A single state-action-reward interaction with the environment.\n\n A trajectory comprises a sequence of transitions.\n \"\"\"\n s: int\n a: int\n r: float","key":"a6LoRNEBnX"},{"type":"output","id":"2E7iizq9o92VpiPyHVLND","data":[],"key":"DIXTcL4cWZ"}],"data":{},"key":"LXKUzXUIPq"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":315,"column":1},"end":{"line":317,"column":1}},"children":[{"type":"text","value":"Once we’ve chosen a policy,\nwe can sample trajectories by repeatedly choosing actions according to the policy,\ntransitioning according to the state transitions, and observing the rewards.","position":{"start":{"line":315,"column":1},"end":{"line":315,"column":1}},"key":"ZykA1Ahipp"}],"key":"VUmVdj7hp4"},{"type":"image","url":"/build/trajectory-ea534afbae8ad1151663ff974e306d5e.png","width":"240px","align":"center","key":"EhVUmKm1Iz","urlSource":"shared/trajectory.png","urlOptimized":"/build/trajectory-ea534afbae8ad1151663ff974e306d5e.webp"},{"type":"paragraph","position":{"start":{"line":324,"column":1},"end":{"line":325,"column":1}},"children":[{"type":"text","value":"That is, a policy induces a distribution ","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"key":"KiFBLtA534"},{"type":"inlineMath","value":"\\rho^{\\pi}","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"html":"ρπ\\rho^{\\pi}ρπ","key":"PxUHmwBdqM"},{"type":"text","value":" over trajectories.\n(We assume that ","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"key":"jIjjwecRae"},{"type":"text","value":"μ","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"key":"hdsNdivGQR"},{"type":"text","value":" and ","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"key":"FMt3tnef2v"},{"type":"inlineMath","value":"P","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"html":"PPP","key":"hZgb6vWhr5"},{"type":"text","value":" are clear from context.)","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"key":"HwEt1W8jid"}],"key":"XcDPs7Dm6M"},{"type":"proof","kind":"example","label":"tidy_traj","identifier":"tidy_traj","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Trajectories in the tidying environment","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"key":"GHJCRP3Sn7"}],"key":"hMlA3ayKZQ"},{"type":"paragraph","position":{"start":{"line":330,"column":1},"end":{"line":330,"column":1}},"children":[{"type":"text","value":"Here is a possible trajectory for the tidying example:","position":{"start":{"line":330,"column":1},"end":{"line":330,"column":1}},"key":"oIcrZ2j3Nq"}],"key":"ZlnaUkUbTz"},{"type":"container","kind":"table","children":[{"type":"table","position":{"start":{"line":333,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"tableRow","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"inlineMath","value":"\\hi","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"html":"h\\hih","key":"m9BQlrSLve"}],"key":"z2xyeMugdT"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"0","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"RgeJ9mnjKu"}],"key":"zm5d2sitia"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"1","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"BNCYAIk8Ap"}],"key":"Y35EgA691k"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"2","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"AZr5iSeLzT"}],"key":"ZzknCQF1zc"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"3","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"nTI36G8hbd"}],"key":"L3yjvBFbKw"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"4","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"fbxsMNtQWf"}],"key":"R1acl3loqQ"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"5","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"TNVszDr5Op"}],"key":"CG85mKyAS0"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"6","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"MYudHrbZxE"}],"key":"dNH0cHH4va"}],"key":"mrGQ65rxX4"},{"type":"tableRow","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"inlineMath","value":"s","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"html":"sss","key":"u5zwfU4aSJ"}],"key":"CjxbV7La6B"},{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"orderly","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"umxT61MIKd"}],"key":"HHQwBrpZGQ"},{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"orderly","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"vpxUISFtXT"}],"key":"KqcmiVxZeM"},{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"orderly","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"XO9DgAInkq"}],"key":"cUSbRpXrVK"},{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"messy","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"P5Jo9thIHB"}],"key":"lGFejvI7sT"},{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"messy","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"sYclhZJ4Vm"}],"key":"PxffCI6l4y"},{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"orderly","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"C9r5qxwwd8"}],"key":"KPmhCIEJki"},{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"orderly","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"iiXiXsbpKs"}],"key":"i66mQOTEHL"}],"key":"aTGj5JSdDy"},{"type":"tableRow","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"inlineMath","value":"a","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"html":"aaa","key":"QveOMrayyj"}],"key":"g5DcB4TdoG"},{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"text","value":"tidy","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"key":"CKIZt18XaO"}],"key":"MHnajdmrCD"},{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"text","value":"ignore","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"key":"P86FrjnX4a"}],"key":"Wds8UbyKFW"},{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"text","value":"ignore","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"key":"Y7cQxjZTfm"}],"key":"c3az0RtuyA"},{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"text","value":"ignore","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"key":"Skv3NVNQdf"}],"key":"PQNoNydA6N"},{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"text","value":"tidy","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"key":"tN7u9e6Nvr"}],"key":"UocXlZTRGz"},{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"text","value":"ignore","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"key":"agxZ1MD4T8"}],"key":"y21LdRV4lP"},{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"text","value":"ignore","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"key":"RQhv7SwP5Y"}],"key":"PuHTY4nOqU"}],"key":"WJvNi9y9r0"},{"type":"tableRow","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"inlineMath","value":"r","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"html":"rrr","key":"dVglnkvTBS"}],"key":"RbUcodUnbt"},{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"-1","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"ZsXojoLc8Y"}],"key":"HP0FiLQ7Qt"},{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"1","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"qLAiTLZvYj"}],"key":"WiBQerIvCU"},{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"1","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"yWJI0IuydC"}],"key":"WO5p8192Ui"},{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"-1","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"RuPWXkGYji"}],"key":"iL97O644r5"},{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"0","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"UkezKlQ0zc"}],"key":"dMq6AWybt9"},{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"1","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"Mt5cdBK4bN"}],"key":"bGGTjhGNYX"},{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"1","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"alh7CCd81z"}],"key":"i7yGZsbUEu"}],"key":"lr6doU75q5"}],"key":"QBPBKmGi8I"}],"enumerator":"1.1","key":"R75yGROeMM"},{"type":"paragraph","position":{"start":{"line":340,"column":1},"end":{"line":340,"column":1}},"children":[{"type":"text","value":"Could any of the policies in ","position":{"start":{"line":340,"column":1},"end":{"line":340,"column":1}},"key":"oTxyH4d685"},{"type":"crossReference","kind":"proof:example","identifier":"tidy_policy","label":"tidy_policy","children":[{"type":"text","value":"Example ","key":"kV057X2bfL"},{"type":"text","value":"1.2","key":"xoW0KK1KXF"}],"template":"Example %s","enumerator":"1.2","resolved":true,"html_id":"tidy-policy","key":"M9AKd4UACm"},{"type":"text","value":" have generated this trajectory?","position":{"start":{"line":340,"column":1},"end":{"line":340,"column":1}},"key":"lsiGAKpLed"}],"key":"uC64f8LZwh"}],"enumerator":"1.3","html_id":"tidy-traj","key":"YPw8WXYboD"},{"type":"paragraph","position":{"start":{"line":343,"column":1},"end":{"line":344,"column":1}},"children":[{"type":"text","value":"Note that for a state-dependent policy, using the Markov property ","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"hRPLnTjHTj"},{"type":"crossReference","kind":"proof:definition","identifier":"markov","label":"markov","children":[{"type":"text","value":"Definition ","key":"PQHRlNdN01"},{"type":"text","value":"1.1","key":"f4spORheBi"}],"template":"Definition %s","enumerator":"1.1","resolved":true,"html_id":"markov","key":"v5v3uGqn2W"},{"type":"text","value":",\nwe can write down the likelihood function of this probability distribution in an ","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"Suvcb3YFyE"},{"type":"strong","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"children":[{"type":"text","value":"autoregressive","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"RI5yR6gALM"}],"key":"TN7M3E3I7H"},{"type":"text","value":" way (i.e. one timestep at a time):","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"bAGu1xIccJ"}],"key":"NvkQBZZIgz"},{"type":"proof","kind":"definition","label":"autoregressive_trajectories","identifier":"autoregressive_trajectories","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Autoregressive trajectory distribution","position":{"start":{"line":346,"column":1},"end":{"line":346,"column":1}},"key":"WrUnI8HMNc"}],"key":"QGAzd2QItS"},{"type":"math","value":"\\rho^{\\pi}(\\tau) := \\mu(s_0) \\pi_0(a_0 \\mid s_0) P(s_1 \\mid s_0, a_0) \\cdots P(s_{\\hor-1} \\mid s_{\\hor-2}, a_{\\hor-2}) \\pi_{\\hor-1}(a_{\\hor-1} \\mid s_{\\hor-1})","position":{"start":{"line":349,"column":1},"end":{"line":349,"column":1}},"html":"ρπ(τ):=μ(s0)π0(a0s0)P(s1s0,a0)P(sH1sH2,aH2)πH1(aH1sH1)\\rho^{\\pi}(\\tau) := \\mu(s_0) \\pi_0(a_0 \\mid s_0) P(s_1 \\mid s_0, a_0) \\cdots P(s_{\\hor-1} \\mid s_{\\hor-2}, a_{\\hor-2}) \\pi_{\\hor-1}(a_{\\hor-1} \\mid s_{\\hor-1})ρπ(τ):=μ(s0)π0(a0s0)P(s1s0,a0)P(sH1sH2,aH2)πH1(aH1sH1)","enumerator":"1.6","key":"h58CFI8W6T"}],"enumerator":"1.5","html_id":"autoregressive-trajectories","key":"yVuWR0xAdp"}],"key":"w6v6pfQC1C"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def trajectory_log_likelihood(\n mdp: MDP,\n τ: list[Transition],\n π: Float[Array, \"S A\"],\n) -> float:\n \"\"\"Compute the log-likelihood of a trajectory under a given MDP and policy.\"\"\"\n\n # initial distribution and action\n total = jnp.log(mdp.μ[τ[0].s])\n total += jnp.log(π[τ[0].s, τ[0].a])\n\n # remaining state transitions and actions\n for i in range(1, mdp.H):\n total += jnp.log(mdp.P[τ[i - 1].s, τ[i - 1].a, τ[i].s])\n total += jnp.log(π[τ[i].s, τ[i].a])\n\n return total","key":"RenoEWvZuT"},{"type":"output","id":"dszYr90dG_2Ak092bkQxX","data":[],"key":"MX1tPhzEof"}],"data":{},"key":"vv5fhEW7EN"},{"type":"block","children":[{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"Rh2ZMvwhhx"}],"key":"Rmlc7PIi0D"},{"type":"paragraph","position":{"start":{"line":373,"column":1},"end":{"line":373,"column":1}},"children":[{"type":"text","value":"How would you modify this to include stochastic rewards?","position":{"start":{"line":373,"column":1},"end":{"line":373,"column":1}},"key":"FbpWwb35rD"}],"key":"pKYYya3MzV"}],"key":"uwjtvDjHzW"},{"type":"paragraph","position":{"start":{"line":376,"column":1},"end":{"line":378,"column":1}},"children":[{"type":"text","value":"For a deterministic policy ","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"UKCwY1rJQI"},{"type":"text","value":"π","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"ECUOUpF1D2"},{"type":"text","value":", we have that ","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"wgrepgvMjZ"},{"type":"inlineMath","value":"\\pi_\\hi(a \\mid s) = \\mathbb{I}[a = \\pi_\\hi(s)]","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"html":"πh(as)=I[a=πh(s)]\\pi_\\hi(a \\mid s) = \\mathbb{I}[a = \\pi_\\hi(s)]πh(as)=I[a=πh(s)]","key":"DxyRstynCn"},{"type":"text","value":";\nthat is, the probability of taking an action is ","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"fnJbgUZBGM"},{"type":"text","value":"1","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"RafV3WM7mH"},{"type":"text","value":" if it’s the unique action prescribed by the policy for that state and ","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"xNK7dMNSkU"},{"type":"text","value":"0","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"cvfBcfglio"},{"type":"text","value":" otherwise.\nIn this case, the only randomness in sampling trajectories comes from the initial state distribution ","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"kZrtpqwFai"},{"type":"text","value":"μ","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"R1n9hM9yGS"},{"type":"text","value":" and the state transitions ","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"M7pVvXTHyk"},{"type":"inlineMath","value":"P","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"html":"PPP","key":"w3UYbi06n5"},{"type":"text","value":".","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"czK1NgdZ9j"}],"key":"tB10JdaHpg"}],"key":"bksz4UzqDT"},{"type":"block","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":382,"column":1},"end":{"line":382,"column":1}},"children":[{"type":"text","value":"Value functions","position":{"start":{"line":382,"column":1},"end":{"line":382,"column":1}},"key":"Qq05CvH7k1"}],"identifier":"value-functions","label":"Value functions","html_id":"value-functions","implicit":true,"enumerator":"1.2.4","key":"thU9jIaJiR"},{"type":"paragraph","position":{"start":{"line":384,"column":1},"end":{"line":385,"column":1}},"children":[{"type":"text","value":"The main goal of RL is to find a policy that maximizes the expected total\nreward ","position":{"start":{"line":384,"column":1},"end":{"line":384,"column":1}},"key":"Je3ejjnlZS"},{"type":"inlineMath","value":"\\E [r_0 + \\cdots + r_{\\hor-1}]","position":{"start":{"line":384,"column":1},"end":{"line":384,"column":1}},"html":"E[r0++rH1]\\E [r_0 + \\cdots + r_{\\hor-1}]E[r0++rH1]","key":"hzq1DmZr9l"},{"type":"text","value":".","position":{"start":{"line":384,"column":1},"end":{"line":384,"column":1}},"key":"IstSYQpSsZ"}],"key":"EE23LasPNL"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"wcxYTlmgZU"}],"key":"wpyEiYXMbq"},{"type":"paragraph","position":{"start":{"line":388,"column":1},"end":{"line":390,"column":1}},"children":[{"type":"text","value":"Note that ","position":{"start":{"line":388,"column":1},"end":{"line":388,"column":1}},"key":"h7l6iW4Fze"},{"type":"inlineMath","value":"r_0 + \\cdots + r_{\\hor-1}","position":{"start":{"line":388,"column":1},"end":{"line":388,"column":1}},"html":"r0++rH1r_0 + \\cdots + r_{\\hor-1}r0++rH1","key":"HjTwkbbGhj"},{"type":"text","value":" is a random variable.\nWhat sources of randomness does it depend on?\nDescribe the generating process.","position":{"start":{"line":388,"column":1},"end":{"line":388,"column":1}},"key":"EnUg8Yurbo"}],"key":"GZpi7OucxQ"}],"key":"hSmUtQ3Egr"},{"type":"paragraph","position":{"start":{"line":393,"column":1},"end":{"line":393,"column":1}},"children":[{"type":"text","value":"Let’s introduce some notation for analyzing this quantity.","position":{"start":{"line":393,"column":1},"end":{"line":393,"column":1}},"key":"RjhWTMJpc6"}],"key":"Tws6dvIw7R"},{"type":"paragraph","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"children":[{"type":"text","value":"A policy’s ","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"wCcxlyUQ5g"},{"type":"strong","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"children":[{"type":"text","value":"value function","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"d1CdanR105"}],"key":"G7ZegeDYqU"},{"type":"text","value":" at time ","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"H5XevVf9ui"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"html":"h\\hih","key":"MViz5Sbx2d"},{"type":"text","value":" is its expected remaining reward ","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"RDDYcQ80Pt"},{"type":"emphasis","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"children":[{"type":"text","value":"from a given state","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"wgMDYPqra8"}],"key":"Nr28lqf09K"},{"type":"text","value":":","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"Ztls6fGsyD"}],"key":"P0ATYXXqds"},{"type":"proof","kind":"definition","label":"value","identifier":"value","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Value function","position":{"start":{"line":397,"column":1},"end":{"line":397,"column":1}},"key":"VxmImDL5OK"}],"key":"WQXxiTgei6"},{"type":"math","value":"V_\\hi^\\pi(s) := \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\cdots + r_{H-1} \\mid s_\\hi = s]","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"html":"Vhπ(s):=Eτρπ[rh++rH1sh=s]V_\\hi^\\pi(s) := \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\cdots + r_{H-1} \\mid s_\\hi = s]Vhπ(s):=Eτρπ[rh++rH1sh=s]","enumerator":"1.7","key":"i3Tg9pEWbz"}],"enumerator":"1.6","html_id":"value","key":"ftgVyN07RI"},{"type":"paragraph","position":{"start":{"line":403,"column":1},"end":{"line":404,"column":1}},"children":[{"type":"text","value":"Similarly, we can define the ","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"aBN5WicUZO"},{"type":"strong","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"children":[{"type":"text","value":"action-value function","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"yWcNhfZDS0"}],"key":"kJVFlesCvH"},{"type":"text","value":" (aka the\n","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"CDrZhSsEjf"},{"type":"strong","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"children":[{"type":"text","value":"Q-function","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"izqhN2tO72"}],"key":"ZPzcEslLxL"},{"type":"text","value":") at time ","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"Tb54TaZq1E"},{"type":"inlineMath","value":"h","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"html":"hhh","key":"arRN5rMpDS"},{"type":"text","value":" as the expected remaining reward ","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"Kxdms1XXoR"},{"type":"emphasis","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"children":[{"type":"text","value":"from a given state and taking a given action","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"hXrZlegK2L"}],"key":"H9nmW1qGbb"},{"type":"text","value":":","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"ukyZPKAfjC"}],"key":"pZXyKaZo74"},{"type":"proof","kind":"definition","label":"action_value","identifier":"action_value","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Action-value function","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"key":"u65r0pIdZH"}],"key":"s4L8jAp5yx"},{"type":"math","value":"Q_\\hi^\\pi(s, a) := \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\cdots + r_{H-1} \\mid s_\\hi = s, a_\\hi = a]","position":{"start":{"line":409,"column":1},"end":{"line":409,"column":1}},"html":"Qhπ(s,a):=Eτρπ[rh++rH1sh=s,ah=a]Q_\\hi^\\pi(s, a) := \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\cdots + r_{H-1} \\mid s_\\hi = s, a_\\hi = a]Qhπ(s,a):=Eτρπ[rh++rH1sh=s,ah=a]","enumerator":"1.8","key":"sOyNZarsox"}],"enumerator":"1.7","html_id":"action-value","key":"Y6akPE87DV"}],"key":"rGqXBS5x0W"},{"type":"block","position":{"start":{"line":412,"column":1},"end":{"line":412,"column":1}},"children":[{"type":"heading","depth":4,"position":{"start":{"line":414,"column":1},"end":{"line":414,"column":1}},"children":[{"type":"text","value":"Relating the value function and action-value function","position":{"start":{"line":414,"column":1},"end":{"line":414,"column":1}},"key":"flMXt6ItZ8"}],"identifier":"relating-the-value-function-and-action-value-function","label":"Relating the value function and action-value function","html_id":"relating-the-value-function-and-action-value-function","implicit":true,"enumerator":"1.2.4.1","key":"TYzhDzEUmu"},{"type":"paragraph","position":{"start":{"line":416,"column":1},"end":{"line":417,"column":1}},"children":[{"type":"text","value":"Note that the value function is just the expected action-value over\nactions drawn from the policy:","position":{"start":{"line":416,"column":1},"end":{"line":416,"column":1}},"key":"eoNOfER7o3"}],"key":"Vuf14ltKns"},{"type":"math","value":"V_\\hi^\\pi(s) = \\E_{a \\sim \\pi_\\hi(s)} [Q_\\hi^\\pi(s, a)]","position":{"start":{"line":419,"column":1},"end":{"line":419,"column":1}},"html":"Vhπ(s)=Eaπh(s)[Qhπ(s,a)]V_\\hi^\\pi(s) = \\E_{a \\sim \\pi_\\hi(s)} [Q_\\hi^\\pi(s, a)]Vhπ(s)=Eaπh(s)[Qhπ(s,a)]","enumerator":"1.9","key":"KTu2RGsDYB"}],"key":"rhUjhi64X2"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def q_to_v(\n policy: Float[Array, \"S A\"],\n q: Float[Array, \"S A\"],\n) -> Float[Array, \" S\"]:\n \"\"\"\n Compute the value function for a given policy in a known finite MDP\n at a single timestep from its action-value function.\n \"\"\"\n return jnp.average(q, weights=policy, axis=1)","key":"XIcz9NLBn0"},{"type":"output","id":"eDiBC3NeqfcTrHPvjw6Tb","data":[],"key":"NbrFPaOClF"}],"data":{},"key":"d4V6K8kuUT"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":433,"column":1},"end":{"line":434,"column":1}},"children":[{"type":"text","value":"and the action-value is the sum of the immediate reward and the expected value of the following\nstate:","position":{"start":{"line":433,"column":1},"end":{"line":433,"column":1}},"key":"TZBMc0stiW"}],"key":"w0yzx0UXvP"},{"type":"math","value":"Q_\\hi^\\pi(s, a) = r(s, a) + \\E_{s' \\sim P(s, a)} [V_{\\hi+1}^\\pi(s')]","position":{"start":{"line":436,"column":1},"end":{"line":436,"column":1}},"html":"Qhπ(s,a)=r(s,a)+EsP(s,a)[Vh+1π(s)]Q_\\hi^\\pi(s, a) = r(s, a) + \\E_{s' \\sim P(s, a)} [V_{\\hi+1}^\\pi(s')]Qhπ(s,a)=r(s,a)+EsP(s,a)[Vh+1π(s)]","enumerator":"1.10","key":"SJgXG5MiHV"}],"key":"Wpe2XthFhU"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def v_to_q(\n mdp: MDP,\n v_next: Float[Array, \" S\"],\n) -> Float[Array, \"S A\"]:\n \"\"\"\n Compute the action-value function in a known finite MDP\n at a single timestep from the corresponding value function.\n \"\"\"\n # the discount factor is relevant later\n return mdp.r + mdp.γ * mdp.P @ v_next\n\n\n# convert a list of v functions to a list of q functions\nv_ary_to_q_ary = jax.vmap(v_to_q, in_axes=(None, 0))","key":"VbcAjBac2s"},{"type":"output","id":"XB9p1De2paS08gkC0r2cT","data":[],"key":"bGo8MmxSfn"}],"data":{},"key":"Cf9LPJm2IW"},{"type":"block","children":[{"type":"heading","depth":4,"position":{"start":{"line":455,"column":1},"end":{"line":455,"column":1}},"children":[{"type":"text","value":"Greedy policies","position":{"start":{"line":455,"column":1},"end":{"line":455,"column":1}},"key":"oV3FsbXY05"}],"identifier":"greedy-policies","label":"Greedy policies","html_id":"greedy-policies","implicit":true,"enumerator":"1.2.4.2","key":"nAeiJ0xqPo"},{"type":"paragraph","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"children":[{"type":"text","value":"For any given ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"s9pOqDGgpx"},{"type":"inlineMath","value":"Q \\in \\mathbb{R}^{|\\mathcal{S}| \\times |\\mathcal{A}|}","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"html":"QRS×AQ \\in \\mathbb{R}^{|\\mathcal{S}| \\times |\\mathcal{A}|}QRS×A","key":"iImjRMQl33"},{"type":"text","value":", we can define the ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"OAsJ5CMG6c"},{"type":"strong","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"children":[{"type":"text","value":"greedy policy","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"AlzqpAPQ7m"}],"key":"JzSikxQfZV"},{"type":"text","value":" ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"qv3o7jyPRz"},{"type":"inlineMath","value":"\\hat \\pi_Q","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"html":"π^Q\\hat \\pi_Qπ^Q","key":"GDgiG2qrJq"},{"type":"text","value":" as the deterministic policy that selects the action with the highest ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"a3leS1v1dQ"},{"type":"inlineMath","value":"Q","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"html":"QQQ","key":"O9wRO5k55r"},{"type":"text","value":"-value at each state:","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"H0plZ6zkkB"}],"key":"YrrOsizF8M"},{"type":"math","value":"\\hat \\pi_Q(s) = \\arg\\max_{a} Q_{sa}","position":{"start":{"line":459,"column":1},"end":{"line":461,"column":1}},"html":"π^Q(s)=argmaxaQsa\\hat \\pi_Q(s) = \\arg\\max_{a} Q_{sa}π^Q(s)=argamaxQsa","enumerator":"1.11","key":"tVUZTqMa0Y"}],"key":"wEUESvhtLD"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def q_to_greedy(q: Float[Array, \"S A\"]) -> Float[Array, \"S A\"]:\n \"\"\"\n Get the (deterministic) greedy policy with respect to an action-value function.\n Return the policy as a matrix of shape (S, A) where each row is a one-hot vector.\n \"\"\"\n A = q.shape[1]\n a_ary = jnp.argmax(q, axis=1)\n return jnp.eye(A)[a_ary]\n\n\ndef v_to_greedy(mdp: MDP, v: Float[Array, \" S\"]) -> Float[Array, \"S A\"]:\n \"\"\"Get the (deterministic) greedy policy with respect to a value function.\"\"\"\n return q_to_greedy(v_to_q(mdp, v))","key":"KPZxTFtuPW"},{"type":"output","id":"usD5cW7_ONIlp9iWX1r0f","data":[],"key":"qwA3uybbWJ"}],"data":{},"key":"sD7kEKBlSY"},{"type":"block","children":[{"type":"heading","depth":3,"position":{"start":{"line":479,"column":1},"end":{"line":479,"column":1}},"children":[{"type":"text","value":"The one-step (Bellman) consistency equation","position":{"start":{"line":479,"column":1},"end":{"line":479,"column":1}},"key":"ABj2y23OR4"}],"identifier":"the-one-step-bellman-consistency-equation","label":"The one-step (Bellman) consistency equation","html_id":"the-one-step-bellman-consistency-equation","implicit":true,"enumerator":"1.2.5","key":"MrN9es6E4J"},{"type":"paragraph","position":{"start":{"line":481,"column":1},"end":{"line":485,"column":1}},"children":[{"type":"text","value":"Note that by simply considering the cumulative reward as the sum of the\n","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"yhaLxgTKDb"},{"type":"emphasis","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"children":[{"type":"text","value":"current","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"tpTzz8Rlqj"}],"key":"z9oBV2euMN"},{"type":"text","value":" reward and the ","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"rKPcjXat0K"},{"type":"emphasis","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"children":[{"type":"text","value":"future","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"tLpVflus7C"}],"key":"M8mHrpQMA9"},{"type":"text","value":" cumulative reward, we can describe the\nvalue function recursively (in terms of itself). This is named the\n","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"jBi0Yr3Q37"},{"type":"strong","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"children":[{"type":"text","value":"Bellman consistency equation","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"yQ0PSm8rDj"}],"key":"aKNQi9icp9"},{"type":"text","value":" after ","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"n1OHdwX6zR"},{"type":"strong","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"children":[{"type":"text","value":"Richard Bellman","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"pgw8vmwbiu"}],"key":"kpYuXDnD7v"},{"type":"text","value":" (1920--1984),\nwho is credited with introducing dynamic programming in 1953.","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"L5JbwG8c1L"}],"key":"ENXhctd9CG"},{"type":"proof","kind":"theorem","label":"bellman_consistency","identifier":"bellman_consistency","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Bellman consistency equation for the value function","position":{"start":{"line":487,"column":1},"end":{"line":487,"column":1}},"key":"ZgYa1q5L4P"}],"key":"cvHQNlltbP"},{"type":"math","value":"V_\\hi^\\pi(s) = \\E_{\\substack{a \\sim \\pi_\\hi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + V_{\\hi+1}^\\pi(s')]","position":{"start":{"line":490,"column":1},"end":{"line":492,"column":1}},"html":"Vhπ(s)=Eaπh(s)sP(s,a)[r(s,a)+Vh+1π(s)]V_\\hi^\\pi(s) = \\E_{\\substack{a \\sim \\pi_\\hi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + V_{\\hi+1}^\\pi(s')]Vhπ(s)=Eaπh(s)sP(s,a)[r(s,a)+Vh+1π(s)]","enumerator":"1.12","key":"Yyy6nFU7qX"}],"enumerator":"1.1","html_id":"bellman-consistency","key":"TXjUwbBrJN"}],"key":"dICYnMAHRm"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def check_bellman_consistency_v(\n mdp: MDP,\n policy: Float[Array, \"H S A\"],\n v_ary: Float[Array, \"H S\"],\n) -> bool:\n \"\"\"\n Check that the given (time-dependent) \"value function\"\n satisfies the Bellman consistency equation.\n \"\"\"\n return all(\n jnp.allclose(\n # lhs\n v_ary[h],\n # rhs\n jnp.sum(policy[h] * (mdp.r + mdp.γ * mdp.P @ v_ary[h + 1]), axis=1),\n )\n for h in range(mdp.H - 1)\n )","key":"DIodALJ0BY"},{"type":"output","id":"JxGaMz-Db2PYuQCCGu7Kd","data":[],"key":"vI8Y3JQnhB"}],"data":{},"key":"ZLNzec954j"},{"type":"block","children":[{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"AalDu5zbE8"}],"key":"UFeyhXV6zh"},{"type":"paragraph","position":{"start":{"line":517,"column":1},"end":{"line":518,"column":1}},"children":[{"type":"text","value":"Verify that this equation holds by expanding ","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"key":"USwMRJpDxz"},{"type":"inlineMath","value":"V_\\hi^\\pi(s)","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"html":"Vhπ(s)V_\\hi^\\pi(s)Vhπ(s)","key":"Z1qQq8h0nc"},{"type":"text","value":"\nand ","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"key":"QAAN7f4yme"},{"type":"inlineMath","value":"V_{\\hi+1}^\\pi(s')","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"html":"Vh+1π(s)V_{\\hi+1}^\\pi(s')Vh+1π(s)","key":"zv0c9iyKtp"},{"type":"text","value":".","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"key":"pniLx3FyO9"}],"key":"kQP08fAUmI"}],"key":"gRyVxNsxO4"},{"type":"paragraph","position":{"start":{"line":521,"column":1},"end":{"line":522,"column":1}},"children":[{"type":"text","value":"One can analogously derive the Bellman consistency equation for the\naction-value function:","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"Z9lgte9bgb"}],"key":"sJpHsZ7vks"},{"type":"proof","kind":"theorem","label":"bellman_consistency_action","identifier":"bellman_consistency_action","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Bellman consistency equation for action-values","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"key":"fGoz3ZZ6qL"}],"key":"ixR2Mrl5bv"},{"type":"math","value":"Q_\\hi^\\pi(s, a) = r(s, a) + \\E_{\\substack{s' \\sim P(s, a) \\\\ a' \\sim \\pi_{\\hi+1}(s')}} [Q_{\\hi+1}^\\pi(s', a')]","position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"html":"Qhπ(s,a)=r(s,a)+EsP(s,a)aπh+1(s)[Qh+1π(s,a)]Q_\\hi^\\pi(s, a) = r(s, a) + \\E_{\\substack{s' \\sim P(s, a) \\\\ a' \\sim \\pi_{\\hi+1}(s')}} [Q_{\\hi+1}^\\pi(s', a')]Qhπ(s,a)=r(s,a)+EsP(s,a)aπh+1(s)[Qh+1π(s,a)]","enumerator":"1.13","key":"Nqfs2ZmNiS"}],"enumerator":"1.2","html_id":"bellman-consistency-action","key":"lgE4dUlYnY"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"VpdJjj902h"}],"key":"Xmdh6OD9DG"},{"type":"paragraph","position":{"start":{"line":531,"column":1},"end":{"line":531,"column":1}},"children":[{"type":"text","value":"Write a ","position":{"start":{"line":531,"column":1},"end":{"line":531,"column":1}},"key":"btGeoGpNKI"},{"type":"inlineCode","value":"check_bellman_consistency_q","position":{"start":{"line":531,"column":1},"end":{"line":531,"column":1}},"key":"BaVlX6snbg"},{"type":"text","value":" function for the action-value function.","position":{"start":{"line":531,"column":1},"end":{"line":531,"column":1}},"key":"hy1gMduxhM"}],"key":"M0PV2TFC5C"}],"key":"WfdrPyzNoS"},{"type":"proof","kind":"remark","label":"bellman_det","identifier":"bellman_det","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"The Bellman consistency equation for deterministic policies","position":{"start":{"line":534,"column":1},"end":{"line":534,"column":1}},"key":"ieBxUnp5zo"}],"key":"n4OP8geGuK"},{"type":"paragraph","position":{"start":{"line":537,"column":1},"end":{"line":538,"column":1}},"children":[{"type":"text","value":"Note that for deterministic policies, the Bellman consistency equation\nsimplifies to","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"LAXggzvSqg"}],"key":"trorOGS4yt"},{"type":"math","value":"\\begin{aligned}\n V_\\hi^\\pi(s) &= r(s, \\pi_\\hi(s)) + \\E_{s' \\sim P(s, \\pi_\\hi(s))} [V_{\\hi+1}^\\pi(s')] \\\\\n Q_\\hi^\\pi(s, a) &= r(s, a) + \\E_{s' \\sim P(s, a)} [Q_{\\hi+1}^\\pi(s', \\pi_{\\hi+1}(s'))]\n\\end{aligned}","position":{"start":{"line":540,"column":1},"end":{"line":545,"column":1}},"html":"Vhπ(s)=r(s,πh(s))+EsP(s,πh(s))[Vh+1π(s)]Qhπ(s,a)=r(s,a)+EsP(s,a)[Qh+1π(s,πh+1(s))]\\begin{aligned}\n V_\\hi^\\pi(s) &= r(s, \\pi_\\hi(s)) + \\E_{s' \\sim P(s, \\pi_\\hi(s))} [V_{\\hi+1}^\\pi(s')] \\\\\n Q_\\hi^\\pi(s, a) &= r(s, a) + \\E_{s' \\sim P(s, a)} [Q_{\\hi+1}^\\pi(s', \\pi_{\\hi+1}(s'))]\n\\end{aligned}Vhπ(s)Qhπ(s,a)=r(s,πh(s))+EsP(s,πh(s))[Vh+1π(s)]=r(s,a)+EsP(s,a)[Qh+1π(s,πh+1(s))]","enumerator":"1.14","key":"rTYdbJe6mo"}],"enumerator":"1.1","html_id":"bellman-det","key":"P157kF0o8P"}],"key":"H7KjBEJgWk"},{"type":"block","position":{"start":{"line":548,"column":1},"end":{"line":548,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"children":[{"type":"text","value":"The one-step Bellman operator","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"lnxEXW1dDv"}],"identifier":"the-one-step-bellman-operator","label":"The one-step Bellman operator","html_id":"the-one-step-bellman-operator","implicit":true,"enumerator":"1.2.6","key":"FtwrR1WaVB"},{"type":"paragraph","position":{"start":{"line":552,"column":1},"end":{"line":554,"column":1}},"children":[{"type":"text","value":"Fix a policy ","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"key":"wQsu3o80PE"},{"type":"text","value":"π","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"key":"YYma6jXSYy"},{"type":"text","value":". Consider the higher-order operator that takes in a\n“value function” ","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"key":"sAu6ilWDi1"},{"type":"inlineMath","value":"v : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"html":"v:SRv : \\mathcal{S} \\to \\mathbb{R}v:SR","key":"jlNro583DD"},{"type":"text","value":" and returns the r.h.s. of the Bellman\nequation for that “value function”:","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"key":"NeIJ0tEkRL"}],"key":"xbMWIjGlPv"},{"type":"proof","kind":"definition","label":"bellman_operator","identifier":"bellman_operator","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Bellman operator","position":{"start":{"line":556,"column":1},"end":{"line":556,"column":1}},"key":"dECMhSaiUn"}],"key":"iRTJob5o3k"},{"type":"math","value":"[\\mathcal{J}^{\\pi}(v)](s) := \\E_{\\substack{a \\sim \\pi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + v(s')].","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"html":"[Jπ(v)](s):=Eaπ(s)sP(s,a)[r(s,a)+v(s)].[\\mathcal{J}^{\\pi}(v)](s) := \\E_{\\substack{a \\sim \\pi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + v(s')].[Jπ(v)](s):=Eaπ(s)sP(s,a)[r(s,a)+v(s)].","enumerator":"1.15","key":"n9YYQLr8BT"},{"type":"paragraph","position":{"start":{"line":561,"column":1},"end":{"line":564,"column":1}},"children":[{"type":"text","value":"This is a crucial tool for reasoning about MDPs.\nIntuitively, it answers the following question:\nif we evaluate the ","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"key":"QX3MozcBnQ"},{"type":"emphasis","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"children":[{"type":"text","value":"next","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"key":"Sqz4k53fau"}],"key":"hEtp7u2a7v"},{"type":"text","value":" state using ","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"key":"ZZ9k14OuXC"},{"type":"inlineMath","value":"v","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"html":"vvv","key":"WAG4L4K0jH"},{"type":"text","value":",\nhow good is the ","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"key":"xUF2SVqMiL"},{"type":"emphasis","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"children":[{"type":"text","value":"current","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"key":"TW2eXnQKo1"}],"key":"yosI0S4j5q"},{"type":"text","value":" state, according to the given policy?","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"key":"bj5fl2VDe0"}],"key":"PNwHBM01JL"}],"enumerator":"1.8","html_id":"bellman-operator","key":"xOHcAXLtKd"}],"key":"ZOW9azrVdd"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def bellman_operator_looping(\n mdp: MDP,\n policy: Float[Array, \"S A\"],\n v: Float[Array, \" S\"],\n) -> Float[Array, \" S\"]:\n \"\"\"\n Looping definition of the Bellman operator.\n Concise version is below\n \"\"\"\n v_new = jnp.zeros(mdp.S)\n for s in range(mdp.S):\n for a in range(mdp.A):\n for s_next in range(mdp.S):\n v_new[s] += (\n policy[s, a]\n * mdp.P[s, a, s_next]\n * (mdp.r[s, a] + mdp.γ * v[s_next])\n )\n return v_new","visibility":"hide","key":"WCnjENVeNM"},{"type":"output","id":"dyRksKX-inE8Nzasn_pUw","data":[],"visibility":"show","key":"x9d3Gpe1Yi"}],"data":{"tags":[]},"visibility":"show","key":"akPV4sOkGm"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"children":[{"type":"text","value":"Note that we can concisely implement this using the ","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"E6AZMdFo08"},{"type":"inlineCode","value":"q_to_v","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"HvA8qa4yqD"},{"type":"text","value":" and ","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"WGzCcRkyVN"},{"type":"inlineCode","value":"v_to_q","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"L8fkK961Zq"},{"type":"text","value":" utilities from above:","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"aAl4mK2GpG"}],"key":"xG4nM1uPt6"}],"key":"KwUvOpU3EU"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def bellman_operator(\n mdp: MDP,\n policy: Float[Array, \"S A\"],\n v: Float[Array, \" S\"],\n) -> Float[Array, \" S\"]:\n \"\"\"For a known finite MDP, the Bellman operator can be exactly evaluated.\"\"\"\n return q_to_v(policy, v_to_q(mdp, v)) # equivalent\n return jnp.sum(policy * (mdp.r + mdp.γ * mdp.P @ v), axis=1)","key":"CZjETtn1ha"},{"type":"output","id":"GtGBn56rqDA_cYubrW3Ss","data":[],"key":"Vmf9aCa0xm"}],"data":{},"key":"LAeZQFBXu9"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":604,"column":1},"end":{"line":608,"column":1}},"children":[{"type":"text","value":"We’ll call ","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"erG89WgjLw"},{"type":"inlineMath","value":"\\mathcal{J}^\\pi : \\mathbb{R}^\\mathcal{S} \\to \\mathbb{R}^\\mathcal{S}","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"html":"Jπ:RSRS\\mathcal{J}^\\pi : \\mathbb{R}^\\mathcal{S} \\to \\mathbb{R}^\\mathcal{S}Jπ:RSRS","key":"SHQIOIAbr1"},{"type":"text","value":" the ","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"s263DmPXXo"},{"type":"strong","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"children":[{"type":"text","value":"Bellman\noperator","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"BaUezrmgzZ"}],"key":"pzOuxfdN3I"},{"type":"text","value":" of ","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"P4fLaBvMGv"},{"type":"text","value":"π","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"Y940t9CBp7"},{"type":"text","value":".\nNote that it’s defined on any “value function” mapping states to real numbers;\n","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"LbFDb8qFg0"},{"type":"inlineMath","value":"v","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"html":"vvv","key":"j1RFnHI1QC"},{"type":"text","value":" doesn’t have to be a well-defined value function for some policy (hence the lowercase notation).\nThe Bellman operator also gives us a concise way to express ","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"ZEnta5L5ow"},{"type":"crossReference","kind":"proof:theorem","identifier":"bellman_consistency","label":"bellman_consistency","children":[{"type":"text","value":"Theorem ","key":"dyOCYKJFL1"},{"type":"text","value":"1.1","key":"N6iQdmCuXO"}],"template":"Theorem %s","enumerator":"1.1","resolved":true,"html_id":"bellman-consistency","key":"mkqAz0x2TA"},{"type":"text","value":" for the value function:","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"xhx5iLDBWa"}],"key":"MpCWvPWg8r"},{"type":"math","value":"V_\\hi^\\pi = \\mathcal{J}^{\\pi}(V_{\\hi+1}^\\pi)","position":{"start":{"line":610,"column":1},"end":{"line":610,"column":1}},"html":"Vhπ=Jπ(Vh+1π)V_\\hi^\\pi = \\mathcal{J}^{\\pi}(V_{\\hi+1}^\\pi)Vhπ=Jπ(Vh+1π)","enumerator":"1.16","key":"PP99BEzmpl"},{"type":"paragraph","position":{"start":{"line":612,"column":1},"end":{"line":615,"column":1}},"children":[{"type":"text","value":"Intuitively, the output of the Bellman operator, a new “value function”,\nevaluates states as follows: from a given state, take one action\naccording to ","position":{"start":{"line":612,"column":1},"end":{"line":612,"column":1}},"key":"GElSQrkjfA"},{"type":"text","value":"π","position":{"start":{"line":612,"column":1},"end":{"line":612,"column":1}},"key":"M99Vl3mFsM"},{"type":"text","value":", observe the reward, and then evaluate the next state\nusing the input “value function”.","position":{"start":{"line":612,"column":1},"end":{"line":612,"column":1}},"key":"B9b9aYF0bt"}],"key":"Ai7X0186hJ"},{"type":"paragraph","position":{"start":{"line":617,"column":1},"end":{"line":619,"column":1}},"children":[{"type":"text","value":"When we discuss infinite-horizon MDPs, the Bellman operator will turn\nout to be more than just a notational convenience: We’ll use it to\nconstruct algorithms for computing the optimal policy.","position":{"start":{"line":617,"column":1},"end":{"line":617,"column":1}},"key":"vrIgueMOZW"}],"key":"ZK6x6XJ8aq"},{"type":"heading","depth":2,"position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"children":[{"type":"text","value":"Solving finite-horizon MDPs","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"key":"jLC8DRq8bZ"}],"label":"finite_horizon_mdps","identifier":"finite_horizon_mdps","html_id":"finite-horizon-mdps-1","enumerator":"1.3","key":"r063IXIFqm"},{"type":"heading","depth":3,"position":{"start":{"line":626,"column":1},"end":{"line":626,"column":1}},"children":[{"type":"text","value":"Policy evaluation in finite-horizon MDPs","position":{"start":{"line":626,"column":1},"end":{"line":626,"column":1}},"key":"xbEraMTwLK"}],"label":"eval_dp","identifier":"eval_dp","html_id":"eval-dp","enumerator":"1.3.1","key":"mKfMTovyDT"},{"type":"paragraph","position":{"start":{"line":628,"column":1},"end":{"line":629,"column":1}},"children":[{"type":"text","value":"How can we actually compute the value function of a given policy? This\nis the task of ","position":{"start":{"line":628,"column":1},"end":{"line":628,"column":1}},"key":"g75XDMKzqy"},{"type":"strong","position":{"start":{"line":628,"column":1},"end":{"line":628,"column":1}},"children":[{"type":"text","value":"policy evaluation","position":{"start":{"line":628,"column":1},"end":{"line":628,"column":1}},"key":"EY5vZ7bzG6"}],"key":"LHC00FdA3A"},{"type":"text","value":".","position":{"start":{"line":628,"column":1},"end":{"line":628,"column":1}},"key":"vFOdYXNtoa"}],"key":"zVujq13ix2"},{"type":"proof","kind":"definition","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"DP algorithm to evaluate a policy in a finite-horizon MDP","position":{"start":{"line":631,"column":1},"end":{"line":631,"column":1}},"key":"k29lvYXu31"}],"key":"ss6Edat9fz"},{"type":"paragraph","position":{"start":{"line":633,"column":1},"end":{"line":640,"column":1}},"children":[{"type":"text","value":"The Bellman consistency equation\n","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"key":"tQ1DwWL04s"},{"type":"crossReference","kind":"proof:theorem","identifier":"bellman_consistency","label":"bellman_consistency","children":[{"type":"text","value":"Theorem ","key":"OQudG9a4lr"},{"type":"text","value":"1.1","key":"tkGJ74urSP"}],"template":"Theorem %s","enumerator":"1.1","resolved":true,"html_id":"bellman-consistency","key":"r71XwdwxAy"},{"type":"text","value":"\ngives us a convenient algorithm for\nevaluating stationary policies: it expresses the value function at\ntimestep ","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"key":"z0NZvSUjrR"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"html":"h\\hih","key":"ToCHR4ZNot"},{"type":"text","value":" as a function of the value function at timestep ","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"key":"BB2RxIhPrB"},{"type":"inlineMath","value":"\\hi+1","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"html":"h+1\\hi+1h+1","key":"eBNTPd7G0n"},{"type":"text","value":". This\nmeans we can start at the end of the time horizon, where the value is\nknown, and work backwards in time, using the Bellman consistency\nequation to compute the value function at each time step.","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"key":"t9ohnr16Di"}],"key":"iOi109NtYY"}],"enumerator":"1.9","key":"f7qxJwpYbn"}],"key":"GACO3nzuiN"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def dp_eval_finite(mdp: MDP, policy: Float[Array, \"S A\"]) -> Float[Array, \"H S\"]:\n \"\"\"Evaluate a policy using dynamic programming.\"\"\"\n V_ary = [None] * mdp.H + [jnp.zeros(mdp.S)] # initialize to 0 at end of time horizon\n for h in range(mdp.H - 1, -1, -1):\n V_ary[h] = bellman_operator(mdp, policy[h], V_ary[h + 1])\n return jnp.stack(V_ary[:-1])","key":"axBUcT44ur"},{"type":"output","id":"m2KQvip3tffMMmN6xvU6R","data":[],"key":"osKhzHDQvQ"}],"data":{},"key":"CVTzpiJ0Rt"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":652,"column":1},"end":{"line":653,"column":1}},"children":[{"type":"text","value":"This runs in time ","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"EDH5gS7rPm"},{"type":"inlineMath","value":"O(H \\cdot |\\mathcal{S}|^2 \\cdot |\\mathcal{A}|)","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"html":"O(HS2A)O(H \\cdot |\\mathcal{S}|^2 \\cdot |\\mathcal{A}|)O(HS2A)","key":"ewo9lwL48J"},{"type":"text","value":" by counting the\nloops.","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"abujKKNucA"}],"key":"Z1EmBHsoM9"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"ElonKmFFCB"}],"key":"ILQSYh9RPC"},{"type":"paragraph","position":{"start":{"line":656,"column":1},"end":{"line":657,"column":1}},"children":[{"type":"text","value":"Do you see where we compute ","position":{"start":{"line":656,"column":1},"end":{"line":656,"column":1}},"key":"HBJnl5lniw"},{"type":"inlineMath","value":"Q^\\pi_\\hi","position":{"start":{"line":656,"column":1},"end":{"line":656,"column":1}},"html":"QhπQ^\\pi_\\hiQhπ","key":"nV0ZKUqAPb"},{"type":"text","value":" along the way? Make\nthis step explicit.","position":{"start":{"line":656,"column":1},"end":{"line":656,"column":1}},"key":"LmLQusuBAy"}],"key":"uNe1ExCh8E"}],"key":"mGSwl8CWRI"},{"type":"proof","kind":"example","label":"tidy_eval_finite","identifier":"tidy_eval_finite","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Tidying policy evaluation","position":{"start":{"line":660,"column":1},"end":{"line":660,"column":1}},"key":"VSNk2yFHFW"}],"key":"zCXeAEEieR"},{"type":"paragraph","position":{"start":{"line":663,"column":1},"end":{"line":667,"column":1}},"children":[{"type":"text","value":"Let’s evaluate the policy from\n","position":{"start":{"line":663,"column":1},"end":{"line":663,"column":1}},"key":"SPluul28HE"},{"type":"crossReference","kind":"proof:example","identifier":"tidy_policy","label":"tidy_policy","children":[{"type":"text","value":"Example ","key":"ROsdTl224p"},{"type":"text","value":"1.2","key":"oFnK9rwVUY"}],"template":"Example %s","enumerator":"1.2","resolved":true,"html_id":"tidy-policy","key":"HdzO6cDrYr"},{"type":"text","value":" in the tidying MDP\nthat tidies if and only if the room is\nmessy. We’ll use the Bellman consistency equation to compute the value\nfunction at each time step.","position":{"start":{"line":663,"column":1},"end":{"line":663,"column":1}},"key":"VwcCf1vKbB"}],"key":"LnlSVNQ3sa"},{"type":"math","value":"\\begin{aligned}\nV_{H-1}^\\pi(\\text{orderly}) &= r(\\text{orderly}, \\text{ignore}) \\\\\n&= 1 \\\\\nV_{H-1}^\\pi(\\text{messy}) &= r(\\text{messy}, \\text{tidy}) \\\\\n&= 0 \\\\\nV_{H-2}^\\pi(\\text{orderly}) &= r(\\text{orderly}, \\text{ignore}) + \\E_{s' \\sim P(\\text{orderly}, \\text{ignore})} [V_{H-1}^\\pi(s')] \\\\\n&= 1 + 0.7 \\cdot V_{H-1}^{\\pi}(\\text{orderly}) + 0.3 \\cdot V_{H-1}^{\\pi}(\\text{messy}) \\\\\n&= 1 + 0.7 \\cdot 1 + 0.3 \\cdot 0 \\\\\n&= 1.7 \\\\\nV_{H-2}^\\pi(\\text{messy}) &= r(\\text{messy}, \\text{tidy}) + \\E_{s' \\sim P(\\text{messy}, \\text{tidy})} [V_{H-1}^\\pi(s')] \\\\\n&= 0 + 1 \\cdot V_{H-1}^{\\pi}(\\text{orderly}) + 0 \\cdot V_{H-1}^{\\pi}(\\text{messy}) \\\\\n&= 1 \\\\\nV_{H-3}^\\pi(\\text{orderly}) &= r(\\text{orderly}, \\text{ignore}) + \\E_{s' \\sim P(\\text{orderly}, \\text{ignore})} [V_{H-2}^\\pi(s')] \\\\\n&= 1 + 0.7 \\cdot V_{H-2}^{\\pi}(\\text{orderly}) + 0.3 \\cdot V_{H-2}^{\\pi}(\\text{messy}) \\\\\n&= 1 + 0.7 \\cdot 1.7 + 0.3 \\cdot 1 \\\\\n&= 2.49 \\\\\nV_{H-3}^\\pi(\\text{messy}) &= r(\\text{messy}, \\text{tidy}) + \\E_{s' \\sim P(\\text{messy}, \\text{tidy})} [V_{H-2}^\\pi(s')] \\\\\n&= 0 + 1 \\cdot V_{H-2}^{\\pi}(\\text{orderly}) + 0 \\cdot V_{H-2}^{\\pi}(\\text{messy}) \\\\\n&= 1.7\n\\end{aligned}","position":{"start":{"line":669,"column":1},"end":{"line":690,"column":1}},"html":"VH1π(orderly)=r(orderly,ignore)=1VH1π(messy)=r(messy,tidy)=0VH2π(orderly)=r(orderly,ignore)+EsP(orderly,ignore)[VH1π(s)]=1+0.7VH1π(orderly)+0.3VH1π(messy)=1+0.71+0.30=1.7VH2π(messy)=r(messy,tidy)+EsP(messy,tidy)[VH1π(s)]=0+1VH1π(orderly)+0VH1π(messy)=1VH3π(orderly)=r(orderly,ignore)+EsP(orderly,ignore)[VH2π(s)]=1+0.7VH2π(orderly)+0.3VH2π(messy)=1+0.71.7+0.31=2.49VH3π(messy)=r(messy,tidy)+EsP(messy,tidy)[VH2π(s)]=0+1VH2π(orderly)+0VH2π(messy)=1.7\\begin{aligned}\nV_{H-1}^\\pi(\\text{orderly}) &= r(\\text{orderly}, \\text{ignore}) \\\\\n&= 1 \\\\\nV_{H-1}^\\pi(\\text{messy}) &= r(\\text{messy}, \\text{tidy}) \\\\\n&= 0 \\\\\nV_{H-2}^\\pi(\\text{orderly}) &= r(\\text{orderly}, \\text{ignore}) + \\E_{s' \\sim P(\\text{orderly}, \\text{ignore})} [V_{H-1}^\\pi(s')] \\\\\n&= 1 + 0.7 \\cdot V_{H-1}^{\\pi}(\\text{orderly}) + 0.3 \\cdot V_{H-1}^{\\pi}(\\text{messy}) \\\\\n&= 1 + 0.7 \\cdot 1 + 0.3 \\cdot 0 \\\\\n&= 1.7 \\\\\nV_{H-2}^\\pi(\\text{messy}) &= r(\\text{messy}, \\text{tidy}) + \\E_{s' \\sim P(\\text{messy}, \\text{tidy})} [V_{H-1}^\\pi(s')] \\\\\n&= 0 + 1 \\cdot V_{H-1}^{\\pi}(\\text{orderly}) + 0 \\cdot V_{H-1}^{\\pi}(\\text{messy}) \\\\\n&= 1 \\\\\nV_{H-3}^\\pi(\\text{orderly}) &= r(\\text{orderly}, \\text{ignore}) + \\E_{s' \\sim P(\\text{orderly}, \\text{ignore})} [V_{H-2}^\\pi(s')] \\\\\n&= 1 + 0.7 \\cdot V_{H-2}^{\\pi}(\\text{orderly}) + 0.3 \\cdot V_{H-2}^{\\pi}(\\text{messy}) \\\\\n&= 1 + 0.7 \\cdot 1.7 + 0.3 \\cdot 1 \\\\\n&= 2.49 \\\\\nV_{H-3}^\\pi(\\text{messy}) &= r(\\text{messy}, \\text{tidy}) + \\E_{s' \\sim P(\\text{messy}, \\text{tidy})} [V_{H-2}^\\pi(s')] \\\\\n&= 0 + 1 \\cdot V_{H-2}^{\\pi}(\\text{orderly}) + 0 \\cdot V_{H-2}^{\\pi}(\\text{messy}) \\\\\n&= 1.7\n\\end{aligned}VH1π(orderly)VH1π(messy)VH2π(orderly)VH2π(messy)VH3π(orderly)VH3π(messy)=r(orderly,ignore)=1=r(messy,tidy)=0=r(orderly,ignore)+EsP(orderly,ignore)[VH1π(s)]=1+0.7VH1π(orderly)+0.3VH1π(messy)=1+0.71+0.30=1.7=r(messy,tidy)+EsP(messy,tidy)[VH1π(s)]=0+1VH1π(orderly)+0VH1π(messy)=1=r(orderly,ignore)+EsP(orderly,ignore)[VH2π(s)]=1+0.7VH2π(orderly)+0.3VH2π(messy)=1+0.71.7+0.31=2.49=r(messy,tidy)+EsP(messy,tidy)[VH2π(s)]=0+1VH2π(orderly)+0VH2π(messy)=1.7","enumerator":"1.17","key":"tLoXlTBMjR"},{"type":"paragraph","position":{"start":{"line":692,"column":1},"end":{"line":693,"column":1}},"children":[{"type":"text","value":"etc. You may wish to repeat this computation for the\nother policies to get a better sense of this algorithm.","position":{"start":{"line":692,"column":1},"end":{"line":692,"column":1}},"key":"rR3XMWPf97"}],"key":"QoWxCeM8QJ"}],"enumerator":"1.4","html_id":"tidy-eval-finite","key":"ZbCpBIGOlL"}],"key":"L7DWvQ4byX"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"V_messy = dp_eval_finite(tidy_mdp, tidy_policy_messy_only)\nV_messy","key":"BrVc9RnBd2"},{"type":"output","id":"kI3PLAXow4GA4KExpalHJ","data":[{"output_type":"execute_result","execution_count":14,"metadata":{},"data":{"text/plain":{"content":"Array([[5.5621696, 4.7927704],\n [4.7927704, 4.0241003],\n [4.0241003, 3.253 ],\n [3.253 , 2.49 ],\n [2.49 , 1.7 ],\n [1.7 , 1. ],\n [1. , 0. ]], dtype=float32)","content_type":"text/plain"}}}],"key":"tnwKO6BoAH"}],"data":{},"key":"CHBdElOjSz"},{"type":"block","children":[{"type":"heading","depth":3,"position":{"start":{"line":702,"column":1},"end":{"line":702,"column":1}},"children":[{"type":"text","value":"Optimal policies in finite-horizon MDPs","position":{"start":{"line":702,"column":1},"end":{"line":702,"column":1}},"key":"dAg9Udr3UV"}],"label":"opt_dynamic_programming","identifier":"opt_dynamic_programming","html_id":"opt-dynamic-programming","enumerator":"1.3.2","key":"w5ISHyaMhv"},{"type":"paragraph","position":{"start":{"line":704,"column":1},"end":{"line":705,"column":1}},"children":[{"type":"text","value":"We’ve just seen how to ","position":{"start":{"line":704,"column":1},"end":{"line":704,"column":1}},"key":"Nnaz1KEdxD"},{"type":"emphasis","position":{"start":{"line":704,"column":1},"end":{"line":704,"column":1}},"children":[{"type":"text","value":"evaluate","position":{"start":{"line":704,"column":1},"end":{"line":704,"column":1}},"key":"kpgtv4pOgz"}],"key":"KZ9xULEe4t"},{"type":"text","value":" a given policy. But how can we find\nthe ","position":{"start":{"line":704,"column":1},"end":{"line":704,"column":1}},"key":"vwG5G3cfWB"},{"type":"strong","position":{"start":{"line":704,"column":1},"end":{"line":704,"column":1}},"children":[{"type":"text","value":"optimal policy","position":{"start":{"line":704,"column":1},"end":{"line":704,"column":1}},"key":"uDVCtWqIUJ"}],"key":"hNrXpKtUAi"},{"type":"text","value":" for a given environment?","position":{"start":{"line":704,"column":1},"end":{"line":704,"column":1}},"key":"td9KsWC7JZ"}],"key":"j4MeJfa4hE"},{"type":"proof","kind":"definition","label":"optimal_policy_finite","identifier":"optimal_policy_finite","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Optimal policies","position":{"start":{"line":707,"column":1},"end":{"line":707,"column":1}},"key":"tNLU58Inzo"}],"key":"LCYQof8xAY"},{"type":"paragraph","position":{"start":{"line":710,"column":1},"end":{"line":712,"column":1}},"children":[{"type":"text","value":"We call a policy optimal, and denote it by ","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"key":"FIytnfkIi1"},{"type":"inlineMath","value":"\\pi^\\star","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"html":"π\\pi^\\starπ","key":"rj0z0YgaZg"},{"type":"text","value":", if it does at\nleast as well as ","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"key":"UuGIl6nhAu"},{"type":"emphasis","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"children":[{"type":"text","value":"any","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"key":"szxbLpXEA0"}],"key":"XvVfabqhNQ"},{"type":"text","value":" other policy ","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"key":"KOH8gcufYy"},{"type":"text","value":"π","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"key":"gkzCNsbUhu"},{"type":"text","value":" (including stochastic and\nhistory-dependent ones) in all situations:","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"key":"GGJLnQVjZb"}],"key":"oG1Ry9dCTI"},{"type":"math","value":"\\begin{aligned}\n V_\\hi^{\\pi^\\star}(s) &= \\E_{\\tau \\sim \\rho^{\\pi^{\\star}}}[r_\\hi + \\cdots + r_{H-1} \\mid s_\\hi = s] \\\\\n &\\ge \\E_{\\tau \\sim \\rho^{\\pi}}[r_\\hi + \\cdots + r_{H-1} \\mid \\tau_\\hi] \\quad \\forall \\pi, \\tau_\\hi, \\hi \\in [H]\n\\end{aligned}","position":{"start":{"line":714,"column":1},"end":{"line":719,"column":1}},"html":"Vhπ(s)=Eτρπ[rh++rH1sh=s]Eτρπ[rh++rH1τh]π,τh,h[H]\\begin{aligned}\n V_\\hi^{\\pi^\\star}(s) &= \\E_{\\tau \\sim \\rho^{\\pi^{\\star}}}[r_\\hi + \\cdots + r_{H-1} \\mid s_\\hi = s] \\\\\n &\\ge \\E_{\\tau \\sim \\rho^{\\pi}}[r_\\hi + \\cdots + r_{H-1} \\mid \\tau_\\hi] \\quad \\forall \\pi, \\tau_\\hi, \\hi \\in [H]\n\\end{aligned}Vhπ(s)=Eτρπ[rh++rH1sh=s]Eτρπ[rh++rH1τh]π,τh,h[H]","enumerator":"1.18","key":"pbRAQsj3uN"},{"type":"paragraph","position":{"start":{"line":721,"column":1},"end":{"line":723,"column":1}},"children":[{"type":"text","value":"where we condition on the\ntrajectory up to time ","position":{"start":{"line":721,"column":1},"end":{"line":721,"column":1}},"key":"n9Ou97KIzA"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":721,"column":1},"end":{"line":721,"column":1}},"html":"h\\hih","key":"WnynMGDGmD"},{"type":"text","value":", denoted\n","position":{"start":{"line":721,"column":1},"end":{"line":721,"column":1}},"key":"B6oY0dAOd1"},{"type":"inlineMath","value":"\\tau_\\hi = (s_0, a_0, r_0, \\dots, s_\\hi)","position":{"start":{"line":721,"column":1},"end":{"line":721,"column":1}},"html":"τh=(s0,a0,r0,,sh)\\tau_\\hi = (s_0, a_0, r_0, \\dots, s_\\hi)τh=(s0,a0,r0,,sh)","key":"UBTDuwjefZ"},{"type":"text","value":", where ","position":{"start":{"line":721,"column":1},"end":{"line":721,"column":1}},"key":"EAqJo52mVZ"},{"type":"inlineMath","value":"s_\\hi = s","position":{"start":{"line":721,"column":1},"end":{"line":721,"column":1}},"html":"sh=ss_\\hi = ssh=s","key":"Xw4SzejCdm"},{"type":"text","value":".","position":{"start":{"line":721,"column":1},"end":{"line":721,"column":1}},"key":"C29csi7vbX"}],"key":"Sc75OoToWu"}],"enumerator":"1.10","html_id":"optimal-policy-finite","key":"wcRF1F6vP0"},{"type":"paragraph","position":{"start":{"line":726,"column":1},"end":{"line":729,"column":1}},"children":[{"type":"text","value":"Convince yourself that all optimal policies must have the same value\nfunction. We call this the ","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"key":"aMRvsbkvz7"},{"type":"strong","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"children":[{"type":"text","value":"optimal value function","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"key":"KHmKOwJziy"}],"key":"csjZ3ZGwlm"},{"type":"text","value":" and denote it by\n","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"key":"FpVpRAeXmS"},{"type":"inlineMath","value":"V_\\hi^\\star(s)","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"html":"Vh(s)V_\\hi^\\star(s)Vh(s)","key":"cowglaVijm"},{"type":"text","value":". The same goes for the action-value function\n","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"key":"NRP5kNmlYe"},{"type":"inlineMath","value":"Q_\\hi^\\star(s, a)","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"html":"Qh(s,a)Q_\\hi^\\star(s, a)Qh(s,a)","key":"oGL27m9o47"},{"type":"text","value":".","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"key":"newB15MeqQ"}],"key":"yjbcIxRPKE"},{"type":"paragraph","position":{"start":{"line":731,"column":1},"end":{"line":734,"column":1}},"children":[{"type":"text","value":"It is a stunning fact that ","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"key":"mK11eB5Oh4"},{"type":"strong","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"children":[{"type":"text","value":"every finite-horizon MDP has an optimal\npolicy that is time-dependent and deterministic.","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"key":"jBPzm9OGWz"}],"key":"PffCaoQ8e9"},{"type":"text","value":" In particular, we can\nconstruct such a policy by acting ","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"key":"YHVE4WFDJ8"},{"type":"emphasis","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"children":[{"type":"text","value":"greedily","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"key":"r83bYLYiuQ"}],"key":"Ms167YLTUH"},{"type":"text","value":" with respect to the optimal\naction-value function:","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"key":"hr0mSXULSx"}],"key":"jBoniYihHh"},{"type":"proof","kind":"theorem","label":"optimal_greedy","identifier":"optimal_greedy","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"It is optimal to be greedy with respect to the optimal value function","position":{"start":{"line":737,"column":1},"end":{"line":737,"column":1}},"key":"CnpSWLOoan"}],"key":"O7BePHm94q"},{"type":"math","value":"\\pi_\\hi^\\star(s) = \\arg\\max_a Q_\\hi^\\star(s, a).","position":{"start":{"line":740,"column":1},"end":{"line":740,"column":1}},"html":"πh(s)=argmaxaQh(s,a).\\pi_\\hi^\\star(s) = \\arg\\max_a Q_\\hi^\\star(s, a).πh(s)=argamaxQh(s,a).","enumerator":"1.19","key":"cdsQmiTxI8"}],"enumerator":"1.3","html_id":"optimal-greedy","key":"b7plR2mkMA"},{"type":"proof","kind":"proof","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Proof","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"key":"m8ytANnGeh"}],"key":"YTexVYkXph"},{"type":"paragraph","position":{"start":{"line":744,"column":1},"end":{"line":745,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":744,"column":1},"end":{"line":744,"column":1}},"key":"CDCOxwjUvB"},{"type":"inlineMath","value":"V^{\\star}","position":{"start":{"line":744,"column":1},"end":{"line":744,"column":1}},"html":"VV^{\\star}V","key":"U48cvspJ4t"},{"type":"text","value":" and ","position":{"start":{"line":744,"column":1},"end":{"line":744,"column":1}},"key":"Gn0xYLcH2t"},{"type":"inlineMath","value":"Q^{\\star}","position":{"start":{"line":744,"column":1},"end":{"line":744,"column":1}},"html":"QQ^{\\star}Q","key":"Vma8QVIPfg"},{"type":"text","value":" denote the optimal value and\naction-value functions. Consider the greedy policy","position":{"start":{"line":744,"column":1},"end":{"line":744,"column":1}},"key":"UWXilmVmTw"}],"key":"SN2PWV2M03"},{"type":"math","value":"\\hat \\pi_\\hi(s) := \\arg\\max_a Q_\\hi^{\\star}(s, a).","position":{"start":{"line":747,"column":1},"end":{"line":747,"column":1}},"html":"π^h(s):=argmaxaQh(s,a).\\hat \\pi_\\hi(s) := \\arg\\max_a Q_\\hi^{\\star}(s, a).π^h(s):=argamaxQh(s,a).","enumerator":"1.20","key":"tnRCn4IT6b"},{"type":"paragraph","position":{"start":{"line":749,"column":1},"end":{"line":750,"column":1}},"children":[{"type":"text","value":"We aim to show that\n","position":{"start":{"line":749,"column":1},"end":{"line":749,"column":1}},"key":"H7vbtNE6Y0"},{"type":"inlineMath","value":"\\hat \\pi","position":{"start":{"line":749,"column":1},"end":{"line":749,"column":1}},"html":"π^\\hat \\piπ^","key":"ai7HV5ONoN"},{"type":"text","value":" is optimal; that is, ","position":{"start":{"line":749,"column":1},"end":{"line":749,"column":1}},"key":"SVLXTDGMd9"},{"type":"inlineMath","value":"V^{\\hat \\pi} = V^{\\star}","position":{"start":{"line":749,"column":1},"end":{"line":749,"column":1}},"html":"Vπ^=VV^{\\hat \\pi} = V^{\\star}Vπ^=V","key":"xVP6w8k0pe"},{"type":"text","value":".","position":{"start":{"line":749,"column":1},"end":{"line":749,"column":1}},"key":"ZNxjaHVa67"}],"key":"pH6V0rv2S4"},{"type":"paragraph","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"children":[{"type":"text","value":"Fix an arbitrary state ","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"lfgvFF8qJo"},{"type":"inlineMath","value":"s \\in \\mathcal{S}","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"html":"sSs \\in \\mathcal{S}sS","key":"pGV1ZonF4B"},{"type":"text","value":" and time ","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"NQsrzgzeNv"},{"type":"inlineMath","value":"\\hi \\in [H]","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"html":"h[H]\\hi \\in [H]h[H]","key":"GwM4A1GQ6K"},{"type":"text","value":".","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"jklhJyeStZ"}],"key":"eosttjmiie"},{"type":"paragraph","position":{"start":{"line":754,"column":1},"end":{"line":759,"column":1}},"children":[{"type":"text","value":"Firstly, by the definition of ","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"ZM5OBlo0z4"},{"type":"inlineMath","value":"V^{\\star}","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"html":"VV^{\\star}V","key":"Kzf82Y77bl"},{"type":"text","value":", we already know\n","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"NSONKJMyVV"},{"type":"inlineMath","value":"V_\\hi^{\\star}(s) \\ge V_\\hi^{\\hat \\pi}(s)","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"html":"Vh(s)Vhπ^(s)V_\\hi^{\\star}(s) \\ge V_\\hi^{\\hat \\pi}(s)Vh(s)Vhπ^(s)","key":"rySjunCmQt"},{"type":"text","value":". So for equality to hold we just\nneed to show that ","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"C4sgVYePWS"},{"type":"inlineMath","value":"V_\\hi^{\\star}(s) \\le V_\\hi^{\\hat \\pi}(s)","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"html":"Vh(s)Vhπ^(s)V_\\hi^{\\star}(s) \\le V_\\hi^{\\hat \\pi}(s)Vh(s)Vhπ^(s)","key":"Ljr1YlDclp"},{"type":"text","value":". We’ll first\nshow that the Bellman operator ","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"WyIOc09Pfs"},{"type":"inlineMath","value":"\\mathcal{J}^{\\hat \\pi}","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"html":"Jπ^\\mathcal{J}^{\\hat \\pi}Jπ^","key":"pWlKcs9o0f"},{"type":"text","value":" never decreases\n","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"UsTPbNUyt9"},{"type":"inlineMath","value":"V_\\hi^{\\star}","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"html":"VhV_\\hi^{\\star}Vh","key":"ynbhQxleyO"},{"type":"text","value":". Then we’ll apply this result recursively to show that\n","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"Us9c7EkZDB"},{"type":"inlineMath","value":"V^{\\star} = V^{\\hat \\pi}","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"html":"V=Vπ^V^{\\star} = V^{\\hat \\pi}V=Vπ^","key":"RBSfJPW7Kx"},{"type":"text","value":".","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"IurokSIZ1f"}],"key":"ivYdTdDqpS"},{"type":"proof","kind":"lemma","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"The Bellman operator never decreases the optimal value function","position":{"start":{"line":761,"column":1},"end":{"line":761,"column":1}},"key":"W69gViOw8i"}],"key":"jO6gpcZ12M"},{"type":"paragraph","position":{"start":{"line":762,"column":1},"end":{"line":763,"column":1}},"children":[{"type":"inlineMath","value":"\\mathcal{J}^{\\hat \\pi}","position":{"start":{"line":762,"column":1},"end":{"line":762,"column":1}},"html":"Jπ^\\mathcal{J}^{\\hat \\pi}Jπ^","key":"BWPPFoGd5x"},{"type":"text","value":" never decreases ","position":{"start":{"line":762,"column":1},"end":{"line":762,"column":1}},"key":"PDtUcdKyOL"},{"type":"inlineMath","value":"V_\\hi^{\\star}","position":{"start":{"line":762,"column":1},"end":{"line":762,"column":1}},"html":"VhV_\\hi^{\\star}Vh","key":"Jbq0U94gwB"},{"type":"text","value":"\n(elementwise):","position":{"start":{"line":762,"column":1},"end":{"line":762,"column":1}},"key":"P76gy5o4M3"}],"key":"SU6govKDv4"},{"type":"math","value":"[\\mathcal{J}^{\\hat \\pi} (V_{\\hi+1}^{\\star})](s) \\ge V_\\hi^{\\star}(s).","position":{"start":{"line":765,"column":1},"end":{"line":765,"column":1}},"html":"[Jπ^(Vh+1)](s)Vh(s).[\\mathcal{J}^{\\hat \\pi} (V_{\\hi+1}^{\\star})](s) \\ge V_\\hi^{\\star}(s).[Jπ^(Vh+1)](s)Vh(s).","enumerator":"1.21","key":"xyQLRcM1a5"},{"type":"paragraph","position":{"start":{"line":767,"column":1},"end":{"line":767,"column":1}},"children":[{"type":"strong","position":{"start":{"line":767,"column":1},"end":{"line":767,"column":1}},"children":[{"type":"text","value":"Proof:","position":{"start":{"line":767,"column":1},"end":{"line":767,"column":1}},"key":"lVE8tovmk7"}],"key":"oKq8u2Ama9"}],"key":"Y1YkwLqGRG"},{"type":"math","value":"\\begin{aligned}\n V_\\hi^{\\star}(s) &= \\max_{\\pi \\in \\Pi} V_\\hi^{\\pi}(s) \\\\\n &= \\max_{\\pi \\in \\Pi} \\mathop{\\mathbb{E}}_{a \\sim \\pi(\\dots)}\\left[r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} V_{\\hi+1}^\\pi(s') \\right] && \\text{Bellman consistency} \\\\\n &\\le \\max_{\\pi \\in \\Pi} \\mathop{\\mathbb{E}}_{a \\sim \\pi(\\dots)}\\left[r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} V_{\\hi+1}^{\\star}(s') \\right] && \\text{definition of } V^\\star \\\\\n &= \\max_{a} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} V_{\\hi+1}^{\\star}(s') \\right] && \\text{only depends on } \\pi \\text{ via } a \\\\\n &= [\\mathcal{J}^{\\hat \\pi}(V_{\\hi+1}^{\\star})](s). \n\\end{aligned}","position":{"start":{"line":769,"column":1},"end":{"line":777,"column":1}},"html":"Vh(s)=maxπΠVhπ(s)=maxπΠEaπ()[r(s,a)+EsP(s,a)Vh+1π(s)]Bellman consistencymaxπΠEaπ()[r(s,a)+EsP(s,a)Vh+1(s)]definition of V=maxa[r(s,a)+EsP(s,a)Vh+1(s)]only depends on π via a=[Jπ^(Vh+1)](s).\\begin{aligned}\n V_\\hi^{\\star}(s) &= \\max_{\\pi \\in \\Pi} V_\\hi^{\\pi}(s) \\\\\n &= \\max_{\\pi \\in \\Pi} \\mathop{\\mathbb{E}}_{a \\sim \\pi(\\dots)}\\left[r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} V_{\\hi+1}^\\pi(s') \\right] && \\text{Bellman consistency} \\\\\n &\\le \\max_{\\pi \\in \\Pi} \\mathop{\\mathbb{E}}_{a \\sim \\pi(\\dots)}\\left[r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} V_{\\hi+1}^{\\star}(s') \\right] && \\text{definition of } V^\\star \\\\\n &= \\max_{a} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} V_{\\hi+1}^{\\star}(s') \\right] && \\text{only depends on } \\pi \\text{ via } a \\\\\n &= [\\mathcal{J}^{\\hat \\pi}(V_{\\hi+1}^{\\star})](s). \n\\end{aligned}Vh(s)=πΠmaxVhπ(s)=πΠmaxEaπ()[r(s,a)+EsP(s,a)Vh+1π(s)]πΠmaxEaπ()[r(s,a)+EsP(s,a)Vh+1(s)]=amax[r(s,a)+EsP(s,a)Vh+1(s)]=[Jπ^(Vh+1)](s).Bellman consistencydefinition of Vonly depends on π via a","enumerator":"1.22","key":"HLC80W5cU1"},{"type":"paragraph","position":{"start":{"line":779,"column":1},"end":{"line":781,"column":1}},"children":[{"type":"text","value":"Note that the chosen action ","position":{"start":{"line":779,"column":1},"end":{"line":779,"column":1}},"key":"szceLOYj8O"},{"type":"inlineMath","value":"a \\sim \\pi(\\dots)","position":{"start":{"line":779,"column":1},"end":{"line":779,"column":1}},"html":"aπ()a \\sim \\pi(\\dots)aπ()","key":"UxOAAROhYo"},{"type":"text","value":" above\nmight depend on the past history; this isn’t shown in the notation and\ndoesn’t affect our result (make sure you see why).","position":{"start":{"line":779,"column":1},"end":{"line":779,"column":1}},"key":"MP6JUngeBn"}],"key":"T8q0vBnEvc"}],"enumerator":"1.1","key":"HT1BBEnOhc"},{"type":"paragraph","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"children":[{"type":"text","value":"We can now apply this result recursively to get","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"zbLCB1FNGt"}],"key":"stcrXBgn0W"},{"type":"math","value":"V^{\\star}_t(s) \\le V^{\\hat \\pi}_t(s)","position":{"start":{"line":786,"column":1},"end":{"line":786,"column":1}},"html":"Vt(s)Vtπ^(s)V^{\\star}_t(s) \\le V^{\\hat \\pi}_t(s)Vt(s)Vtπ^(s)","enumerator":"1.23","key":"sLmcnIuxz1"},{"type":"paragraph","position":{"start":{"line":788,"column":1},"end":{"line":790,"column":1}},"children":[{"type":"text","value":"as follows. (Note that even\nthough ","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"key":"bpULSu1Q7o"},{"type":"inlineMath","value":"\\hat \\pi","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"html":"π^\\hat \\piπ^","key":"dFtUb3yn0O"},{"type":"text","value":" is deterministic, we’ll use the ","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"key":"QvIB9Ohwd5"},{"type":"inlineMath","value":"a \\sim \\hat \\pi(s)","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"html":"aπ^(s)a \\sim \\hat \\pi(s)aπ^(s)","key":"ctlbprDsnC"},{"type":"text","value":"\nnotation to make it explicit that we’re sampling a trajectory from it.)","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"key":"mUKshY9csf"}],"key":"hJCEhUgePp"},{"type":"math","value":"\\begin{aligned}\n V_{t}^{\\star}(s) &\\le [\\mathcal{J}^{\\hat \\pi}(V_{\\hi+1}^{\\star})](s) \\\\\n &= \\mathop{\\mathbb{E}}_{a \\sim \\hat \\pi(s)} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} \\left[ {\\color{blue} V_{\\hi+1}^{\\star}(s')} \\right] \\right] && \\text{definition of } \\mathcal{J}^{\\hat \\pi} \\\\\n &\\le \\mathop{\\mathbb{E}}_{a \\sim \\hat \\pi(s)} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} \\left[ {\\color{blue}[ \\mathcal{J}^{\\hat \\pi} (V_{t+2}^{\\star})] (s')} \\right] \\right] && \\text{above lemma} \\\\\n &= \\mathop{\\mathbb{E}}_{a \\sim \\hat \\pi(s)} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)}{\\color{blue} \\left[ \\mathop{\\mathbb{E}}_{a' \\sim \\hat \\pi} r(s', a') + \\mathop{\\mathbb{E}}_{s''} V_{t+2}^{\\star}(s'') \\right]} \\right] && \\text{definition of } \\mathcal{J}^{\\hat \\pi} \\\\\n &\\le \\cdots && \\text{apply at all timesteps} \\\\\n &= \\mathop{\\mathbb{E}}_{\\tau \\sim \\rho^{\\hat \\pi}} [G_{t} \\mid s_\\hi = s] && \\text{rewrite expectation} \\\\\n &= V_{t}^{\\hat \\pi}(s) && \\text{definition}\n\\end{aligned}","position":{"start":{"line":792,"column":1},"end":{"line":802,"column":1}},"html":"Vt(s)[Jπ^(Vh+1)](s)=Eaπ^(s)[r(s,a)+EsP(s,a)[Vh+1(s)]]definition of Jπ^Eaπ^(s)[r(s,a)+EsP(s,a)[[Jπ^(Vt+2)](s)]]above lemma=Eaπ^(s)[r(s,a)+EsP(s,a)[Eaπ^r(s,a)+EsVt+2(s)]]definition of Jπ^apply at all timesteps=Eτρπ^[Gtsh=s]rewrite expectation=Vtπ^(s)definition\\begin{aligned}\n V_{t}^{\\star}(s) &\\le [\\mathcal{J}^{\\hat \\pi}(V_{\\hi+1}^{\\star})](s) \\\\\n &= \\mathop{\\mathbb{E}}_{a \\sim \\hat \\pi(s)} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} \\left[ {\\color{blue} V_{\\hi+1}^{\\star}(s')} \\right] \\right] && \\text{definition of } \\mathcal{J}^{\\hat \\pi} \\\\\n &\\le \\mathop{\\mathbb{E}}_{a \\sim \\hat \\pi(s)} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} \\left[ {\\color{blue}[ \\mathcal{J}^{\\hat \\pi} (V_{t+2}^{\\star})] (s')} \\right] \\right] && \\text{above lemma} \\\\\n &= \\mathop{\\mathbb{E}}_{a \\sim \\hat \\pi(s)} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)}{\\color{blue} \\left[ \\mathop{\\mathbb{E}}_{a' \\sim \\hat \\pi} r(s', a') + \\mathop{\\mathbb{E}}_{s''} V_{t+2}^{\\star}(s'') \\right]} \\right] && \\text{definition of } \\mathcal{J}^{\\hat \\pi} \\\\\n &\\le \\cdots && \\text{apply at all timesteps} \\\\\n &= \\mathop{\\mathbb{E}}_{\\tau \\sim \\rho^{\\hat \\pi}} [G_{t} \\mid s_\\hi = s] && \\text{rewrite expectation} \\\\\n &= V_{t}^{\\hat \\pi}(s) && \\text{definition}\n\\end{aligned}Vt(s)[Jπ^(Vh+1)](s)=Eaπ^(s)[r(s,a)+EsP(s,a)[Vh+1(s)]]Eaπ^(s)[r(s,a)+EsP(s,a)[[Jπ^(Vt+2)](s)]]=Eaπ^(s)[r(s,a)+EsP(s,a)[Eaπ^r(s,a)+Es′′Vt+2(s′′)]]=Eτρπ^[Gtsh=s]=Vtπ^(s)definition of Jπ^above lemmadefinition of Jπ^apply at all timestepsrewrite expectationdefinition","enumerator":"1.24","key":"Ci353Z0frw"},{"type":"paragraph","position":{"start":{"line":804,"column":1},"end":{"line":804,"column":1}},"children":[{"type":"text","value":"And so we have ","position":{"start":{"line":804,"column":1},"end":{"line":804,"column":1}},"key":"CLzI9QufAV"},{"type":"inlineMath","value":"V^{\\star} = V^{\\hat \\pi}","position":{"start":{"line":804,"column":1},"end":{"line":804,"column":1}},"html":"V=Vπ^V^{\\star} = V^{\\hat \\pi}V=Vπ^","key":"K4MrdAWw7B"},{"type":"text","value":", making ","position":{"start":{"line":804,"column":1},"end":{"line":804,"column":1}},"key":"Sj50lCz0v4"},{"type":"inlineMath","value":"\\hat \\pi","position":{"start":{"line":804,"column":1},"end":{"line":804,"column":1}},"html":"π^\\hat \\piπ^","key":"NisRHLPUoE"},{"type":"text","value":" optimal.","position":{"start":{"line":804,"column":1},"end":{"line":804,"column":1}},"key":"ae84HbYMMn"}],"key":"bgol1cO81y"}],"enumerator":"1.1","key":"pAdJhHYi2S"},{"type":"paragraph","position":{"start":{"line":807,"column":1},"end":{"line":807,"column":1}},"children":[{"type":"text","value":"Note that this also gives simplified forms of the ","position":{"start":{"line":807,"column":1},"end":{"line":807,"column":1}},"key":"fpf9jvNzkg"},{"type":"crossReference","position":{"start":{"line":807,"column":1},"end":{"line":807,"column":1}},"children":[{"type":"text","value":"Bellman consistency","position":{"start":{"line":807,"column":1},"end":{"line":807,"column":1}},"key":"n66tzJHu5l"}],"identifier":"bellman_consistency","label":"bellman_consistency","kind":"proof:theorem","template":"Theorem %s","enumerator":"1.1","resolved":true,"html_id":"bellman-consistency","key":"RwODzzmYK0"},{"type":"text","value":" equations for the optimal policy:","position":{"start":{"line":807,"column":1},"end":{"line":807,"column":1}},"key":"KoUWw8QVF8"}],"key":"pV4UoIbma5"},{"type":"proof","kind":"corollary","label":"bellman_consistency_optimal","identifier":"bellman_consistency_optimal","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Bellman consistency equations for the optimal policy","position":{"start":{"line":809,"column":1},"end":{"line":809,"column":1}},"key":"xJakbRgDfW"}],"key":"l1k7iHWWzi"},{"type":"math","value":"\\begin{aligned}\n V_\\hi^\\star(s) &= \\max_a Q_\\hi^\\star(s, a) \\\\\n Q_\\hi^\\star(s, a) &= r(s, a) + \\E_{s' \\sim P(s, a)} [V_{\\hi+1}^\\star(s')]\n\\end{aligned}","position":{"start":{"line":812,"column":1},"end":{"line":817,"column":1}},"html":"Vh(s)=maxaQh(s,a)Qh(s,a)=r(s,a)+EsP(s,a)[Vh+1(s)]\\begin{aligned}\n V_\\hi^\\star(s) &= \\max_a Q_\\hi^\\star(s, a) \\\\\n Q_\\hi^\\star(s, a) &= r(s, a) + \\E_{s' \\sim P(s, a)} [V_{\\hi+1}^\\star(s')]\n\\end{aligned}Vh(s)Qh(s,a)=amaxQh(s,a)=r(s,a)+EsP(s,a)[Vh+1(s)]","enumerator":"1.25","key":"fDSiqd6y6I"}],"enumerator":"1.1","html_id":"bellman-consistency-optimal","key":"NdFTbkHFM1"},{"type":"paragraph","position":{"start":{"line":820,"column":1},"end":{"line":823,"column":1}},"children":[{"type":"text","value":"Now that we’ve shown this particular greedy policy is optimal, all we\nneed to do is compute the optimal value function and optimal policy. We\ncan do this by working backwards in time using ","position":{"start":{"line":820,"column":1},"end":{"line":820,"column":1}},"key":"rcFumtuQET"},{"type":"strong","position":{"start":{"line":820,"column":1},"end":{"line":820,"column":1}},"children":[{"type":"text","value":"dynamic programming","position":{"start":{"line":820,"column":1},"end":{"line":820,"column":1}},"key":"NDzBUv1zJc"}],"key":"hBFzW2kvF2"},{"type":"text","value":"\n(DP).","position":{"start":{"line":820,"column":1},"end":{"line":820,"column":1}},"key":"Gs69dgiLfo"}],"key":"DKfRq5fUrm"},{"type":"proof","kind":"definition","label":"pi_star_dp","identifier":"pi_star_dp","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"DP algorithm to compute an optimal policy in a finite-horizon MDP","position":{"start":{"line":825,"column":1},"end":{"line":825,"column":1}},"key":"Bab68HuCFg"}],"key":"x8ElgOXSSI"},{"type":"paragraph","position":{"start":{"line":828,"column":1},"end":{"line":830,"column":1}},"children":[{"type":"strong","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"children":[{"type":"text","value":"Base case.","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"key":"jsWn4gwf7f"}],"key":"VNSZVIxG8f"},{"type":"text","value":" At the end of the episode (time step ","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"key":"UC1p4lxWJK"},{"type":"inlineMath","value":"H-1","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"html":"H1H-1H1","key":"jdTs7QRJnG"},{"type":"text","value":"), we can’t\ntake any more actions, so the ","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"key":"rWSLyhoszm"},{"type":"inlineMath","value":"Q","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"html":"QQQ","key":"InwBmt69vT"},{"type":"text","value":"-function is simply the reward that\nwe obtain:","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"key":"TNd0WSV7Yh"}],"key":"eBmZNxRbVA"},{"type":"math","value":"Q^\\star_{H-1}(s, a) = r(s, a)","position":{"start":{"line":832,"column":1},"end":{"line":832,"column":1}},"html":"QH1(s,a)=r(s,a)Q^\\star_{H-1}(s, a) = r(s, a)QH1(s,a)=r(s,a)","enumerator":"1.26","key":"Ki7HkFCXwi"},{"type":"paragraph","position":{"start":{"line":834,"column":1},"end":{"line":835,"column":1}},"children":[{"type":"text","value":"so the best thing to do\nis just act greedily and get as much reward as we can!","position":{"start":{"line":834,"column":1},"end":{"line":834,"column":1}},"key":"k7LeREDaDC"}],"key":"q3nw2dZ6aT"},{"type":"math","value":"\\pi^\\star_{H-1}(s) = \\arg\\max_a Q^\\star_{H-1}(s, a)","position":{"start":{"line":837,"column":1},"end":{"line":837,"column":1}},"html":"πH1(s)=argmaxaQH1(s,a)\\pi^\\star_{H-1}(s) = \\arg\\max_a Q^\\star_{H-1}(s, a)πH1(s)=argamaxQH1(s,a)","enumerator":"1.27","key":"V8uPsFv667"},{"type":"paragraph","position":{"start":{"line":839,"column":1},"end":{"line":841,"column":1}},"children":[{"type":"text","value":"Then\n","position":{"start":{"line":839,"column":1},"end":{"line":839,"column":1}},"key":"bvYeIfossM"},{"type":"inlineMath","value":"V^\\star_{H-1}(s)","position":{"start":{"line":839,"column":1},"end":{"line":839,"column":1}},"html":"VH1(s)V^\\star_{H-1}(s)VH1(s)","key":"M9vm7rboXx"},{"type":"text","value":", the optimal value of state ","position":{"start":{"line":839,"column":1},"end":{"line":839,"column":1}},"key":"sAAQXVMMq6"},{"type":"inlineMath","value":"s","position":{"start":{"line":839,"column":1},"end":{"line":839,"column":1}},"html":"sss","key":"FJGgJeBdHg"},{"type":"text","value":" at the end of the\ntrajectory, is simply whatever action gives the most reward.","position":{"start":{"line":839,"column":1},"end":{"line":839,"column":1}},"key":"Xyi8A3291v"}],"key":"OBaWth6YKg"},{"type":"math","value":"V^\\star_{H-1} = \\max_a Q^\\star_{H-1}(s, a)","position":{"start":{"line":843,"column":1},"end":{"line":843,"column":1}},"html":"VH1=maxaQH1(s,a)V^\\star_{H-1} = \\max_a Q^\\star_{H-1}(s, a)VH1=amaxQH1(s,a)","enumerator":"1.28","key":"MDDs6uVzgx"},{"type":"paragraph","position":{"start":{"line":845,"column":1},"end":{"line":847,"column":1}},"children":[{"type":"strong","position":{"start":{"line":845,"column":1},"end":{"line":845,"column":1}},"children":[{"type":"text","value":"Recursion.","position":{"start":{"line":845,"column":1},"end":{"line":845,"column":1}},"key":"xTW9AdVccj"}],"key":"zKbYlSXBGl"},{"type":"text","value":" Then, we can work backwards in time, starting from the\nend, using our consistency equations! i.e. for each\n","position":{"start":{"line":845,"column":1},"end":{"line":845,"column":1}},"key":"MMhJRfShcs"},{"type":"inlineMath","value":"t = H-2, \\dots, 0","position":{"start":{"line":845,"column":1},"end":{"line":845,"column":1}},"html":"t=H2,,0t = H-2, \\dots, 0t=H2,,0","key":"m3tUgiGcuM"},{"type":"text","value":", we set","position":{"start":{"line":845,"column":1},"end":{"line":845,"column":1}},"key":"uygtDvJdRM"}],"key":"yonf8Y3MvL"},{"type":"math","value":"\\begin{aligned}\n Q^\\star_{t}(s, a) &= r(s, a) + \\E_{s' \\sim P(s, a)} [V^\\star_{\\hi+1}(s')] \\\\\n \\pi^\\star_{t}(s) &= \\arg\\max_a Q^\\star_{t}(s, a) \\\\\n V^\\star_{t}(s) &= \\max_a Q^\\star_{t}(s, a)\n\\end{aligned}","position":{"start":{"line":849,"column":1},"end":{"line":855,"column":1}},"html":"Qt(s,a)=r(s,a)+EsP(s,a)[Vh+1(s)]πt(s)=argmaxaQt(s,a)Vt(s)=maxaQt(s,a)\\begin{aligned}\n Q^\\star_{t}(s, a) &= r(s, a) + \\E_{s' \\sim P(s, a)} [V^\\star_{\\hi+1}(s')] \\\\\n \\pi^\\star_{t}(s) &= \\arg\\max_a Q^\\star_{t}(s, a) \\\\\n V^\\star_{t}(s) &= \\max_a Q^\\star_{t}(s, a)\n\\end{aligned}Qt(s,a)πt(s)Vt(s)=r(s,a)+EsP(s,a)[Vh+1(s)]=argamaxQt(s,a)=amaxQt(s,a)","enumerator":"1.29","key":"DFOiQR2OhV"}],"enumerator":"1.11","html_id":"pi-star-dp","key":"gLJK7Ni7Cg"}],"key":"SdiKta1534"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def find_optimal_policy(mdp: MDP):\n Q = [None] * mdp.H\n pi = [None] * mdp.H\n V = [None] * mdp.H + [jnp.zeros(mdp.S)] # initialize to 0 at end of time horizon\n\n for h in range(mdp.H - 1, -1, -1):\n Q[h] = mdp.r + mdp.P @ V[h + 1]\n pi[h] = jnp.eye(mdp.S)[jnp.argmax(Q[h], axis=1)] # one-hot\n V[h] = jnp.max(Q[h], axis=1)\n\n Q = jnp.stack(Q)\n pi = jnp.stack(pi)\n V = jnp.stack(V[:-1])\n\n return pi, V, Q","key":"S4IjuSJuLh"},{"type":"output","id":"dboccwd4xw87y9dFJU2dl","data":[],"key":"PGnH5XsElu"}],"data":{},"key":"TWOJTWupmP"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":876,"column":1},"end":{"line":879,"column":1}},"children":[{"type":"text","value":"At each of the ","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"key":"uUNisUltWN"},{"type":"inlineMath","value":"H","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"html":"HHH","key":"Utm5Ls0k78"},{"type":"text","value":" timesteps, we must compute ","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"key":"Gtffo7hvjB"},{"type":"inlineMath","value":"Q^{\\star}","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"html":"QQ^{\\star}Q","key":"jkqNEKkT18"},{"type":"text","value":" for each of\nthe ","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"key":"kvq8xmYQRA"},{"type":"inlineMath","value":"|\\mathcal{S}| |\\mathcal{A}|","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"html":"SA|\\mathcal{S}| |\\mathcal{A}|S∣∣A","key":"Bz8R8riLVl"},{"type":"text","value":" state-action pairs. Each computation takes ","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"key":"d1wVlgcBBi"},{"type":"inlineMath","value":"|\\mathcal{S}|","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"html":"S|\\mathcal{S}|S","key":"EYFWvCCZjW"},{"type":"text","value":"\noperations to evaluate the average value over ","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"key":"e9LVwZ63vt"},{"type":"inlineMath","value":"s'","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"html":"ss's","key":"pqUegsyTlh"},{"type":"text","value":". This gives a total\ncomputation time of ","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"key":"TaZVPtKQXK"},{"type":"inlineMath","value":"O(H \\cdot |\\mathcal{S}|^2 \\cdot |\\mathcal{A}|)","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"html":"O(HS2A)O(H \\cdot |\\mathcal{S}|^2 \\cdot |\\mathcal{A}|)O(HS2A)","key":"t2xwDciTQa"},{"type":"text","value":".","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"key":"oGW72zurhT"}],"key":"DgJ2S1SpE0"},{"type":"paragraph","position":{"start":{"line":881,"column":1},"end":{"line":886,"column":1}},"children":[{"type":"text","value":"Note that this algorithm is identical to the policy evaluation algorithm\n","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"SiP1HhU9r1"},{"type":"crossReference","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"children":[{"type":"inlineCode","value":"dp_eval_finite","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"ivzCaesKAL"}],"identifier":"eval_dp","label":"eval_dp","kind":"heading","template":"Section %s","enumerator":"1.3.1","resolved":true,"html_id":"eval-dp","key":"DkOW4IAxQX"},{"type":"text","value":", but instead of ","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"GzOnsVlCMp"},{"type":"emphasis","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"children":[{"type":"text","value":"averaging","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"imGcuTbUck"}],"key":"g9qTb0TlyH"},{"type":"text","value":" over the\nactions chosen by a policy, we instead simply take a ","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"fVuOTLT8gx"},{"type":"emphasis","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"children":[{"type":"text","value":"maximum","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"ScHb8uDzAb"}],"key":"gX2Ao65p4z"},{"type":"text","value":" over the\naction-values. We’ll see this relationship between ","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"GcfAxz40G2"},{"type":"strong","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"children":[{"type":"text","value":"policy evaluation","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"VlOOnb8aaJ"}],"key":"uOeouPbfnB"},{"type":"text","value":"\nand ","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"xOoea3LZr8"},{"type":"strong","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"children":[{"type":"text","value":"optimal policy computation","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"XEuwoeIkVz"}],"key":"KmksqBAGxg"},{"type":"text","value":" show up again in the infinite-horizon\nsetting.","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"GOoy6NZzZh"}],"key":"V1dg8VGCbv"}],"key":"ZGWSejlbsw"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"π_opt, V_opt, Q_opt = find_optimal_policy(tidy_mdp)\nassert jnp.allclose(π_opt, tidy_policy_messy_only)\nassert jnp.allclose(V_opt, V_messy)\nassert jnp.allclose(Q_opt[:-1], v_ary_to_q_ary(tidy_mdp, V_messy)[1:])\n\"Assertions passed (the 'tidy when messy' policy is optimal)\"","key":"QiziMetcku"},{"type":"output","id":"A8ZM9Be2sA7OuUs-KmPll","data":[{"output_type":"execute_result","execution_count":16,"metadata":{},"data":{"text/plain":{"content":"\"Assertions passed (the 'tidy when messy' policy is optimal)\"","content_type":"text/plain"}}}],"key":"imCT46I43p"}],"data":{},"key":"JjEIyHMuML"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":897,"column":1},"end":{"line":897,"column":1}},"children":[{"type":"text","value":"Infinite-horizon MDPs","position":{"start":{"line":897,"column":1},"end":{"line":897,"column":1}},"key":"txsVqExX9Y"}],"label":"infinite_horizon_mdps","identifier":"infinite_horizon_mdps","html_id":"infinite-horizon-mdps","enumerator":"1.4","key":"IPg8fJt8Ym"},{"type":"paragraph","position":{"start":{"line":899,"column":1},"end":{"line":900,"column":1}},"children":[{"type":"text","value":"What happens if a trajectory is allowed to continue forever (i.e.\n","position":{"start":{"line":899,"column":1},"end":{"line":899,"column":1}},"key":"Ziw9dduHWZ"},{"type":"inlineMath","value":"H = \\infty","position":{"start":{"line":899,"column":1},"end":{"line":899,"column":1}},"html":"H=H = \\inftyH=","key":"CWRJ00RLZ2"},{"type":"text","value":")? This is the setting of ","position":{"start":{"line":899,"column":1},"end":{"line":899,"column":1}},"key":"y5TpCpkcCA"},{"type":"strong","position":{"start":{"line":899,"column":1},"end":{"line":899,"column":1}},"children":[{"type":"text","value":"infinite horizon","position":{"start":{"line":899,"column":1},"end":{"line":899,"column":1}},"key":"i5Xj3yVgGu"}],"key":"M6vdUQL6Em"},{"type":"text","value":" MDPs.","position":{"start":{"line":899,"column":1},"end":{"line":899,"column":1}},"key":"IsQ1qONNtd"}],"key":"w1o3Txu5u8"},{"type":"paragraph","position":{"start":{"line":902,"column":1},"end":{"line":910,"column":1}},"children":[{"type":"text","value":"In this chapter, we’ll describe the necessary adjustments from the\nfinite-horizon case to make the problem tractable. We’ll show that the\n","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"I4CSEQeMiA"},{"type":"crossReference","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"children":[{"type":"text","value":"Bellman operator","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"ownnTzCsmc"}],"identifier":"bellman_operator","label":"bellman_operator","kind":"proof:definition","template":"Definition %s","enumerator":"1.8","resolved":true,"html_id":"bellman-operator","key":"zjfZLvJnHL"},{"type":"text","value":" in the discounted reward setting is a\n","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"pF2pfooBjV"},{"type":"strong","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"children":[{"type":"text","value":"contraction mapping","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"DnejkE8k8U"}],"key":"CnkC4mBAZU"},{"type":"text","value":" for any policy.\nWe’ll discuss how to evaluate\npolicies (i.e. compute their corresponding value functions). Finally,\nwe’ll present and analyze two iterative algorithms, based on the Bellman\noperator, for computing the optimal policy: ","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"vufibmxORJ"},{"type":"strong","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"children":[{"type":"text","value":"value iteration","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"ON4YRvD8IT"}],"key":"OsYi4M9poi"},{"type":"text","value":" and\n","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"uTitzODm5Y"},{"type":"strong","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"children":[{"type":"text","value":"policy iteration","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"ED92zNNiCz"}],"key":"GvBJPQftbu"},{"type":"text","value":".","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"y8eREUg3Sh"}],"key":"RUYJaNcNu6"},{"type":"heading","depth":3,"position":{"start":{"line":912,"column":1},"end":{"line":912,"column":1}},"children":[{"type":"text","value":"Discounted rewards","position":{"start":{"line":912,"column":1},"end":{"line":912,"column":1}},"key":"uPuwtuxd3m"}],"identifier":"discounted-rewards","label":"Discounted rewards","html_id":"discounted-rewards","implicit":true,"enumerator":"1.4.1","key":"s1sG5pQcbT"},{"type":"paragraph","position":{"start":{"line":914,"column":1},"end":{"line":918,"column":1}},"children":[{"type":"text","value":"First of all, note that maximizing the cumulative reward\n","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"key":"OhcTirdjW8"},{"type":"inlineMath","value":"r_\\hi + r_{\\hi+1} + r_{\\hi+2} + \\cdots","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"html":"rh+rh+1+rh+2+r_\\hi + r_{\\hi+1} + r_{\\hi+2} + \\cdotsrh+rh+1+rh+2+","key":"SYgx33VXd0"},{"type":"text","value":" is no longer a good idea since it\nmight blow up to infinity. Instead of a time horizon ","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"key":"keDE7CYfaC"},{"type":"inlineMath","value":"H","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"html":"HHH","key":"Tgoez4soHp"},{"type":"text","value":", we now need a\n","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"key":"mz9urzqGOk"},{"type":"strong","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"children":[{"type":"text","value":"discount factor","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"key":"wYFWNMm6V7"}],"key":"ZX9pr7xXLT"},{"type":"text","value":" ","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"key":"eowdrtaR09"},{"type":"inlineMath","value":"\\gamma \\in [0, 1)","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"html":"γ[0,1)\\gamma \\in [0, 1)γ[0,1)","key":"ClFdOj8taP"},{"type":"text","value":" such that rewards become less\nvaluable the further into the future they are:","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"key":"bBngRauWeN"}],"key":"RREdXKDPgo"},{"type":"math","value":"r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots = \\sum_{k=0}^\\infty \\gamma^k r_{\\hi+k}.","position":{"start":{"line":920,"column":1},"end":{"line":920,"column":1}},"html":"rh+γrh+1+γ2rh+2+=k=0γkrh+k.r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots = \\sum_{k=0}^\\infty \\gamma^k r_{\\hi+k}.rh+γrh+1+γ2rh+2+=k=0γkrh+k.","enumerator":"1.30","key":"BQQbi9AMbm"},{"type":"paragraph","position":{"start":{"line":922,"column":1},"end":{"line":924,"column":1}},"children":[{"type":"text","value":"We can think of ","position":{"start":{"line":922,"column":1},"end":{"line":922,"column":1}},"key":"Il3FEK7rr6"},{"type":"text","value":"γ","position":{"start":{"line":922,"column":1},"end":{"line":922,"column":1}},"key":"mVnsYeP0DJ"},{"type":"text","value":" as measuring how much we care about the future:\nif it’s close to ","position":{"start":{"line":922,"column":1},"end":{"line":922,"column":1}},"key":"ASXrSXIyGi"},{"type":"text","value":"0","position":{"start":{"line":922,"column":1},"end":{"line":922,"column":1}},"key":"ILmm5Jn9Jf"},{"type":"text","value":", we only care about the near-term rewards; it’s\nclose to ","position":{"start":{"line":922,"column":1},"end":{"line":922,"column":1}},"key":"DIqWayC0DX"},{"type":"text","value":"1","position":{"start":{"line":922,"column":1},"end":{"line":922,"column":1}},"key":"wiWZpj9FHb"},{"type":"text","value":", we put more weight into future rewards.","position":{"start":{"line":922,"column":1},"end":{"line":922,"column":1}},"key":"CFmx2Lf0qD"}],"key":"TGRqk4cGx5"},{"type":"paragraph","position":{"start":{"line":926,"column":1},"end":{"line":932,"column":1}},"children":[{"type":"text","value":"You can also analyze ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"sD8CXydl7f"},{"type":"text","value":"γ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"HB1adaeYBl"},{"type":"text","value":" as the probability of ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"Ga02JTB4wY"},{"type":"emphasis","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"children":[{"type":"text","value":"continuing","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"rBzXJLqNVX"}],"key":"HtvZVstIJC"},{"type":"text","value":" the\ntrajectory at each time step. (This is equivalent to ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"n4zwFX83cT"},{"type":"inlineMath","value":"H","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"html":"HHH","key":"SgCnS5O2sB"},{"type":"text","value":" being\ndistributed by a First Success distribution with success probability\n","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"u24lHGds4R"},{"type":"text","value":"γ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"EATRR0syhj"},{"type":"text","value":".) This accords with the above interpretation: if ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"v40ba5peEK"},{"type":"text","value":"γ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"Bxm5W3PZ4M"},{"type":"text","value":" is\nclose to ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"r3qvHGbGoP"},{"type":"text","value":"0","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"jVMzjUQ90t"},{"type":"text","value":", the trajectory will likely be very short, while if\n","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"aX5xOZmTyp"},{"type":"text","value":"γ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"NSmUfvyUMk"},{"type":"text","value":" is close to ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"qVuoExjngn"},{"type":"text","value":"1","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"AuVK9Iuh40"},{"type":"text","value":", the trajectory will likely continue for a long\ntime.","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"SzbqzgIvV0"}],"key":"c8yTqS1Rbi"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"UWVP512Bqk"}],"key":"WyGqYHtDTW"},{"type":"paragraph","position":{"start":{"line":935,"column":1},"end":{"line":937,"column":1}},"children":[{"type":"text","value":"Assuming that ","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"key":"WgtqqnZ279"},{"type":"inlineMath","value":"r_\\hi \\in [0, 1]","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"html":"rh[0,1]r_\\hi \\in [0, 1]rh[0,1]","key":"xmoTqd8jbM"},{"type":"text","value":" for all ","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"key":"gsRBLYUHPt"},{"type":"inlineMath","value":"\\hi \\in \\mathbb{N}","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"html":"hN\\hi \\in \\mathbb{N}hN","key":"GSY5BsdFsf"},{"type":"text","value":",\nwhat is the maximum ","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"key":"xz3HQxVVGq"},{"type":"strong","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"children":[{"type":"text","value":"discounted","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"key":"kLUiEwrkN9"}],"key":"t9FIqi1ksc"},{"type":"text","value":" cumulative reward? You may find it\nuseful to review geometric series.","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"key":"eiVSFQ1E7t"}],"key":"eebdfKnETX"}],"key":"PMdkaLng4v"},{"type":"paragraph","position":{"start":{"line":940,"column":1},"end":{"line":940,"column":1}},"children":[{"type":"text","value":"The other components of the MDP remain the same:","position":{"start":{"line":940,"column":1},"end":{"line":940,"column":1}},"key":"pZtnHv9Joi"}],"key":"HAEJBYPFsH"},{"type":"math","value":"M = (\\mathcal{S}, \\mathcal{A}, \\mu, P, r, \\gamma).","position":{"start":{"line":942,"column":1},"end":{"line":942,"column":1}},"html":"M=(S,A,μ,P,r,γ).M = (\\mathcal{S}, \\mathcal{A}, \\mu, P, r, \\gamma).M=(S,A,μ,P,r,γ).","enumerator":"1.31","key":"h5Wx1MLh1H"},{"type":"paragraph","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"children":[{"type":"text","value":"Code-wise, we can reuse the ","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"UD8eBnM8Xa"},{"type":"inlineCode","value":"MDP","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"N33b6RvLFm"},{"type":"text","value":" class from before ","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"QcNawkzAXP"},{"type":"crossReference","kind":"proof:definition","identifier":"finite_horizon_mdp","label":"finite_horizon_mdp","children":[{"type":"text","value":"Definition ","key":"dosflvOnMx"},{"type":"text","value":"1.2","key":"kbWveCrZjc"}],"template":"Definition %s","enumerator":"1.2","resolved":true,"html_id":"finite-horizon-mdp","key":"VV8jLhTKlm"},{"type":"text","value":" and set ","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"mgqg8qV9so"},{"type":"inlineCode","value":"mdp.H = float('inf')","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"CmYRFvQqPP"},{"type":"text","value":".","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"ThyXyLqA54"}],"key":"ofLYkUGAtp"}],"key":"HKGIBIztIb"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"tidy_mdp_inf = tidy_mdp._replace(H=float(\"inf\"), γ=0.95)","key":"j5uwuflTGL"},{"type":"output","id":"ieIueWCVK0DtKkyT9sQDR","data":[],"key":"geuVqBWW9D"}],"data":{},"key":"and4MQFUOM"},{"type":"block","children":[{"type":"heading","depth":3,"position":{"start":{"line":950,"column":1},"end":{"line":950,"column":1}},"children":[{"type":"text","value":"Stationary policies","position":{"start":{"line":950,"column":1},"end":{"line":950,"column":1}},"key":"wrnGRMEyvJ"}],"identifier":"stationary-policies","label":"Stationary policies","html_id":"stationary-policies","implicit":true,"enumerator":"1.4.2","key":"QGvCpsW1pX"},{"type":"paragraph","position":{"start":{"line":952,"column":1},"end":{"line":956,"column":1}},"children":[{"type":"text","value":"The time-dependent policies from the finite-horizon case become\ndifficult to handle in the infinite-horizon case. In particular, many of\nthe DP approaches we saw required us to start at the end of the\ntrajectory, which is no longer possible. We’ll shift to ","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"key":"erhqBKVOWr"},{"type":"strong","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"children":[{"type":"text","value":"stationary","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"key":"GBfaTse5IA"}],"key":"e6iLhuu2QQ"},{"type":"text","value":"\npolicies ","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"key":"hSGIFimWSI"},{"type":"inlineMath","value":"\\pi : \\mathcal{S} \\to \\mathcal{A}","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"html":"π:SA\\pi : \\mathcal{S} \\to \\mathcal{A}π:SA","key":"h1bSYu8Gcm"},{"type":"text","value":" (deterministic) or ","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"key":"jBH8mDTqti"},{"type":"inlineMath","value":"\\Delta(\\mathcal{A})","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"html":"Δ(A)\\Delta(\\mathcal{A})Δ(A)","key":"jEUzeINVXG"},{"type":"text","value":" (stochastic).","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"key":"IdlH0tZymD"}],"key":"pyyhdkOYn6"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"kvLnk5iMuN"}],"key":"xG68OWP4xA"},{"type":"paragraph","position":{"start":{"line":959,"column":1},"end":{"line":959,"column":1}},"children":[{"type":"text","value":"Which of the policies in ","position":{"start":{"line":959,"column":1},"end":{"line":959,"column":1}},"key":"zMSClrCoJg"},{"type":"crossReference","kind":"proof:example","identifier":"tidy_policy","label":"tidy_policy","children":[{"type":"text","value":"Example ","key":"NEi23jubsO"},{"type":"text","value":"1.2","key":"OFqc27LTXG"}],"template":"Example %s","enumerator":"1.2","resolved":true,"html_id":"tidy-policy","key":"gPRJhsJTRp"},{"type":"text","value":" are stationary?","position":{"start":{"line":959,"column":1},"end":{"line":959,"column":1}},"key":"Fhtb5Gb2Xj"}],"key":"LiiHKNeaDL"}],"key":"E31N9HDqAj"},{"type":"heading","depth":3,"position":{"start":{"line":962,"column":1},"end":{"line":962,"column":1}},"children":[{"type":"text","value":"Value functions and Bellman consistency","position":{"start":{"line":962,"column":1},"end":{"line":962,"column":1}},"key":"w9gCBVptlP"}],"identifier":"value-functions-and-bellman-consistency","label":"Value functions and Bellman consistency","html_id":"value-functions-and-bellman-consistency","implicit":true,"enumerator":"1.4.3","key":"tAt9ETf64P"},{"type":"paragraph","position":{"start":{"line":964,"column":1},"end":{"line":966,"column":1}},"children":[{"type":"text","value":"We also consider stationary value functions ","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"key":"SkyOSCeLTa"},{"type":"inlineMath","value":"V^\\pi : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"html":"Vπ:SRV^\\pi : \\mathcal{S} \\to \\mathbb{R}Vπ:SR","key":"TzzIG0QUaa"},{"type":"text","value":" and\n","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"key":"aMtAh0Cq5l"},{"type":"inlineMath","value":"Q^\\pi : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"html":"Qπ:S×ARQ^\\pi : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}Qπ:S×AR","key":"Co0tKSU2CO"},{"type":"text","value":". We need to insert a factor of ","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"key":"VuaQcmglxz"},{"type":"text","value":"γ","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"key":"Zyrd9J8g0r"},{"type":"text","value":"\ninto the Bellman consistency equation ","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"key":"ZliaXCHEgE"},{"type":"crossReference","kind":"proof:theorem","identifier":"bellman_consistency","label":"bellman_consistency","children":[{"type":"text","value":"Theorem ","key":"Qrh6dXSwiB"},{"type":"text","value":"1.1","key":"DU04fCGHsO"}],"template":"Theorem %s","enumerator":"1.1","resolved":true,"html_id":"bellman-consistency","key":"SURQvN0FOa"},{"type":"text","value":" to account for the discounting:","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"key":"Gqux5X5UI6"}],"key":"AuTBZMPHXl"},{"type":"math","value":"\\begin{aligned}\n V^\\pi(s) &= \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} \\cdots \\mid s_\\hi = s] && \\text{for any } \\hi \\in \\mathbb{N} \\\\\n &= \\E_{\\substack{a \\sim \\pi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + \\gamma V^\\pi(s')]\\\\\n Q^\\pi(s, a) &= \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots \\mid s_\\hi = s, a_\\hi = a] && \\text{for any } \\hi \\in \\mathbb{N} \\\\\n &= r(s, a) + \\gamma \\E_{\\substack{s' \\sim P(s, a) \\\\ a' \\sim \\pi(s')}} [Q^\\pi(s', a')]\n\\end{aligned}","label":"bellman_consistency_infinite","identifier":"bellman_consistency_infinite","html":"Vπ(s)=Eτρπ[rh+γrh+1+γ2rh+2sh=s]for any hN=Eaπ(s)sP(s,a)[r(s,a)+γVπ(s)]Qπ(s,a)=Eτρπ[rh+γrh+1+γ2rh+2+sh=s,ah=a]for any hN=r(s,a)+γEsP(s,a)aπ(s)[Qπ(s,a)]\\begin{aligned}\n V^\\pi(s) &= \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} \\cdots \\mid s_\\hi = s] && \\text{for any } \\hi \\in \\mathbb{N} \\\\\n &= \\E_{\\substack{a \\sim \\pi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + \\gamma V^\\pi(s')]\\\\\n Q^\\pi(s, a) &= \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots \\mid s_\\hi = s, a_\\hi = a] && \\text{for any } \\hi \\in \\mathbb{N} \\\\\n &= r(s, a) + \\gamma \\E_{\\substack{s' \\sim P(s, a) \\\\ a' \\sim \\pi(s')}} [Q^\\pi(s', a')]\n\\end{aligned}Vπ(s)Qπ(s,a)=Eτρπ[rh+γrh+1+γ2rh+2sh=s]=Eaπ(s)sP(s,a)[r(s,a)+γVπ(s)]=Eτρπ[rh+γrh+1+γ2rh+2+sh=s,ah=a]=r(s,a)+γEsP(s,a)aπ(s)[Qπ(s,a)]for any hNfor any hN","enumerator":"1.32","html_id":"bellman-consistency-infinite","key":"swT6b0I7Og"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"EmbBpCHPmT"}],"key":"nLTjzb8iRS"},{"type":"paragraph","position":{"start":{"line":980,"column":1},"end":{"line":981,"column":1}},"children":[{"type":"text","value":"Heuristically speaking, why does it no longer matter which\ntime step we condition on when defining the value function?","position":{"start":{"line":980,"column":1},"end":{"line":980,"column":1}},"key":"Vy8c71a2So"}],"key":"yqvQchcsdS"}],"key":"x4OS4VQGcB"},{"type":"heading","depth":2,"position":{"start":{"line":984,"column":1},"end":{"line":984,"column":1}},"children":[{"type":"text","value":"Solving infinite-horizon MDPs","position":{"start":{"line":984,"column":1},"end":{"line":984,"column":1}},"key":"CiZHLhHIeQ"}],"identifier":"solving-infinite-horizon-mdps","label":"Solving infinite-horizon MDPs","html_id":"solving-infinite-horizon-mdps","implicit":true,"enumerator":"1.5","key":"XyigiFqvJN"},{"type":"heading","depth":3,"position":{"start":{"line":986,"column":1},"end":{"line":986,"column":1}},"children":[{"type":"text","value":"The Bellman operator is a contraction mapping","position":{"start":{"line":986,"column":1},"end":{"line":986,"column":1}},"key":"wiMgo6KOQE"}],"identifier":"the-bellman-operator-is-a-contraction-mapping","label":"The Bellman operator is a contraction mapping","html_id":"the-bellman-operator-is-a-contraction-mapping","implicit":true,"enumerator":"1.5.1","key":"cr7T4xYFP7"},{"type":"paragraph","position":{"start":{"line":988,"column":1},"end":{"line":991,"column":1}},"children":[{"type":"text","value":"Recall from ","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"key":"QTTQP1YRgP"},{"type":"crossReference","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"children":[{"type":"text","value":"Definition ","key":"THojxedjeU"},{"type":"text","value":"1.8","key":"RF5UDGlYrP"}],"identifier":"bellman_operator","label":"bellman_operator","kind":"proof:definition","template":"Definition %s","enumerator":"1.8","resolved":true,"html_id":"bellman-operator","key":"h9pO5tdDzO"},{"type":"text","value":" that the Bellman operator ","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"key":"p2nFFLjB7q"},{"type":"inlineMath","value":"\\mathcal{J}^{\\pi}","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"html":"Jπ\\mathcal{J}^{\\pi}Jπ","key":"po42ZpGCKI"},{"type":"text","value":"\nfor a policy ","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"key":"nAScShAqU3"},{"type":"text","value":"π","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"key":"Jjigx4qWkO"},{"type":"text","value":" takes in a “value function” ","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"key":"VWdIwFKei9"},{"type":"inlineMath","value":"v : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"html":"v:SRv : \\mathcal{S} \\to \\mathbb{R}v:SR","key":"apBCwCP9Vc"},{"type":"text","value":" and\nreturns the r.h.s. of the Bellman equation for that “value function”. In\nthe infinite-horizon setting, this is","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"key":"rwAopuxGt0"}],"key":"hVkLfsLCqP"},{"type":"math","value":"[\\mathcal{J}^{\\pi}(v)](s) := \\E_{\\substack{a \\sim \\pi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + \\gamma v(s')].","position":{"start":{"line":993,"column":1},"end":{"line":993,"column":1}},"html":"[Jπ(v)](s):=Eaπ(s)sP(s,a)[r(s,a)+γv(s)].[\\mathcal{J}^{\\pi}(v)](s) := \\E_{\\substack{a \\sim \\pi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + \\gamma v(s')].[Jπ(v)](s):=Eaπ(s)sP(s,a)[r(s,a)+γv(s)].","enumerator":"1.33","key":"EyoGsLpHou"},{"type":"paragraph","position":{"start":{"line":995,"column":1},"end":{"line":999,"column":1}},"children":[{"type":"text","value":"The crucial property of the Bellman operator is that it is a\n","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"CPppWhrxEo"},{"type":"strong","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"children":[{"type":"text","value":"contraction mapping","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"Gx4EO6iRdJ"}],"key":"x2Gnrk8sds"},{"type":"text","value":" for any policy. Intuitively, if we start with\ntwo “value functions” ","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"olQwXmQ0an"},{"type":"inlineMath","value":"v, u : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"html":"v,u:SRv, u : \\mathcal{S} \\to \\mathbb{R}v,u:SR","key":"CygdmGBiL0"},{"type":"text","value":", if we repeatedly apply the\nBellman operator to each of them, they will get closer and closer\ntogether at an exponential rate.","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"pBRbUcynIW"}],"key":"ipvpQWQBIT"},{"type":"proof","kind":"definition","label":"contraction","identifier":"contraction","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Contraction mapping","position":{"start":{"line":1001,"column":1},"end":{"line":1001,"column":1}},"key":"WiofhdEG0J"}],"key":"XKhP5ne4xK"},{"type":"paragraph","position":{"start":{"line":1004,"column":1},"end":{"line":1005,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"H4mg3qjPQn"},{"type":"inlineMath","value":"X","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"html":"XXX","key":"ACJsgrq2Et"},{"type":"text","value":" be some space with a norm ","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"wYDrjJ0VCl"},{"type":"inlineMath","value":"\\|\\cdot\\|","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"html":"\\|\\cdot\\|","key":"lWs0fuhWoD"},{"type":"text","value":". We call an operator\n","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"L7RAIUPdXa"},{"type":"inlineMath","value":"f: X \\to X","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"html":"f:XXf: X \\to Xf:XX","key":"THeetxgXqC"},{"type":"text","value":" a ","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"nls0nFpUjf"},{"type":"strong","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"children":[{"type":"text","value":"contraction mapping","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"v8LPxq3yoj"}],"key":"yJGMxBZcaf"},{"type":"text","value":" if for any ","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"cJNezHz1dP"},{"type":"inlineMath","value":"x, y \\in X","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"html":"x,yXx, y \\in Xx,yX","key":"gBIV1S9qSo"},{"type":"text","value":",","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"qd5eLIUurL"}],"key":"fOneFMhzc7"},{"type":"math","value":"\\|f(x) - f(y)\\| \\le \\gamma \\|x - y\\|","position":{"start":{"line":1007,"column":1},"end":{"line":1007,"column":1}},"html":"f(x)f(y)γxy\\|f(x) - f(y)\\| \\le \\gamma \\|x - y\\|f(x)f(y)γxy","enumerator":"1.34","key":"OQfpcC9Ds0"},{"type":"paragraph","position":{"start":{"line":1009,"column":1},"end":{"line":1011,"column":1}},"children":[{"type":"text","value":"for some fixed ","position":{"start":{"line":1009,"column":1},"end":{"line":1009,"column":1}},"key":"tggrQVF0hv"},{"type":"inlineMath","value":"\\gamma \\in (0, 1)","position":{"start":{"line":1009,"column":1},"end":{"line":1009,"column":1}},"html":"γ(0,1)\\gamma \\in (0, 1)γ(0,1)","key":"yUzuWPqtVP"},{"type":"text","value":".\nIntuitively, this means that if two points are ","position":{"start":{"line":1009,"column":1},"end":{"line":1009,"column":1}},"key":"Oh6CPYqOjP"},{"type":"text","value":"δ","position":{"start":{"line":1009,"column":1},"end":{"line":1009,"column":1}},"key":"CVJMslW2B2"},{"type":"text","value":" far apart,\nafter applying the mapping,","position":{"start":{"line":1009,"column":1},"end":{"line":1009,"column":1}},"key":"bG5aGYIKeT"}],"key":"NIZixUY8EJ"}],"enumerator":"1.12","html_id":"contraction","key":"mPfov96t9d"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"iptmcdcFV0"}],"key":"evwdyttaBE"},{"type":"paragraph","position":{"start":{"line":1016,"column":1},"end":{"line":1017,"column":1}},"children":[{"type":"text","value":"Show that for a contraction mapping ","position":{"start":{"line":1016,"column":1},"end":{"line":1016,"column":1}},"key":"CGRVvuJPol"},{"type":"inlineMath","value":"f","position":{"start":{"line":1016,"column":1},"end":{"line":1016,"column":1}},"html":"fff","key":"fdB2ZPj1mb"},{"type":"text","value":" with coefficient\n","position":{"start":{"line":1016,"column":1},"end":{"line":1016,"column":1}},"key":"j4OAEQfsqy"},{"type":"text","value":"γ","position":{"start":{"line":1016,"column":1},"end":{"line":1016,"column":1}},"key":"NzvRhekguR"},{"type":"text","value":", for all ","position":{"start":{"line":1016,"column":1},"end":{"line":1016,"column":1}},"key":"IUywX33z2Q"},{"type":"inlineMath","value":"t \\in \\mathbb{N}","position":{"start":{"line":1016,"column":1},"end":{"line":1016,"column":1}},"html":"tNt \\in \\mathbb{N}tN","key":"HRFLyrbWEc"},{"type":"text","value":",","position":{"start":{"line":1016,"column":1},"end":{"line":1016,"column":1}},"key":"CIuujYKNRg"}],"key":"Nu45YeQHOj"},{"type":"math","value":"\\|f^{(t)}(x) - f^{(t)}(y)\\| \\le \\gamma^t \\|x - y\\|,","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"html":"f(t)(x)f(t)(y)γtxy,\\|f^{(t)}(x) - f^{(t)}(y)\\| \\le \\gamma^t \\|x - y\\|,f(t)(x)f(t)(y)γtxy,","enumerator":"1.35","key":"cFNx1zOaKH"},{"type":"paragraph","position":{"start":{"line":1021,"column":1},"end":{"line":1023,"column":1}},"children":[{"type":"text","value":"i.e. that any\ntwo points will be pushed closer by at least a factor of ","position":{"start":{"line":1021,"column":1},"end":{"line":1021,"column":1}},"key":"Y2WohtaOYU"},{"type":"text","value":"γ","position":{"start":{"line":1021,"column":1},"end":{"line":1021,"column":1}},"key":"UfpveUUgL6"},{"type":"text","value":" at\neach iteration.","position":{"start":{"line":1021,"column":1},"end":{"line":1021,"column":1}},"key":"leynOMQMcd"}],"key":"Rb8JmI7PIC"}],"key":"iAiy3Cmnk2"},{"type":"paragraph","position":{"start":{"line":1026,"column":1},"end":{"line":1029,"column":1}},"children":[{"type":"text","value":"It is a powerful fact (known as the ","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"yv1xOtbk6N"},{"type":"strong","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"children":[{"type":"text","value":"Banach fixed-point theorem","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"F4GT7cNlEK"}],"key":"sTUxA4ijC4"},{"type":"text","value":") that\nevery contraction mapping has a unique ","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"v9eWDUDhsn"},{"type":"strong","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"children":[{"type":"text","value":"fixed point","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"rikuVyPkPz"}],"key":"jHVXB3OxpU"},{"type":"text","value":" ","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"BpnFNAp6bR"},{"type":"inlineMath","value":"x^\\star","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"html":"xx^\\starx","key":"kwM1iwsSKo"},{"type":"text","value":" such\nthat ","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"lsftPzLfF8"},{"type":"inlineMath","value":"f(x^\\star) = x^\\star","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"html":"f(x)=xf(x^\\star) = x^\\starf(x)=x","key":"IvE71uaWRA"},{"type":"text","value":". This means that if we repeatedly apply ","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"EtdVVz6KVm"},{"type":"inlineMath","value":"f","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"html":"fff","key":"Z7vsgq6Mzo"},{"type":"text","value":"\nto any starting point, we will eventually converge to ","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"fnldjlphnw"},{"type":"inlineMath","value":"x^\\star","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"html":"xx^\\starx","key":"bmZjnBV9qb"},{"type":"text","value":":","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"SaN9PHj2Ql"}],"key":"Vd4TPwXuPt"},{"type":"math","value":"\\|f^{(t)}(x) - x^\\star\\| \\le \\gamma^t \\|x - x^\\star\\|.","label":"contraction_convergence","identifier":"contraction_convergence","html":"f(t)(x)xγtxx.\\|f^{(t)}(x) - x^\\star\\| \\le \\gamma^t \\|x - x^\\star\\|.f(t)(x)xγtxx∥.","enumerator":"1.36","html_id":"contraction-convergence","key":"Ro08MkW4Rq"},{"type":"paragraph","position":{"start":{"line":1037,"column":1},"end":{"line":1040,"column":1}},"children":[{"type":"text","value":"Let’s return to the RL setting and apply this result to the Bellman\noperator. How can we measure the distance between two “value functions”\n","position":{"start":{"line":1037,"column":1},"end":{"line":1037,"column":1}},"key":"ockcGQwcIR"},{"type":"inlineMath","value":"v, u : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":1037,"column":1},"end":{"line":1037,"column":1}},"html":"v,u:SRv, u : \\mathcal{S} \\to \\mathbb{R}v,u:SR","key":"Io3Djja5x7"},{"type":"text","value":"? We’ll take the ","position":{"start":{"line":1037,"column":1},"end":{"line":1037,"column":1}},"key":"UAvJQotecT"},{"type":"strong","position":{"start":{"line":1037,"column":1},"end":{"line":1037,"column":1}},"children":[{"type":"text","value":"supremum norm","position":{"start":{"line":1037,"column":1},"end":{"line":1037,"column":1}},"key":"WmqOsl1g45"}],"key":"qhve5DDdzH"},{"type":"text","value":" as our distance\nmetric:","position":{"start":{"line":1037,"column":1},"end":{"line":1037,"column":1}},"key":"zMBznxXP4V"}],"key":"JnpOjXo83m"},{"type":"math","value":"\\| v - u \\|_{\\infty} := \\sup_{s \\in \\mathcal{S}} |v(s) - u(s)|,","position":{"start":{"line":1042,"column":1},"end":{"line":1042,"column":1}},"html":"vu:=supsSv(s)u(s),\\| v - u \\|_{\\infty} := \\sup_{s \\in \\mathcal{S}} |v(s) - u(s)|,vu:=sSsupv(s)u(s),","enumerator":"1.37","key":"rAjh5QYYfN"},{"type":"paragraph","position":{"start":{"line":1044,"column":1},"end":{"line":1048,"column":1}},"children":[{"type":"text","value":"i.e.\nwe compare the “value functions” on the state that causes the biggest\ngap between them. Then ","position":{"start":{"line":1044,"column":1},"end":{"line":1044,"column":1}},"key":"CXIXi9EBy0"},{"type":"crossReference","kind":"equation","identifier":"contraction_convergence","label":"contraction_convergence","children":[{"type":"text","value":"(","key":"KmvmYrS2Wd"},{"type":"text","value":"1.36","key":"Uro6dcJv0a"},{"type":"text","value":")","key":"MTzNM3GZ75"}],"template":"(%s)","enumerator":"1.36","resolved":true,"html_id":"contraction-convergence","key":"KLFImq48Zs"},{"type":"text","value":" implies that if we repeatedly\napply ","position":{"start":{"line":1044,"column":1},"end":{"line":1044,"column":1}},"key":"a3Si0x3HPj"},{"type":"inlineMath","value":"\\mathcal{J}^\\pi","position":{"start":{"line":1044,"column":1},"end":{"line":1044,"column":1}},"html":"Jπ\\mathcal{J}^\\piJπ","key":"MUYtbDzeql"},{"type":"text","value":" to any starting “value function”, we will eventually\nconverge to ","position":{"start":{"line":1044,"column":1},"end":{"line":1044,"column":1}},"key":"zXTYRlJ7C7"},{"type":"inlineMath","value":"V^\\pi","position":{"start":{"line":1044,"column":1},"end":{"line":1044,"column":1}},"html":"VπV^\\piVπ","key":"edoMZW9MP5"},{"type":"text","value":":","position":{"start":{"line":1044,"column":1},"end":{"line":1044,"column":1}},"key":"iXNKQpGAAA"}],"key":"wqETSU0iUD"},{"type":"math","value":"\\|(\\mathcal{J}^\\pi)^{(t)}(v) - V^\\pi \\|_{\\infty} \\le \\gamma^{t} \\| v - V^\\pi\\|_{\\infty}.","label":"bellman_convergence","identifier":"bellman_convergence","html":"(Jπ)(t)(v)VπγtvVπ.\\|(\\mathcal{J}^\\pi)^{(t)}(v) - V^\\pi \\|_{\\infty} \\le \\gamma^{t} \\| v - V^\\pi\\|_{\\infty}.(Jπ)(t)(v)VπγtvVπ.","enumerator":"1.38","html_id":"bellman-convergence","key":"GTCWHp5wXP"},{"type":"paragraph","position":{"start":{"line":1056,"column":1},"end":{"line":1057,"column":1}},"children":[{"type":"text","value":"We’ll use this useful fact to prove the convergence of several\nalgorithms later on.","position":{"start":{"line":1056,"column":1},"end":{"line":1056,"column":1}},"key":"KoRYEuvTrb"}],"key":"Iuc1cRXGYF"},{"type":"proof","kind":"theorem","label":"bellman_contraction","identifier":"bellman_contraction","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"The Bellman operator is a contraction mapping","position":{"start":{"line":1059,"column":1},"end":{"line":1059,"column":1}},"key":"moDs11uTrY"}],"key":"dYSZSY1i2f"},{"type":"math","value":"\\|\\mathcal{J}^{\\pi} (v) - \\mathcal{J}^{\\pi} (u) \\|_{\\infty} \\le \\gamma \\|v - u \\|_{\\infty}.","position":{"start":{"line":1062,"column":1},"end":{"line":1064,"column":1}},"html":"Jπ(v)Jπ(u)γvu.\\|\\mathcal{J}^{\\pi} (v) - \\mathcal{J}^{\\pi} (u) \\|_{\\infty} \\le \\gamma \\|v - u \\|_{\\infty}.Jπ(v)Jπ(u)γvu.","enumerator":"1.39","key":"ulvJoGyNrp"}],"enumerator":"1.4","html_id":"bellman-contraction","key":"QehsBfOJdz"},{"type":"proof","kind":"proof","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Proof of ","position":{"start":{"line":1067,"column":1},"end":{"line":1067,"column":1}},"key":"KHed2TJ8ov"},{"type":"crossReference","kind":"proof:theorem","identifier":"bellman_contraction","label":"bellman_contraction","children":[{"type":"text","value":"Theorem ","key":"gGUInGimn7"},{"type":"text","value":"1.4","key":"l4vGVAtYGI"}],"template":"Theorem %s","enumerator":"1.4","resolved":true,"html_id":"bellman-contraction","key":"tEvjLVxT9q"}],"key":"tNpD8Wx3h9"},{"type":"paragraph","position":{"start":{"line":1069,"column":1},"end":{"line":1069,"column":1}},"children":[{"type":"text","value":"For all states ","position":{"start":{"line":1069,"column":1},"end":{"line":1069,"column":1}},"key":"WxOHe3HIwe"},{"type":"inlineMath","value":"s \\in \\mathcal{S}","position":{"start":{"line":1069,"column":1},"end":{"line":1069,"column":1}},"html":"sSs \\in \\mathcal{S}sS","key":"jtBAQ5QL72"},{"type":"text","value":",","position":{"start":{"line":1069,"column":1},"end":{"line":1069,"column":1}},"key":"Il6E0j2LIu"}],"key":"cHPDWuWolw"},{"type":"math","value":"\\begin{aligned}\n|[\\mathcal{J}^{\\pi} (v)](s) - [\\mathcal{J}^{\\pi} (u)](s)|&= \\Big| \\mathop{\\mathbb{E}}_{a \\sim \\pi(s)} \\left[ r(s, a) + \\gamma \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} v(s') \\right] \\\\\n&\\qquad - \\mathop{\\mathbb{E}}_{a \\sim \\pi(s)} \\left[r(s, a) + \\gamma \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} u(s') \\right] \\Big| \\\\\n&= \\gamma \\left|\\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} [v(s') - u(s')] \\right| \\\\\n&\\le \\gamma \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)}|v(s') - u(s')| \\qquad \\text{(Jensen's inequality)} \\\\\n&\\le \\gamma \\max_{s'} |v(s') - u(s')| \\\\\n&= \\gamma \\|v - u \\|_{\\infty}.\n\\end{aligned}","position":{"start":{"line":1071,"column":1},"end":{"line":1080,"column":1}},"html":"[Jπ(v)](s)[Jπ(u)](s)=Eaπ(s)[r(s,a)+γEsP(s,a)v(s)]Eaπ(s)[r(s,a)+γEsP(s,a)u(s)]=γEsP(s,a)[v(s)u(s)]γEsP(s,a)v(s)u(s)(Jensen’s inequality)γmaxsv(s)u(s)=γvu.\\begin{aligned}\n|[\\mathcal{J}^{\\pi} (v)](s) - [\\mathcal{J}^{\\pi} (u)](s)|&= \\Big| \\mathop{\\mathbb{E}}_{a \\sim \\pi(s)} \\left[ r(s, a) + \\gamma \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} v(s') \\right] \\\\\n&\\qquad - \\mathop{\\mathbb{E}}_{a \\sim \\pi(s)} \\left[r(s, a) + \\gamma \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} u(s') \\right] \\Big| \\\\\n&= \\gamma \\left|\\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} [v(s') - u(s')] \\right| \\\\\n&\\le \\gamma \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)}|v(s') - u(s')| \\qquad \\text{(Jensen's inequality)} \\\\\n&\\le \\gamma \\max_{s'} |v(s') - u(s')| \\\\\n&= \\gamma \\|v - u \\|_{\\infty}.\n\\end{aligned}[Jπ(v)](s)[Jπ(u)](s)=Eaπ(s)[r(s,a)+γEsP(s,a)v(s)]Eaπ(s)[r(s,a)+γEsP(s,a)u(s)]=γEsP(s,a)[v(s)u(s)]γEsP(s,a)v(s)u(s)(Jensen’s inequality)γsmaxv(s)u(s)=γvu.","enumerator":"1.40","key":"YfD9SfQA0e"}],"enumerator":"1.2","key":"HTfOL3E4mR"},{"type":"heading","depth":3,"position":{"start":{"line":1083,"column":1},"end":{"line":1083,"column":1}},"children":[{"type":"text","value":"Policy evaluation in infinite-horizon MDPs","position":{"start":{"line":1083,"column":1},"end":{"line":1083,"column":1}},"key":"C8J756gOxG"}],"identifier":"policy-evaluation-in-infinite-horizon-mdps","label":"Policy evaluation in infinite-horizon MDPs","html_id":"policy-evaluation-in-infinite-horizon-mdps","implicit":true,"enumerator":"1.5.2","key":"Co9NA0vAxk"},{"type":"paragraph","position":{"start":{"line":1085,"column":1},"end":{"line":1087,"column":1}},"children":[{"type":"text","value":"The backwards DP technique we used in ","position":{"start":{"line":1085,"column":1},"end":{"line":1085,"column":1}},"key":"Z1lK0mSLqK"},{"type":"crossReference","position":{"start":{"line":1085,"column":1},"end":{"line":1085,"column":1}},"children":[{"type":"text","value":"the finite-horizon case","position":{"start":{"line":1085,"column":1},"end":{"line":1085,"column":1}},"key":"pW2SonWUqt"}],"identifier":"eval_dp","label":"eval_dp","kind":"heading","template":"Section %s","enumerator":"1.3.1","resolved":true,"html_id":"eval-dp","key":"hcj58GhtRU"},{"type":"text","value":" no\nlonger works since there is no “final timestep” to start from. We’ll\nneed another approach to policy evaluation.","position":{"start":{"line":1085,"column":1},"end":{"line":1085,"column":1}},"key":"dlxWo21N6Y"}],"key":"s0BbdsJr9X"},{"type":"paragraph","position":{"start":{"line":1089,"column":1},"end":{"line":1092,"column":1}},"children":[{"type":"text","value":"The Bellman consistency conditions yield a system of equations we can\nsolve to evaluate a deterministic policy ","position":{"start":{"line":1089,"column":1},"end":{"line":1089,"column":1}},"key":"EzjHAu2CxB"},{"type":"emphasis","position":{"start":{"line":1089,"column":1},"end":{"line":1089,"column":1}},"children":[{"type":"text","value":"exactly","position":{"start":{"line":1089,"column":1},"end":{"line":1089,"column":1}},"key":"YSWoKWi4A8"}],"key":"sS6yShvSyJ"},{"type":"text","value":". For a faster approximate solution,\nwe can iterate the policy’s Bellman operator, since we know that it has\na unique fixed point at the true value function.","position":{"start":{"line":1089,"column":1},"end":{"line":1089,"column":1}},"key":"nQR95ACcSb"}],"key":"S38INlp3IN"},{"type":"heading","depth":4,"position":{"start":{"line":1094,"column":1},"end":{"line":1094,"column":1}},"children":[{"type":"text","value":"Matrix inversion for deterministic policies","position":{"start":{"line":1094,"column":1},"end":{"line":1094,"column":1}},"key":"KktIXDnAZP"}],"identifier":"matrix-inversion-for-deterministic-policies","label":"Matrix inversion for deterministic policies","html_id":"matrix-inversion-for-deterministic-policies","implicit":true,"enumerator":"1.5.2.1","key":"Kq6JI3wwso"},{"type":"paragraph","position":{"start":{"line":1096,"column":1},"end":{"line":1098,"column":1}},"children":[{"type":"text","value":"Note that when the policy ","position":{"start":{"line":1096,"column":1},"end":{"line":1096,"column":1}},"key":"JVMysCgtlE"},{"type":"text","value":"π","position":{"start":{"line":1096,"column":1},"end":{"line":1096,"column":1}},"key":"YADNxMYKY1"},{"type":"text","value":" is deterministic, the actions can be\ndetermined from the states, and so we can chop off the action dimension\nfor the rewards and state transitions:","position":{"start":{"line":1096,"column":1},"end":{"line":1096,"column":1}},"key":"afLZaxlLnV"}],"key":"tXit5AEMXr"},{"type":"math","value":"\\begin{aligned}\n r^{\\pi} &\\in \\mathbb{R}^{|\\mathcal{S}|} & P^{\\pi} &\\in [0, 1]^{|\\mathcal{S}| \\times |\\mathcal{S}|} & \\mu &\\in [0, 1]^{|\\mathcal{S}|} \\\\\n \\pi &\\in \\mathcal{A}^{|\\mathcal{S}|} & V^\\pi &\\in \\mathbb{R}^{|\\mathcal{S}|} & Q^\\pi &\\in \\mathbb{R}^{|\\mathcal{S}| \\times |\\mathcal{A}|}.\n\\end{aligned}","position":{"start":{"line":1100,"column":1},"end":{"line":1105,"column":1}},"html":"rπRSPπ[0,1]S×Sμ[0,1]SπASVπRSQπRS×A.\\begin{aligned}\n r^{\\pi} &\\in \\mathbb{R}^{|\\mathcal{S}|} & P^{\\pi} &\\in [0, 1]^{|\\mathcal{S}| \\times |\\mathcal{S}|} & \\mu &\\in [0, 1]^{|\\mathcal{S}|} \\\\\n \\pi &\\in \\mathcal{A}^{|\\mathcal{S}|} & V^\\pi &\\in \\mathbb{R}^{|\\mathcal{S}|} & Q^\\pi &\\in \\mathbb{R}^{|\\mathcal{S}| \\times |\\mathcal{A}|}.\n\\end{aligned}rππRSASPπVπ[0,1]S×SRSμQπ[0,1]SRS×A.","enumerator":"1.41","key":"mKxzuJX4uD"},{"type":"paragraph","position":{"start":{"line":1107,"column":1},"end":{"line":1109,"column":1}},"children":[{"type":"text","value":"For ","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"key":"HcjgdtkQko"},{"type":"inlineMath","value":"P^\\pi","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"html":"PπP^\\piPπ","key":"gTlhzGkbiv"},{"type":"text","value":", we’ll treat the rows as the states and the\ncolumns as the next states. Then ","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"key":"UxHEWfbE66"},{"type":"inlineMath","value":"P^\\pi_{s, s'}","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"html":"Ps,sπP^\\pi_{s, s'}Ps,sπ","key":"zEFAeinv72"},{"type":"text","value":" is the probability of\ntransitioning from state ","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"key":"iTUrXovAue"},{"type":"inlineMath","value":"s","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"html":"sss","key":"WpsmWF4ote"},{"type":"text","value":" to state ","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"key":"TMpCcoCeg4"},{"type":"inlineMath","value":"s'","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"html":"ss's","key":"qkxhtCqUGB"},{"type":"text","value":" under policy ","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"key":"BacgOl7dXb"},{"type":"text","value":"π","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"key":"sOMACHeyPt"},{"type":"text","value":".","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"key":"YHMVCvlBVK"}],"key":"OdywGqbc4O"},{"type":"proof","kind":"example","label":"tidy_tabular","identifier":"tidy_tabular","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Tidying MDP","position":{"start":{"line":1111,"column":1},"end":{"line":1111,"column":1}},"key":"XMoPvyW9aA"}],"key":"AcGfRJCNgv"},{"type":"paragraph","position":{"start":{"line":1114,"column":1},"end":{"line":1116,"column":1}},"children":[{"type":"text","value":"The tabular MDP from before has ","position":{"start":{"line":1114,"column":1},"end":{"line":1114,"column":1}},"key":"uhwe7f1FhB"},{"type":"inlineMath","value":"|\\mathcal{S}| = 2","position":{"start":{"line":1114,"column":1},"end":{"line":1114,"column":1}},"html":"S=2|\\mathcal{S}| = 2S=2","key":"dgCXV9NGCY"},{"type":"text","value":" and ","position":{"start":{"line":1114,"column":1},"end":{"line":1114,"column":1}},"key":"iq4heP4j9w"},{"type":"inlineMath","value":"|\\mathcal{A}| = 2","position":{"start":{"line":1114,"column":1},"end":{"line":1114,"column":1}},"html":"A=2|\\mathcal{A}| = 2A=2","key":"Jpb4W0JynO"},{"type":"text","value":". Let’s write\ndown the quantities for the policy ","position":{"start":{"line":1114,"column":1},"end":{"line":1114,"column":1}},"key":"fwUt82LvAN"},{"type":"text","value":"π","position":{"start":{"line":1114,"column":1},"end":{"line":1114,"column":1}},"key":"jR8g74nd8G"},{"type":"text","value":" that tidies if and only if the\nroom is messy:","position":{"start":{"line":1114,"column":1},"end":{"line":1114,"column":1}},"key":"yMdy52DrVQ"}],"key":"iedJCe3xHz"},{"type":"math","value":"r^{\\pi} = \\begin{bmatrix} 1 \\\\ 0 \\end{bmatrix}, \\quad\n P^{\\pi} = \\begin{bmatrix} 0.7 & 0.3 \\\\ 1 & 0 \\end{bmatrix}, \\quad\n \\mu = \\begin{bmatrix} 1 \\\\ 0 \\end{bmatrix}","position":{"start":{"line":1118,"column":1},"end":{"line":1120,"column":1}},"html":"rπ=[10],Pπ=[0.70.310],μ=[10]r^{\\pi} = \\begin{bmatrix} 1 \\\\ 0 \\end{bmatrix}, \\quad\n P^{\\pi} = \\begin{bmatrix} 0.7 & 0.3 \\\\ 1 & 0 \\end{bmatrix}, \\quad\n \\mu = \\begin{bmatrix} 1 \\\\ 0 \\end{bmatrix}rπ=[10],Pπ=[0.710.30],μ=[10]","enumerator":"1.42","key":"M8HpQzzq8d"},{"type":"paragraph","position":{"start":{"line":1122,"column":1},"end":{"line":1123,"column":1}},"children":[{"type":"text","value":"We’ll see how to\nevaluate this policy in the next section.","position":{"start":{"line":1122,"column":1},"end":{"line":1122,"column":1}},"key":"MX1fpzUf4Q"}],"key":"IQPdLS7Ri5"}],"enumerator":"1.5","html_id":"tidy-tabular","key":"Qv8iBIQj7h"},{"type":"paragraph","position":{"start":{"line":1126,"column":1},"end":{"line":1127,"column":1}},"children":[{"type":"text","value":"The Bellman consistency equation for a deterministic policy can be\nwritten in tabular notation as","position":{"start":{"line":1126,"column":1},"end":{"line":1126,"column":1}},"key":"nM26lL13iz"}],"key":"eOmietn7AX"},{"type":"math","value":"V^\\pi = r^\\pi + \\gamma P^\\pi V^\\pi.","position":{"start":{"line":1129,"column":1},"end":{"line":1129,"column":1}},"html":"Vπ=rπ+γPπVπ.V^\\pi = r^\\pi + \\gamma P^\\pi V^\\pi.Vπ=rπ+γPπVπ.","enumerator":"1.43","key":"a6Bf4iFZnA"},{"type":"paragraph","position":{"start":{"line":1131,"column":1},"end":{"line":1133,"column":1}},"children":[{"type":"text","value":"(Unfortunately, this notation doesn’t simplify the expression for\n","position":{"start":{"line":1131,"column":1},"end":{"line":1131,"column":1}},"key":"nCiLiACPc1"},{"type":"inlineMath","value":"Q^\\pi","position":{"start":{"line":1131,"column":1},"end":{"line":1131,"column":1}},"html":"QπQ^\\piQπ","key":"V0FzlbBNz8"},{"type":"text","value":".) This system of equations can be solved with a matrix\ninversion:","position":{"start":{"line":1131,"column":1},"end":{"line":1131,"column":1}},"key":"pVsKtpXOcV"}],"key":"QAAGkeoZUv"},{"type":"math","value":"V^\\pi = (I - \\gamma P^\\pi)^{-1} r^\\pi.","label":"matrix_inversion_pe","identifier":"matrix_inversion_pe","html":"Vπ=(IγPπ)1rπ.V^\\pi = (I - \\gamma P^\\pi)^{-1} r^\\pi.Vπ=(IγPπ)1rπ.","enumerator":"1.44","html_id":"matrix-inversion-pe","key":"IVnZCVAKNZ"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"yLaGVmJ389"}],"key":"ockRNuOtek"},{"type":"paragraph","position":{"start":{"line":1142,"column":1},"end":{"line":1143,"column":1}},"children":[{"type":"text","value":"Note we’ve assumed that ","position":{"start":{"line":1142,"column":1},"end":{"line":1142,"column":1}},"key":"DQquqxIPTm"},{"type":"inlineMath","value":"I - \\gamma P^\\pi","position":{"start":{"line":1142,"column":1},"end":{"line":1142,"column":1}},"html":"IγPπI - \\gamma P^\\piIγPπ","key":"pO4Pt5c9Mu"},{"type":"text","value":" is invertible. Can you see\nwhy this is the case?","position":{"start":{"line":1142,"column":1},"end":{"line":1142,"column":1}},"key":"gRijrtGEkK"}],"key":"Af0I80QW95"},{"type":"paragraph","position":{"start":{"line":1145,"column":1},"end":{"line":1149,"column":1}},"children":[{"type":"text","value":"(Recall that a linear operator, i.e. a square matrix, is invertible if\nand only if its null space is trivial; that is, it doesn’t map any\nnonzero vector to zero. In this case, we can see that ","position":{"start":{"line":1145,"column":1},"end":{"line":1145,"column":1}},"key":"xxeyhgyp9P"},{"type":"inlineMath","value":"I - \\gamma P^\\pi","position":{"start":{"line":1145,"column":1},"end":{"line":1145,"column":1}},"html":"IγPπI - \\gamma P^\\piIγPπ","key":"JKDWPFZKSU"},{"type":"text","value":"\nis invertible because it maps any nonzero vector to a vector with at\nleast one nonzero element.)","position":{"start":{"line":1145,"column":1},"end":{"line":1145,"column":1}},"key":"tasLd218aG"}],"key":"If070nZ9DF"}],"key":"ObFHKdkGbQ"}],"key":"FpxCEnKeLr"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def eval_deterministic_infinite(\n mdp: MDP, policy: Float[Array, \"S A\"]\n) -> Float[Array, \" S\"]:\n pi = jnp.argmax(policy, axis=1) # un-one-hot\n P_π = mdp.P[jnp.arange(mdp.S), pi]\n r_π = mdp.r[jnp.arange(mdp.S), pi]\n return jnp.linalg.solve(jnp.eye(mdp.S) - mdp.γ * P_π, r_π)","key":"kPTXzoVkjP"},{"type":"output","id":"lR3IMnfeh6ceeBjRS-hp8","data":[],"key":"vnJ2mohdEf"}],"data":{},"key":"i8ZGFIxzlB"},{"type":"block","children":[{"type":"proof","kind":"example","label":"tidy_eval_infinite","identifier":"tidy_eval_infinite","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Tidying policy evaluation","position":{"start":{"line":1162,"column":1},"end":{"line":1162,"column":1}},"key":"S5k5uQGZ3n"}],"key":"bkK7dKODuj"},{"type":"paragraph","position":{"start":{"line":1165,"column":1},"end":{"line":1166,"column":1}},"children":[{"type":"text","value":"Let’s use the same policy ","position":{"start":{"line":1165,"column":1},"end":{"line":1165,"column":1}},"key":"tPkaRi2RLc"},{"type":"text","value":"π","position":{"start":{"line":1165,"column":1},"end":{"line":1165,"column":1}},"key":"qwawNw7TgY"},{"type":"text","value":" that tidies if and only if the room is\nmessy. Setting ","position":{"start":{"line":1165,"column":1},"end":{"line":1165,"column":1}},"key":"wBzFuAApt8"},{"type":"inlineMath","value":"\\gamma = 0.95","position":{"start":{"line":1165,"column":1},"end":{"line":1165,"column":1}},"html":"γ=0.95\\gamma = 0.95γ=0.95","key":"CbDNzJHtrx"},{"type":"text","value":", we must invert","position":{"start":{"line":1165,"column":1},"end":{"line":1165,"column":1}},"key":"V3CvTq3D6i"}],"key":"YXTGaeCEWY"},{"type":"math","value":"I - \\gamma P^{\\pi} = \\begin{bmatrix} 1 - 0.95 \\times 0.7 & - 0.95 \\times 0.3 \\\\ - 0.95 \\times 1 & 1 - 0.95 \\times 0 \\end{bmatrix} = \\begin{bmatrix} 0.335 & -0.285 \\\\ -0.95 & 1 \\end{bmatrix}.","position":{"start":{"line":1168,"column":1},"end":{"line":1168,"column":1}},"html":"IγPπ=[10.95×0.70.95×0.30.95×110.95×0]=[0.3350.2850.951].I - \\gamma P^{\\pi} = \\begin{bmatrix} 1 - 0.95 \\times 0.7 & - 0.95 \\times 0.3 \\\\ - 0.95 \\times 1 & 1 - 0.95 \\times 0 \\end{bmatrix} = \\begin{bmatrix} 0.335 & -0.285 \\\\ -0.95 & 1 \\end{bmatrix}.IγPπ=[10.95×0.70.95×10.95×0.310.95×0]=[0.3350.950.2851].","enumerator":"1.45","key":"tasrJoEOIn"},{"type":"paragraph","position":{"start":{"line":1170,"column":1},"end":{"line":1170,"column":1}},"children":[{"type":"text","value":"The inverse to two decimal points is","position":{"start":{"line":1170,"column":1},"end":{"line":1170,"column":1}},"key":"kZWniX1e2g"}],"key":"i6uHJnUmPc"},{"type":"math","value":"(I - \\gamma P^{\\pi})^{-1} = \\begin{bmatrix} 15.56 & 4.44 \\\\ 14.79 & 5.21 \\end{bmatrix}.","position":{"start":{"line":1172,"column":1},"end":{"line":1172,"column":1}},"html":"(IγPπ)1=[15.564.4414.795.21].(I - \\gamma P^{\\pi})^{-1} = \\begin{bmatrix} 15.56 & 4.44 \\\\ 14.79 & 5.21 \\end{bmatrix}.(IγPπ)1=[15.5614.794.445.21].","enumerator":"1.46","key":"wjXCgx14ke"},{"type":"paragraph","position":{"start":{"line":1174,"column":1},"end":{"line":1174,"column":1}},"children":[{"type":"text","value":"Thus the value function is","position":{"start":{"line":1174,"column":1},"end":{"line":1174,"column":1}},"key":"HRtjY5nMt3"}],"key":"wODe0cmsev"},{"type":"math","value":"V^{\\pi} = (I - \\gamma P^{\\pi})^{-1} r^{\\pi} = \\begin{bmatrix} 15.56 & 4.44 \\\\ 14.79 & 5.21 \\end{bmatrix} \\begin{bmatrix} 1 \\\\ 0 \\end{bmatrix} = \\begin{bmatrix} 15.56 \\\\ 14.79 \\end{bmatrix}.","position":{"start":{"line":1176,"column":1},"end":{"line":1176,"column":1}},"html":"Vπ=(IγPπ)1rπ=[15.564.4414.795.21][10]=[15.5614.79].V^{\\pi} = (I - \\gamma P^{\\pi})^{-1} r^{\\pi} = \\begin{bmatrix} 15.56 & 4.44 \\\\ 14.79 & 5.21 \\end{bmatrix} \\begin{bmatrix} 1 \\\\ 0 \\end{bmatrix} = \\begin{bmatrix} 15.56 \\\\ 14.79 \\end{bmatrix}.Vπ=(IγPπ)1rπ=[15.5614.794.445.21][10]=[15.5614.79].","enumerator":"1.47","key":"BXfGD9iaVT"},{"type":"paragraph","position":{"start":{"line":1178,"column":1},"end":{"line":1181,"column":1}},"children":[{"type":"text","value":"Let’s sanity-check this result. Since rewards are at most ","position":{"start":{"line":1178,"column":1},"end":{"line":1178,"column":1}},"key":"IpHCKZQxbr"},{"type":"text","value":"1","position":{"start":{"line":1178,"column":1},"end":{"line":1178,"column":1}},"key":"XpOwJwV4WV"},{"type":"text","value":", the\nmaximum cumulative return of a trajectory is at most\n","position":{"start":{"line":1178,"column":1},"end":{"line":1178,"column":1}},"key":"WNp3xdseHg"},{"type":"inlineMath","value":"1/(1-\\gamma) = 20","position":{"start":{"line":1178,"column":1},"end":{"line":1178,"column":1}},"html":"1/(1γ)=201/(1-\\gamma) = 201/(1γ)=20","key":"w9O9ighFVC"},{"type":"text","value":". We see that the value function is indeed slightly\nlower than this.","position":{"start":{"line":1178,"column":1},"end":{"line":1178,"column":1}},"key":"wRZq20WDDj"}],"key":"er8NUTOwyR"}],"enumerator":"1.6","html_id":"tidy-eval-infinite","key":"r3uzYwNogw"}],"key":"xTErSz1mtM"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"eval_deterministic_infinite(tidy_mdp_inf, tidy_policy_messy_only[0])","key":"WpAZJi0yZ1"},{"type":"output","id":"dVXQnKoDUfx14cYpjTGaf","data":[{"output_type":"execute_result","execution_count":19,"metadata":{},"data":{"text/plain":{"content":"Array([15.56419, 14.78598], dtype=float32)","content_type":"text/plain"}}}],"key":"L3ypiFP93G"}],"data":{},"key":"Y0v9LipI2R"},{"type":"block","children":[{"type":"heading","depth":4,"position":{"start":{"line":1189,"column":1},"end":{"line":1189,"column":1}},"children":[{"type":"text","value":"Iterative policy evaluation","position":{"start":{"line":1189,"column":1},"end":{"line":1189,"column":1}},"key":"dSpwlulXai"}],"label":"iterative_pe","identifier":"iterative_pe","html_id":"iterative-pe","enumerator":"1.5.2.2","key":"E7TskrYI2Y"},{"type":"paragraph","position":{"start":{"line":1191,"column":1},"end":{"line":1194,"column":1}},"children":[{"type":"text","value":"The matrix inversion above takes roughly ","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"key":"Z5yHYL3GeG"},{"type":"inlineMath","value":"O(|\\mathcal{S}|^3)","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"html":"O(S3)O(|\\mathcal{S}|^3)O(S3)","key":"az1AaHbVon"},{"type":"text","value":" time.\nIt also only works for deterministic policies.\nCan we trade off the requirement of finding the ","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"key":"X9LmoYItLM"},{"type":"emphasis","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"children":[{"type":"text","value":"exact","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"key":"WiyVt0AR3t"}],"key":"PqMl70YGQI"},{"type":"text","value":" value function for a faster\n","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"key":"AuVwiqAodP"},{"type":"emphasis","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"children":[{"type":"text","value":"approximate","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"key":"WvYeYb6kmM"}],"key":"LYN7yhlhrC"},{"type":"text","value":" algorithm that will also extend to stochastic policies?","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"key":"cDl0YHAkyT"}],"key":"eXW3uvqoK6"},{"type":"paragraph","position":{"start":{"line":1196,"column":1},"end":{"line":1199,"column":1}},"children":[{"type":"text","value":"Let’s use the Bellman operator to define an iterative algorithm for\ncomputing the value function. We’ll start with an initial guess\n","position":{"start":{"line":1196,"column":1},"end":{"line":1196,"column":1}},"key":"yMMUE5WIon"},{"type":"inlineMath","value":"v^{(0)}","position":{"start":{"line":1196,"column":1},"end":{"line":1196,"column":1}},"html":"v(0)v^{(0)}v(0)","key":"gul287JjEY"},{"type":"text","value":" with elements in ","position":{"start":{"line":1196,"column":1},"end":{"line":1196,"column":1}},"key":"UJUumCNW1p"},{"type":"inlineMath","value":"[0, 1/(1-\\gamma)]","position":{"start":{"line":1196,"column":1},"end":{"line":1196,"column":1}},"html":"[0,1/(1γ)][0, 1/(1-\\gamma)][0,1/(1γ)]","key":"fyE4lnrSJt"},{"type":"text","value":" and then iterate the\nBellman operator:","position":{"start":{"line":1196,"column":1},"end":{"line":1196,"column":1}},"key":"qx9oV0foy6"}],"key":"yJAONbsPM8"},{"type":"math","value":"v^{(t+1)} = \\mathcal{J}^{\\pi}(v^{(t)}),","position":{"start":{"line":1201,"column":1},"end":{"line":1201,"column":1}},"html":"v(t+1)=Jπ(v(t)),v^{(t+1)} = \\mathcal{J}^{\\pi}(v^{(t)}),v(t+1)=Jπ(v(t)),","enumerator":"1.48","key":"lqLXYXJ6wA"},{"type":"paragraph","position":{"start":{"line":1203,"column":1},"end":{"line":1204,"column":1}},"children":[{"type":"text","value":"i.e. ","position":{"start":{"line":1203,"column":1},"end":{"line":1203,"column":1}},"key":"lGfRP9THtn"},{"type":"inlineMath","value":"v^{(t)} = (\\mathcal{J}^{\\pi})^{(t)} (v^{(0)})","position":{"start":{"line":1203,"column":1},"end":{"line":1203,"column":1}},"html":"v(t)=(Jπ)(t)(v(0))v^{(t)} = (\\mathcal{J}^{\\pi})^{(t)} (v^{(0)})v(t)=(Jπ)(t)(v(0))","key":"Ro9Rt3TXlF"},{"type":"text","value":". Note that each iteration\ntakes ","position":{"start":{"line":1203,"column":1},"end":{"line":1203,"column":1}},"key":"ygZ57dScnJ"},{"type":"inlineMath","value":"O(|\\mathcal{S}|^2)","position":{"start":{"line":1203,"column":1},"end":{"line":1203,"column":1}},"html":"O(S2)O(|\\mathcal{S}|^2)O(S2)","key":"qAglu9BtRi"},{"type":"text","value":" time for the matrix-vector multiplication.","position":{"start":{"line":1203,"column":1},"end":{"line":1203,"column":1}},"key":"FWlWpJqE7M"}],"key":"qRjOSnq0OT"}],"key":"wZ4oY61jSN"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def supremum_norm(v):\n return jnp.max(jnp.abs(v)) # same as jnp.linalg.norm(v, jnp.inf)\n\n\ndef loop_until_convergence(op, v, ε=1e-6):\n \"\"\"Repeatedly apply op to v until convergence (in supremum norm).\"\"\"\n while True:\n v_new = op(v)\n if supremum_norm(v_new - v) < ε:\n return v_new\n v = v_new\n\n\ndef iterative_evaluation(mdp: MDP, pi: Float[Array, \"S A\"], ε=1e-6) -> Float[Array, \" S\"]:\n op = partial(bellman_operator, mdp, pi)\n return loop_until_convergence(op, jnp.zeros(mdp.S), ε)","key":"NKZJJZD0RW"},{"type":"output","id":"Gar_yNIiFG5vOubSiOYqW","data":[],"key":"ichaQyjNeI"}],"data":{},"key":"u3YBTIoJ4K"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":1225,"column":1},"end":{"line":1225,"column":1}},"children":[{"type":"text","value":"Then, as we showed in ","position":{"start":{"line":1225,"column":1},"end":{"line":1225,"column":1}},"key":"OZ6NyAA6nQ"},{"type":"crossReference","kind":"equation","identifier":"bellman_convergence","label":"bellman_convergence","children":[{"type":"text","value":"(","key":"y0XUKQY26K"},{"type":"text","value":"1.38","key":"D55SdQuieN"},{"type":"text","value":")","key":"A5RyjErlGE"}],"template":"(%s)","enumerator":"1.38","resolved":true,"html_id":"bellman-convergence","key":"fUIMYjsVCR"},{"type":"text","value":", by the Banach fixed-point theorem:","position":{"start":{"line":1225,"column":1},"end":{"line":1225,"column":1}},"key":"XY8kXNH2SC"}],"key":"a63cdSY8hj"},{"type":"math","value":"\\|v^{(t)} - V^\\pi \\|_{\\infty} \\le \\gamma^{t} \\| v^{(0)} - V^\\pi\\|_{\\infty}.","position":{"start":{"line":1227,"column":1},"end":{"line":1227,"column":1}},"html":"v(t)Vπγtv(0)Vπ.\\|v^{(t)} - V^\\pi \\|_{\\infty} \\le \\gamma^{t} \\| v^{(0)} - V^\\pi\\|_{\\infty}.v(t)Vπγtv(0)Vπ.","enumerator":"1.49","key":"fLQkxhhO4P"}],"key":"kDGIX13zcR"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"iterative_evaluation(tidy_mdp_inf, tidy_policy_messy_only[0])","key":"nmnSTUeqF8"},{"type":"output","id":"3LoPYbIed8hZgY1CUcFqQ","data":[{"output_type":"execute_result","execution_count":21,"metadata":{},"data":{"text/plain":{"content":"Array([15.564166, 14.785956], dtype=float32)","content_type":"text/plain"}}}],"key":"uRhlqvxYXV"}],"data":{},"key":"m8oY93g6oS"},{"type":"block","children":[{"type":"proof","kind":"remark","label":"iterations_vi","identifier":"iterations_vi","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Convergence of iterative policy evaluation","position":{"start":{"line":1233,"column":1},"end":{"line":1233,"column":1}},"key":"dnO4q3xDGG"}],"key":"S0R7nQi8hO"},{"type":"paragraph","position":{"start":{"line":1236,"column":1},"end":{"line":1237,"column":1}},"children":[{"type":"text","value":"How many iterations do we need for an ","position":{"start":{"line":1236,"column":1},"end":{"line":1236,"column":1}},"key":"Mo4TvOb6p5"},{"type":"text","value":"ε","position":{"start":{"line":1236,"column":1},"end":{"line":1236,"column":1}},"key":"Nf6f0nSsIR"},{"type":"text","value":"-accurate estimate? We\ncan work backwards to solve for ","position":{"start":{"line":1236,"column":1},"end":{"line":1236,"column":1}},"key":"dCExuBr0ZT"},{"type":"inlineMath","value":"t","position":{"start":{"line":1236,"column":1},"end":{"line":1236,"column":1}},"html":"ttt","key":"k4VOZB0qrg"},{"type":"text","value":":","position":{"start":{"line":1236,"column":1},"end":{"line":1236,"column":1}},"key":"jyisDQezf0"}],"key":"aDWVGVr5Sc"},{"type":"math","value":"\\begin{aligned}\n \\gamma^t \\|v^{(0)} - V^\\pi\\|_{\\infty} &\\le \\epsilon \\\\\n t &\\ge \\frac{\\log (\\epsilon / \\|v^{(0)} - V^\\pi\\|_{\\infty})}{\\log \\gamma} \\\\\n &= \\frac{\\log (\\|v^{(0)} - V^\\pi\\|_{\\infty} / \\epsilon)}{\\log (1 / \\gamma)},\n\\end{aligned}","position":{"start":{"line":1239,"column":1},"end":{"line":1245,"column":1}},"html":"γtv(0)Vπϵtlog(ϵ/v(0)Vπ)logγ=log(v(0)Vπ/ϵ)log(1/γ),\\begin{aligned}\n \\gamma^t \\|v^{(0)} - V^\\pi\\|_{\\infty} &\\le \\epsilon \\\\\n t &\\ge \\frac{\\log (\\epsilon / \\|v^{(0)} - V^\\pi\\|_{\\infty})}{\\log \\gamma} \\\\\n &= \\frac{\\log (\\|v^{(0)} - V^\\pi\\|_{\\infty} / \\epsilon)}{\\log (1 / \\gamma)},\n\\end{aligned}γtv(0)Vπtϵlogγlog(ϵ/∥v(0)Vπ)=log(1/γ)log(v(0)Vπ/ϵ),","enumerator":"1.50","key":"yG4D1PBCCM"},{"type":"paragraph","position":{"start":{"line":1247,"column":1},"end":{"line":1248,"column":1}},"children":[{"type":"text","value":"and so the number of iterations required for an\n","position":{"start":{"line":1247,"column":1},"end":{"line":1247,"column":1}},"key":"v2kK6UDuAy"},{"type":"text","value":"ε","position":{"start":{"line":1247,"column":1},"end":{"line":1247,"column":1}},"key":"IGltSXzbQX"},{"type":"text","value":"-accurate estimate is","position":{"start":{"line":1247,"column":1},"end":{"line":1247,"column":1}},"key":"MjaTIVjFi7"}],"key":"l2vLaW6jTG"},{"type":"math","value":"T = O\\left( \\frac{1}{1-\\gamma} \\log\\left(\\frac{1}{\\epsilon (1-\\gamma)}\\right) \\right).","position":{"start":{"line":1250,"column":1},"end":{"line":1252,"column":1}},"html":"T=O(11γlog(1ϵ(1γ))).T = O\\left( \\frac{1}{1-\\gamma} \\log\\left(\\frac{1}{\\epsilon (1-\\gamma)}\\right) \\right).T=O(1γ1log(ϵ(1γ)1)).","enumerator":"1.51","key":"qLHncLcO9y"},{"type":"paragraph","position":{"start":{"line":1254,"column":1},"end":{"line":1256,"column":1}},"children":[{"type":"text","value":"Note that we’ve applied the inequalities\n","position":{"start":{"line":1254,"column":1},"end":{"line":1254,"column":1}},"key":"Z5VbywXBV7"},{"type":"inlineMath","value":"\\|v^{(0)} - V^\\pi\\|_{\\infty} \\le 1/(1-\\gamma)","position":{"start":{"line":1254,"column":1},"end":{"line":1254,"column":1}},"html":"v(0)Vπ1/(1γ)\\|v^{(0)} - V^\\pi\\|_{\\infty} \\le 1/(1-\\gamma)v(0)Vπ1/(1γ)","key":"iTYWsNs2By"},{"type":"text","value":" and\n","position":{"start":{"line":1254,"column":1},"end":{"line":1254,"column":1}},"key":"kJtU9012qT"},{"type":"inlineMath","value":"\\log (1/x) \\ge 1-x","position":{"start":{"line":1254,"column":1},"end":{"line":1254,"column":1}},"html":"log(1/x)1x\\log (1/x) \\ge 1-xlog(1/x)1x","key":"JKyBkVpSqi"},{"type":"text","value":".","position":{"start":{"line":1254,"column":1},"end":{"line":1254,"column":1}},"key":"unaXZIqLsu"}],"key":"wXwD74NG18"}],"enumerator":"1.2","html_id":"iterations-vi","key":"xL3L5ixgSX"},{"type":"heading","depth":3,"position":{"start":{"line":1259,"column":1},"end":{"line":1259,"column":1}},"children":[{"type":"text","value":"Optimal policies in infinite-horizon MDPs","position":{"start":{"line":1259,"column":1},"end":{"line":1259,"column":1}},"key":"rR0tXQ37g0"}],"identifier":"optimal-policies-in-infinite-horizon-mdps","label":"Optimal policies in infinite-horizon MDPs","html_id":"optimal-policies-in-infinite-horizon-mdps","implicit":true,"enumerator":"1.5.3","key":"rmgLCS5WYv"},{"type":"paragraph","position":{"start":{"line":1261,"column":1},"end":{"line":1266,"column":1}},"children":[{"type":"text","value":"Now let’s move on to solving for an optimal policy in the\ninfinite-horizon case. As in ","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"VDCJs4q9sj"},{"type":"crossReference","kind":"proof:definition","identifier":"optimal_policy_finite","label":"optimal_policy_finite","children":[{"type":"text","value":"the finite-horizon case","key":"Caa1wzMSFN"}],"template":"Definition %s","enumerator":"1.10","resolved":true,"html_id":"optimal-policy-finite","key":"FqVz0rpilf"},{"type":"text","value":", an ","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"GASkshhiq5"},{"type":"strong","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"children":[{"type":"text","value":"optimal policy","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"bfL9iuqmi2"}],"key":"aG3sdGIhqd"},{"type":"text","value":" ","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"nFQmo0Xo2a"},{"type":"inlineMath","value":"\\pi^\\star","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"html":"π\\pi^\\starπ","key":"BgRaFsz5DD"},{"type":"text","value":"\nis one that does at least as well as any other policy in all situations.\nThat is, for all policies ","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"LSdU6fq1bc"},{"type":"text","value":"π","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"WNS6hlb4uU"},{"type":"text","value":", states ","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"Ertv187Yd9"},{"type":"inlineMath","value":"s \\in \\mathcal{S}","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"html":"sSs \\in \\mathcal{S}sS","key":"Mn3RtgWjaa"},{"type":"text","value":", times\n","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"XZVuZJ48Me"},{"type":"inlineMath","value":"\\hi \\in \\mathbb{N}","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"html":"hN\\hi \\in \\mathbb{N}hN","key":"lZW4Kd0kcM"},{"type":"text","value":", and initial trajectories\n","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"g5Lv6ryLD9"},{"type":"inlineMath","value":"\\tau_\\hi = (s_0, a_0, r_0, \\dots, s_\\hi)","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"html":"τh=(s0,a0,r0,,sh)\\tau_\\hi = (s_0, a_0, r_0, \\dots, s_\\hi)τh=(s0,a0,r0,,sh)","key":"VB6Gb1r7OD"},{"type":"text","value":" where ","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"ZUkDCetbEu"},{"type":"inlineMath","value":"s_\\hi = s","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"html":"sh=ss_\\hi = ssh=s","key":"HEgKXIJXek"},{"type":"text","value":",","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"EgxBqAi5rf"}],"key":"oxmKFGrsNM"},{"type":"math","value":"\\begin{aligned}\n V^{\\pi^\\star}(s) &= \\E_{\\tau \\sim \\rho^{\\pi^{\\star}}}[r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots \\mid s_\\hi = s] \\\\\n &\\ge \\E_{\\tau \\sim \\rho^{\\pi}}[r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots \\mid \\tau_\\hi]\n\\end{aligned}","label":"optimal_policy_infinite","identifier":"optimal_policy_infinite","html":"Vπ(s)=Eτρπ[rh+γrh+1+γ2rh+2+sh=s]Eτρπ[rh+γrh+1+γ2rh+2+τh]\\begin{aligned}\n V^{\\pi^\\star}(s) &= \\E_{\\tau \\sim \\rho^{\\pi^{\\star}}}[r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots \\mid s_\\hi = s] \\\\\n &\\ge \\E_{\\tau \\sim \\rho^{\\pi}}[r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots \\mid \\tau_\\hi]\n\\end{aligned}Vπ(s)=Eτρπ[rh+γrh+1+γ2rh+2+sh=s]Eτρπ[rh+γrh+1+γ2rh+2+τh]","enumerator":"1.52","html_id":"optimal-policy-infinite","key":"ZMMBwnL3u4"},{"type":"paragraph","position":{"start":{"line":1278,"column":1},"end":{"line":1279,"column":1}},"children":[{"type":"text","value":"Once again, all optimal policies share the same ","position":{"start":{"line":1278,"column":1},"end":{"line":1278,"column":1}},"key":"iC7A7z18em"},{"type":"strong","position":{"start":{"line":1278,"column":1},"end":{"line":1278,"column":1}},"children":[{"type":"text","value":"optimal value function","position":{"start":{"line":1278,"column":1},"end":{"line":1278,"column":1}},"key":"DVSyvHSUIN"}],"key":"lZXKkRcqnk"},{"type":"text","value":" ","position":{"start":{"line":1278,"column":1},"end":{"line":1278,"column":1}},"key":"wEGTckKLdz"},{"type":"inlineMath","value":"V^\\star","position":{"start":{"line":1278,"column":1},"end":{"line":1278,"column":1}},"html":"VV^\\starV","key":"fleKkBhDF2"},{"type":"text","value":", and the greedy policy with respect to this value function\nis optimal.","position":{"start":{"line":1278,"column":1},"end":{"line":1278,"column":1}},"key":"wKcnJL6GnI"}],"key":"OK8JYg66Od"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"zCxmgufPX7"}],"key":"bFTS0xUwCY"},{"type":"paragraph","position":{"start":{"line":1282,"column":1},"end":{"line":1282,"column":1}},"children":[{"type":"text","value":"Verify this by modifying the proof ","position":{"start":{"line":1282,"column":1},"end":{"line":1282,"column":1}},"key":"P0rGKhmmTW"},{"type":"crossReference","kind":"proof:theorem","identifier":"optimal_greedy","label":"optimal_greedy","children":[{"type":"text","value":"Theorem ","key":"pFVtpIHU4Z"},{"type":"text","value":"1.3","key":"w2zHu6wt69"}],"template":"Theorem %s","enumerator":"1.3","resolved":true,"html_id":"optimal-greedy","key":"VUFLdablgK"},{"type":"text","value":" from the finite-horizon case.","position":{"start":{"line":1282,"column":1},"end":{"line":1282,"column":1}},"key":"cLzBZicUJp"}],"key":"dAFfXeaEZE"}],"key":"oyIAhR3Pib"},{"type":"paragraph","position":{"start":{"line":1285,"column":1},"end":{"line":1289,"column":1}},"children":[{"type":"text","value":"So how can we compute such an optimal policy? We can’t use the backwards\nDP approach from the finite-horizon case ","position":{"start":{"line":1285,"column":1},"end":{"line":1285,"column":1}},"key":"EBKxoHoXJI"},{"type":"crossReference","kind":"proof:definition","identifier":"pi_star_dp","label":"pi_star_dp","children":[{"type":"text","value":"Definition ","key":"nNoHz9sNOk"},{"type":"text","value":"1.11","key":"DHJIVewtfo"}],"template":"Definition %s","enumerator":"1.11","resolved":true,"html_id":"pi-star-dp","key":"pd97uDlMgY"},{"type":"text","value":" since there’s no “final timestep” to start\nfrom. Instead, we’ll exploit the fact that the Bellman consistency\nequation ","position":{"start":{"line":1285,"column":1},"end":{"line":1285,"column":1}},"key":"hHF5rKQZIT"},{"type":"crossReference","kind":"equation","identifier":"bellman_consistency_infinite","label":"bellman_consistency_infinite","children":[{"type":"text","value":"(","key":"m9sMi2O2qA"},{"type":"text","value":"1.32","key":"q9c1qIhml2"},{"type":"text","value":")","key":"PlFiLd1SIl"}],"template":"(%s)","enumerator":"1.32","resolved":true,"html_id":"bellman-consistency-infinite","key":"GxK7F4z60W"},{"type":"text","value":" for the optimal value\nfunction doesn’t depend on any policy:","position":{"start":{"line":1285,"column":1},"end":{"line":1285,"column":1}},"key":"Z6XpXSDdPO"}],"key":"PS9CPOySUs"},{"type":"math","value":"V^\\star(s) = \\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} V^\\star(s'). \\right]","label":"bellman_optimality","identifier":"bellman_optimality","html":"V(s)=maxa[r(s,a)+γEsP(s,a)V(s).]V^\\star(s) = \\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} V^\\star(s'). \\right]V(s)=amax[r(s,a)+γEsP(s,a)V(s).]","enumerator":"1.53","html_id":"bellman-optimality","key":"X6Kx5LBtxn"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"lVHkqTRyLS"}],"key":"ytDwpbuffU"},{"type":"paragraph","position":{"start":{"line":1298,"column":1},"end":{"line":1299,"column":1}},"children":[{"type":"text","value":"Verify this by substituting the greedy policy into the\nBellman consistency equation.","position":{"start":{"line":1298,"column":1},"end":{"line":1298,"column":1}},"key":"hnVrFyBAkU"}],"key":"BOCaVazTwM"}],"key":"puGpkqYJDA"},{"type":"paragraph","position":{"start":{"line":1302,"column":1},"end":{"line":1303,"column":1}},"children":[{"type":"text","value":"As before, thinking of the r.h.s. of ","position":{"start":{"line":1302,"column":1},"end":{"line":1302,"column":1}},"key":"lMiL6z6en5"},{"type":"crossReference","kind":"equation","identifier":"bellman_optimality","label":"bellman_optimality","children":[{"type":"text","value":"(","key":"caHl5JVNni"},{"type":"text","value":"1.53","key":"H30Vg4vmPE"},{"type":"text","value":")","key":"QCu04tAbls"}],"template":"(%s)","enumerator":"1.53","resolved":true,"html_id":"bellman-optimality","key":"fpbTwTQj0N"},{"type":"text","value":" as an operator on value functions\ngives the ","position":{"start":{"line":1302,"column":1},"end":{"line":1302,"column":1}},"key":"UqN3JPePVu"},{"type":"strong","position":{"start":{"line":1302,"column":1},"end":{"line":1302,"column":1}},"children":[{"type":"text","value":"Bellman optimality operator","position":{"start":{"line":1302,"column":1},"end":{"line":1302,"column":1}},"key":"yIdOAUGIsH"}],"key":"VEvqBmfsYM"}],"key":"FA9eNzf0tE"},{"type":"math","value":"[\\mathcal{J}^{\\star}(v)](s) = \\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} v(s') \\right]","label":"bellman_optimality_operator","identifier":"bellman_optimality_operator","html":"[J(v)](s)=maxa[r(s,a)+γEsP(s,a)v(s)][\\mathcal{J}^{\\star}(v)](s) = \\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} v(s') \\right][J(v)](s)=amax[r(s,a)+γEsP(s,a)v(s)]","enumerator":"1.54","html_id":"bellman-optimality-operator","key":"TRmPxm6HvD"}],"key":"PZsM7GvEvn"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def bellman_optimality_operator(mdp: MDP, v: Float[Array, \" S\"]) -> Float[Array, \" S\"]:\n return jnp.max(mdp.r + mdp.γ * mdp.P @ v, axis=1)\n\n\ndef check_optimal(v: Float[Array, \" S\"], mdp: MDP):\n return jnp.allclose(v, bellman_optimality_operator(v, mdp))","key":"gLwFDv7NuW"},{"type":"output","id":"XzEoiYoOYIzM_jpa8U6CJ","data":[],"key":"QRinva7u61"}],"data":{},"key":"qUw0M5E0EA"},{"type":"block","children":[{"type":"heading","depth":4,"position":{"start":{"line":1321,"column":1},"end":{"line":1321,"column":1}},"children":[{"type":"text","value":"Value iteration","position":{"start":{"line":1321,"column":1},"end":{"line":1321,"column":1}},"key":"L0lykyIuea"}],"label":"value_iteration","identifier":"value_iteration","html_id":"value-iteration","enumerator":"1.5.3.1","key":"jflBhhfkZB"},{"type":"paragraph","position":{"start":{"line":1323,"column":1},"end":{"line":1326,"column":1}},"children":[{"type":"text","value":"Since the optimal policy is still a policy, our result that the Bellman\noperator is a contracting map still holds, and so we can repeatedly\napply this operator to converge to the optimal value function! This\nalgorithm is known as ","position":{"start":{"line":1323,"column":1},"end":{"line":1323,"column":1}},"key":"TTPR88kvW7"},{"type":"strong","position":{"start":{"line":1323,"column":1},"end":{"line":1323,"column":1}},"children":[{"type":"text","value":"value iteration","position":{"start":{"line":1323,"column":1},"end":{"line":1323,"column":1}},"key":"CwXLxhI0i2"}],"key":"y00bHfI3CZ"},{"type":"text","value":".","position":{"start":{"line":1323,"column":1},"end":{"line":1323,"column":1}},"key":"yUZIUHOANp"}],"key":"I620Wri0Zj"}],"key":"peErTZmTUC"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def value_iteration(mdp: MDP, ε: float = 1e-6) -> Float[Array, \" S\"]:\n \"\"\"Iterate the Bellman optimality operator until convergence.\"\"\"\n op = partial(bellman_optimality_operator, mdp)\n return loop_until_convergence(op, jnp.zeros(mdp.S), ε)","key":"ca4r0nmy69"},{"type":"output","id":"L94hPD6FUYuixquPi7rzm","data":[],"key":"hGKWm6Lg9U"}],"data":{},"key":"q8DhcJfIMw"},{"type":"block","children":[],"key":"QQmdtX5SS3"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"value_iteration(tidy_mdp_inf)","key":"gpdp5PCMpq"},{"type":"output","id":"bYIrykIWhLK07flL-L3IV","data":[{"output_type":"execute_result","execution_count":24,"metadata":{},"data":{"text/plain":{"content":"Array([15.564166, 14.785956], dtype=float32)","content_type":"text/plain"}}}],"key":"ebgC4I5W0X"}],"data":{},"key":"xbtS56pw6p"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":1339,"column":1},"end":{"line":1342,"column":1}},"children":[{"type":"text","value":"Note that the runtime analysis for an ","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"key":"Q0f9WyFwMJ"},{"type":"text","value":"ε","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"key":"Ld6naUhhJG"},{"type":"text","value":"-optimal value function\nis exactly the same as ","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"key":"v04spzYJ6Y"},{"type":"crossReference","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"children":[{"type":"text","value":"iterative policy evaluation","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"key":"n34Ouk295F"}],"identifier":"iterative_pe","label":"iterative_pe","kind":"heading","template":"Section %s","enumerator":"1.5.2.2","resolved":true,"html_id":"iterative-pe","key":"oA6oZs9GbW"},{"type":"text","value":"! This is because value iteration is simply\nthe special case of applying iterative policy evaluation to the\n","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"key":"FcNOqDN7DT"},{"type":"emphasis","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"children":[{"type":"text","value":"optimal","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"key":"EQNNhL0rVt"}],"key":"iejoQ70Ohw"},{"type":"text","value":" value function.","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"key":"bqzrWm962n"}],"key":"j6mglYbioe"},{"type":"paragraph","position":{"start":{"line":1344,"column":1},"end":{"line":1346,"column":1}},"children":[{"type":"text","value":"As the final step of the algorithm, to return an actual policy\n","position":{"start":{"line":1344,"column":1},"end":{"line":1344,"column":1}},"key":"e0oHqRBvdN"},{"type":"inlineMath","value":"\\hat \\pi","position":{"start":{"line":1344,"column":1},"end":{"line":1344,"column":1}},"html":"π^\\hat \\piπ^","key":"qD4Dqsc02X"},{"type":"text","value":", we can simply act greedily with respect to the final iteration\n","position":{"start":{"line":1344,"column":1},"end":{"line":1344,"column":1}},"key":"a0As6AZ0Yr"},{"type":"inlineMath","value":"v^{(T)}","position":{"start":{"line":1344,"column":1},"end":{"line":1344,"column":1}},"html":"v(T)v^{(T)}v(T)","key":"nldFdnL7iE"},{"type":"text","value":" of our above algorithm:","position":{"start":{"line":1344,"column":1},"end":{"line":1344,"column":1}},"key":"Jq7KgGVVNx"}],"key":"XgHZIjMm7v"},{"type":"math","value":"\\hat \\pi(s) = \\arg\\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} v^{(T)}(s') \\right].","position":{"start":{"line":1348,"column":1},"end":{"line":1348,"column":1}},"html":"π^(s)=argmaxa[r(s,a)+γEsP(s,a)v(T)(s)].\\hat \\pi(s) = \\arg\\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} v^{(T)}(s') \\right].π^(s)=argamax[r(s,a)+γEsP(s,a)v(T)(s)].","enumerator":"1.55","key":"f5JUlylpID"},{"type":"paragraph","position":{"start":{"line":1350,"column":1},"end":{"line":1352,"column":1}},"children":[{"type":"text","value":"We must be careful, though: the value function of this greedy policy,\n","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"key":"jVZFyz7wNw"},{"type":"inlineMath","value":"V^{\\hat \\pi}","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"html":"Vπ^V^{\\hat \\pi}Vπ^","key":"nS8Ika3IIe"},{"type":"text","value":", is ","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"key":"HP4KiHS8Ic"},{"type":"emphasis","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"children":[{"type":"text","value":"not","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"key":"m3RbuNXhG2"}],"key":"wikRXaFjSp"},{"type":"text","value":" the same as ","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"key":"qxrmnUllPf"},{"type":"inlineMath","value":"v^{(T)}","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"html":"v(T)v^{(T)}v(T)","key":"yoOoWgppDD"},{"type":"text","value":", which need not even be a\nwell-defined value function for some policy!","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"key":"k5NIYOrnmR"}],"key":"WbLtzhjeZw"},{"type":"paragraph","position":{"start":{"line":1354,"column":1},"end":{"line":1358,"column":1}},"children":[{"type":"text","value":"The bound on the policy’s quality is actually quite loose: if\n","position":{"start":{"line":1354,"column":1},"end":{"line":1354,"column":1}},"key":"fgsNtbK1ow"},{"type":"inlineMath","value":"\\|v^{(T)} - V^\\star\\|_{\\infty} \\le \\epsilon","position":{"start":{"line":1354,"column":1},"end":{"line":1354,"column":1}},"html":"v(T)Vϵ\\|v^{(T)} - V^\\star\\|_{\\infty} \\le \\epsilonv(T)Vϵ","key":"pkruFIJ5dl"},{"type":"text","value":", then the greedy policy\n","position":{"start":{"line":1354,"column":1},"end":{"line":1354,"column":1}},"key":"s8eADOwLz1"},{"type":"inlineMath","value":"\\hat \\pi","position":{"start":{"line":1354,"column":1},"end":{"line":1354,"column":1}},"html":"π^\\hat \\piπ^","key":"VvXq0E8jAx"},{"type":"text","value":" satisfies\n","position":{"start":{"line":1354,"column":1},"end":{"line":1354,"column":1}},"key":"X0iTOQeVsx"},{"type":"inlineMath","value":"\\|V^{\\hat \\pi} - V^\\star\\|_{\\infty} \\le \\frac{2\\gamma}{1-\\gamma} \\epsilon","position":{"start":{"line":1354,"column":1},"end":{"line":1354,"column":1}},"html":"Vπ^V2γ1γϵ\\|V^{\\hat \\pi} - V^\\star\\|_{\\infty} \\le \\frac{2\\gamma}{1-\\gamma} \\epsilonVπ^V1γ2γϵ","key":"VTddphxPJ8"},{"type":"text","value":",\nwhich might potentially be very large.","position":{"start":{"line":1354,"column":1},"end":{"line":1354,"column":1}},"key":"Z1SySxffX4"}],"key":"pYK53JkHQQ"},{"type":"proof","kind":"theorem","label":"greedy_worsen","identifier":"greedy_worsen","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Greedy policy value worsening","position":{"start":{"line":1360,"column":1},"end":{"line":1360,"column":1}},"key":"oyp20tWbOx"}],"key":"qQ2JMf2dUU"},{"type":"math","value":"\\|V^{\\hat \\pi} - V^\\star \\|_{\\infty} \\le \\frac{2 \\gamma}{1-\\gamma} \\|v - V^\\star\\|_{\\infty}","position":{"start":{"line":1363,"column":1},"end":{"line":1363,"column":1}},"html":"Vπ^V2γ1γvV\\|V^{\\hat \\pi} - V^\\star \\|_{\\infty} \\le \\frac{2 \\gamma}{1-\\gamma} \\|v - V^\\star\\|_{\\infty}Vπ^V1γ2γvV","enumerator":"1.56","key":"rjdHVy6WEI"},{"type":"paragraph","position":{"start":{"line":1365,"column":1},"end":{"line":1365,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":1365,"column":1},"end":{"line":1365,"column":1}},"key":"tDlouxDsCq"},{"type":"inlineMath","value":"\\hat \\pi(s) = \\arg\\max_a q(s, a)","position":{"start":{"line":1365,"column":1},"end":{"line":1365,"column":1}},"html":"π^(s)=argmaxaq(s,a)\\hat \\pi(s) = \\arg\\max_a q(s, a)π^(s)=argmaxaq(s,a)","key":"StQIBjYMJz"},{"type":"text","value":" is the greedy policy with respect to","position":{"start":{"line":1365,"column":1},"end":{"line":1365,"column":1}},"key":"pWM7EyZrYy"}],"key":"YP6f3iq3bz"},{"type":"math","value":"q(s, a) = r(s, a) + \\E_{s' \\sim P(s, a)} v(s').","position":{"start":{"line":1367,"column":1},"end":{"line":1367,"column":1}},"html":"q(s,a)=r(s,a)+EsP(s,a)v(s).q(s, a) = r(s, a) + \\E_{s' \\sim P(s, a)} v(s').q(s,a)=r(s,a)+EsP(s,a)v(s).","enumerator":"1.57","key":"bEDuyoctCc"}],"enumerator":"1.5","html_id":"greedy-worsen","key":"Hx4IIQHf8P"},{"type":"proof","kind":"proof","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Proof","position":{"start":{"line":1370,"column":1},"end":{"line":1370,"column":1}},"key":"ZnY9K8QQvL"}],"key":"JlDZ4U9Ynp"},{"type":"paragraph","position":{"start":{"line":1371,"column":1},"end":{"line":1371,"column":1}},"children":[{"type":"text","value":"We first have","position":{"start":{"line":1371,"column":1},"end":{"line":1371,"column":1}},"key":"SZTP5NPPg8"}],"key":"tlpjnfTKgy"},{"type":"math","value":"\\begin{aligned}\n V^{\\star}(s) - V^{\\hat \\pi}(s) &= Q^{\\star}(s,\\pi^\\star(s)) - Q^{\\hat \\pi}(s, \\hat \\pi(s))\\\\\n &= [Q^{\\star}(s,\\pi^\\star(s)) - Q^{\\star}(s, \\hat \\pi(s))] + [Q^{\\star}(s, \\hat \\pi(s)) - Q^{\\hat \\pi}(s, \\hat \\pi(s))].\n\\end{aligned}","position":{"start":{"line":1373,"column":1},"end":{"line":1378,"column":1}},"html":"V(s)Vπ^(s)=Q(s,π(s))Qπ^(s,π^(s))=[Q(s,π(s))Q(s,π^(s))]+[Q(s,π^(s))Qπ^(s,π^(s))].\\begin{aligned}\n V^{\\star}(s) - V^{\\hat \\pi}(s) &= Q^{\\star}(s,\\pi^\\star(s)) - Q^{\\hat \\pi}(s, \\hat \\pi(s))\\\\\n &= [Q^{\\star}(s,\\pi^\\star(s)) - Q^{\\star}(s, \\hat \\pi(s))] + [Q^{\\star}(s, \\hat \\pi(s)) - Q^{\\hat \\pi}(s, \\hat \\pi(s))].\n\\end{aligned}V(s)Vπ^(s)=Q(s,π(s))Qπ^(s,π^(s))=[Q(s,π(s))Q(s,π^(s))]+[Q(s,π^(s))Qπ^(s,π^(s))].","enumerator":"1.58","key":"WFyC3SvzOu"},{"type":"paragraph","position":{"start":{"line":1380,"column":1},"end":{"line":1380,"column":1}},"children":[{"type":"text","value":"Let’s bound these two quantities separately.","position":{"start":{"line":1380,"column":1},"end":{"line":1380,"column":1}},"key":"qoj8BovrbY"}],"key":"k7hTcAmWkS"},{"type":"paragraph","position":{"start":{"line":1382,"column":1},"end":{"line":1382,"column":1}},"children":[{"type":"text","value":"For the first quantity, note that by the definition of ","position":{"start":{"line":1382,"column":1},"end":{"line":1382,"column":1}},"key":"SwwiLXCSsS"},{"type":"inlineMath","value":"\\hat \\pi","position":{"start":{"line":1382,"column":1},"end":{"line":1382,"column":1}},"html":"π^\\hat \\piπ^","key":"CJZ8HU47OQ"},{"type":"text","value":", we have","position":{"start":{"line":1382,"column":1},"end":{"line":1382,"column":1}},"key":"t5uPdVDqAM"}],"key":"jWxyjyN5tC"},{"type":"math","value":"q(s, \\hat \\pi(s)) \\ge q(s,\\pi^\\star(s)).","position":{"start":{"line":1384,"column":1},"end":{"line":1384,"column":1}},"html":"q(s,π^(s))q(s,π(s)).q(s, \\hat \\pi(s)) \\ge q(s,\\pi^\\star(s)).q(s,π^(s))q(s,π(s)).","enumerator":"1.59","key":"en03GnG2De"},{"type":"paragraph","position":{"start":{"line":1386,"column":1},"end":{"line":1386,"column":1}},"children":[{"type":"text","value":"Let’s add ","position":{"start":{"line":1386,"column":1},"end":{"line":1386,"column":1}},"key":"LV84V2iMPh"},{"type":"inlineMath","value":"q(s, \\hat \\pi(s)) - q(s,\\pi^\\star(s)) \\ge 0","position":{"start":{"line":1386,"column":1},"end":{"line":1386,"column":1}},"html":"q(s,π^(s))q(s,π(s))0q(s, \\hat \\pi(s)) - q(s,\\pi^\\star(s)) \\ge 0q(s,π^(s))q(s,π(s))0","key":"LEV8dgXEmO"},{"type":"text","value":" to the first term to get","position":{"start":{"line":1386,"column":1},"end":{"line":1386,"column":1}},"key":"KGuwPKp43L"}],"key":"fSumpDMEwy"},{"type":"math","value":"\\begin{aligned}\n Q^{\\star}(s,\\pi^\\star(s)) - Q^{\\star}(s, \\hat \\pi(s)) &\\le [Q^{\\star}(s,\\pi^\\star(s))- q(s,\\pi^\\star(s))] + [q(s, \\hat \\pi(s)) - Q^{\\star}(s, \\hat \\pi(s))] \\\\\n &= \\gamma \\E_{s' \\sim P(s, \\pi^{\\star}(s))} [ V^{\\star}(s') - v(s') ] + \\gamma \\E_{s' \\sim P(s, \\hat \\pi(s))} [ v(s') - V^{\\star}(s') ] \\\\\n &\\le 2 \\gamma \\|v - V^{\\star}\\|_{\\infty}.\n\\end{aligned}","position":{"start":{"line":1388,"column":1},"end":{"line":1394,"column":1}},"html":"Q(s,π(s))Q(s,π^(s))[Q(s,π(s))q(s,π(s))]+[q(s,π^(s))Q(s,π^(s))]=γEsP(s,π(s))[V(s)v(s)]+γEsP(s,π^(s))[v(s)V(s)]2γvV.\\begin{aligned}\n Q^{\\star}(s,\\pi^\\star(s)) - Q^{\\star}(s, \\hat \\pi(s)) &\\le [Q^{\\star}(s,\\pi^\\star(s))- q(s,\\pi^\\star(s))] + [q(s, \\hat \\pi(s)) - Q^{\\star}(s, \\hat \\pi(s))] \\\\\n &= \\gamma \\E_{s' \\sim P(s, \\pi^{\\star}(s))} [ V^{\\star}(s') - v(s') ] + \\gamma \\E_{s' \\sim P(s, \\hat \\pi(s))} [ v(s') - V^{\\star}(s') ] \\\\\n &\\le 2 \\gamma \\|v - V^{\\star}\\|_{\\infty}.\n\\end{aligned}Q(s,π(s))Q(s,π^(s))[Q(s,π(s))q(s,π(s))]+[q(s,π^(s))Q(s,π^(s))]=γEsP(s,π(s))[V(s)v(s)]+γEsP(s,π^(s))[v(s)V(s)]2γvV.","enumerator":"1.60","key":"bk6Sif0WYr"},{"type":"paragraph","position":{"start":{"line":1397,"column":1},"end":{"line":1397,"column":1}},"children":[{"type":"text","value":"The second quantity is bounded by","position":{"start":{"line":1397,"column":1},"end":{"line":1397,"column":1}},"key":"WUiXsBm8qj"}],"key":"qJK6aQaCTe"},{"type":"math","value":"\\begin{aligned}\n Q^{\\star}(s, \\hat \\pi(s)) - Q^{\\hat \\pi}(s, \\hat \\pi(s))\n &=\n \\gamma \\E_{s'\\sim P(s, \\hat \\pi(s))}\\left[ V^\\star(s') - V^{\\hat \\pi}(s') \\right] \\\\\n & \\leq \n \\gamma \\|V^{\\star} - V^{\\hat \\pi}\\|_\\infty\n\\end{aligned}","position":{"start":{"line":1399,"column":1},"end":{"line":1407,"column":1}},"html":"Q(s,π^(s))Qπ^(s,π^(s))=γEsP(s,π^(s))[V(s)Vπ^(s)]γVVπ^\\begin{aligned}\n Q^{\\star}(s, \\hat \\pi(s)) - Q^{\\hat \\pi}(s, \\hat \\pi(s))\n &=\n \\gamma \\E_{s'\\sim P(s, \\hat \\pi(s))}\\left[ V^\\star(s') - V^{\\hat \\pi}(s') \\right] \\\\\n & \\leq \n \\gamma \\|V^{\\star} - V^{\\hat \\pi}\\|_\\infty\n\\end{aligned}Q(s,π^(s))Qπ^(s,π^(s))=γEsP(s,π^(s))[V(s)Vπ^(s)]γVVπ^","enumerator":"1.61","key":"PdOLUUZ0m6"},{"type":"paragraph","position":{"start":{"line":1409,"column":1},"end":{"line":1409,"column":1}},"children":[{"type":"text","value":"and thus","position":{"start":{"line":1409,"column":1},"end":{"line":1409,"column":1}},"key":"mopslW7LZW"}],"key":"Y32jcMOp0p"},{"type":"math","value":"\\begin{aligned}\n \\|V^\\star - V^{\\hat \\pi}\\|_\\infty &\\le 2 \\gamma \\|v - V^{\\star}\\|_{\\infty} + \\gamma \\|V^{\\star} - V^{\\hat \\pi}\\|_\\infty \\\\\n \\|V^\\star - V^{\\hat \\pi}\\|_\\infty &\\le \\frac{2 \\gamma \\|v - V^{\\star}\\|_{\\infty}}{1-\\gamma}.\n\\end{aligned}","position":{"start":{"line":1411,"column":1},"end":{"line":1416,"column":1}},"html":"VVπ^2γvV+γVVπ^VVπ^2γvV1γ.\\begin{aligned}\n \\|V^\\star - V^{\\hat \\pi}\\|_\\infty &\\le 2 \\gamma \\|v - V^{\\star}\\|_{\\infty} + \\gamma \\|V^{\\star} - V^{\\hat \\pi}\\|_\\infty \\\\\n \\|V^\\star - V^{\\hat \\pi}\\|_\\infty &\\le \\frac{2 \\gamma \\|v - V^{\\star}\\|_{\\infty}}{1-\\gamma}.\n\\end{aligned}VVπ^VVπ^2γvV+γVVπ^1γ2γvV.","enumerator":"1.62","key":"HSEuixOVup"}],"enumerator":"1.3","key":"PtUCuY01yY"},{"type":"paragraph","position":{"start":{"line":1419,"column":1},"end":{"line":1419,"column":1}},"children":[{"type":"text","value":"So in order to compensate and achieve ","position":{"start":{"line":1419,"column":1},"end":{"line":1419,"column":1}},"key":"B9oW1ZKgeo"},{"type":"inlineMath","value":"\\|V^{\\hat \\pi} - V^{\\star}\\| \\le \\epsilon","position":{"start":{"line":1419,"column":1},"end":{"line":1419,"column":1}},"html":"Vπ^Vϵ\\|V^{\\hat \\pi} - V^{\\star}\\| \\le \\epsilonVπ^Vϵ","key":"mz6qwg6Tje"},{"type":"text","value":", we must have","position":{"start":{"line":1419,"column":1},"end":{"line":1419,"column":1}},"key":"WVQyd3MydZ"}],"key":"zAwiladqT5"},{"type":"math","value":"\\|v^{(T)} - V^\\star\\|_{\\infty} \\le \\frac{1-\\gamma}{2 \\gamma} \\epsilon.","position":{"start":{"line":1421,"column":1},"end":{"line":1421,"column":1}},"html":"v(T)V1γ2γϵ.\\|v^{(T)} - V^\\star\\|_{\\infty} \\le \\frac{1-\\gamma}{2 \\gamma} \\epsilon.v(T)V2γ1γϵ.","enumerator":"1.63","key":"BkRQoNx2Ro"},{"type":"paragraph","position":{"start":{"line":1423,"column":1},"end":{"line":1423,"column":1}},"children":[{"type":"text","value":"This means, using ","position":{"start":{"line":1423,"column":1},"end":{"line":1423,"column":1}},"key":"XuUefj8sS7"},{"type":"crossReference","kind":"proof:remark","identifier":"iterations_vi","label":"iterations_vi","children":[{"type":"text","value":"Remark ","key":"oRrOaBLVJ4"},{"type":"text","value":"1.2","key":"iC7oVV5Ju2"}],"template":"Remark %s","enumerator":"1.2","resolved":true,"html_id":"iterations-vi","key":"b39PD9TiN3"},{"type":"text","value":", we need to run value iteration for","position":{"start":{"line":1423,"column":1},"end":{"line":1423,"column":1}},"key":"FgCjVmlOJK"}],"key":"UBlMX35deV"},{"type":"math","value":"T = O\\left( \\frac{1}{1-\\gamma} \\log\\left(\\frac{\\gamma}{\\epsilon (1-\\gamma)^2}\\right) \\right)","position":{"start":{"line":1425,"column":1},"end":{"line":1425,"column":1}},"html":"T=O(11γlog(γϵ(1γ)2))T = O\\left( \\frac{1}{1-\\gamma} \\log\\left(\\frac{\\gamma}{\\epsilon (1-\\gamma)^2}\\right) \\right)T=O(1γ1log(ϵ(1γ)2γ))","enumerator":"1.64","key":"kwubnaPEot"},{"type":"paragraph","position":{"start":{"line":1427,"column":1},"end":{"line":1427,"column":1}},"children":[{"type":"text","value":"iterations to achieve an ","position":{"start":{"line":1427,"column":1},"end":{"line":1427,"column":1}},"key":"NQUK9GptfS"},{"type":"text","value":"ε","position":{"start":{"line":1427,"column":1},"end":{"line":1427,"column":1}},"key":"v9n4v2uXfw"},{"type":"text","value":"-accurate estimate of the optimal value function.","position":{"start":{"line":1427,"column":1},"end":{"line":1427,"column":1}},"key":"VIO6KOCGtl"}],"key":"VHjir1olZY"},{"type":"heading","depth":4,"position":{"start":{"line":1431,"column":1},"end":{"line":1431,"column":1}},"children":[{"type":"text","value":"Policy iteration","position":{"start":{"line":1431,"column":1},"end":{"line":1431,"column":1}},"key":"QCT94pwo4q"}],"label":"policy_iteration","identifier":"policy_iteration","html_id":"policy-iteration","enumerator":"1.5.3.2","key":"n8JR61T4g4"},{"type":"paragraph","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"children":[{"type":"text","value":"Can we mitigate this “greedy worsening”? What if instead of approximating the optimal value function and then acting greedily by it at the very end, we iteratively improve the policy and value function ","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"key":"teZreTggUJ"},{"type":"emphasis","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"children":[{"type":"text","value":"together","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"key":"bHAYpeFbI9"}],"key":"BYvRFH2SGb"},{"type":"text","value":"? This is the idea behind ","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"key":"N3PLkifGUd"},{"type":"strong","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"children":[{"type":"text","value":"policy iteration","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"key":"JsFPWbwWD4"}],"key":"gebl74jMzF"},{"type":"text","value":". In each step, we simply set the policy to act greedily with respect to its own value function.","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"key":"HBwFsQFNbd"}],"key":"Si3VRmFlON"}],"key":"TKDr3lz82m"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def policy_iteration(mdp: MDP, ε=1e-6) -> Float[Array, \"S A\"]:\n \"\"\"Iteratively improve the policy and value function.\"\"\"\n def op(pi):\n return v_to_greedy(mdp, eval_deterministic_infinite(mdp, pi))\n π_init = jnp.ones((mdp.S, mdp.A)) / mdp.A # uniform random policy\n return loop_until_convergence(op, π_init, ε)","key":"KxVVHWvqzn"},{"type":"output","id":"hv-N829sHK89aKw3irEK9","data":[],"key":"YvFwfJuK9t"}],"data":{},"key":"VwXXJACex6"},{"type":"block","children":[],"key":"JDFKoj5DBN"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"policy_iteration(tidy_mdp_inf)","key":"rfqdLc63iK"},{"type":"output","id":"Kn8nUTYNhhNsMZj_kgAWi","data":[{"output_type":"execute_result","execution_count":26,"metadata":{},"data":{"text/plain":{"content":"Array([[1., 0.],\n [0., 1.]], dtype=float32)","content_type":"text/plain"}}}],"key":"oHth0SLq2A"}],"data":{},"key":"UqPeQ4CsyY"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":1448,"column":1},"end":{"line":1448,"column":1}},"children":[{"type":"text","value":"Although PI appears more complex than VI, we’ll use the same contraction property ","position":{"start":{"line":1448,"column":1},"end":{"line":1448,"column":1}},"key":"zeDxThNdDs"},{"type":"crossReference","kind":"proof:theorem","identifier":"bellman_contraction","label":"bellman_contraction","children":[{"type":"text","value":"Theorem ","key":"ZcfRSQmT97"},{"type":"text","value":"1.4","key":"onXizweskL"}],"template":"Theorem %s","enumerator":"1.4","resolved":true,"html_id":"bellman-contraction","key":"cRYldbIdA0"},{"type":"text","value":" to show convergence. This will give us the same runtime bound as value iteration and iterative policy evaluation for an ","position":{"start":{"line":1448,"column":1},"end":{"line":1448,"column":1}},"key":"wFIMoYgtK7"},{"type":"text","value":"ε","position":{"start":{"line":1448,"column":1},"end":{"line":1448,"column":1}},"key":"Jh0MVz0PpB"},{"type":"text","value":"-optimal value function ","position":{"start":{"line":1448,"column":1},"end":{"line":1448,"column":1}},"key":"IJAJhHsCVm"},{"type":"crossReference","kind":"proof:remark","identifier":"iterations_vi","label":"iterations_vi","children":[{"type":"text","value":"Remark ","key":"JbOxAqOryA"},{"type":"text","value":"1.2","key":"Wlfk3Eqep1"}],"template":"Remark %s","enumerator":"1.2","resolved":true,"html_id":"iterations-vi","key":"E3wLw0J8cz"},{"type":"text","value":", although in practice, PI often converges much faster.","position":{"start":{"line":1448,"column":1},"end":{"line":1448,"column":1}},"key":"tO3PIS0NPq"}],"key":"sIuCzu9qvv"},{"type":"proof","kind":"theorem","label":"pi_iter_analysis","identifier":"pi_iter_analysis","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Policy Iteration runtime and convergence","position":{"start":{"line":1450,"column":1},"end":{"line":1450,"column":1}},"key":"LaQM5SHabT"}],"key":"S9x37Sq33Q"},{"type":"paragraph","position":{"start":{"line":1453,"column":1},"end":{"line":1454,"column":1}},"children":[{"type":"text","value":"We aim to show that the number of iterations required for an\n","position":{"start":{"line":1453,"column":1},"end":{"line":1453,"column":1}},"key":"Cughv7EYbr"},{"type":"text","value":"ε","position":{"start":{"line":1453,"column":1},"end":{"line":1453,"column":1}},"key":"zJX9xBcvs0"},{"type":"text","value":"-accurate estimate of the optimal value function is","position":{"start":{"line":1453,"column":1},"end":{"line":1453,"column":1}},"key":"B1Xys9Qxp4"}],"key":"sawXfSll3X"},{"type":"math","value":"T = O\\left( \\frac{1}{1-\\gamma} \\log\\left(\\frac{1}{\\epsilon (1-\\gamma)}\\right) \\right).","position":{"start":{"line":1456,"column":1},"end":{"line":1456,"column":1}},"html":"T=O(11γlog(1ϵ(1γ))).T = O\\left( \\frac{1}{1-\\gamma} \\log\\left(\\frac{1}{\\epsilon (1-\\gamma)}\\right) \\right).T=O(1γ1log(ϵ(1γ)1)).","enumerator":"1.65","key":"eKNZcwaqbc"},{"type":"paragraph","position":{"start":{"line":1458,"column":1},"end":{"line":1458,"column":1}},"children":[{"type":"text","value":"This bound follows from the contraction property ","position":{"start":{"line":1458,"column":1},"end":{"line":1458,"column":1}},"key":"IfpIJvOgP4"},{"type":"crossReference","kind":"equation","identifier":"bellman_convergence","label":"bellman_convergence","children":[{"type":"text","value":"(","key":"XRBWSkEhMm"},{"type":"text","value":"1.38","key":"LZYAZfElTp"},{"type":"text","value":")","key":"piAQflLbQW"}],"template":"(%s)","enumerator":"1.38","resolved":true,"html_id":"bellman-convergence","key":"ZYcmR8uvtr"},{"type":"text","value":":","position":{"start":{"line":1458,"column":1},"end":{"line":1458,"column":1}},"key":"yH51fsPfi3"}],"key":"FWABO1E4t3"},{"type":"math","value":"\\|V^{\\pi^{t+1}} - V^\\star \\|_{\\infty} \\le \\gamma \\|V^{\\pi^{t}} - V^\\star \\|_{\\infty}.","position":{"start":{"line":1460,"column":1},"end":{"line":1460,"column":1}},"html":"Vπt+1VγVπtV.\\|V^{\\pi^{t+1}} - V^\\star \\|_{\\infty} \\le \\gamma \\|V^{\\pi^{t}} - V^\\star \\|_{\\infty}.Vπt+1VγVπtV.","enumerator":"1.66","key":"CEUKvfxrE9"},{"type":"paragraph","position":{"start":{"line":1462,"column":1},"end":{"line":1463,"column":1}},"children":[{"type":"text","value":"We’ll prove that the iterates of PI respect the contraction property by\nshowing that the policies improve monotonically:","position":{"start":{"line":1462,"column":1},"end":{"line":1462,"column":1}},"key":"TrmLg5jqbg"}],"key":"lZIQhyOj0r"},{"type":"math","value":"V^{\\pi^{t+1}}(s) \\ge V^{\\pi^{t}}(s).","position":{"start":{"line":1465,"column":1},"end":{"line":1465,"column":1}},"html":"Vπt+1(s)Vπt(s).V^{\\pi^{t+1}}(s) \\ge V^{\\pi^{t}}(s).Vπt+1(s)Vπt(s).","enumerator":"1.67","key":"bn1MWev8xP"},{"type":"paragraph","position":{"start":{"line":1467,"column":1},"end":{"line":1468,"column":1}},"children":[{"type":"text","value":"Then we’ll use this to show\n","position":{"start":{"line":1467,"column":1},"end":{"line":1467,"column":1}},"key":"BZnClAPhMY"},{"type":"inlineMath","value":"V^{\\pi^{t+1}}(s) \\ge [\\mathcal{J}^{\\star}(V^{\\pi^{t}})](s)","position":{"start":{"line":1467,"column":1},"end":{"line":1467,"column":1}},"html":"Vπt+1(s)[J(Vπt)](s)V^{\\pi^{t+1}}(s) \\ge [\\mathcal{J}^{\\star}(V^{\\pi^{t}})](s)Vπt+1(s)[J(Vπt)](s)","key":"ak7z9AaZVX"},{"type":"text","value":". Note that","position":{"start":{"line":1467,"column":1},"end":{"line":1467,"column":1}},"key":"jBcCo1MEg6"}],"key":"me8FJeDANj"},{"type":"math","value":"\\begin{aligned}\n(s) &= \\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} V^{\\pi^{t}}(s') \\right] \\\\\n &= r(s, \\pi^{t+1}(s)) + \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} V^{\\pi^{t}}(s')\n\\end{aligned}","position":{"start":{"line":1470,"column":1},"end":{"line":1475,"column":1}},"html":"(s)=maxa[r(s,a)+γEsP(s,a)Vπt(s)]=r(s,πt+1(s))+γEsP(s,πt+1(s))Vπt(s)\\begin{aligned}\n(s) &= \\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} V^{\\pi^{t}}(s') \\right] \\\\\n &= r(s, \\pi^{t+1}(s)) + \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} V^{\\pi^{t}}(s')\n\\end{aligned}(s)=amax[r(s,a)+γEsP(s,a)Vπt(s)]=r(s,πt+1(s))+γEsP(s,πt+1(s))Vπt(s)","enumerator":"1.68","key":"sK6m26dt6u"},{"type":"paragraph","position":{"start":{"line":1477,"column":1},"end":{"line":1478,"column":1}},"children":[{"type":"text","value":"Since\n","position":{"start":{"line":1477,"column":1},"end":{"line":1477,"column":1}},"key":"F289oALmn2"},{"type":"inlineMath","value":"[\\mathcal{J}^{\\star}(V^{\\pi^{t}})](s) \\ge V^{\\pi^{t}}(s)","position":{"start":{"line":1477,"column":1},"end":{"line":1477,"column":1}},"html":"[J(Vπt)](s)Vπt(s)[\\mathcal{J}^{\\star}(V^{\\pi^{t}})](s) \\ge V^{\\pi^{t}}(s)[J(Vπt)](s)Vπt(s)","key":"rbeXx3vrpW"},{"type":"text","value":", we then have","position":{"start":{"line":1477,"column":1},"end":{"line":1477,"column":1}},"key":"TkCuSEAFpu"}],"key":"B0CpGP7HL7"},{"type":"math","value":"\\begin{aligned}\n V^{\\pi^{t+1}}(s) - V^{\\pi^{t}}(s) &\\ge V^{\\pi^{t+1}}(s) - \\mathcal{J}^{\\star} (V^{\\pi^{t}})(s) \\\\\n &= \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} \\left[V^{\\pi^{t+1}}(s') - V^{\\pi^{t}}(s') \\right].\n\\end{aligned}","label":"pi_iter_proof","identifier":"pi_iter_proof","html":"Vπt+1(s)Vπt(s)Vπt+1(s)J(Vπt)(s)=γEsP(s,πt+1(s))[Vπt+1(s)Vπt(s)].\\begin{aligned}\n V^{\\pi^{t+1}}(s) - V^{\\pi^{t}}(s) &\\ge V^{\\pi^{t+1}}(s) - \\mathcal{J}^{\\star} (V^{\\pi^{t}})(s) \\\\\n &= \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} \\left[V^{\\pi^{t+1}}(s') - V^{\\pi^{t}}(s') \\right].\n\\end{aligned}Vπt+1(s)Vπt(s)Vπt+1(s)J(Vπt)(s)=γEsP(s,πt+1(s))[Vπt+1(s)Vπt(s)].","enumerator":"1.69","html_id":"pi-iter-proof","key":"fERdIxiUdN"},{"type":"paragraph","position":{"start":{"line":1489,"column":1},"end":{"line":1492,"column":1}},"children":[{"type":"text","value":"But note that the\nexpression being averaged is the same as the expression on the l.h.s.\nwith ","position":{"start":{"line":1489,"column":1},"end":{"line":1489,"column":1}},"key":"J2M2es2L6Y"},{"type":"inlineMath","value":"s","position":{"start":{"line":1489,"column":1},"end":{"line":1489,"column":1}},"html":"sss","key":"tkszrBvlQK"},{"type":"text","value":" replaced by ","position":{"start":{"line":1489,"column":1},"end":{"line":1489,"column":1}},"key":"hMrMlHXjEo"},{"type":"inlineMath","value":"s'","position":{"start":{"line":1489,"column":1},"end":{"line":1489,"column":1}},"html":"ss's","key":"YoGIVtkHNT"},{"type":"text","value":". So we can apply the same inequality\nrecursively to get","position":{"start":{"line":1489,"column":1},"end":{"line":1489,"column":1}},"key":"flC4JQIktV"}],"key":"sVHa1NYscY"},{"type":"math","value":"\\begin{aligned}\n V^{\\pi^{t+1}}(s) - V^{\\pi^{t}}(s) &\\ge \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} \\left[V^{\\pi^{t+1}}(s') - V^{\\pi^{t}}(s') \\right] \\\\\n &\\ge \\gamma^2 \\E_{\\substack{s' \\sim P(s, \\pi^{t+1}(s)) \\\\ s'' \\sim P(s', \\pi^{t+1}(s'))}} \\left[V^{\\pi^{t+1}}(s'') - V^{\\pi^{t}}(s'') \\right]\\\\\n &\\ge \\cdots\n\\end{aligned}","position":{"start":{"line":1494,"column":1},"end":{"line":1500,"column":1}},"html":"Vπt+1(s)Vπt(s)γEsP(s,πt+1(s))[Vπt+1(s)Vπt(s)]γ2EsP(s,πt+1(s))sP(s,πt+1(s))[Vπt+1(s)Vπt(s)]\\begin{aligned}\n V^{\\pi^{t+1}}(s) - V^{\\pi^{t}}(s) &\\ge \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} \\left[V^{\\pi^{t+1}}(s') - V^{\\pi^{t}}(s') \\right] \\\\\n &\\ge \\gamma^2 \\E_{\\substack{s' \\sim P(s, \\pi^{t+1}(s)) \\\\ s'' \\sim P(s', \\pi^{t+1}(s'))}} \\left[V^{\\pi^{t+1}}(s'') - V^{\\pi^{t}}(s'') \\right]\\\\\n &\\ge \\cdots\n\\end{aligned}Vπt+1(s)Vπt(s)γEsP(s,πt+1(s))[Vπt+1(s)Vπt(s)]γ2EsP(s,πt+1(s))s′′P(s,πt+1(s))[Vπt+1(s′′)Vπt(s′′)]","enumerator":"1.70","key":"K6QVSK8zo2"},{"type":"paragraph","position":{"start":{"line":1502,"column":1},"end":{"line":1506,"column":1}},"children":[{"type":"text","value":"which implies that ","position":{"start":{"line":1502,"column":1},"end":{"line":1502,"column":1}},"key":"puU6Do8ql2"},{"type":"inlineMath","value":"V^{\\pi^{t+1}}(s) \\ge V^{\\pi^{t}}(s)","position":{"start":{"line":1502,"column":1},"end":{"line":1502,"column":1}},"html":"Vπt+1(s)Vπt(s)V^{\\pi^{t+1}}(s) \\ge V^{\\pi^{t}}(s)Vπt+1(s)Vπt(s)","key":"SOAonUhf4d"},{"type":"text","value":"\nfor all ","position":{"start":{"line":1502,"column":1},"end":{"line":1502,"column":1}},"key":"MDjZmBMwvq"},{"type":"inlineMath","value":"s","position":{"start":{"line":1502,"column":1},"end":{"line":1502,"column":1}},"html":"sss","key":"qHS1wmomcC"},{"type":"text","value":" (since the r.h.s. converges to zero). We can then plug this\nback into\n","position":{"start":{"line":1502,"column":1},"end":{"line":1502,"column":1}},"key":"nsFK4uFE9m"},{"type":"crossReference","kind":"equation","identifier":"pi_iter_proof","label":"pi_iter_proof","children":[{"type":"text","value":"(","key":"DhTBChoudr"},{"type":"text","value":"1.69","key":"oirvDhVDfx"},{"type":"text","value":")","key":"ng1aL1iSM7"}],"template":"(%s)","enumerator":"1.69","resolved":true,"html_id":"pi-iter-proof","key":"Ur1h7eAow2"},{"type":"text","value":"\nto get the desired result:","position":{"start":{"line":1502,"column":1},"end":{"line":1502,"column":1}},"key":"syzvlYyaQt"}],"key":"eOp0j3Xq4Y"},{"type":"math","value":"\\begin{aligned}\n V^{\\pi^{t+1}}(s) - \\mathcal{J}^{\\star} (V^{\\pi^{t}})(s) &= \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} \\left[V^{\\pi^{t+1}}(s') - V^{\\pi^{t}}(s') \\right] \\\\\n &\\ge 0 \\\\\n V^{\\pi^{t+1}}(s) &\\ge [\\mathcal{J}^{\\star}(V^{\\pi^{t}})](s)\n\\end{aligned}","position":{"start":{"line":1508,"column":1},"end":{"line":1514,"column":1}},"html":"Vπt+1(s)J(Vπt)(s)=γEsP(s,πt+1(s))[Vπt+1(s)Vπt(s)]0Vπt+1(s)[J(Vπt)](s)\\begin{aligned}\n V^{\\pi^{t+1}}(s) - \\mathcal{J}^{\\star} (V^{\\pi^{t}})(s) &= \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} \\left[V^{\\pi^{t+1}}(s') - V^{\\pi^{t}}(s') \\right] \\\\\n &\\ge 0 \\\\\n V^{\\pi^{t+1}}(s) &\\ge [\\mathcal{J}^{\\star}(V^{\\pi^{t}})](s)\n\\end{aligned}Vπt+1(s)J(Vπt)(s)Vπt+1(s)=γEsP(s,πt+1(s))[Vπt+1(s)Vπt(s)]0[J(Vπt)](s)","enumerator":"1.71","key":"e98qyJowdR"},{"type":"paragraph","position":{"start":{"line":1516,"column":1},"end":{"line":1516,"column":1}},"children":[{"type":"text","value":"This means we can now apply the Bellman convergence result ","position":{"start":{"line":1516,"column":1},"end":{"line":1516,"column":1}},"key":"anZZJu8j2s"},{"type":"crossReference","kind":"equation","identifier":"bellman_convergence","label":"bellman_convergence","children":[{"type":"text","value":"(","key":"CWntIUgTDP"},{"type":"text","value":"1.38","key":"PjYH5Av1Qp"},{"type":"text","value":")","key":"YA0qwsM6fz"}],"template":"(%s)","enumerator":"1.38","resolved":true,"html_id":"bellman-convergence","key":"zPGYQ4qfo8"},{"type":"text","value":" to get","position":{"start":{"line":1516,"column":1},"end":{"line":1516,"column":1}},"key":"RExDt3YPFj"}],"key":"OMGWCAwCTf"},{"type":"math","value":"\\|V^{\\pi^{t+1}} - V^\\star \\|_{\\infty} \\le \\|\\mathcal{J}^{\\star} (V^{\\pi^{t}}) - V^{\\star}\\|_{\\infty} \\le \\gamma \\|V^{\\pi^{t}} - V^\\star \\|_{\\infty}.","position":{"start":{"line":1518,"column":1},"end":{"line":1518,"column":1}},"html":"Vπt+1VJ(Vπt)VγVπtV.\\|V^{\\pi^{t+1}} - V^\\star \\|_{\\infty} \\le \\|\\mathcal{J}^{\\star} (V^{\\pi^{t}}) - V^{\\star}\\|_{\\infty} \\le \\gamma \\|V^{\\pi^{t}} - V^\\star \\|_{\\infty}.Vπt+1VJ(Vπt)VγVπtV.","enumerator":"1.72","key":"fjCWosrDG7"}],"enumerator":"1.6","html_id":"pi-iter-analysis","key":"acupTtfGOa"},{"type":"heading","depth":2,"position":{"start":{"line":1521,"column":1},"end":{"line":1521,"column":1}},"children":[{"type":"text","value":"Summary","position":{"start":{"line":1521,"column":1},"end":{"line":1521,"column":1}},"key":"HiTgvMqiAW"}],"identifier":"summary","label":"Summary","html_id":"summary","implicit":true,"enumerator":"1.6","key":"iIhPyeDf8a"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":1523,"column":1},"end":{"line":1555,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":1523,"column":1},"end":{"line":1530,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":1523,"column":1},"end":{"line":1529,"column":1}},"children":[{"type":"text","value":"Markov decision processes (MDPs) are a framework for sequential\ndecision making under uncertainty. They consist of a state space\n","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"oMHQURtos4"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"html":"S\\mathcal{S}S","key":"K6D2D2yWr8"},{"type":"text","value":", an action space ","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"Ur4uyeXJ6A"},{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"html":"A\\mathcal{A}A","key":"RqDww6vffx"},{"type":"text","value":", an initial state distribution\n","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"dR7hrukRJl"},{"type":"inlineMath","value":"\\mu \\in \\Delta(\\mathcal{S})","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"html":"μΔ(S)\\mu \\in \\Delta(\\mathcal{S})μΔ(S)","key":"YfcwVWN0uk"},{"type":"text","value":", a transition function ","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"MpvO0pQLsN"},{"type":"inlineMath","value":"P(s' \\mid s, a)","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"html":"P(ss,a)P(s' \\mid s, a)P(ss,a)","key":"TWsDcF5Tlf"},{"type":"text","value":", and a\nreward function ","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"yQipNMaRNL"},{"type":"inlineMath","value":"r(s, a)","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"html":"r(s,a)r(s, a)r(s,a)","key":"mlmxRLTPny"},{"type":"text","value":". They can be finite-horizon (ends after\n","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"AQgeIqICxL"},{"type":"inlineMath","value":"H","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"html":"HHH","key":"AJWfOY71DZ"},{"type":"text","value":" timesteps) or infinite-horizon (where rewards scale by\n","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"q4cTlgdYl7"},{"type":"inlineMath","value":"\\gamma \\in (0, 1)","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"html":"γ(0,1)\\gamma \\in (0, 1)γ(0,1)","key":"NuOyFGETIg"},{"type":"text","value":" at each timestep).","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"RYvU7Rw6YO"}],"key":"YmvrMmNBHx"}],"key":"k1lyU0beFv"},{"type":"listItem","spread":true,"position":{"start":{"line":1531,"column":1},"end":{"line":1535,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":1531,"column":1},"end":{"line":1534,"column":1}},"children":[{"type":"text","value":"Our goal is to find a policy ","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"Ls6kcJ2L5V"},{"type":"text","value":"π","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"bRbMVPdY69"},{"type":"text","value":" that maximizes expected total\nreward. Policies can be ","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"c09rKbnihM"},{"type":"strong","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"children":[{"type":"text","value":"deterministic","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"W5OMVi8sig"}],"key":"JkgoqZ3ulE"},{"type":"text","value":" or ","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"VS1bSwR9gy"},{"type":"strong","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"children":[{"type":"text","value":"stochastic","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"baMBhIJAql"}],"key":"v6Q9mUwb3y"},{"type":"text","value":",\n","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"msuxV2RcZE"},{"type":"strong","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"children":[{"type":"text","value":"state-dependent","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"SdN6afErfm"}],"key":"IU5zc5YNbt"},{"type":"text","value":" or ","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"rEbqkEtl3P"},{"type":"strong","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"children":[{"type":"text","value":"history-dependent","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"iysAeJAK4J"}],"key":"tZr8T20NlD"},{"type":"text","value":", ","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"HXJv7Tx9R1"},{"type":"strong","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"children":[{"type":"text","value":"stationary","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"vNA5sJck3k"}],"key":"eXawNTE1E3"},{"type":"text","value":" or\n","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"v6TTipqWsq"},{"type":"strong","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"children":[{"type":"text","value":"time-dependent","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"aoBiMc7bkc"}],"key":"Cz7JGeVD1P"},{"type":"text","value":".","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"AF4AyUpkwW"}],"key":"fEph84nQ2e"}],"key":"HHOrQ0yl1c"},{"type":"listItem","spread":true,"position":{"start":{"line":1536,"column":1},"end":{"line":1537,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":1536,"column":1},"end":{"line":1536,"column":1}},"children":[{"type":"text","value":"A policy induces a distribution over ","position":{"start":{"line":1536,"column":1},"end":{"line":1536,"column":1}},"key":"JpaaEq3OID"},{"type":"strong","position":{"start":{"line":1536,"column":1},"end":{"line":1536,"column":1}},"children":[{"type":"text","value":"trajectories","position":{"start":{"line":1536,"column":1},"end":{"line":1536,"column":1}},"key":"gR08ouaboC"}],"key":"zPcUn6Z58o"},{"type":"text","value":".","position":{"start":{"line":1536,"column":1},"end":{"line":1536,"column":1}},"key":"XsVtcFdKpo"}],"key":"nj9SudS8pR"}],"key":"JoztHIwldh"},{"type":"listItem","spread":true,"position":{"start":{"line":1538,"column":1},"end":{"line":1545,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":1538,"column":1},"end":{"line":1544,"column":1}},"children":[{"type":"text","value":"We can evaluate a policy by computing its ","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"xMG1XM5Ono"},{"type":"strong","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"children":[{"type":"text","value":"value function","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"J9g0nnxyVI"}],"key":"MPs9ODGCwz"},{"type":"text","value":"\n","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"sZarxyDxFH"},{"type":"inlineMath","value":"V^\\pi(s)","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"html":"Vπ(s)V^\\pi(s)Vπ(s)","key":"JOwfQ5qAE9"},{"type":"text","value":", which is the expected total reward starting from state\n","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"QYdRvAaUaF"},{"type":"inlineMath","value":"s","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"html":"sss","key":"A5BFl72mRE"},{"type":"text","value":" and following policy ","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"c4RjMVz6Dl"},{"type":"text","value":"π","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"VdO62r1ebP"},{"type":"text","value":". We can also compute the\n","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"xwQtKE89B2"},{"type":"strong","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"children":[{"type":"text","value":"state-action value function","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"I49DeFE84p"}],"key":"tDrPyap2Lg"},{"type":"text","value":" ","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"JkIX0K4GUu"},{"type":"inlineMath","value":"Q^\\pi(s, a)","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"html":"Qπ(s,a)Q^\\pi(s, a)Qπ(s,a)","key":"YwU3g8waBn"},{"type":"text","value":", which is the expected\ntotal reward starting from state ","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"hgXe91Cooo"},{"type":"inlineMath","value":"s","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"html":"sss","key":"GRdZsASl40"},{"type":"text","value":", taking action ","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"kILCbxBiim"},{"type":"inlineMath","value":"a","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"html":"aaa","key":"LB5SJH02ot"},{"type":"text","value":", and then\nfollowing policy ","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"R8UPl5laT2"},{"type":"text","value":"π","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"ILFxuKCjJY"},{"type":"text","value":". In the finite-horizon setting, these also\ndepend on the timestep ","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"AHOQIYYLFk"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"html":"h\\hih","key":"ffyyJOt3mT"},{"type":"text","value":".","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"zZOD7ctGL3"}],"key":"SGHRyAbLyd"}],"key":"HpBjcOhkJ4"},{"type":"listItem","spread":true,"position":{"start":{"line":1546,"column":1},"end":{"line":1550,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":1546,"column":1},"end":{"line":1549,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":1546,"column":1},"end":{"line":1546,"column":1}},"key":"YzrNnGWJyJ"},{"type":"strong","position":{"start":{"line":1546,"column":1},"end":{"line":1546,"column":1}},"children":[{"type":"text","value":"Bellman consistency equation","position":{"start":{"line":1546,"column":1},"end":{"line":1546,"column":1}},"key":"EO2xqAbAWk"}],"key":"xQWbgJPq7p"},{"type":"text","value":" is an equation that the value\nfunction must satisfy. It can be used to solve for the value\nfunctions exactly. Thinking of the r.h.s. of this equation as an\noperator on value functions gives the ","position":{"start":{"line":1546,"column":1},"end":{"line":1546,"column":1}},"key":"qSmwAohlXA"},{"type":"strong","position":{"start":{"line":1546,"column":1},"end":{"line":1546,"column":1}},"children":[{"type":"text","value":"Bellman operator","position":{"start":{"line":1546,"column":1},"end":{"line":1546,"column":1}},"key":"meAJPM3LHP"}],"key":"CZh5Xvb1HM"},{"type":"text","value":".","position":{"start":{"line":1546,"column":1},"end":{"line":1546,"column":1}},"key":"geknOuuN4e"}],"key":"YpozwVdoR8"}],"key":"kyHMUyV98Y"},{"type":"listItem","spread":true,"position":{"start":{"line":1551,"column":1},"end":{"line":1553,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":1551,"column":1},"end":{"line":1552,"column":1}},"children":[{"type":"text","value":"In the finite-horizon setting, we can compute the optimal policy\nusing ","position":{"start":{"line":1551,"column":1},"end":{"line":1551,"column":1}},"key":"MRcV7UTBfR"},{"type":"strong","position":{"start":{"line":1551,"column":1},"end":{"line":1551,"column":1}},"children":[{"type":"text","value":"dynamic programming","position":{"start":{"line":1551,"column":1},"end":{"line":1551,"column":1}},"key":"eSt4yxA7wW"}],"key":"n5tagC3fXA"},{"type":"text","value":".","position":{"start":{"line":1551,"column":1},"end":{"line":1551,"column":1}},"key":"d9pNTpagwD"}],"key":"DigLmPe7US"}],"key":"zkTXywXLzP"},{"type":"listItem","spread":true,"position":{"start":{"line":1554,"column":1},"end":{"line":1555,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":1554,"column":1},"end":{"line":1555,"column":1}},"children":[{"type":"text","value":"In the infinite-horizon setting, we can compute the optimal policy\nusing ","position":{"start":{"line":1554,"column":1},"end":{"line":1554,"column":1}},"key":"fF7GBGcqOE"},{"type":"strong","position":{"start":{"line":1554,"column":1},"end":{"line":1554,"column":1}},"children":[{"type":"text","value":"value iteration","position":{"start":{"line":1554,"column":1},"end":{"line":1554,"column":1}},"key":"SoEwz7EniC"}],"key":"YlGoh3ArFZ"},{"type":"text","value":" or ","position":{"start":{"line":1554,"column":1},"end":{"line":1554,"column":1}},"key":"fz1ycg2Xgk"},{"type":"strong","position":{"start":{"line":1554,"column":1},"end":{"line":1554,"column":1}},"children":[{"type":"text","value":"policy iteration","position":{"start":{"line":1554,"column":1},"end":{"line":1554,"column":1}},"key":"ofABOzPcnZ"}],"key":"dJUsVn1q4q"},{"type":"text","value":".","position":{"start":{"line":1554,"column":1},"end":{"line":1554,"column":1}},"key":"Up67IfFoYS"}],"key":"TgEjI88dWn"}],"key":"hPPDvIzksL"}],"key":"vAsPbcukBl"}],"key":"yx4u6IzIhO"}],"key":"CaDQXSmzwH"},"references":{"cite":{"order":[],"data":{}}},"footer":{"navigation":{"prev":{"title":"CS/STAT 184: Introduction to Reinforcement Learning","url":"/","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"2 Linear Quadratic Regulators","url":"/control","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file +{"kind":"Notebook","sha256":"32c2f6fe9e96648ecf8985a4e80db115d0d6950b01e46976348cc5f4529cd76f","slug":"mdps","location":"/mdps.md","dependencies":[],"frontmatter":{"title":"1 Markov Decision Processes","numbering":{"all":{"enabled":true},"enumerator":{"template":"1.%s"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"thumbnail":"/build/deterministic_policy-9d0b50d69541007293ead345d987b682.png","thumbnailOptimized":"/build/deterministic_policy-9d0b50d69541007293ead345d987b682.webp","exports":[{"format":"md","filename":"mdps.md","url":"/build/mdps-eb86bf115f025d31fd89a81ae9f29e0d.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"HMMZAPnc9i"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"1.1","key":"I87CP5ko2x"},{"type":"paragraph","position":{"start":{"line":20,"column":1},"end":{"line":22,"column":1}},"children":[{"type":"text","value":"The field of RL studies how an agent can learn to make sequential decisions in an interactive environment.\nThis is a very general problem!\nHow can we ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"Ml4hM31hbw"},{"type":"emphasis","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"formalize","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"QYRBuSP6uk"}],"key":"f0a65sUXxB"},{"type":"text","value":" this task in a way that is both ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"x57qTBu88W"},{"type":"emphasis","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"sufficiently general","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"lVURPjfQO1"}],"key":"DN6DPYprxG"},{"type":"text","value":" yet also tractable enough for ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"TTruSG65w9"},{"type":"emphasis","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"fruitful analysis","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"HYjnRnZNet"}],"key":"AoUT1IxVRN"},{"type":"text","value":"?","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"ydlN9cFLuJ"}],"key":"OR5AHdSDCc"},{"type":"paragraph","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"children":[{"type":"text","value":"Let’s consider some examples of sequential decision problems to identify the key common properties we’d like to capture:","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"TPY5xL6cm5"}],"key":"VKcYjNJwNb"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":26,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"children":[{"type":"strong","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"children":[{"type":"text","value":"Board games and video games,","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"qXwweAgzLH"}],"key":"nj7A7p8NYC"},{"type":"text","value":" where a player takes actions in a virtual environment.","position":{"start":{"line":26,"column":1},"end":{"line":26,"column":1}},"key":"uCGw40SXxY"}],"key":"B4wQ8OkCjF"},{"type":"listItem","spread":true,"position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"strong","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"text","value":"Inventory management,","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"NJ3K51cfHd"}],"key":"GveLvLHW4D"},{"type":"text","value":" where a company must efficiently move resources from producers to consumers.","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"UkLO1ifRp7"}],"key":"SyBVHB31C8"},{"type":"listItem","spread":true,"position":{"start":{"line":28,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"strong","position":{"start":{"line":28,"column":1},"end":{"line":28,"column":1}},"children":[{"type":"text","value":"Robotic control","position":{"start":{"line":28,"column":1},"end":{"line":28,"column":1}},"key":"das4iieIZv"}],"key":"rfCAoouxsl"},{"type":"text","value":", where a robot can move and interact with the real world to complete some task.","position":{"start":{"line":28,"column":1},"end":{"line":28,"column":1}},"key":"fg9YB8koyE"}],"key":"yyjCXVjuXK"}],"key":"HL8hW1WgZJ"},{"type":"paragraph","position":{"start":{"line":30,"column":1},"end":{"line":37,"column":1}},"children":[{"type":"text","value":"In these environments and many others, the ","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"zsHVoYvjkz"},{"type":"strong","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"children":[{"type":"text","value":"state transitions","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"E0n8ZvAJGu"}],"key":"reaf7XFSC7"},{"type":"text","value":",\nthe “rules” of the environment,\nonly depend on the ","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"AzpzHbtQer"},{"type":"emphasis","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"children":[{"type":"text","value":"most recent","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"RNKPtHjk5T"}],"key":"qrQbRoO9bQ"},{"type":"text","value":" state and action (generally speaking).\nFor example, if you want to take a break while playing a game of chess,\nyou could take a picture of the board,\nand later on reset the board to that state and continue playing;\nthe past history of moves doesn’t matter (generally speaking).\nThis is called the ","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"NQdCc5JPoN"},{"type":"strong","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"children":[{"type":"text","value":"Markov property.","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"VfeDqBh0I0"}],"key":"X3a9UBndBh"}],"key":"N4LlLgtxPV"},{"type":"proof","kind":"definition","label":"markov","identifier":"markov","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Markov property","position":{"start":{"line":39,"column":1},"end":{"line":39,"column":1}},"key":"WegkxBiGwS"}],"key":"CURoqelgUg"},{"type":"paragraph","position":{"start":{"line":42,"column":1},"end":{"line":44,"column":1}},"children":[{"type":"text","value":"An interactive environment satisfies the ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"mCVs4FS42o"},{"type":"strong","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"Markov property","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"OEnWQPSwWr"}],"key":"a2OIW543PG"},{"type":"text","value":" if the\nprobability of transitioning to a new state only depends on the current\nstate and action:","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"dVUuBl5RCI"}],"key":"wos86ikFdx"},{"type":"math","value":"\\pr(s_{\\hi+1} \\mid s_0, a_0, \\dots, s_\\hi, a_\\hi) = P(s_{\\hi+1} \\mid s_\\hi, a_\\hi)","position":{"start":{"line":46,"column":1},"end":{"line":46,"column":1}},"html":"P(sh+1s0,a0,,sh,ah)=P(sh+1sh,ah)\\pr(s_{\\hi+1} \\mid s_0, a_0, \\dots, s_\\hi, a_\\hi) = P(s_{\\hi+1} \\mid s_\\hi, a_\\hi)P(sh+1s0,a0,,sh,ah)=P(sh+1sh,ah)","enumerator":"1.1","key":"xtfNMhLmDc"},{"type":"paragraph","position":{"start":{"line":48,"column":1},"end":{"line":49,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"key":"dNSavNyJkA"},{"type":"inlineMath","value":"P : \\mathcal{S} \\times \\mathcal{A} \\to \\triangle(\\mathcal{S})","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"html":"P:S×A(S)P : \\mathcal{S} \\times \\mathcal{A} \\to \\triangle(\\mathcal{S})P:S×A(S)","key":"r82G2sH8aI"},{"type":"text","value":" describes the state transitions.\n(We’ll elaborate on this notation later in the chapter.)","position":{"start":{"line":48,"column":1},"end":{"line":48,"column":1}},"key":"IjUTKOaYzy"}],"key":"zFCW0a8v3p"}],"enumerator":"1.1","html_id":"markov","key":"joPTBELRUn"},{"type":"paragraph","position":{"start":{"line":52,"column":1},"end":{"line":53,"column":1}},"children":[{"type":"text","value":"Environments that satisfy the Markov property are called ","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"HGi6hCrwl3"},{"type":"strong","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"children":[{"type":"text","value":"Markov decision processes","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"oMXiZRd2f8"}],"key":"b3lhNdQjip"},{"type":"text","value":" (MDPs).\nThis chapter will focus on introducing core vocabulary for MDPs that will be useful throughout the book.","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"XHmPGmcS5D"}],"key":"yQ4ZvrlERm"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"GWgpAgT86I"}],"key":"MHMYujrfyx"},{"type":"paragraph","position":{"start":{"line":56,"column":1},"end":{"line":58,"column":1}},"children":[{"type":"text","value":"What information might be encoded in the ","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"td3mIwKQkf"},{"type":"emphasis","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"state","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"gaIkTICMGK"}],"key":"x5tzOWlWil"},{"type":"text","value":" for each of the above examples?\nWhat might the valid set of ","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"Rn8HxwlFNo"},{"type":"emphasis","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"actions","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"PglLnvRjnf"}],"key":"GIZG7xhNzZ"},{"type":"text","value":" be?\nDescribe the ","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"mEN4wTSrfb"},{"type":"emphasis","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"state transitions","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"bj3Vsny4cr"}],"key":"rfeo5nQYI8"},{"type":"text","value":" heuristically and verify that they satisfy the Markov property.","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"fPvRRaWlpg"}],"key":"VtC3TVj5Mh"}],"key":"bRAVvkMU6s"},{"type":"paragraph","position":{"start":{"line":61,"column":1},"end":{"line":63,"column":1}},"children":[{"type":"text","value":"MDPs are usually classified as ","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"key":"e4sUGQmDCA"},{"type":"strong","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"children":[{"type":"text","value":"finite-horizon","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"key":"IOM82vwguo"}],"key":"iX2aZ6ghqS"},{"type":"text","value":", where the interactions end after some finite number of time steps,\nor ","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"key":"Q0xE3tDIGy"},{"type":"strong","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"children":[{"type":"text","value":"infinite-horizon","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"key":"ySjONCKteX"}],"key":"vQzkv2SBiW"},{"type":"text","value":", where the interactions can continue indefinitely.\nWe’ll begin with the finite-horizon case and discuss the infinite-horizon case in the second half of the chapter.","position":{"start":{"line":61,"column":1},"end":{"line":61,"column":1}},"key":"xC1MjXhoWZ"}],"key":"nozbXPJQai"},{"type":"paragraph","position":{"start":{"line":65,"column":1},"end":{"line":67,"column":1}},"children":[{"type":"text","value":"We’ll describe how to ","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"wnM2zlHy9T"},{"type":"emphasis","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"children":[{"type":"text","value":"evaluate","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"sEj0RJIEEx"}],"key":"elZk35aZx4"},{"type":"text","value":" different strategies, called ","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"JaHXSn4g3N"},{"type":"strong","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"children":[{"type":"text","value":"policies,","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"XoeTOU3LyW"}],"key":"lc6NFOX9qR"},{"type":"text","value":" and how to compute (or approximate)\nthe ","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"ha2d1cpwK5"},{"type":"strong","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"children":[{"type":"text","value":"optimal policy","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"P5VGDHEMvy"}],"key":"Fpsl4KJLWH"},{"type":"text","value":" for a given MDP.\nWe’ll introduce the ","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"kV3ebFGTQj"},{"type":"strong","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"children":[{"type":"text","value":"Bellman consistency condition","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"uo2ApP76eW"}],"key":"x95OhEIgcj"},{"type":"text","value":", which allows us to analyze the whole sequence of interactions in terms of individual timesteps.","position":{"start":{"line":65,"column":1},"end":{"line":65,"column":1}},"key":"DmOdpEorVp"}],"key":"XzC4JcoP7z"}],"key":"ar7mEdrLrV"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"from utils import NamedTuple, Float, Array, partial, jax, jnp, latexify","key":"Z3f799AorF"},{"type":"output","id":"ofQlDAe87-M3TMAtquc9t","data":[],"key":"IVrYTBFrhq"}],"data":{},"key":"OT2bC174Ch"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":73,"column":1},"end":{"line":73,"column":1}},"children":[{"type":"text","value":"Finite-horizon MDPs","position":{"start":{"line":73,"column":1},"end":{"line":73,"column":1}},"key":"LNxCyXwjnd"}],"identifier":"finite-horizon-mdps","label":"Finite-horizon MDPs","html_id":"finite-horizon-mdps","implicit":true,"enumerator":"1.2","key":"dfDL2FCVzw"},{"type":"heading","depth":3,"position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"children":[{"type":"text","value":"Definition","position":{"start":{"line":75,"column":1},"end":{"line":75,"column":1}},"key":"paso34s3xj"}],"identifier":"definition","label":"Definition","html_id":"definition","implicit":true,"enumerator":"1.2.1","key":"tkQuDQMUWM"},{"type":"proof","kind":"definition","label":"finite_horizon_mdp","identifier":"finite_horizon_mdp","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Finite-horizon Markov decision process","position":{"start":{"line":77,"column":1},"end":{"line":77,"column":1}},"key":"XoLOw88IF5"}],"key":"ChpVRFJu74"},{"type":"paragraph","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"children":[{"type":"text","value":"The components of a finite-horizon Markov decision process are:","position":{"start":{"line":80,"column":1},"end":{"line":80,"column":1}},"key":"EEAYKJQZvf"}],"key":"pc44432Gsy"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":82,"column":1},"end":{"line":101,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":82,"column":1},"end":{"line":84,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":82,"column":1},"end":{"line":83,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"F2hPSivzdo"},{"type":"strong","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"children":[{"type":"text","value":"state","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"u4GpwuMM3T"}],"key":"Yvidf2TvE9"},{"type":"text","value":" that the agent interacts with. We use ","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"hP0aQWMbTH"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"html":"S\\mathcal{S}S","key":"HqjWrgFLQV"},{"type":"text","value":" to denote\nthe set of possible states, called the ","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"asCZX2aF6L"},{"type":"strong","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"children":[{"type":"text","value":"state space","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"WZsZzjVQVV"}],"key":"AQHv1f4YJ4"},{"type":"text","value":".","position":{"start":{"line":82,"column":1},"end":{"line":82,"column":1}},"key":"aif01OCYJm"}],"key":"siMgKhZnF2"}],"key":"CqWPzj6UF3"},{"type":"listItem","spread":true,"position":{"start":{"line":85,"column":1},"end":{"line":87,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":85,"column":1},"end":{"line":86,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"SrhXX2bfza"},{"type":"strong","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"children":[{"type":"text","value":"actions","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"kLnutE7UKb"}],"key":"M87Ws7e44I"},{"type":"text","value":" that the agent can take. We use ","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"WhCGkFfE4m"},{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"html":"A\\mathcal{A}A","key":"d4pPnWYmQ1"},{"type":"text","value":" to denote the\nset of possible actions, called the ","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"BHaw9N6wCA"},{"type":"strong","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"children":[{"type":"text","value":"action space","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"oG77UAf8WN"}],"key":"NcQ3nXxAYh"},{"type":"text","value":".","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"QIhILkU1KE"}],"key":"tUVX9hpRmM"}],"key":"vfAiCdREs1"},{"type":"listItem","spread":true,"position":{"start":{"line":88,"column":1},"end":{"line":89,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"children":[{"type":"text","value":"Some ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"lA23BkYE0G"},{"type":"strong","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"children":[{"type":"text","value":"initial state distribution","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"sEfVA61Pw3"}],"key":"cgC07uLZ3a"},{"type":"text","value":" ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"eeq7uaa7ii"},{"type":"inlineMath","value":"\\mu \\in \\triangle(\\mathcal{S})","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"html":"μ(S)\\mu \\in \\triangle(\\mathcal{S})μ(S)","key":"cc3YEEbFgU"},{"type":"text","value":".","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"ahzNoLgvol"}],"key":"VhvUqCKiMM"}],"key":"kPLHTUACpE"},{"type":"listItem","spread":true,"position":{"start":{"line":90,"column":1},"end":{"line":93,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":90,"column":1},"end":{"line":92,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"key":"Jp0FWdSweo"},{"type":"strong","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"children":[{"type":"text","value":"state transitions","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"key":"O17r9AIg7K"}],"key":"rbVLBDUymP"},{"type":"text","value":" (a.k.a. ","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"key":"pFzbQndFcd"},{"type":"strong","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"children":[{"type":"text","value":"dynamics","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"key":"cd60xgOnGp"}],"key":"VzAldWLud4"},{"type":"text","value":")\n","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"key":"hVjf5Btail"},{"type":"inlineMath","value":"P : \\mathcal{S} \\times \\mathcal{A} \\to \\triangle(\\mathcal{S})","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"html":"P:S×A(S)P : \\mathcal{S} \\times \\mathcal{A} \\to \\triangle(\\mathcal{S})P:S×A(S)","key":"WOywrmaWQv"},{"type":"text","value":" that describe what state the agent\ntransitions to after taking an action.","position":{"start":{"line":90,"column":1},"end":{"line":90,"column":1}},"key":"V7n7u1tGsi"}],"key":"NoPFpZxd1C"}],"key":"NF6I7gACPJ"},{"type":"listItem","spread":true,"position":{"start":{"line":94,"column":1},"end":{"line":98,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":94,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"X18AKXJtpH"},{"type":"strong","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"reward","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"NKejNixCCp"}],"key":"FKWuqY3wr7"},{"type":"text","value":" signal. In this course we’ll take it to be a\ndeterministic function on state-action pairs,\n","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"xrTyv00U5y"},{"type":"inlineMath","value":"r : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"r:S×ARr : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}r:S×AR","key":"egXr2IZZkd"},{"type":"text","value":", but in general many results will\nextend to a ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"XlKu648cbN"},{"type":"emphasis","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"stochastic","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"AqjGxarYK9"}],"key":"m9alEpq3LB"},{"type":"text","value":" reward signal.","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"EA927Hy1Ar"}],"key":"J094RwLSYy"}],"key":"PeE3paxTe3"},{"type":"listItem","spread":true,"position":{"start":{"line":99,"column":1},"end":{"line":101,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":99,"column":1},"end":{"line":100,"column":1}},"children":[{"type":"text","value":"A time horizon ","position":{"start":{"line":99,"column":1},"end":{"line":99,"column":1}},"key":"q75Kso0si3"},{"type":"inlineMath","value":"\\hor \\in \\mathbb{N}","position":{"start":{"line":99,"column":1},"end":{"line":99,"column":1}},"html":"HN\\hor \\in \\mathbb{N}HN","key":"kU47K4l9a8"},{"type":"text","value":" that specifies the number of\ninteractions in an ","position":{"start":{"line":99,"column":1},"end":{"line":99,"column":1}},"key":"o0LrF0oWki"},{"type":"strong","position":{"start":{"line":99,"column":1},"end":{"line":99,"column":1}},"children":[{"type":"text","value":"episode","position":{"start":{"line":99,"column":1},"end":{"line":99,"column":1}},"key":"LG5lR0vhRV"}],"key":"gXqpWFStXP"},{"type":"text","value":".","position":{"start":{"line":99,"column":1},"end":{"line":99,"column":1}},"key":"a4IcuxGcrb"}],"key":"wG1t5BOY1d"}],"key":"lcSBmSWkYe"}],"key":"aFcgRaqItj"},{"type":"paragraph","position":{"start":{"line":102,"column":1},"end":{"line":103,"column":1}},"children":[{"type":"text","value":"Combined together, these objects specify a finite-horizon Markov\ndecision process:","position":{"start":{"line":102,"column":1},"end":{"line":102,"column":1}},"key":"VivL8zPurT"}],"key":"WDOPuv0XXE"},{"type":"math","value":"M = (\\mathcal{S}, \\mathcal{A}, \\mu, P, r, \\hor).","position":{"start":{"line":105,"column":1},"end":{"line":105,"column":1}},"html":"M=(S,A,μ,P,r,H).M = (\\mathcal{S}, \\mathcal{A}, \\mu, P, r, \\hor).M=(S,A,μ,P,r,H).","enumerator":"1.2","key":"mHce4r4hKn"},{"type":"paragraph","position":{"start":{"line":107,"column":1},"end":{"line":110,"column":1}},"children":[{"type":"text","value":"When there are ","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"FxUhgFEWIj"},{"type":"strong","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"children":[{"type":"text","value":"finitely","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"DbtenUueG6"}],"key":"LAMiYK8WV2"},{"type":"text","value":" many states and actions, i.e.\n","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"FvsuKANCju"},{"type":"inlineMath","value":"|\\mathcal{S}|, |\\mathcal{A}| < \\infty","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"html":"S,A<|\\mathcal{S}|, |\\mathcal{A}| < \\inftyS,A<","key":"mgbkfxKYqm"},{"type":"text","value":", we can express\nthe relevant quantities as vectors and matrices (i.e. ","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"ZNyJHnJE14"},{"type":"emphasis","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"children":[{"type":"text","value":"tables","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"Is1kkazLBK"}],"key":"YQONV6NqEj"},{"type":"text","value":" of\nvalues):","position":{"start":{"line":107,"column":1},"end":{"line":107,"column":1}},"key":"ZywHgorpvX"}],"key":"fpTJcH1C0w"},{"type":"math","value":"\\begin{aligned}\n \\mu &\\in [0, 1]^{|\\mathcal{S}|} &\n P &\\in [0, 1]^{(|\\mathcal{S} \\times \\mathcal{A}|) \\times |\\mathcal{S}|} &\n r &\\in \\mathbb{R}^{|\\mathcal{S}| \\times |\\mathcal{A}|}\n\\end{aligned}","position":{"start":{"line":112,"column":1},"end":{"line":118,"column":1}},"html":"μ[0,1]SP[0,1](S×A)×SrRS×A\\begin{aligned}\n \\mu &\\in [0, 1]^{|\\mathcal{S}|} &\n P &\\in [0, 1]^{(|\\mathcal{S} \\times \\mathcal{A}|) \\times |\\mathcal{S}|} &\n r &\\in \\mathbb{R}^{|\\mathcal{S}| \\times |\\mathcal{A}|}\n\\end{aligned}μ[0,1]SP[0,1](S×A)×SrRS×A","enumerator":"1.3","key":"PmCHFPzuR1"}],"enumerator":"1.2","html_id":"finite-horizon-mdp","key":"MbFms6PQNb"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"EgC4LJRbvn"}],"key":"otGcWsmiG0"},{"type":"paragraph","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"children":[{"type":"text","value":"Verify that the types and shapes provided above make sense!","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"DIcWcB7I1H"}],"key":"w1VO4cHErh"}],"key":"zVkDZkkGGz"}],"key":"LGaFFdoJJK"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class MDP(NamedTuple):\n \"\"\"A description of a Markov decision process with finitely many states and actions.\"\"\"\n S: int # number of states\n A: int # number of actions\n μ: Float[Array, \" S\"]\n P: Float[Array, \"S A S\"] # \"current\" state, \"current\" action, \"next\" state\n r: Float[Array, \"S A\"]\n H: int\n γ: float = 1.0 # discount factor (used later)","key":"JSa1NLh914"},{"type":"output","id":"nuqOgxFba4ze2brHYVaKI","data":[],"key":"crJBYVip60"}],"data":{},"key":"g3cMcTNU2M"},{"type":"block","children":[{"type":"proof","kind":"example","label":"tidy_mdp","identifier":"tidy_mdp","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Tidying MDP","position":{"start":{"line":137,"column":1},"end":{"line":137,"column":1}},"key":"gZ6mqN1nS6"}],"key":"dTwPNBd94v"},{"type":"paragraph","position":{"start":{"line":140,"column":1},"end":{"line":141,"column":1}},"children":[{"type":"text","value":"Let’s consider a simple decision problem throughout this chapter:\nthe task of keeping your room tidy!","position":{"start":{"line":140,"column":1},"end":{"line":140,"column":1}},"key":"beNxWpO5pR"}],"key":"Ruei4HZWDN"},{"type":"paragraph","position":{"start":{"line":143,"column":1},"end":{"line":146,"column":1}},"children":[{"type":"text","value":"Your room has the possible states\n","position":{"start":{"line":143,"column":1},"end":{"line":143,"column":1}},"key":"K9Zg1xyLWp"},{"type":"inlineMath","value":"\\mathcal{S} = \\{ \\text{orderly}, \\text{messy} \\}.","position":{"start":{"line":143,"column":1},"end":{"line":143,"column":1}},"html":"S={orderly,messy}.\\mathcal{S} = \\{ \\text{orderly}, \\text{messy} \\}.S={orderly,messy}.","key":"GuAFnlsaUP"},{"type":"text","value":"\nYou can take either of the actions ","position":{"start":{"line":143,"column":1},"end":{"line":143,"column":1}},"key":"mAFX7Ss8sy"},{"type":"inlineMath","value":"\\mathcal{A} = \\{ \\text{ignore}, \\text{tidy} \\}.","position":{"start":{"line":143,"column":1},"end":{"line":143,"column":1}},"html":"A={ignore,tidy}.\\mathcal{A} = \\{ \\text{ignore}, \\text{tidy} \\}.A={ignore,tidy}.","key":"jVc1oVtFY8"},{"type":"text","value":"\nThe room starts off orderly.","position":{"start":{"line":143,"column":1},"end":{"line":143,"column":1}},"key":"nwxkJ8xPTw"}],"key":"sTl5sTe8EY"},{"type":"paragraph","position":{"start":{"line":148,"column":1},"end":{"line":150,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"key":"frdxYv2hA0"},{"type":"strong","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"children":[{"type":"text","value":"state transitions","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"key":"EVn5A602Df"}],"key":"OF0AOoioT3"},{"type":"text","value":" are as follows:\nif you tidy the room, it becomes (or remains) orderly;\nif you ignore the room, it ","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"key":"ETjEr7bi2W"},{"type":"emphasis","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"children":[{"type":"text","value":"might","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"key":"WmhlqZ2M0r"}],"key":"Dm9qLUcIA3"},{"type":"text","value":" become messy (see table below).","position":{"start":{"line":148,"column":1},"end":{"line":148,"column":1}},"key":"dvG3VO0pvM"}],"key":"c1sdf4I0lY"},{"type":"paragraph","position":{"start":{"line":152,"column":1},"end":{"line":154,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"key":"nfzs0DrAfs"},{"type":"strong","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"children":[{"type":"text","value":"rewards","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"key":"bqoinB0F9R"}],"key":"BN7VrvBxV5"},{"type":"text","value":" are as follows: You get penalized for tidying an orderly room (a waste of time) or ignoring a messy room,\nbut you get rewarded for ignoring an orderly room (since you can enjoy your additional time).\nTidying a messy room is a chore that gives no reward.","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"key":"f13PUAe4CS"}],"key":"X8F6shDuLJ"},{"type":"paragraph","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"children":[{"type":"text","value":"These are summarized in the following table:","position":{"start":{"line":156,"column":1},"end":{"line":156,"column":1}},"key":"oC8WbXma3o"}],"key":"xxYPwoBLgi"},{"type":"math","value":"\\begin{array}{ccccc}\n s & a & P(\\text{orderly} \\mid s, a) & P(\\text{messy} \\mid s, a) & r(s, a) \\\\\n \\text{orderly} & \\text{ignore} & 0.7 & 0.3 & 1 \\\\\n \\text{orderly} & \\text{tidy} & 1 & 0 & -1 \\\\\n \\text{messy} & \\text{ignore} & 0 & 1 & -1 \\\\\n \\text{messy} & \\text{tidy} & 1 & 0 & 0 \\\\\n\\end{array}","position":{"start":{"line":158,"column":1},"end":{"line":164,"column":1}},"html":"saP(orderlys,a)P(messys,a)r(s,a)orderlyignore0.70.31orderlytidy101messyignore011messytidy100\\begin{array}{ccccc}\n s & a & P(\\text{orderly} \\mid s, a) & P(\\text{messy} \\mid s, a) & r(s, a) \\\\\n \\text{orderly} & \\text{ignore} & 0.7 & 0.3 & 1 \\\\\n \\text{orderly} & \\text{tidy} & 1 & 0 & -1 \\\\\n \\text{messy} & \\text{ignore} & 0 & 1 & -1 \\\\\n \\text{messy} & \\text{tidy} & 1 & 0 & 0 \\\\\n\\end{array}sorderlyorderlymessymessyaignoretidyignoretidyP(orderlys,a)0.7101P(messys,a)0.3010r(s,a)1110","enumerator":"1.4","key":"tX53g45bDQ"},{"type":"paragraph","position":{"start":{"line":166,"column":1},"end":{"line":167,"column":1}},"children":[{"type":"text","value":"Consider a time horizon of ","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"key":"dqVudILf4L"},{"type":"inlineMath","value":"\\hor = 7","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"html":"H=7\\hor = 7H=7","key":"Wt0E4FQufx"},{"type":"text","value":" days (one interaction per day). Let\n","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"key":"Nv67Ra4Nio"},{"type":"inlineMath","value":"t = 0","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"html":"t=0t = 0t=0","key":"XV28Pfwvfa"},{"type":"text","value":" correspond to Monday and ","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"key":"rW2vbvucgG"},{"type":"inlineMath","value":"t = 6","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"html":"t=6t = 6t=6","key":"uvxt5uHKvy"},{"type":"text","value":" correspond to Sunday.","position":{"start":{"line":166,"column":1},"end":{"line":166,"column":1}},"key":"npPRNw2E3f"}],"key":"dt0bNtyZsh"}],"enumerator":"1.1","html_id":"tidy-mdp","key":"g7N2Ywtu2O"}],"key":"HVrIsmEmBl"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"tidy_mdp = MDP(\n S=2, # 0 = orderly, 1 = messy\n A=2, # 0 = ignore, 1 = tidy\n μ=jnp.array([1.0, 0.0]), # start in orderly state\n P=jnp.array([\n [\n [0.7, 0.3], # orderly, ignore\n [1.0, 0.0], # orderly, tidy\n ],\n [\n [0.0, 1.0], # messy, ignore\n [1.0, 0.0], # messy, tidy\n ],\n ]),\n r=jnp.array([\n [\n 1.0, # orderly, ignore\n -1.0, # orderly, tidy\n ],\n [\n -1.0, # messy, ignore\n 0.0, # messy, tidy\n ]\n ]),\n H=7,\n)","key":"stXV2OG5vz"},{"type":"output","id":"tBa-aRZ9HH7JMBpekXRPi","data":[],"key":"hg24gzwoEw"}],"data":{},"key":"ooHgNKKREj"},{"type":"block","children":[{"type":"heading","depth":3,"position":{"start":{"line":199,"column":1},"end":{"line":199,"column":1}},"children":[{"type":"text","value":"Policies","position":{"start":{"line":199,"column":1},"end":{"line":199,"column":1}},"key":"Aix0MgpcP3"}],"identifier":"policies","label":"Policies","html_id":"policies","implicit":true,"enumerator":"1.2.2","key":"dc0mwll5k3"},{"type":"proof","kind":"definition","label":"policy","identifier":"policy","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Policies","position":{"start":{"line":201,"column":1},"end":{"line":201,"column":1}},"key":"z9lwoBStgw"}],"key":"LuclEwqftj"},{"type":"paragraph","position":{"start":{"line":204,"column":1},"end":{"line":206,"column":1}},"children":[{"type":"text","value":"A ","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"VLsnwZQiG0"},{"type":"strong","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"children":[{"type":"text","value":"policy","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"naiy1odGtf"}],"key":"jHaLHJjAcF"},{"type":"text","value":" ","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"pBuxJ3kFmd"},{"type":"text","value":"π","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"u3tgWW8ZID"},{"type":"text","value":" describes the agent’s strategy:\nwhich actions it takes in a given situation.\nA key goal of RL is to find the ","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"WXjBS5vVKt"},{"type":"strong","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"children":[{"type":"text","value":"optimal policy","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"BBy0mUu9I6"}],"key":"wQG0admtNv"},{"type":"text","value":" that maximizes the total reward on average.","position":{"start":{"line":204,"column":1},"end":{"line":204,"column":1}},"key":"grYuqjJz4D"}],"key":"fpW1mcITmX"},{"type":"paragraph","position":{"start":{"line":208,"column":1},"end":{"line":209,"column":1}},"children":[{"type":"text","value":"There are three axes along which policies can vary: their outputs,\ninputs, and time-dependence.","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"St0uKltcep"}],"key":"qJVlxo32a2"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":211,"column":1},"end":{"line":214,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":211,"column":1},"end":{"line":214,"column":1}},"children":[{"type":"strong","position":{"start":{"line":211,"column":1},"end":{"line":211,"column":1}},"children":[{"type":"text","value":"Deterministic or stochastic.","position":{"start":{"line":211,"column":1},"end":{"line":211,"column":1}},"key":"VuYWudFwa8"}],"key":"tPtp9XGNF7"},{"type":"text","value":" A deterministic policy outputs\nactions while a stochastic policy outputs ","position":{"start":{"line":211,"column":1},"end":{"line":211,"column":1}},"key":"aMX5woQNvE"},{"type":"emphasis","position":{"start":{"line":211,"column":1},"end":{"line":211,"column":1}},"children":[{"type":"text","value":"distributions","position":{"start":{"line":211,"column":1},"end":{"line":211,"column":1}},"key":"z1WSIO9Iyh"}],"key":"gQfn7geJAx"},{"type":"text","value":" over\nactions.","position":{"start":{"line":211,"column":1},"end":{"line":211,"column":1}},"key":"dRoxixetcw"}],"key":"R28hFPNnbr"}],"key":"WpIugEvAJt"},{"type":"container","kind":"figure","children":[{"type":"image","url":"/build/deterministic_policy-9d0b50d69541007293ead345d987b682.png","alt":"A deterministic policy.","align":"center","data":{"altTextIsAutoGenerated":true},"key":"YEJW93MPlR","urlSource":"./shared/deterministic_policy.png","urlOptimized":"/build/deterministic_policy-9d0b50d69541007293ead345d987b682.webp"},{"type":"caption","children":[{"type":"paragraph","position":{"start":{"line":218,"column":1},"end":{"line":218,"column":1}},"children":[{"type":"text","value":"A deterministic policy.","position":{"start":{"line":218,"column":1},"end":{"line":218,"column":1}},"key":"Fk1qFRPjYE"}],"key":"qhQgyCAJOU"}],"key":"upZiRIxdLi"}],"enumerator":"1.1","key":"UwhwftIir8"},{"type":"container","kind":"figure","children":[{"type":"image","url":"/build/stochastic_policy-bc720a6ff54c4a27f3c7ec4de93b5c0d.png","alt":"A stochastic policy.","align":"center","data":{"altTextIsAutoGenerated":true},"key":"HiQK3xVz78","urlSource":"./shared/stochastic_policy.png","urlOptimized":"/build/stochastic_policy-bc720a6ff54c4a27f3c7ec4de93b5c0d.webp"},{"type":"caption","children":[{"type":"paragraph","position":{"start":{"line":224,"column":1},"end":{"line":224,"column":1}},"children":[{"type":"text","value":"A stochastic policy.","position":{"start":{"line":224,"column":1},"end":{"line":224,"column":1}},"key":"t8cESMIVoi"}],"key":"KDifkWX28q"}],"key":"eWqLzpc0LR"}],"enumerator":"1.2","key":"thGFv9ITQ6"},{"type":"list","ordered":true,"start":2,"spread":false,"position":{"start":{"line":227,"column":1},"end":{"line":236,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":227,"column":1},"end":{"line":232,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":227,"column":1},"end":{"line":231,"column":1}},"children":[{"type":"strong","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"children":[{"type":"text","value":"State-dependent or history-dependent.","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"key":"c9j9vryxLQ"}],"key":"mt8QK2swD3"},{"type":"text","value":" A state-dependent (a.k.a.\n“Markovian”) policy only depends on the current state, while a\nhistory-dependent policy depends on the sequence of past states,\nactions, and rewards. We’ll only consider state-dependent policies\nin this course.","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"key":"dmVSIVgVHn"}],"key":"U7KieomhZW"}],"key":"iEtQGfIOEL"},{"type":"listItem","spread":true,"position":{"start":{"line":233,"column":1},"end":{"line":236,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":233,"column":1},"end":{"line":236,"column":1}},"children":[{"type":"strong","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"children":[{"type":"text","value":"Stationary or time-dependent.","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"key":"dyRXfThWSV"}],"key":"xRlcgVdcyy"},{"type":"text","value":" A stationary (a.k.a. time-homogeneous) policy\nremains the same function at all time steps, while a time-dependent policy can depend on the current timestep.\nFor consistency with states and actions, we will denote the timestep as a subscript,\ni.e. ","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"key":"qkVA6DhtRZ"},{"type":"inlineMath","value":"\\pi = \\{ \\pi_0, \\dots, \\pi_{\\hor-1} \\}.","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"html":"π={π0,,πH1}.\\pi = \\{ \\pi_0, \\dots, \\pi_{\\hor-1} \\}.π={π0,,πH1}.","key":"U9toHP5rw3"}],"key":"z2iWTAPOtJ"}],"key":"tYGWDWxEpO"}],"key":"Cg9xpTO1gm"}],"enumerator":"1.3","html_id":"policy","key":"BWOEOWOSoR"}],"key":"DXjtGqphhw"},{"type":"block","position":{"start":{"line":239,"column":1},"end":{"line":239,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":241,"column":1},"end":{"line":244,"column":1}},"children":[{"type":"text","value":"Note that for finite state and action spaces,\nwe can represent a randomized mapping ","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"key":"i187HSVlqm"},{"type":"inlineMath","value":"\\mathcal{S} \\to \\Delta(\\mathcal{A})","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"html":"SΔ(A)\\mathcal{S} \\to \\Delta(\\mathcal{A})SΔ(A)","key":"Mlv7q3y1IE"},{"type":"text","value":"\nas a matrix ","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"key":"WaLx3wbLtZ"},{"type":"inlineMath","value":"\\pi \\in [0, 1]^{\\mathcal{S} \\times \\mathcal{A}}","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"html":"π[0,1]S×A\\pi \\in [0, 1]^{\\mathcal{S} \\times \\mathcal{A}}π[0,1]S×A","key":"vlC18PuhQT"},{"type":"text","value":" where each row describes\nthe policy’s distribution over actions for the corresponding state.","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"key":"cNAyu8T4kG"}],"key":"kTwMsKvRHF"},{"type":"paragraph","position":{"start":{"line":246,"column":1},"end":{"line":248,"column":1}},"children":[{"type":"text","value":"A fascinating result is that every finite-horizon MDP has an optimal deterministic time-dependent policy!\nIntuitively, the Markov property implies that the current state contains all the information we need to make the optimal decision.\nWe’ll prove this result constructively later in the chapter.","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"key":"A8ba1nVtFt"}],"key":"CMLXbgU3UM"},{"type":"proof","kind":"example","label":"tidy_policy","identifier":"tidy_policy","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Policies for the tidying MDP","position":{"start":{"line":250,"column":1},"end":{"line":250,"column":1}},"key":"zIMCDFmMC6"}],"key":"DBVl98jiCa"},{"type":"paragraph","position":{"start":{"line":253,"column":1},"end":{"line":253,"column":1}},"children":[{"type":"text","value":"Here are some possible policies for the tidying MDP ","position":{"start":{"line":253,"column":1},"end":{"line":253,"column":1}},"key":"tiBftiar4c"},{"type":"crossReference","kind":"proof:example","identifier":"tidy_mdp","label":"tidy_mdp","children":[{"type":"text","value":"Example ","key":"TwjYr97fLD"},{"type":"text","value":"1.1","key":"jpElPfA9ek"}],"template":"Example %s","enumerator":"1.1","resolved":true,"html_id":"tidy-mdp","key":"KKTV0sqr8m"},{"type":"text","value":":","position":{"start":{"line":253,"column":1},"end":{"line":253,"column":1}},"key":"xBEMgQurL5"}],"key":"XvJINuKLf1"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":255,"column":1},"end":{"line":261,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":255,"column":1},"end":{"line":256,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":255,"column":1},"end":{"line":255,"column":1}},"children":[{"type":"text","value":"Always tidy: ","position":{"start":{"line":255,"column":1},"end":{"line":255,"column":1}},"key":"hy2ubpKxON"},{"type":"inlineMath","value":"\\pi(s) = \\text{tidy}","position":{"start":{"line":255,"column":1},"end":{"line":255,"column":1}},"html":"π(s)=tidy\\pi(s) = \\text{tidy}π(s)=tidy","key":"f7DBB4vQYe"},{"type":"text","value":".","position":{"start":{"line":255,"column":1},"end":{"line":255,"column":1}},"key":"lunTXHMoQQ"}],"key":"yr3R64LvvU"}],"key":"QqK5K0mDwD"},{"type":"listItem","spread":true,"position":{"start":{"line":257,"column":1},"end":{"line":259,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":257,"column":1},"end":{"line":258,"column":1}},"children":[{"type":"text","value":"Only tidy on weekends: ","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"key":"iJitA8W5u8"},{"type":"inlineMath","value":"\\pi_\\hi(s) = \\text{tidy}","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"html":"πh(s)=tidy\\pi_\\hi(s) = \\text{tidy}πh(s)=tidy","key":"WXkBfgvOD8"},{"type":"text","value":" if\n","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"key":"X33caMcUTn"},{"type":"inlineMath","value":"\\hi \\in \\{ 5, 6 \\}","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"html":"h{5,6}\\hi \\in \\{ 5, 6 \\}h{5,6}","key":"M8mKBzdWsl"},{"type":"text","value":" and ","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"key":"KgA1YogRRw"},{"type":"inlineMath","value":"\\pi_\\hi(s) = \\text{ignore}","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"html":"πh(s)=ignore\\pi_\\hi(s) = \\text{ignore}πh(s)=ignore","key":"gImFGAkyKB"},{"type":"text","value":" otherwise.","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"key":"wMmApxvOcN"}],"key":"oMd0AqF1cM"}],"key":"ffqEGg5Ydd"},{"type":"listItem","spread":true,"position":{"start":{"line":260,"column":1},"end":{"line":261,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":260,"column":1},"end":{"line":261,"column":1}},"children":[{"type":"text","value":"Only tidy if the room is messy: ","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"key":"G3kwUnqwhP"},{"type":"inlineMath","value":"\\pi_\\hi(\\text{messy}) = \\text{tidy}","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"html":"πh(messy)=tidy\\pi_\\hi(\\text{messy}) = \\text{tidy}πh(messy)=tidy","key":"AVd4fv8a64"},{"type":"text","value":"\nand ","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"key":"xFtBNcM4YG"},{"type":"inlineMath","value":"\\pi_\\hi(\\text{orderly}) = \\text{ignore}","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"html":"πh(orderly)=ignore\\pi_\\hi(\\text{orderly}) = \\text{ignore}πh(orderly)=ignore","key":"vP2TCOELJx"},{"type":"text","value":" for all ","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"key":"Z07tIAQto6"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"html":"h\\hih","key":"zXQYd2vXFH"},{"type":"text","value":".","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"key":"BDVsqlN1D8"}],"key":"OfePzGU7kP"}],"key":"IgdAJCP7wn"}],"key":"yroK2cbYYs"}],"enumerator":"1.2","html_id":"tidy-policy","key":"ZDecAykPE1"}],"key":"PupzH4EfhA"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"# arrays of shape (H, S, A) represent time-dependent policies\ntidy_policy_always_tidy = (\n jnp.zeros((7, 2, 2))\n .at[:, :, 1].set(1.0)\n)\ntidy_policy_weekends = (\n jnp.zeros((7, 2, 2))\n .at[5:7, :, 1].set(1.0)\n .at[0:5, :, 0].set(1.0)\n)\ntidy_policy_messy_only = (\n jnp.zeros((7, 2, 2))\n .at[:, 1, 1].set(1.0)\n .at[:, 0, 0].set(1.0)\n)","key":"J1l1yFhyJ4"},{"type":"output","id":"tI9P03rks766a1WToYTMJ","data":[],"key":"blYiX5LMwz"}],"data":{},"key":"KvMMqX6E7Y"},{"type":"block","children":[{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Note","key":"Ye2RAbJDJu"}],"key":"LaV4V7dm4a"},{"type":"paragraph","position":{"start":{"line":283,"column":1},"end":{"line":285,"column":1}},"children":[{"type":"text","value":"Array objects in Jax are ","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"M1f6bG5jYq"},{"type":"strong","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"children":[{"type":"text","value":"immutable,","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"do2P0uQfas"}],"key":"PXSne477z2"},{"type":"text","value":" that is, they cannot be ","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"NBTzB9eHdD"},{"type":"emphasis","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"children":[{"type":"text","value":"changed.","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"KlAslD02dC"}],"key":"gWlifvIc5A"},{"type":"text","value":"\nThis might seem inconvenient, but in larger projects,\nimmutability makes code much easier to reason about.","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"key":"WeeFqIo4yf"}],"key":"kNgE9JMdIu"}],"key":"T2BO2iJwkW"}],"key":"o6vlqTaV5i"},{"type":"block","position":{"start":{"line":288,"column":1},"end":{"line":288,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":291,"column":1},"end":{"line":291,"column":1}},"children":[{"type":"text","value":"Trajectories","position":{"start":{"line":291,"column":1},"end":{"line":291,"column":1}},"key":"SeQaw3Zefh"}],"label":"trajectories","identifier":"trajectories","html_id":"trajectories","enumerator":"1.2.3","key":"SYK7MJIP4i"},{"type":"proof","kind":"definition","label":"trajectory","identifier":"trajectory","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Trajectories","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"key":"T5OfcuWhc4"}],"key":"EW0BUYgKKK"},{"type":"paragraph","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"children":[{"type":"text","value":"A sequence of states, actions, and rewards is called a ","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"key":"Du2szoQ59N"},{"type":"strong","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"children":[{"type":"text","value":"trajectory","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"key":"VIJ2GOq2Q9"}],"key":"zQPQ0CJ65S"},{"type":"text","value":":","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"key":"et0fxZEjHN"}],"key":"sZgT2xqUGq"},{"type":"math","value":"\\tau = (s_0, a_0, r_0, \\dots, s_{H-1}, a_{H-1}, r_{H-1})","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"html":"τ=(s0,a0,r0,,sH1,aH1,rH1)\\tau = (s_0, a_0, r_0, \\dots, s_{H-1}, a_{H-1}, r_{H-1})τ=(s0,a0,r0,,sH1,aH1,rH1)","enumerator":"1.5","key":"wfVOkIFv9E"},{"type":"paragraph","position":{"start":{"line":300,"column":1},"end":{"line":301,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":300,"column":1},"end":{"line":300,"column":1}},"key":"Uqhl7Cyw5A"},{"type":"inlineMath","value":"r_\\hi = r(s_\\hi, a_\\hi)","position":{"start":{"line":300,"column":1},"end":{"line":300,"column":1}},"html":"rh=r(sh,ah)r_\\hi = r(s_\\hi, a_\\hi)rh=r(sh,ah)","key":"B8Tx8OQHuO"},{"type":"text","value":".\n(Note that some sources omit the reward at the final time step. This is a minor detail.)","position":{"start":{"line":300,"column":1},"end":{"line":300,"column":1}},"key":"J52HgRJk3d"}],"key":"ND0Dozs9gI"}],"enumerator":"1.4","html_id":"trajectory","key":"Os9bBHJcUs"}],"key":"qUill7lZyi"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"class Transition(NamedTuple):\n \"\"\"A single state-action-reward interaction with the environment.\n\n A trajectory comprises a sequence of transitions.\n \"\"\"\n s: int\n a: int\n r: float","key":"AbDtG3GhTf"},{"type":"output","id":"WnxM-DV1kQdMMk38eLpPB","data":[],"key":"i1ZDN7tzAG"}],"data":{},"key":"YVgntA9w2f"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":315,"column":1},"end":{"line":317,"column":1}},"children":[{"type":"text","value":"Once we’ve chosen a policy,\nwe can sample trajectories by repeatedly choosing actions according to the policy,\ntransitioning according to the state transitions, and observing the rewards.","position":{"start":{"line":315,"column":1},"end":{"line":315,"column":1}},"key":"Quk4xLHg5i"}],"key":"XhMB7hdROe"},{"type":"image","url":"/build/trajectory-ea534afbae8ad1151663ff974e306d5e.png","width":"240px","align":"center","key":"vq8kllZmHs","urlSource":"shared/trajectory.png","urlOptimized":"/build/trajectory-ea534afbae8ad1151663ff974e306d5e.webp"},{"type":"paragraph","position":{"start":{"line":324,"column":1},"end":{"line":325,"column":1}},"children":[{"type":"text","value":"That is, a policy induces a distribution ","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"key":"tQMO6wbvF4"},{"type":"inlineMath","value":"\\rho^{\\pi}","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"html":"ρπ\\rho^{\\pi}ρπ","key":"R3VvWdOP3u"},{"type":"text","value":" over trajectories.\n(We assume that ","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"key":"CxULa3h2Rt"},{"type":"text","value":"μ","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"key":"zyVUyfZynR"},{"type":"text","value":" and ","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"key":"WXYAhqDW7k"},{"type":"inlineMath","value":"P","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"html":"PPP","key":"ZOfDv94R3F"},{"type":"text","value":" are clear from context.)","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"key":"CX7rWpWbsP"}],"key":"YI2aSJG2oq"},{"type":"proof","kind":"example","label":"tidy_traj","identifier":"tidy_traj","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Trajectories in the tidying environment","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"key":"mlUwCk6yAT"}],"key":"tu9JKWGAQg"},{"type":"paragraph","position":{"start":{"line":330,"column":1},"end":{"line":330,"column":1}},"children":[{"type":"text","value":"Here is a possible trajectory for the tidying example:","position":{"start":{"line":330,"column":1},"end":{"line":330,"column":1}},"key":"ajTjapb9vg"}],"key":"lDQbRTqKZZ"},{"type":"container","kind":"table","children":[{"type":"table","position":{"start":{"line":333,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"tableRow","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"inlineMath","value":"\\hi","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"html":"h\\hih","key":"xIFY3OWVal"}],"key":"Znb9HX4Uq1"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"0","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"pSBy8qXAOy"}],"key":"fE2pvBmNsM"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"1","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"FuRHpm4cRa"}],"key":"ld77DjeHcr"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"2","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"VIYdrOt2Ef"}],"key":"DxJSuO6Omm"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"3","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"xp2tuYnRMc"}],"key":"IYj1GG82ee"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"4","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"nYWK55WbbI"}],"key":"Q2KedJdk4y"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"5","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"mplRhncoJP"}],"key":"gXlS0oh8eR"},{"type":"tableCell","header":true,"align":"center","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"6","position":{"start":{"line":333,"column":1},"end":{"line":333,"column":1}},"key":"g3xyUZKcua"}],"key":"Ka1OJBoHAU"}],"key":"d7MuU2y8CW"},{"type":"tableRow","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"inlineMath","value":"s","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"html":"sss","key":"jw36lK5u8W"}],"key":"r1M1UKNMJS"},{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"orderly","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"CQhitnZbTv"}],"key":"hACVfgFhgA"},{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"orderly","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"AMRpylFRrx"}],"key":"Y7DDHoESkr"},{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"orderly","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"nIZ8TvO8wm"}],"key":"viGV1JK7Bz"},{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"messy","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"aRwgwfLIfQ"}],"key":"BJkWwEEXJM"},{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"messy","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"hCm3dqB72Q"}],"key":"o3x1MFhKsL"},{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"orderly","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"ma4gHzNyxK"}],"key":"bn3EdA9jmb"},{"type":"tableCell","align":"center","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"children":[{"type":"text","value":"orderly","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"beOlq0i8jS"}],"key":"s9izaut72n"}],"key":"ICYVNeTx50"},{"type":"tableRow","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"inlineMath","value":"a","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"html":"aaa","key":"XgQvn7extg"}],"key":"xDbV65L4CX"},{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"text","value":"tidy","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"key":"zQcXTV530K"}],"key":"qNchgBlmmR"},{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"text","value":"ignore","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"key":"T13B6u9aVw"}],"key":"qua0XDw6m6"},{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"text","value":"ignore","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"key":"mj2o9T7JPm"}],"key":"IgasX0JreB"},{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"text","value":"ignore","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"key":"bEHgm0QZj6"}],"key":"rKLz7w0FMr"},{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"text","value":"tidy","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"key":"drXgwa5aZK"}],"key":"zDLeCE7rqn"},{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"text","value":"ignore","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"key":"WrmrO59aPv"}],"key":"hMN2Jf0gTR"},{"type":"tableCell","align":"center","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"text","value":"ignore","position":{"start":{"line":336,"column":1},"end":{"line":336,"column":1}},"key":"AJCwPFzFRH"}],"key":"f9m6ClWQWm"}],"key":"XeLz1r9Ylc"},{"type":"tableRow","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"inlineMath","value":"r","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"html":"rrr","key":"kWxdLG0Ehq"}],"key":"xaaxtBTrEv"},{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"-1","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"mmFjSWogRx"}],"key":"yVL65ZM6IV"},{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"1","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"NUzmTSerGn"}],"key":"MoVVE5NMtf"},{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"1","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"AFXPIJlJBm"}],"key":"PlM1iGgWpd"},{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"-1","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"hkkT7vsoYM"}],"key":"fiSY4rMmon"},{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"0","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"nNmhNMnB69"}],"key":"VhiTFTyAsZ"},{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"1","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"r4s1xyFe5K"}],"key":"vYje3kJE0M"},{"type":"tableCell","align":"center","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"1","position":{"start":{"line":337,"column":1},"end":{"line":337,"column":1}},"key":"qYNUmTbbp6"}],"key":"ptHoRVY41R"}],"key":"VB0CztPljX"}],"key":"Q0Zy2yHlin"}],"enumerator":"1.1","key":"YJPXPR0KEp"},{"type":"paragraph","position":{"start":{"line":340,"column":1},"end":{"line":340,"column":1}},"children":[{"type":"text","value":"Could any of the policies in ","position":{"start":{"line":340,"column":1},"end":{"line":340,"column":1}},"key":"QWbYHeEKwF"},{"type":"crossReference","kind":"proof:example","identifier":"tidy_policy","label":"tidy_policy","children":[{"type":"text","value":"Example ","key":"P6cQLqBx8D"},{"type":"text","value":"1.2","key":"TwTcrruyUR"}],"template":"Example %s","enumerator":"1.2","resolved":true,"html_id":"tidy-policy","key":"dFVh7PlAzX"},{"type":"text","value":" have generated this trajectory?","position":{"start":{"line":340,"column":1},"end":{"line":340,"column":1}},"key":"DlTHgl0rMz"}],"key":"bTKKvSXnOf"}],"enumerator":"1.3","html_id":"tidy-traj","key":"Qf8X4iHEUG"},{"type":"paragraph","position":{"start":{"line":343,"column":1},"end":{"line":344,"column":1}},"children":[{"type":"text","value":"Note that for a state-dependent policy, using the Markov property ","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"eO9rZgeZ9f"},{"type":"crossReference","kind":"proof:definition","identifier":"markov","label":"markov","children":[{"type":"text","value":"Definition ","key":"preT6Ym8nr"},{"type":"text","value":"1.1","key":"kRldFd1y8o"}],"template":"Definition %s","enumerator":"1.1","resolved":true,"html_id":"markov","key":"pZINbDl3QO"},{"type":"text","value":",\nwe can write down the likelihood function of this probability distribution in an ","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"jhkMDD7RdA"},{"type":"strong","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"children":[{"type":"text","value":"autoregressive","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"jplWn8WpKf"}],"key":"HVG0ov88MS"},{"type":"text","value":" way (i.e. one timestep at a time):","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"aSHkY8WjCZ"}],"key":"H4vzIeRe9C"},{"type":"proof","kind":"definition","label":"autoregressive_trajectories","identifier":"autoregressive_trajectories","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Autoregressive trajectory distribution","position":{"start":{"line":346,"column":1},"end":{"line":346,"column":1}},"key":"fIYSiUtPBC"}],"key":"vxen55Q9pf"},{"type":"math","value":"\\rho^{\\pi}(\\tau) := \\mu(s_0) \\pi_0(a_0 \\mid s_0) P(s_1 \\mid s_0, a_0) \\cdots P(s_{\\hor-1} \\mid s_{\\hor-2}, a_{\\hor-2}) \\pi_{\\hor-1}(a_{\\hor-1} \\mid s_{\\hor-1})","position":{"start":{"line":349,"column":1},"end":{"line":349,"column":1}},"html":"ρπ(τ):=μ(s0)π0(a0s0)P(s1s0,a0)P(sH1sH2,aH2)πH1(aH1sH1)\\rho^{\\pi}(\\tau) := \\mu(s_0) \\pi_0(a_0 \\mid s_0) P(s_1 \\mid s_0, a_0) \\cdots P(s_{\\hor-1} \\mid s_{\\hor-2}, a_{\\hor-2}) \\pi_{\\hor-1}(a_{\\hor-1} \\mid s_{\\hor-1})ρπ(τ):=μ(s0)π0(a0s0)P(s1s0,a0)P(sH1sH2,aH2)πH1(aH1sH1)","enumerator":"1.6","key":"rXo3HZMKxG"}],"enumerator":"1.5","html_id":"autoregressive-trajectories","key":"RpEdBKj89r"}],"key":"dzGt8vvhqD"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def trajectory_log_likelihood(\n mdp: MDP,\n τ: list[Transition],\n π: Float[Array, \"S A\"],\n) -> float:\n \"\"\"Compute the log-likelihood of a trajectory under a given MDP and policy.\"\"\"\n\n # initial distribution and action\n total = jnp.log(mdp.μ[τ[0].s])\n total += jnp.log(π[τ[0].s, τ[0].a])\n\n # remaining state transitions and actions\n for i in range(1, mdp.H):\n total += jnp.log(mdp.P[τ[i - 1].s, τ[i - 1].a, τ[i].s])\n total += jnp.log(π[τ[i].s, τ[i].a])\n\n return total","key":"Fq0hPCJ5lS"},{"type":"output","id":"z5TuTlJajptGpwyuB6qm6","data":[],"key":"qCJzSomnXq"}],"data":{},"key":"qgbv1Wzrqh"},{"type":"block","children":[{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"afUvwE4DR9"}],"key":"yqTnMzDi9l"},{"type":"paragraph","position":{"start":{"line":373,"column":1},"end":{"line":373,"column":1}},"children":[{"type":"text","value":"How would you modify this to include stochastic rewards?","position":{"start":{"line":373,"column":1},"end":{"line":373,"column":1}},"key":"JlOMGgJnVQ"}],"key":"QJWJ2y3EJm"}],"key":"uYSaz8d5jm"},{"type":"paragraph","position":{"start":{"line":376,"column":1},"end":{"line":378,"column":1}},"children":[{"type":"text","value":"For a deterministic policy ","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"kSMcfwNXB7"},{"type":"text","value":"π","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"Do7Bz8H8o1"},{"type":"text","value":", we have that ","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"AQ2WJPIaLy"},{"type":"inlineMath","value":"\\pi_\\hi(a \\mid s) = \\mathbb{I}[a = \\pi_\\hi(s)]","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"html":"πh(as)=I[a=πh(s)]\\pi_\\hi(a \\mid s) = \\mathbb{I}[a = \\pi_\\hi(s)]πh(as)=I[a=πh(s)]","key":"FHeFsdElvG"},{"type":"text","value":";\nthat is, the probability of taking an action is ","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"ZGLKw8e42K"},{"type":"text","value":"1","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"vErdQmQZeR"},{"type":"text","value":" if it’s the unique action prescribed by the policy for that state and ","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"kSKXkm5mJD"},{"type":"text","value":"0","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"FMmU5lHsw6"},{"type":"text","value":" otherwise.\nIn this case, the only randomness in sampling trajectories comes from the initial state distribution ","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"MuMR3yO3Gc"},{"type":"text","value":"μ","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"o1zLwd6bhR"},{"type":"text","value":" and the state transitions ","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"uZTXZbmusL"},{"type":"inlineMath","value":"P","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"html":"PPP","key":"wBXAuB4YGs"},{"type":"text","value":".","position":{"start":{"line":376,"column":1},"end":{"line":376,"column":1}},"key":"qP72EpklEl"}],"key":"UUjEAsonfV"}],"key":"Lwc4l45Pfj"},{"type":"block","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":382,"column":1},"end":{"line":382,"column":1}},"children":[{"type":"text","value":"Value functions","position":{"start":{"line":382,"column":1},"end":{"line":382,"column":1}},"key":"z5XfwnRxke"}],"identifier":"value-functions","label":"Value functions","html_id":"value-functions","implicit":true,"enumerator":"1.2.4","key":"PUajcGYuQj"},{"type":"paragraph","position":{"start":{"line":384,"column":1},"end":{"line":385,"column":1}},"children":[{"type":"text","value":"The main goal of RL is to find a policy that maximizes the expected total\nreward ","position":{"start":{"line":384,"column":1},"end":{"line":384,"column":1}},"key":"omIOgM8pK9"},{"type":"inlineMath","value":"\\E [r_0 + \\cdots + r_{\\hor-1}]","position":{"start":{"line":384,"column":1},"end":{"line":384,"column":1}},"html":"E[r0++rH1]\\E [r_0 + \\cdots + r_{\\hor-1}]E[r0++rH1]","key":"DAy8lULFla"},{"type":"text","value":".","position":{"start":{"line":384,"column":1},"end":{"line":384,"column":1}},"key":"FgEIsuk74Q"}],"key":"XP0JlhlF4A"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"yL194pmh0e"}],"key":"yw5K7klfzv"},{"type":"paragraph","position":{"start":{"line":388,"column":1},"end":{"line":390,"column":1}},"children":[{"type":"text","value":"Note that ","position":{"start":{"line":388,"column":1},"end":{"line":388,"column":1}},"key":"CAQYvGAdPl"},{"type":"inlineMath","value":"r_0 + \\cdots + r_{\\hor-1}","position":{"start":{"line":388,"column":1},"end":{"line":388,"column":1}},"html":"r0++rH1r_0 + \\cdots + r_{\\hor-1}r0++rH1","key":"HVZuLNCYoW"},{"type":"text","value":" is a random variable.\nWhat sources of randomness does it depend on?\nDescribe the generating process.","position":{"start":{"line":388,"column":1},"end":{"line":388,"column":1}},"key":"von1TElISX"}],"key":"DMeBYBFT8j"}],"key":"wMDoqM0RdK"},{"type":"paragraph","position":{"start":{"line":393,"column":1},"end":{"line":393,"column":1}},"children":[{"type":"text","value":"Let’s introduce some notation for analyzing this quantity.","position":{"start":{"line":393,"column":1},"end":{"line":393,"column":1}},"key":"LjfDy4DB4C"}],"key":"ZynqFucqIz"},{"type":"paragraph","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"children":[{"type":"text","value":"A policy’s ","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"EdMbU1D7MU"},{"type":"strong","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"children":[{"type":"text","value":"value function","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"HSxuCnYSbj"}],"key":"uTvgCyxh1F"},{"type":"text","value":" at time ","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"CQtTyldFAe"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"html":"h\\hih","key":"TCSW7eADzn"},{"type":"text","value":" is its expected remaining reward ","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"cLgF3Isdim"},{"type":"emphasis","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"children":[{"type":"text","value":"from a given state","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"OypcDGnvik"}],"key":"KkIQ0jiMme"},{"type":"text","value":":","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"yQsn19Xgdo"}],"key":"U54I77AK5R"},{"type":"proof","kind":"definition","label":"value","identifier":"value","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Value function","position":{"start":{"line":397,"column":1},"end":{"line":397,"column":1}},"key":"hEH0JsNHi0"}],"key":"h52VVRU2W9"},{"type":"math","value":"V_\\hi^\\pi(s) := \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\cdots + r_{H-1} \\mid s_\\hi = s]","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"html":"Vhπ(s):=Eτρπ[rh++rH1sh=s]V_\\hi^\\pi(s) := \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\cdots + r_{H-1} \\mid s_\\hi = s]Vhπ(s):=Eτρπ[rh++rH1sh=s]","enumerator":"1.7","key":"zskp4Qx7Om"}],"enumerator":"1.6","html_id":"value","key":"f36vQW6rnm"},{"type":"paragraph","position":{"start":{"line":403,"column":1},"end":{"line":404,"column":1}},"children":[{"type":"text","value":"Similarly, we can define the ","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"GacTcFA3G1"},{"type":"strong","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"children":[{"type":"text","value":"action-value function","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"Ls2Ssa7Xe2"}],"key":"a9Q8aXPp90"},{"type":"text","value":" (aka the\n","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"FaGBDWd9vG"},{"type":"strong","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"children":[{"type":"text","value":"Q-function","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"ZoF09gHORE"}],"key":"F22PT6863z"},{"type":"text","value":") at time ","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"tao61zx3J6"},{"type":"inlineMath","value":"h","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"html":"hhh","key":"rwf63CPuec"},{"type":"text","value":" as the expected remaining reward ","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"EQW6DHhNwd"},{"type":"emphasis","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"children":[{"type":"text","value":"from a given state and taking a given action","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"bavoPDdMzY"}],"key":"zboXn80CRi"},{"type":"text","value":":","position":{"start":{"line":403,"column":1},"end":{"line":403,"column":1}},"key":"Pww6FmPZ2A"}],"key":"ckw4gGcjsw"},{"type":"proof","kind":"definition","label":"action_value","identifier":"action_value","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Action-value function","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"key":"G8Hamxkv01"}],"key":"BDRAhmGimX"},{"type":"math","value":"Q_\\hi^\\pi(s, a) := \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\cdots + r_{H-1} \\mid s_\\hi = s, a_\\hi = a]","position":{"start":{"line":409,"column":1},"end":{"line":409,"column":1}},"html":"Qhπ(s,a):=Eτρπ[rh++rH1sh=s,ah=a]Q_\\hi^\\pi(s, a) := \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\cdots + r_{H-1} \\mid s_\\hi = s, a_\\hi = a]Qhπ(s,a):=Eτρπ[rh++rH1sh=s,ah=a]","enumerator":"1.8","key":"cL5UYmQd7I"}],"enumerator":"1.7","html_id":"action-value","key":"dhBAEBr5We"}],"key":"NTjPlLt7F1"},{"type":"block","position":{"start":{"line":412,"column":1},"end":{"line":412,"column":1}},"children":[{"type":"heading","depth":4,"position":{"start":{"line":414,"column":1},"end":{"line":414,"column":1}},"children":[{"type":"text","value":"Relating the value function and action-value function","position":{"start":{"line":414,"column":1},"end":{"line":414,"column":1}},"key":"x5kymkwVcK"}],"identifier":"relating-the-value-function-and-action-value-function","label":"Relating the value function and action-value function","html_id":"relating-the-value-function-and-action-value-function","implicit":true,"enumerator":"1.2.4.1","key":"yXZyE48o6T"},{"type":"paragraph","position":{"start":{"line":416,"column":1},"end":{"line":417,"column":1}},"children":[{"type":"text","value":"Note that the value function is just the expected action-value over\nactions drawn from the policy:","position":{"start":{"line":416,"column":1},"end":{"line":416,"column":1}},"key":"rWLk7XdbzK"}],"key":"gFN07JBYHt"},{"type":"math","value":"V_\\hi^\\pi(s) = \\E_{a \\sim \\pi_\\hi(s)} [Q_\\hi^\\pi(s, a)]","position":{"start":{"line":419,"column":1},"end":{"line":419,"column":1}},"html":"Vhπ(s)=Eaπh(s)[Qhπ(s,a)]V_\\hi^\\pi(s) = \\E_{a \\sim \\pi_\\hi(s)} [Q_\\hi^\\pi(s, a)]Vhπ(s)=Eaπh(s)[Qhπ(s,a)]","enumerator":"1.9","key":"p9XqywfagM"}],"key":"TjgnNlcZ0t"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def q_to_v(\n policy: Float[Array, \"S A\"],\n q: Float[Array, \"S A\"],\n) -> Float[Array, \" S\"]:\n \"\"\"\n Compute the value function for a given policy in a known finite MDP\n at a single timestep from its action-value function.\n \"\"\"\n return jnp.average(q, weights=policy, axis=1)","key":"IQN7NpTyuL"},{"type":"output","id":"6kjK0hETYpA1rt-O9VOaY","data":[],"key":"QGFnnTNncK"}],"data":{},"key":"lWSyXfLJ58"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":433,"column":1},"end":{"line":434,"column":1}},"children":[{"type":"text","value":"and the action-value is the sum of the immediate reward and the expected value of the following\nstate:","position":{"start":{"line":433,"column":1},"end":{"line":433,"column":1}},"key":"ZcdWU4gl82"}],"key":"UCVDFS3i5S"},{"type":"math","value":"Q_\\hi^\\pi(s, a) = r(s, a) + \\E_{s' \\sim P(s, a)} [V_{\\hi+1}^\\pi(s')]","position":{"start":{"line":436,"column":1},"end":{"line":436,"column":1}},"html":"Qhπ(s,a)=r(s,a)+EsP(s,a)[Vh+1π(s)]Q_\\hi^\\pi(s, a) = r(s, a) + \\E_{s' \\sim P(s, a)} [V_{\\hi+1}^\\pi(s')]Qhπ(s,a)=r(s,a)+EsP(s,a)[Vh+1π(s)]","enumerator":"1.10","key":"BN3RX8grss"}],"key":"GZ9l6AqeIZ"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def v_to_q(\n mdp: MDP,\n v_next: Float[Array, \" S\"],\n) -> Float[Array, \"S A\"]:\n \"\"\"\n Compute the action-value function in a known finite MDP\n at a single timestep from the corresponding value function.\n \"\"\"\n # the discount factor is relevant later\n return mdp.r + mdp.γ * mdp.P @ v_next\n\n\n# convert a list of v functions to a list of q functions\nv_ary_to_q_ary = jax.vmap(v_to_q, in_axes=(None, 0))","key":"KICTntrlRo"},{"type":"output","id":"BXZtM5QEaVHtnoHU_4Sm5","data":[],"key":"DjtfWLzONs"}],"data":{},"key":"RQLOyfCPFU"},{"type":"block","children":[{"type":"heading","depth":4,"position":{"start":{"line":455,"column":1},"end":{"line":455,"column":1}},"children":[{"type":"text","value":"Greedy policies","position":{"start":{"line":455,"column":1},"end":{"line":455,"column":1}},"key":"pCExXiiIM9"}],"identifier":"greedy-policies","label":"Greedy policies","html_id":"greedy-policies","implicit":true,"enumerator":"1.2.4.2","key":"q93IUa56Bj"},{"type":"paragraph","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"children":[{"type":"text","value":"For any given ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"E21GOoXT1a"},{"type":"inlineMath","value":"Q \\in \\mathbb{R}^{|\\mathcal{S}| \\times |\\mathcal{A}|}","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"html":"QRS×AQ \\in \\mathbb{R}^{|\\mathcal{S}| \\times |\\mathcal{A}|}QRS×A","key":"cItnWevisC"},{"type":"text","value":", we can define the ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"oHSgXTuUaP"},{"type":"strong","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"children":[{"type":"text","value":"greedy policy","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"wPZNLCQnVt"}],"key":"yqetskKfwa"},{"type":"text","value":" ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"GLCw1hQF06"},{"type":"inlineMath","value":"\\hat \\pi_Q","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"html":"π^Q\\hat \\pi_Qπ^Q","key":"qYvytEPR94"},{"type":"text","value":" as the deterministic policy that selects the action with the highest ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"aKh7riyH28"},{"type":"inlineMath","value":"Q","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"html":"QQQ","key":"UerppCMbOv"},{"type":"text","value":"-value at each state:","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"EymJVfFkzh"}],"key":"vPuGVQWRsy"},{"type":"math","value":"\\hat \\pi_Q(s) = \\arg\\max_{a} Q_{sa}","position":{"start":{"line":459,"column":1},"end":{"line":461,"column":1}},"html":"π^Q(s)=argmaxaQsa\\hat \\pi_Q(s) = \\arg\\max_{a} Q_{sa}π^Q(s)=argamaxQsa","enumerator":"1.11","key":"FY4I4zlp8L"}],"key":"PyXZUH0aQa"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def q_to_greedy(q: Float[Array, \"S A\"]) -> Float[Array, \"S A\"]:\n \"\"\"\n Get the (deterministic) greedy policy with respect to an action-value function.\n Return the policy as a matrix of shape (S, A) where each row is a one-hot vector.\n \"\"\"\n A = q.shape[1]\n a_ary = jnp.argmax(q, axis=1)\n return jnp.eye(A)[a_ary]\n\n\ndef v_to_greedy(mdp: MDP, v: Float[Array, \" S\"]) -> Float[Array, \"S A\"]:\n \"\"\"Get the (deterministic) greedy policy with respect to a value function.\"\"\"\n return q_to_greedy(v_to_q(mdp, v))","key":"gpyO9kji6n"},{"type":"output","id":"W6_KVgE6oXbtPDkAvtD7Z","data":[],"key":"CyJWxb9ZBI"}],"data":{},"key":"oNJ8eXY2yz"},{"type":"block","children":[{"type":"heading","depth":3,"position":{"start":{"line":479,"column":1},"end":{"line":479,"column":1}},"children":[{"type":"text","value":"The one-step (Bellman) consistency equation","position":{"start":{"line":479,"column":1},"end":{"line":479,"column":1}},"key":"OvTYU7Oepq"}],"identifier":"the-one-step-bellman-consistency-equation","label":"The one-step (Bellman) consistency equation","html_id":"the-one-step-bellman-consistency-equation","implicit":true,"enumerator":"1.2.5","key":"Xy1mcLPuB6"},{"type":"paragraph","position":{"start":{"line":481,"column":1},"end":{"line":485,"column":1}},"children":[{"type":"text","value":"Note that by simply considering the cumulative reward as the sum of the\n","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"l59isUhzFD"},{"type":"emphasis","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"children":[{"type":"text","value":"current","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"CcrDk4ZR3H"}],"key":"O75zzNErfQ"},{"type":"text","value":" reward and the ","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"vBd85I6Xbo"},{"type":"emphasis","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"children":[{"type":"text","value":"future","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"IwFhkvlDVb"}],"key":"ZZaGqtFqqO"},{"type":"text","value":" cumulative reward, we can describe the\nvalue function recursively (in terms of itself). This is named the\n","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"XLMWxs8vWV"},{"type":"strong","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"children":[{"type":"text","value":"Bellman consistency equation","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"BHe11QudhX"}],"key":"FNneSls79i"},{"type":"text","value":" after ","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"pVi2jvwXBw"},{"type":"strong","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"children":[{"type":"text","value":"Richard Bellman","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"ONFoArBbwS"}],"key":"yVdjFSJlSD"},{"type":"text","value":" (1920--1984),\nwho is credited with introducing dynamic programming in 1953.","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"kNLvXbjdfB"}],"key":"M2lTLufOhX"},{"type":"proof","kind":"theorem","label":"bellman_consistency","identifier":"bellman_consistency","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Bellman consistency equation for the value function","position":{"start":{"line":487,"column":1},"end":{"line":487,"column":1}},"key":"C1WwqU50tl"}],"key":"hQQ1Off8vl"},{"type":"math","value":"V_\\hi^\\pi(s) = \\E_{\\substack{a \\sim \\pi_\\hi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + V_{\\hi+1}^\\pi(s')]","position":{"start":{"line":490,"column":1},"end":{"line":492,"column":1}},"html":"Vhπ(s)=Eaπh(s)sP(s,a)[r(s,a)+Vh+1π(s)]V_\\hi^\\pi(s) = \\E_{\\substack{a \\sim \\pi_\\hi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + V_{\\hi+1}^\\pi(s')]Vhπ(s)=Eaπh(s)sP(s,a)[r(s,a)+Vh+1π(s)]","enumerator":"1.12","key":"BEiHJZiHeD"}],"enumerator":"1.1","html_id":"bellman-consistency","key":"GQI8zCQJ79"}],"key":"mLAAbxssKE"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def check_bellman_consistency_v(\n mdp: MDP,\n policy: Float[Array, \"H S A\"],\n v_ary: Float[Array, \"H S\"],\n) -> bool:\n \"\"\"\n Check that the given (time-dependent) \"value function\"\n satisfies the Bellman consistency equation.\n \"\"\"\n return all(\n jnp.allclose(\n # lhs\n v_ary[h],\n # rhs\n jnp.sum(policy[h] * (mdp.r + mdp.γ * mdp.P @ v_ary[h + 1]), axis=1),\n )\n for h in range(mdp.H - 1)\n )","key":"BS3WtFHdN5"},{"type":"output","id":"Mf1WBnT9q41PaNUD4mWmh","data":[],"key":"SVXXI7e4Ka"}],"data":{},"key":"VmTz16zG8F"},{"type":"block","children":[{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"NSY33hHyhw"}],"key":"CdVssxvYuv"},{"type":"paragraph","position":{"start":{"line":517,"column":1},"end":{"line":518,"column":1}},"children":[{"type":"text","value":"Verify that this equation holds by expanding ","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"key":"xvXIInarFQ"},{"type":"inlineMath","value":"V_\\hi^\\pi(s)","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"html":"Vhπ(s)V_\\hi^\\pi(s)Vhπ(s)","key":"GNPrI8L9Nu"},{"type":"text","value":"\nand ","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"key":"PeTUqturTg"},{"type":"inlineMath","value":"V_{\\hi+1}^\\pi(s')","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"html":"Vh+1π(s)V_{\\hi+1}^\\pi(s')Vh+1π(s)","key":"nVz3YPdDnU"},{"type":"text","value":".","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"key":"CPIka3XCRV"}],"key":"eVYfjv5nOF"}],"key":"nxCLp4weHW"},{"type":"paragraph","position":{"start":{"line":521,"column":1},"end":{"line":522,"column":1}},"children":[{"type":"text","value":"One can analogously derive the Bellman consistency equation for the\naction-value function:","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"knO1tTTwgp"}],"key":"uRDZGQ8t5y"},{"type":"proof","kind":"theorem","label":"bellman_consistency_action","identifier":"bellman_consistency_action","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Bellman consistency equation for action-values","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"key":"U7LO8BPwDW"}],"key":"GvmlQDDzQp"},{"type":"math","value":"Q_\\hi^\\pi(s, a) = r(s, a) + \\E_{\\substack{s' \\sim P(s, a) \\\\ a' \\sim \\pi_{\\hi+1}(s')}} [Q_{\\hi+1}^\\pi(s', a')]","position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"html":"Qhπ(s,a)=r(s,a)+EsP(s,a)aπh+1(s)[Qh+1π(s,a)]Q_\\hi^\\pi(s, a) = r(s, a) + \\E_{\\substack{s' \\sim P(s, a) \\\\ a' \\sim \\pi_{\\hi+1}(s')}} [Q_{\\hi+1}^\\pi(s', a')]Qhπ(s,a)=r(s,a)+EsP(s,a)aπh+1(s)[Qh+1π(s,a)]","enumerator":"1.13","key":"BD46I0FXWe"}],"enumerator":"1.2","html_id":"bellman-consistency-action","key":"MSEb3RoGhH"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"Dtegu5EYsF"}],"key":"rDCXut3PVV"},{"type":"paragraph","position":{"start":{"line":531,"column":1},"end":{"line":531,"column":1}},"children":[{"type":"text","value":"Write a ","position":{"start":{"line":531,"column":1},"end":{"line":531,"column":1}},"key":"OgsjCOlDMc"},{"type":"inlineCode","value":"check_bellman_consistency_q","position":{"start":{"line":531,"column":1},"end":{"line":531,"column":1}},"key":"ncrA7VQlUN"},{"type":"text","value":" function for the action-value function.","position":{"start":{"line":531,"column":1},"end":{"line":531,"column":1}},"key":"fCBi1jWAPm"}],"key":"zyKH0On1eV"}],"key":"BVVrFlQGGh"},{"type":"proof","kind":"remark","label":"bellman_det","identifier":"bellman_det","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"The Bellman consistency equation for deterministic policies","position":{"start":{"line":534,"column":1},"end":{"line":534,"column":1}},"key":"r6l86hyf2N"}],"key":"gIwrt7l8Ga"},{"type":"paragraph","position":{"start":{"line":537,"column":1},"end":{"line":538,"column":1}},"children":[{"type":"text","value":"Note that for deterministic policies, the Bellman consistency equation\nsimplifies to","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"uEyxNLv7I7"}],"key":"SsFL1148Hv"},{"type":"math","value":"\\begin{aligned}\n V_\\hi^\\pi(s) &= r(s, \\pi_\\hi(s)) + \\E_{s' \\sim P(s, \\pi_\\hi(s))} [V_{\\hi+1}^\\pi(s')] \\\\\n Q_\\hi^\\pi(s, a) &= r(s, a) + \\E_{s' \\sim P(s, a)} [Q_{\\hi+1}^\\pi(s', \\pi_{\\hi+1}(s'))]\n\\end{aligned}","position":{"start":{"line":540,"column":1},"end":{"line":545,"column":1}},"html":"Vhπ(s)=r(s,πh(s))+EsP(s,πh(s))[Vh+1π(s)]Qhπ(s,a)=r(s,a)+EsP(s,a)[Qh+1π(s,πh+1(s))]\\begin{aligned}\n V_\\hi^\\pi(s) &= r(s, \\pi_\\hi(s)) + \\E_{s' \\sim P(s, \\pi_\\hi(s))} [V_{\\hi+1}^\\pi(s')] \\\\\n Q_\\hi^\\pi(s, a) &= r(s, a) + \\E_{s' \\sim P(s, a)} [Q_{\\hi+1}^\\pi(s', \\pi_{\\hi+1}(s'))]\n\\end{aligned}Vhπ(s)Qhπ(s,a)=r(s,πh(s))+EsP(s,πh(s))[Vh+1π(s)]=r(s,a)+EsP(s,a)[Qh+1π(s,πh+1(s))]","enumerator":"1.14","key":"i5HqFR5nsB"}],"enumerator":"1.1","html_id":"bellman-det","key":"ffMzD3KiTM"}],"key":"WU5ggEUx21"},{"type":"block","position":{"start":{"line":548,"column":1},"end":{"line":548,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"children":[{"type":"text","value":"The one-step Bellman operator","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"LcTstW0epl"}],"identifier":"the-one-step-bellman-operator","label":"The one-step Bellman operator","html_id":"the-one-step-bellman-operator","implicit":true,"enumerator":"1.2.6","key":"zM5vf6zw9d"},{"type":"paragraph","position":{"start":{"line":552,"column":1},"end":{"line":554,"column":1}},"children":[{"type":"text","value":"Fix a policy ","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"key":"yqtM5xRPFi"},{"type":"text","value":"π","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"key":"GEH4A35894"},{"type":"text","value":". Consider the higher-order operator that takes in a\n“value function” ","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"key":"soqfxmVTsD"},{"type":"inlineMath","value":"v : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"html":"v:SRv : \\mathcal{S} \\to \\mathbb{R}v:SR","key":"F7IaMLaC6z"},{"type":"text","value":" and returns the r.h.s. of the Bellman\nequation for that “value function”:","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"key":"A4vPGs5JSd"}],"key":"EHmaQRG3Q9"},{"type":"proof","kind":"definition","label":"bellman_operator","identifier":"bellman_operator","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Bellman operator","position":{"start":{"line":556,"column":1},"end":{"line":556,"column":1}},"key":"dLOYtSYXW6"}],"key":"qOIvfKAvW3"},{"type":"math","value":"[\\mathcal{J}^{\\pi}(v)](s) := \\E_{\\substack{a \\sim \\pi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + v(s')].","position":{"start":{"line":559,"column":1},"end":{"line":559,"column":1}},"html":"[Jπ(v)](s):=Eaπ(s)sP(s,a)[r(s,a)+v(s)].[\\mathcal{J}^{\\pi}(v)](s) := \\E_{\\substack{a \\sim \\pi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + v(s')].[Jπ(v)](s):=Eaπ(s)sP(s,a)[r(s,a)+v(s)].","enumerator":"1.15","key":"fNpzZDHiQg"},{"type":"paragraph","position":{"start":{"line":561,"column":1},"end":{"line":564,"column":1}},"children":[{"type":"text","value":"This is a crucial tool for reasoning about MDPs.\nIntuitively, it answers the following question:\nif we evaluate the ","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"key":"ybfcqhcDbS"},{"type":"emphasis","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"children":[{"type":"text","value":"next","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"key":"n0FuKf7fTH"}],"key":"pCO4X141Gz"},{"type":"text","value":" state using ","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"key":"wY7W9MFTXO"},{"type":"inlineMath","value":"v","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"html":"vvv","key":"gZO4xW5Gbl"},{"type":"text","value":",\nhow good is the ","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"key":"smeTznRgnJ"},{"type":"emphasis","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"children":[{"type":"text","value":"current","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"key":"BYfaR065lC"}],"key":"AAHRNqlQfm"},{"type":"text","value":" state, according to the given policy?","position":{"start":{"line":561,"column":1},"end":{"line":561,"column":1}},"key":"a06H6wc716"}],"key":"TooS0PPF3d"}],"enumerator":"1.8","html_id":"bellman-operator","key":"pssCJ9AC6h"}],"key":"DUW4dgzAFe"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def bellman_operator_looping(\n mdp: MDP,\n policy: Float[Array, \"S A\"],\n v: Float[Array, \" S\"],\n) -> Float[Array, \" S\"]:\n \"\"\"\n Looping definition of the Bellman operator.\n Concise version is below\n \"\"\"\n v_new = jnp.zeros(mdp.S)\n for s in range(mdp.S):\n for a in range(mdp.A):\n for s_next in range(mdp.S):\n v_new[s] += (\n policy[s, a]\n * mdp.P[s, a, s_next]\n * (mdp.r[s, a] + mdp.γ * v[s_next])\n )\n return v_new","visibility":"hide","key":"MqO0vUQfM6"},{"type":"output","id":"JIf8rNwiENXb2USjhsKkB","data":[],"visibility":"show","key":"PKcfPZaznN"}],"data":{"tags":[]},"visibility":"show","key":"IKMQWrULjk"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"children":[{"type":"text","value":"Note that we can concisely implement this using the ","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"yijMzb3cNe"},{"type":"inlineCode","value":"q_to_v","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"JQFMBGxpfl"},{"type":"text","value":" and ","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"fcWji5Bvew"},{"type":"inlineCode","value":"v_to_q","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"ZS0mGNtf5M"},{"type":"text","value":" utilities from above:","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"KxcT0YQrUk"}],"key":"aejidlGp2M"}],"key":"y2RYq7KRwO"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def bellman_operator(\n mdp: MDP,\n policy: Float[Array, \"S A\"],\n v: Float[Array, \" S\"],\n) -> Float[Array, \" S\"]:\n \"\"\"For a known finite MDP, the Bellman operator can be exactly evaluated.\"\"\"\n return q_to_v(policy, v_to_q(mdp, v)) # equivalent\n return jnp.sum(policy * (mdp.r + mdp.γ * mdp.P @ v), axis=1)","key":"YKi8ggHOR3"},{"type":"output","id":"43I8ZpNlogBMxEqPCZOP-","data":[],"key":"CAAPP4vJZb"}],"data":{},"key":"bVtiLmtxM1"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":604,"column":1},"end":{"line":608,"column":1}},"children":[{"type":"text","value":"We’ll call ","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"IsTm6NgVqV"},{"type":"inlineMath","value":"\\mathcal{J}^\\pi : \\mathbb{R}^\\mathcal{S} \\to \\mathbb{R}^\\mathcal{S}","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"html":"Jπ:RSRS\\mathcal{J}^\\pi : \\mathbb{R}^\\mathcal{S} \\to \\mathbb{R}^\\mathcal{S}Jπ:RSRS","key":"Yx6N5AcjCA"},{"type":"text","value":" the ","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"BDadD1HV9q"},{"type":"strong","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"children":[{"type":"text","value":"Bellman\noperator","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"zjcd7EQ4W7"}],"key":"K421y3sgyQ"},{"type":"text","value":" of ","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"KUlEdBz4U5"},{"type":"text","value":"π","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"Nq3vHHueKa"},{"type":"text","value":".\nNote that it’s defined on any “value function” mapping states to real numbers;\n","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"rejOdCgBbq"},{"type":"inlineMath","value":"v","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"html":"vvv","key":"hKZ2TgHx9E"},{"type":"text","value":" doesn’t have to be a well-defined value function for some policy (hence the lowercase notation).\nThe Bellman operator also gives us a concise way to express ","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"jhNVw0vOGb"},{"type":"crossReference","kind":"proof:theorem","identifier":"bellman_consistency","label":"bellman_consistency","children":[{"type":"text","value":"Theorem ","key":"NyXA81YSyl"},{"type":"text","value":"1.1","key":"iGp3CspnnZ"}],"template":"Theorem %s","enumerator":"1.1","resolved":true,"html_id":"bellman-consistency","key":"q4TliIYV0s"},{"type":"text","value":" for the value function:","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"LiXHXuPgWQ"}],"key":"GHYAJ40QN9"},{"type":"math","value":"V_\\hi^\\pi = \\mathcal{J}^{\\pi}(V_{\\hi+1}^\\pi)","position":{"start":{"line":610,"column":1},"end":{"line":610,"column":1}},"html":"Vhπ=Jπ(Vh+1π)V_\\hi^\\pi = \\mathcal{J}^{\\pi}(V_{\\hi+1}^\\pi)Vhπ=Jπ(Vh+1π)","enumerator":"1.16","key":"stnVyDikT9"},{"type":"paragraph","position":{"start":{"line":612,"column":1},"end":{"line":615,"column":1}},"children":[{"type":"text","value":"Intuitively, the output of the Bellman operator, a new “value function”,\nevaluates states as follows: from a given state, take one action\naccording to ","position":{"start":{"line":612,"column":1},"end":{"line":612,"column":1}},"key":"U3nJjHb2kO"},{"type":"text","value":"π","position":{"start":{"line":612,"column":1},"end":{"line":612,"column":1}},"key":"dkn8N4ll51"},{"type":"text","value":", observe the reward, and then evaluate the next state\nusing the input “value function”.","position":{"start":{"line":612,"column":1},"end":{"line":612,"column":1}},"key":"gtI2xeNiby"}],"key":"EL8iZWNpF0"},{"type":"paragraph","position":{"start":{"line":617,"column":1},"end":{"line":619,"column":1}},"children":[{"type":"text","value":"When we discuss infinite-horizon MDPs, the Bellman operator will turn\nout to be more than just a notational convenience: We’ll use it to\nconstruct algorithms for computing the optimal policy.","position":{"start":{"line":617,"column":1},"end":{"line":617,"column":1}},"key":"W69LKTr1od"}],"key":"oQOkLmmDL3"},{"type":"heading","depth":2,"position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"children":[{"type":"text","value":"Solving finite-horizon MDPs","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"key":"zsvETTiP41"}],"label":"finite_horizon_mdps","identifier":"finite_horizon_mdps","html_id":"finite-horizon-mdps-1","enumerator":"1.3","key":"ahWDBG8Mkd"},{"type":"heading","depth":3,"position":{"start":{"line":626,"column":1},"end":{"line":626,"column":1}},"children":[{"type":"text","value":"Policy evaluation in finite-horizon MDPs","position":{"start":{"line":626,"column":1},"end":{"line":626,"column":1}},"key":"T5dHTIJaxQ"}],"label":"eval_dp","identifier":"eval_dp","html_id":"eval-dp","enumerator":"1.3.1","key":"eIO5XWOz95"},{"type":"paragraph","position":{"start":{"line":628,"column":1},"end":{"line":629,"column":1}},"children":[{"type":"text","value":"How can we actually compute the value function of a given policy? This\nis the task of ","position":{"start":{"line":628,"column":1},"end":{"line":628,"column":1}},"key":"E8lBX497gJ"},{"type":"strong","position":{"start":{"line":628,"column":1},"end":{"line":628,"column":1}},"children":[{"type":"text","value":"policy evaluation","position":{"start":{"line":628,"column":1},"end":{"line":628,"column":1}},"key":"m1pmaAk1F9"}],"key":"vyKoA1VCew"},{"type":"text","value":".","position":{"start":{"line":628,"column":1},"end":{"line":628,"column":1}},"key":"VEOPMAeIR5"}],"key":"FYk13M1caW"},{"type":"proof","kind":"definition","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"DP algorithm to evaluate a policy in a finite-horizon MDP","position":{"start":{"line":631,"column":1},"end":{"line":631,"column":1}},"key":"YfSJqq8ajG"}],"key":"qgYKnEIZpf"},{"type":"paragraph","position":{"start":{"line":633,"column":1},"end":{"line":640,"column":1}},"children":[{"type":"text","value":"The Bellman consistency equation\n","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"key":"ocAJ7vPT9M"},{"type":"crossReference","kind":"proof:theorem","identifier":"bellman_consistency","label":"bellman_consistency","children":[{"type":"text","value":"Theorem ","key":"qgw5LnE26O"},{"type":"text","value":"1.1","key":"yTL6Kr80vq"}],"template":"Theorem %s","enumerator":"1.1","resolved":true,"html_id":"bellman-consistency","key":"BiccqaHGdn"},{"type":"text","value":"\ngives us a convenient algorithm for\nevaluating stationary policies: it expresses the value function at\ntimestep ","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"key":"b9b348zlyt"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"html":"h\\hih","key":"mfGXKCAFOp"},{"type":"text","value":" as a function of the value function at timestep ","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"key":"FysdvJmoDw"},{"type":"inlineMath","value":"\\hi+1","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"html":"h+1\\hi+1h+1","key":"ram2BxzEfj"},{"type":"text","value":". This\nmeans we can start at the end of the time horizon, where the value is\nknown, and work backwards in time, using the Bellman consistency\nequation to compute the value function at each time step.","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"key":"LZZlJ5G84p"}],"key":"YcMtmcCLGX"}],"enumerator":"1.9","key":"KL9BKGfYej"}],"key":"dTLZr5ojXe"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def dp_eval_finite(mdp: MDP, policy: Float[Array, \"S A\"]) -> Float[Array, \"H S\"]:\n \"\"\"Evaluate a policy using dynamic programming.\"\"\"\n V_ary = [None] * mdp.H + [jnp.zeros(mdp.S)] # initialize to 0 at end of time horizon\n for h in range(mdp.H - 1, -1, -1):\n V_ary[h] = bellman_operator(mdp, policy[h], V_ary[h + 1])\n return jnp.stack(V_ary[:-1])","key":"wY2po93Lk6"},{"type":"output","id":"1KwAnYAWktjaD3Q093DRI","data":[],"key":"gZ6YMHR9Ze"}],"data":{},"key":"Vp2D5KedDp"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":652,"column":1},"end":{"line":653,"column":1}},"children":[{"type":"text","value":"This runs in time ","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"pNgXmEIqsA"},{"type":"inlineMath","value":"O(H \\cdot |\\mathcal{S}|^2 \\cdot |\\mathcal{A}|)","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"html":"O(HS2A)O(H \\cdot |\\mathcal{S}|^2 \\cdot |\\mathcal{A}|)O(HS2A)","key":"yMJkzCuDjy"},{"type":"text","value":" by counting the\nloops.","position":{"start":{"line":652,"column":1},"end":{"line":652,"column":1}},"key":"QPLHYRJIKL"}],"key":"UdoaCwTdsa"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"lOfaErqh7q"}],"key":"o6KGye7fqY"},{"type":"paragraph","position":{"start":{"line":656,"column":1},"end":{"line":657,"column":1}},"children":[{"type":"text","value":"Do you see where we compute ","position":{"start":{"line":656,"column":1},"end":{"line":656,"column":1}},"key":"GxO7xXEsoS"},{"type":"inlineMath","value":"Q^\\pi_\\hi","position":{"start":{"line":656,"column":1},"end":{"line":656,"column":1}},"html":"QhπQ^\\pi_\\hiQhπ","key":"bH6AD21dpk"},{"type":"text","value":" along the way? Make\nthis step explicit.","position":{"start":{"line":656,"column":1},"end":{"line":656,"column":1}},"key":"QabLMLUV69"}],"key":"W721r952KZ"}],"key":"Lq9SK1lpAR"},{"type":"proof","kind":"example","label":"tidy_eval_finite","identifier":"tidy_eval_finite","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Tidying policy evaluation","position":{"start":{"line":660,"column":1},"end":{"line":660,"column":1}},"key":"VoXTwZhpPG"}],"key":"qgrBVyMjK9"},{"type":"paragraph","position":{"start":{"line":663,"column":1},"end":{"line":667,"column":1}},"children":[{"type":"text","value":"Let’s evaluate the policy from\n","position":{"start":{"line":663,"column":1},"end":{"line":663,"column":1}},"key":"wWlWzg8TZj"},{"type":"crossReference","kind":"proof:example","identifier":"tidy_policy","label":"tidy_policy","children":[{"type":"text","value":"Example ","key":"wDbJFz7MSH"},{"type":"text","value":"1.2","key":"VC6I3vnwml"}],"template":"Example %s","enumerator":"1.2","resolved":true,"html_id":"tidy-policy","key":"hmv7ULEZaA"},{"type":"text","value":" in the tidying MDP\nthat tidies if and only if the room is\nmessy. We’ll use the Bellman consistency equation to compute the value\nfunction at each time step.","position":{"start":{"line":663,"column":1},"end":{"line":663,"column":1}},"key":"faOLxEN7KV"}],"key":"j0cy4JZJfD"},{"type":"math","value":"\\begin{aligned}\nV_{H-1}^\\pi(\\text{orderly}) &= r(\\text{orderly}, \\text{ignore}) \\\\\n&= 1 \\\\\nV_{H-1}^\\pi(\\text{messy}) &= r(\\text{messy}, \\text{tidy}) \\\\\n&= 0 \\\\\nV_{H-2}^\\pi(\\text{orderly}) &= r(\\text{orderly}, \\text{ignore}) + \\E_{s' \\sim P(\\text{orderly}, \\text{ignore})} [V_{H-1}^\\pi(s')] \\\\\n&= 1 + 0.7 \\cdot V_{H-1}^{\\pi}(\\text{orderly}) + 0.3 \\cdot V_{H-1}^{\\pi}(\\text{messy}) \\\\\n&= 1 + 0.7 \\cdot 1 + 0.3 \\cdot 0 \\\\\n&= 1.7 \\\\\nV_{H-2}^\\pi(\\text{messy}) &= r(\\text{messy}, \\text{tidy}) + \\E_{s' \\sim P(\\text{messy}, \\text{tidy})} [V_{H-1}^\\pi(s')] \\\\\n&= 0 + 1 \\cdot V_{H-1}^{\\pi}(\\text{orderly}) + 0 \\cdot V_{H-1}^{\\pi}(\\text{messy}) \\\\\n&= 1 \\\\\nV_{H-3}^\\pi(\\text{orderly}) &= r(\\text{orderly}, \\text{ignore}) + \\E_{s' \\sim P(\\text{orderly}, \\text{ignore})} [V_{H-2}^\\pi(s')] \\\\\n&= 1 + 0.7 \\cdot V_{H-2}^{\\pi}(\\text{orderly}) + 0.3 \\cdot V_{H-2}^{\\pi}(\\text{messy}) \\\\\n&= 1 + 0.7 \\cdot 1.7 + 0.3 \\cdot 1 \\\\\n&= 2.49 \\\\\nV_{H-3}^\\pi(\\text{messy}) &= r(\\text{messy}, \\text{tidy}) + \\E_{s' \\sim P(\\text{messy}, \\text{tidy})} [V_{H-2}^\\pi(s')] \\\\\n&= 0 + 1 \\cdot V_{H-2}^{\\pi}(\\text{orderly}) + 0 \\cdot V_{H-2}^{\\pi}(\\text{messy}) \\\\\n&= 1.7\n\\end{aligned}","position":{"start":{"line":669,"column":1},"end":{"line":690,"column":1}},"html":"VH1π(orderly)=r(orderly,ignore)=1VH1π(messy)=r(messy,tidy)=0VH2π(orderly)=r(orderly,ignore)+EsP(orderly,ignore)[VH1π(s)]=1+0.7VH1π(orderly)+0.3VH1π(messy)=1+0.71+0.30=1.7VH2π(messy)=r(messy,tidy)+EsP(messy,tidy)[VH1π(s)]=0+1VH1π(orderly)+0VH1π(messy)=1VH3π(orderly)=r(orderly,ignore)+EsP(orderly,ignore)[VH2π(s)]=1+0.7VH2π(orderly)+0.3VH2π(messy)=1+0.71.7+0.31=2.49VH3π(messy)=r(messy,tidy)+EsP(messy,tidy)[VH2π(s)]=0+1VH2π(orderly)+0VH2π(messy)=1.7\\begin{aligned}\nV_{H-1}^\\pi(\\text{orderly}) &= r(\\text{orderly}, \\text{ignore}) \\\\\n&= 1 \\\\\nV_{H-1}^\\pi(\\text{messy}) &= r(\\text{messy}, \\text{tidy}) \\\\\n&= 0 \\\\\nV_{H-2}^\\pi(\\text{orderly}) &= r(\\text{orderly}, \\text{ignore}) + \\E_{s' \\sim P(\\text{orderly}, \\text{ignore})} [V_{H-1}^\\pi(s')] \\\\\n&= 1 + 0.7 \\cdot V_{H-1}^{\\pi}(\\text{orderly}) + 0.3 \\cdot V_{H-1}^{\\pi}(\\text{messy}) \\\\\n&= 1 + 0.7 \\cdot 1 + 0.3 \\cdot 0 \\\\\n&= 1.7 \\\\\nV_{H-2}^\\pi(\\text{messy}) &= r(\\text{messy}, \\text{tidy}) + \\E_{s' \\sim P(\\text{messy}, \\text{tidy})} [V_{H-1}^\\pi(s')] \\\\\n&= 0 + 1 \\cdot V_{H-1}^{\\pi}(\\text{orderly}) + 0 \\cdot V_{H-1}^{\\pi}(\\text{messy}) \\\\\n&= 1 \\\\\nV_{H-3}^\\pi(\\text{orderly}) &= r(\\text{orderly}, \\text{ignore}) + \\E_{s' \\sim P(\\text{orderly}, \\text{ignore})} [V_{H-2}^\\pi(s')] \\\\\n&= 1 + 0.7 \\cdot V_{H-2}^{\\pi}(\\text{orderly}) + 0.3 \\cdot V_{H-2}^{\\pi}(\\text{messy}) \\\\\n&= 1 + 0.7 \\cdot 1.7 + 0.3 \\cdot 1 \\\\\n&= 2.49 \\\\\nV_{H-3}^\\pi(\\text{messy}) &= r(\\text{messy}, \\text{tidy}) + \\E_{s' \\sim P(\\text{messy}, \\text{tidy})} [V_{H-2}^\\pi(s')] \\\\\n&= 0 + 1 \\cdot V_{H-2}^{\\pi}(\\text{orderly}) + 0 \\cdot V_{H-2}^{\\pi}(\\text{messy}) \\\\\n&= 1.7\n\\end{aligned}VH1π(orderly)VH1π(messy)VH2π(orderly)VH2π(messy)VH3π(orderly)VH3π(messy)=r(orderly,ignore)=1=r(messy,tidy)=0=r(orderly,ignore)+EsP(orderly,ignore)[VH1π(s)]=1+0.7VH1π(orderly)+0.3VH1π(messy)=1+0.71+0.30=1.7=r(messy,tidy)+EsP(messy,tidy)[VH1π(s)]=0+1VH1π(orderly)+0VH1π(messy)=1=r(orderly,ignore)+EsP(orderly,ignore)[VH2π(s)]=1+0.7VH2π(orderly)+0.3VH2π(messy)=1+0.71.7+0.31=2.49=r(messy,tidy)+EsP(messy,tidy)[VH2π(s)]=0+1VH2π(orderly)+0VH2π(messy)=1.7","enumerator":"1.17","key":"anmWW5kCaC"},{"type":"paragraph","position":{"start":{"line":692,"column":1},"end":{"line":693,"column":1}},"children":[{"type":"text","value":"etc. You may wish to repeat this computation for the\nother policies to get a better sense of this algorithm.","position":{"start":{"line":692,"column":1},"end":{"line":692,"column":1}},"key":"RMO3yCzDOr"}],"key":"BEsIdyQoNe"}],"enumerator":"1.4","html_id":"tidy-eval-finite","key":"Vx9PrisGVb"}],"key":"SM17Q73rZ2"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"V_messy = dp_eval_finite(tidy_mdp, tidy_policy_messy_only)\nV_messy","key":"ocH6nL5RxY"},{"type":"output","id":"2hC5rN7Hli4482TolLJ4W","data":[{"output_type":"execute_result","execution_count":14,"metadata":{},"data":{"text/plain":{"content":"Array([[5.5621696, 4.7927704],\n [4.7927704, 4.0241003],\n [4.0241003, 3.253 ],\n [3.253 , 2.49 ],\n [2.49 , 1.7 ],\n [1.7 , 1. ],\n [1. , 0. ]], dtype=float32)","content_type":"text/plain"}}}],"key":"E6mHOua0od"}],"data":{},"key":"ZxkKdY73JH"},{"type":"block","children":[{"type":"heading","depth":3,"position":{"start":{"line":702,"column":1},"end":{"line":702,"column":1}},"children":[{"type":"text","value":"Optimal policies in finite-horizon MDPs","position":{"start":{"line":702,"column":1},"end":{"line":702,"column":1}},"key":"xV6KLF2zEe"}],"label":"opt_dynamic_programming","identifier":"opt_dynamic_programming","html_id":"opt-dynamic-programming","enumerator":"1.3.2","key":"sFveIkA4Bm"},{"type":"paragraph","position":{"start":{"line":704,"column":1},"end":{"line":705,"column":1}},"children":[{"type":"text","value":"We’ve just seen how to ","position":{"start":{"line":704,"column":1},"end":{"line":704,"column":1}},"key":"VqmYUADubs"},{"type":"emphasis","position":{"start":{"line":704,"column":1},"end":{"line":704,"column":1}},"children":[{"type":"text","value":"evaluate","position":{"start":{"line":704,"column":1},"end":{"line":704,"column":1}},"key":"BupBf41sQC"}],"key":"vv3TUO1esk"},{"type":"text","value":" a given policy. But how can we find\nthe ","position":{"start":{"line":704,"column":1},"end":{"line":704,"column":1}},"key":"JEvvv4I0Im"},{"type":"strong","position":{"start":{"line":704,"column":1},"end":{"line":704,"column":1}},"children":[{"type":"text","value":"optimal policy","position":{"start":{"line":704,"column":1},"end":{"line":704,"column":1}},"key":"bBw670tZMD"}],"key":"mFcKLyUrrK"},{"type":"text","value":" for a given environment?","position":{"start":{"line":704,"column":1},"end":{"line":704,"column":1}},"key":"gnKC1QAg7J"}],"key":"DxwT0emTsu"},{"type":"proof","kind":"definition","label":"optimal_policy_finite","identifier":"optimal_policy_finite","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Optimal policies","position":{"start":{"line":707,"column":1},"end":{"line":707,"column":1}},"key":"uZ4ZQODoo0"}],"key":"PNlKRPaDIV"},{"type":"paragraph","position":{"start":{"line":710,"column":1},"end":{"line":712,"column":1}},"children":[{"type":"text","value":"We call a policy optimal, and denote it by ","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"key":"Ai9h2JGSfx"},{"type":"inlineMath","value":"\\pi^\\star","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"html":"π\\pi^\\starπ","key":"y8RqjywED4"},{"type":"text","value":", if it does at\nleast as well as ","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"key":"s8Q7xNOnGY"},{"type":"emphasis","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"children":[{"type":"text","value":"any","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"key":"yh2sQAyG3l"}],"key":"OlUi79YxkD"},{"type":"text","value":" other policy ","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"key":"MHMFDuIO0B"},{"type":"text","value":"π","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"key":"QLI8YWJXYp"},{"type":"text","value":" (including stochastic and\nhistory-dependent ones) in all situations:","position":{"start":{"line":710,"column":1},"end":{"line":710,"column":1}},"key":"AKUD8mNtpv"}],"key":"As58pmRemH"},{"type":"math","value":"\\begin{aligned}\n V_\\hi^{\\pi^\\star}(s) &= \\E_{\\tau \\sim \\rho^{\\pi^{\\star}}}[r_\\hi + \\cdots + r_{H-1} \\mid s_\\hi = s] \\\\\n &\\ge \\E_{\\tau \\sim \\rho^{\\pi}}[r_\\hi + \\cdots + r_{H-1} \\mid \\tau_\\hi] \\quad \\forall \\pi, \\tau_\\hi, \\hi \\in [H]\n\\end{aligned}","position":{"start":{"line":714,"column":1},"end":{"line":719,"column":1}},"html":"Vhπ(s)=Eτρπ[rh++rH1sh=s]Eτρπ[rh++rH1τh]π,τh,h[H]\\begin{aligned}\n V_\\hi^{\\pi^\\star}(s) &= \\E_{\\tau \\sim \\rho^{\\pi^{\\star}}}[r_\\hi + \\cdots + r_{H-1} \\mid s_\\hi = s] \\\\\n &\\ge \\E_{\\tau \\sim \\rho^{\\pi}}[r_\\hi + \\cdots + r_{H-1} \\mid \\tau_\\hi] \\quad \\forall \\pi, \\tau_\\hi, \\hi \\in [H]\n\\end{aligned}Vhπ(s)=Eτρπ[rh++rH1sh=s]Eτρπ[rh++rH1τh]π,τh,h[H]","enumerator":"1.18","key":"VNqCCgU55U"},{"type":"paragraph","position":{"start":{"line":721,"column":1},"end":{"line":723,"column":1}},"children":[{"type":"text","value":"where we condition on the\ntrajectory up to time ","position":{"start":{"line":721,"column":1},"end":{"line":721,"column":1}},"key":"m0WgDvBErT"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":721,"column":1},"end":{"line":721,"column":1}},"html":"h\\hih","key":"Zb0eDFEykh"},{"type":"text","value":", denoted\n","position":{"start":{"line":721,"column":1},"end":{"line":721,"column":1}},"key":"k01nBza0wm"},{"type":"inlineMath","value":"\\tau_\\hi = (s_0, a_0, r_0, \\dots, s_\\hi)","position":{"start":{"line":721,"column":1},"end":{"line":721,"column":1}},"html":"τh=(s0,a0,r0,,sh)\\tau_\\hi = (s_0, a_0, r_0, \\dots, s_\\hi)τh=(s0,a0,r0,,sh)","key":"bX19QyqT01"},{"type":"text","value":", where ","position":{"start":{"line":721,"column":1},"end":{"line":721,"column":1}},"key":"feLNRjNQie"},{"type":"inlineMath","value":"s_\\hi = s","position":{"start":{"line":721,"column":1},"end":{"line":721,"column":1}},"html":"sh=ss_\\hi = ssh=s","key":"tPNjBiwu7b"},{"type":"text","value":".","position":{"start":{"line":721,"column":1},"end":{"line":721,"column":1}},"key":"QoIJtFhSN5"}],"key":"QVS3gQdQh2"}],"enumerator":"1.10","html_id":"optimal-policy-finite","key":"uSb6n5g67R"},{"type":"paragraph","position":{"start":{"line":726,"column":1},"end":{"line":729,"column":1}},"children":[{"type":"text","value":"Convince yourself that all optimal policies must have the same value\nfunction. We call this the ","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"key":"PK9ttRxLrZ"},{"type":"strong","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"children":[{"type":"text","value":"optimal value function","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"key":"gEvKYLC1Mi"}],"key":"jBtTVx8Bll"},{"type":"text","value":" and denote it by\n","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"key":"ogBY6MBwXF"},{"type":"inlineMath","value":"V_\\hi^\\star(s)","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"html":"Vh(s)V_\\hi^\\star(s)Vh(s)","key":"IEb6O38iaY"},{"type":"text","value":". The same goes for the action-value function\n","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"key":"w1YefO2YFG"},{"type":"inlineMath","value":"Q_\\hi^\\star(s, a)","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"html":"Qh(s,a)Q_\\hi^\\star(s, a)Qh(s,a)","key":"XN2Jl5XOif"},{"type":"text","value":".","position":{"start":{"line":726,"column":1},"end":{"line":726,"column":1}},"key":"PHfHn3Z1et"}],"key":"GJbltH3fNI"},{"type":"paragraph","position":{"start":{"line":731,"column":1},"end":{"line":734,"column":1}},"children":[{"type":"text","value":"It is a stunning fact that ","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"key":"tbfY0otqLW"},{"type":"strong","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"children":[{"type":"text","value":"every finite-horizon MDP has an optimal\npolicy that is time-dependent and deterministic.","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"key":"vCEkbAtT0b"}],"key":"X7ixZwDUql"},{"type":"text","value":" In particular, we can\nconstruct such a policy by acting ","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"key":"z1xy4xvoJa"},{"type":"emphasis","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"children":[{"type":"text","value":"greedily","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"key":"ClsSMyUrAl"}],"key":"FqSoXlXTbA"},{"type":"text","value":" with respect to the optimal\naction-value function:","position":{"start":{"line":731,"column":1},"end":{"line":731,"column":1}},"key":"c5o7O8Sj6O"}],"key":"HChvtSrRmA"},{"type":"proof","kind":"theorem","label":"optimal_greedy","identifier":"optimal_greedy","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"It is optimal to be greedy with respect to the optimal value function","position":{"start":{"line":737,"column":1},"end":{"line":737,"column":1}},"key":"JExfdqP1YD"}],"key":"RLB8PNV8A2"},{"type":"math","value":"\\pi_\\hi^\\star(s) = \\arg\\max_a Q_\\hi^\\star(s, a).","position":{"start":{"line":740,"column":1},"end":{"line":740,"column":1}},"html":"πh(s)=argmaxaQh(s,a).\\pi_\\hi^\\star(s) = \\arg\\max_a Q_\\hi^\\star(s, a).πh(s)=argamaxQh(s,a).","enumerator":"1.19","key":"kr42KkdPJi"}],"enumerator":"1.3","html_id":"optimal-greedy","key":"dB4xJldTxb"},{"type":"proof","kind":"proof","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Proof","position":{"start":{"line":743,"column":1},"end":{"line":743,"column":1}},"key":"u1maz8MpJd"}],"key":"HXwb1IQ5zD"},{"type":"paragraph","position":{"start":{"line":744,"column":1},"end":{"line":745,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":744,"column":1},"end":{"line":744,"column":1}},"key":"BsyJk7ijbE"},{"type":"inlineMath","value":"V^{\\star}","position":{"start":{"line":744,"column":1},"end":{"line":744,"column":1}},"html":"VV^{\\star}V","key":"NCc1I6BaBX"},{"type":"text","value":" and ","position":{"start":{"line":744,"column":1},"end":{"line":744,"column":1}},"key":"szaEgOKvEH"},{"type":"inlineMath","value":"Q^{\\star}","position":{"start":{"line":744,"column":1},"end":{"line":744,"column":1}},"html":"QQ^{\\star}Q","key":"yPXtaohFma"},{"type":"text","value":" denote the optimal value and\naction-value functions. Consider the greedy policy","position":{"start":{"line":744,"column":1},"end":{"line":744,"column":1}},"key":"JQHVzfj5sI"}],"key":"NnNSYiRk3s"},{"type":"math","value":"\\hat \\pi_\\hi(s) := \\arg\\max_a Q_\\hi^{\\star}(s, a).","position":{"start":{"line":747,"column":1},"end":{"line":747,"column":1}},"html":"π^h(s):=argmaxaQh(s,a).\\hat \\pi_\\hi(s) := \\arg\\max_a Q_\\hi^{\\star}(s, a).π^h(s):=argamaxQh(s,a).","enumerator":"1.20","key":"eHNtT2jXmj"},{"type":"paragraph","position":{"start":{"line":749,"column":1},"end":{"line":750,"column":1}},"children":[{"type":"text","value":"We aim to show that\n","position":{"start":{"line":749,"column":1},"end":{"line":749,"column":1}},"key":"H42DseD7E9"},{"type":"inlineMath","value":"\\hat \\pi","position":{"start":{"line":749,"column":1},"end":{"line":749,"column":1}},"html":"π^\\hat \\piπ^","key":"cfo2HAbKFP"},{"type":"text","value":" is optimal; that is, ","position":{"start":{"line":749,"column":1},"end":{"line":749,"column":1}},"key":"eZwYZRwFfM"},{"type":"inlineMath","value":"V^{\\hat \\pi} = V^{\\star}","position":{"start":{"line":749,"column":1},"end":{"line":749,"column":1}},"html":"Vπ^=VV^{\\hat \\pi} = V^{\\star}Vπ^=V","key":"xivUmeAVVs"},{"type":"text","value":".","position":{"start":{"line":749,"column":1},"end":{"line":749,"column":1}},"key":"MVFEKtInoj"}],"key":"rE0uJIJFZt"},{"type":"paragraph","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"children":[{"type":"text","value":"Fix an arbitrary state ","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"vMOddQLBRR"},{"type":"inlineMath","value":"s \\in \\mathcal{S}","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"html":"sSs \\in \\mathcal{S}sS","key":"UE6Rsq0c5W"},{"type":"text","value":" and time ","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"mFXspCrvMd"},{"type":"inlineMath","value":"\\hi \\in [H]","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"html":"h[H]\\hi \\in [H]h[H]","key":"b93zo7JKxA"},{"type":"text","value":".","position":{"start":{"line":752,"column":1},"end":{"line":752,"column":1}},"key":"szrF15OzTW"}],"key":"k8jws7wyDx"},{"type":"paragraph","position":{"start":{"line":754,"column":1},"end":{"line":759,"column":1}},"children":[{"type":"text","value":"Firstly, by the definition of ","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"EeOsSrP9pO"},{"type":"inlineMath","value":"V^{\\star}","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"html":"VV^{\\star}V","key":"wBbJYYuPvg"},{"type":"text","value":", we already know\n","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"lrqickswxM"},{"type":"inlineMath","value":"V_\\hi^{\\star}(s) \\ge V_\\hi^{\\hat \\pi}(s)","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"html":"Vh(s)Vhπ^(s)V_\\hi^{\\star}(s) \\ge V_\\hi^{\\hat \\pi}(s)Vh(s)Vhπ^(s)","key":"ewr2JPeckV"},{"type":"text","value":". So for equality to hold we just\nneed to show that ","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"dHDoZE9YtB"},{"type":"inlineMath","value":"V_\\hi^{\\star}(s) \\le V_\\hi^{\\hat \\pi}(s)","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"html":"Vh(s)Vhπ^(s)V_\\hi^{\\star}(s) \\le V_\\hi^{\\hat \\pi}(s)Vh(s)Vhπ^(s)","key":"XqmbaLnaVJ"},{"type":"text","value":". We’ll first\nshow that the Bellman operator ","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"X8Wf7Ladr7"},{"type":"inlineMath","value":"\\mathcal{J}^{\\hat \\pi}","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"html":"Jπ^\\mathcal{J}^{\\hat \\pi}Jπ^","key":"RtVIwxOLbG"},{"type":"text","value":" never decreases\n","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"w8NK3c7qSv"},{"type":"inlineMath","value":"V_\\hi^{\\star}","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"html":"VhV_\\hi^{\\star}Vh","key":"zsNPj1R1f4"},{"type":"text","value":". Then we’ll apply this result recursively to show that\n","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"k43L3ONPp4"},{"type":"inlineMath","value":"V^{\\star} = V^{\\hat \\pi}","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"html":"V=Vπ^V^{\\star} = V^{\\hat \\pi}V=Vπ^","key":"vTOsagy16q"},{"type":"text","value":".","position":{"start":{"line":754,"column":1},"end":{"line":754,"column":1}},"key":"pYxiGtHq60"}],"key":"JB9rtcujCd"},{"type":"proof","kind":"lemma","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"The Bellman operator never decreases the optimal value function","position":{"start":{"line":761,"column":1},"end":{"line":761,"column":1}},"key":"S7uv8mHNkH"}],"key":"FDgz2XTZKS"},{"type":"paragraph","position":{"start":{"line":762,"column":1},"end":{"line":763,"column":1}},"children":[{"type":"inlineMath","value":"\\mathcal{J}^{\\hat \\pi}","position":{"start":{"line":762,"column":1},"end":{"line":762,"column":1}},"html":"Jπ^\\mathcal{J}^{\\hat \\pi}Jπ^","key":"yiasmxH1ou"},{"type":"text","value":" never decreases ","position":{"start":{"line":762,"column":1},"end":{"line":762,"column":1}},"key":"cQZ0H7hXxx"},{"type":"inlineMath","value":"V_\\hi^{\\star}","position":{"start":{"line":762,"column":1},"end":{"line":762,"column":1}},"html":"VhV_\\hi^{\\star}Vh","key":"yceUstem5M"},{"type":"text","value":"\n(elementwise):","position":{"start":{"line":762,"column":1},"end":{"line":762,"column":1}},"key":"YjoUFXJq2E"}],"key":"XLMWguWak4"},{"type":"math","value":"[\\mathcal{J}^{\\hat \\pi} (V_{\\hi+1}^{\\star})](s) \\ge V_\\hi^{\\star}(s).","position":{"start":{"line":765,"column":1},"end":{"line":765,"column":1}},"html":"[Jπ^(Vh+1)](s)Vh(s).[\\mathcal{J}^{\\hat \\pi} (V_{\\hi+1}^{\\star})](s) \\ge V_\\hi^{\\star}(s).[Jπ^(Vh+1)](s)Vh(s).","enumerator":"1.21","key":"tx5VgT78Cs"},{"type":"paragraph","position":{"start":{"line":767,"column":1},"end":{"line":767,"column":1}},"children":[{"type":"strong","position":{"start":{"line":767,"column":1},"end":{"line":767,"column":1}},"children":[{"type":"text","value":"Proof:","position":{"start":{"line":767,"column":1},"end":{"line":767,"column":1}},"key":"CpJo1bQhBt"}],"key":"SxS7PrPmfd"}],"key":"CJ7cSNHGJd"},{"type":"math","value":"\\begin{aligned}\n V_\\hi^{\\star}(s) &= \\max_{\\pi \\in \\Pi} V_\\hi^{\\pi}(s) \\\\\n &= \\max_{\\pi \\in \\Pi} \\mathop{\\mathbb{E}}_{a \\sim \\pi(\\dots)}\\left[r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} V_{\\hi+1}^\\pi(s') \\right] && \\text{Bellman consistency} \\\\\n &\\le \\max_{\\pi \\in \\Pi} \\mathop{\\mathbb{E}}_{a \\sim \\pi(\\dots)}\\left[r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} V_{\\hi+1}^{\\star}(s') \\right] && \\text{definition of } V^\\star \\\\\n &= \\max_{a} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} V_{\\hi+1}^{\\star}(s') \\right] && \\text{only depends on } \\pi \\text{ via } a \\\\\n &= [\\mathcal{J}^{\\hat \\pi}(V_{\\hi+1}^{\\star})](s). \n\\end{aligned}","position":{"start":{"line":769,"column":1},"end":{"line":777,"column":1}},"html":"Vh(s)=maxπΠVhπ(s)=maxπΠEaπ()[r(s,a)+EsP(s,a)Vh+1π(s)]Bellman consistencymaxπΠEaπ()[r(s,a)+EsP(s,a)Vh+1(s)]definition of V=maxa[r(s,a)+EsP(s,a)Vh+1(s)]only depends on π via a=[Jπ^(Vh+1)](s).\\begin{aligned}\n V_\\hi^{\\star}(s) &= \\max_{\\pi \\in \\Pi} V_\\hi^{\\pi}(s) \\\\\n &= \\max_{\\pi \\in \\Pi} \\mathop{\\mathbb{E}}_{a \\sim \\pi(\\dots)}\\left[r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} V_{\\hi+1}^\\pi(s') \\right] && \\text{Bellman consistency} \\\\\n &\\le \\max_{\\pi \\in \\Pi} \\mathop{\\mathbb{E}}_{a \\sim \\pi(\\dots)}\\left[r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} V_{\\hi+1}^{\\star}(s') \\right] && \\text{definition of } V^\\star \\\\\n &= \\max_{a} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} V_{\\hi+1}^{\\star}(s') \\right] && \\text{only depends on } \\pi \\text{ via } a \\\\\n &= [\\mathcal{J}^{\\hat \\pi}(V_{\\hi+1}^{\\star})](s). \n\\end{aligned}Vh(s)=πΠmaxVhπ(s)=πΠmaxEaπ()[r(s,a)+EsP(s,a)Vh+1π(s)]πΠmaxEaπ()[r(s,a)+EsP(s,a)Vh+1(s)]=amax[r(s,a)+EsP(s,a)Vh+1(s)]=[Jπ^(Vh+1)](s).Bellman consistencydefinition of Vonly depends on π via a","enumerator":"1.22","key":"aazkxBtXAw"},{"type":"paragraph","position":{"start":{"line":779,"column":1},"end":{"line":781,"column":1}},"children":[{"type":"text","value":"Note that the chosen action ","position":{"start":{"line":779,"column":1},"end":{"line":779,"column":1}},"key":"P1L9q90Q8T"},{"type":"inlineMath","value":"a \\sim \\pi(\\dots)","position":{"start":{"line":779,"column":1},"end":{"line":779,"column":1}},"html":"aπ()a \\sim \\pi(\\dots)aπ()","key":"WwX6CRlHdj"},{"type":"text","value":" above\nmight depend on the past history; this isn’t shown in the notation and\ndoesn’t affect our result (make sure you see why).","position":{"start":{"line":779,"column":1},"end":{"line":779,"column":1}},"key":"xqc1yhG6IU"}],"key":"EWW4yuJ9Yz"}],"enumerator":"1.1","key":"eFvD1BCyKy"},{"type":"paragraph","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"children":[{"type":"text","value":"We can now apply this result recursively to get","position":{"start":{"line":784,"column":1},"end":{"line":784,"column":1}},"key":"kIF5IKfbU7"}],"key":"PUdaAdM3zu"},{"type":"math","value":"V^{\\star}_t(s) \\le V^{\\hat \\pi}_t(s)","position":{"start":{"line":786,"column":1},"end":{"line":786,"column":1}},"html":"Vt(s)Vtπ^(s)V^{\\star}_t(s) \\le V^{\\hat \\pi}_t(s)Vt(s)Vtπ^(s)","enumerator":"1.23","key":"yVD1XD222D"},{"type":"paragraph","position":{"start":{"line":788,"column":1},"end":{"line":790,"column":1}},"children":[{"type":"text","value":"as follows. (Note that even\nthough ","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"key":"IV3STzls7S"},{"type":"inlineMath","value":"\\hat \\pi","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"html":"π^\\hat \\piπ^","key":"glqp49xDtB"},{"type":"text","value":" is deterministic, we’ll use the ","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"key":"UNmHgVcGFu"},{"type":"inlineMath","value":"a \\sim \\hat \\pi(s)","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"html":"aπ^(s)a \\sim \\hat \\pi(s)aπ^(s)","key":"Wb9yD6gAlJ"},{"type":"text","value":"\nnotation to make it explicit that we’re sampling a trajectory from it.)","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"key":"voQwD7mLem"}],"key":"y1mc3iIRzS"},{"type":"math","value":"\\begin{aligned}\n V_{t}^{\\star}(s) &\\le [\\mathcal{J}^{\\hat \\pi}(V_{\\hi+1}^{\\star})](s) \\\\\n &= \\mathop{\\mathbb{E}}_{a \\sim \\hat \\pi(s)} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} \\left[ {\\color{blue} V_{\\hi+1}^{\\star}(s')} \\right] \\right] && \\text{definition of } \\mathcal{J}^{\\hat \\pi} \\\\\n &\\le \\mathop{\\mathbb{E}}_{a \\sim \\hat \\pi(s)} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} \\left[ {\\color{blue}[ \\mathcal{J}^{\\hat \\pi} (V_{t+2}^{\\star})] (s')} \\right] \\right] && \\text{above lemma} \\\\\n &= \\mathop{\\mathbb{E}}_{a \\sim \\hat \\pi(s)} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)}{\\color{blue} \\left[ \\mathop{\\mathbb{E}}_{a' \\sim \\hat \\pi} r(s', a') + \\mathop{\\mathbb{E}}_{s''} V_{t+2}^{\\star}(s'') \\right]} \\right] && \\text{definition of } \\mathcal{J}^{\\hat \\pi} \\\\\n &\\le \\cdots && \\text{apply at all timesteps} \\\\\n &= \\mathop{\\mathbb{E}}_{\\tau \\sim \\rho^{\\hat \\pi}} [G_{t} \\mid s_\\hi = s] && \\text{rewrite expectation} \\\\\n &= V_{t}^{\\hat \\pi}(s) && \\text{definition}\n\\end{aligned}","position":{"start":{"line":792,"column":1},"end":{"line":802,"column":1}},"html":"Vt(s)[Jπ^(Vh+1)](s)=Eaπ^(s)[r(s,a)+EsP(s,a)[Vh+1(s)]]definition of Jπ^Eaπ^(s)[r(s,a)+EsP(s,a)[[Jπ^(Vt+2)](s)]]above lemma=Eaπ^(s)[r(s,a)+EsP(s,a)[Eaπ^r(s,a)+EsVt+2(s)]]definition of Jπ^apply at all timesteps=Eτρπ^[Gtsh=s]rewrite expectation=Vtπ^(s)definition\\begin{aligned}\n V_{t}^{\\star}(s) &\\le [\\mathcal{J}^{\\hat \\pi}(V_{\\hi+1}^{\\star})](s) \\\\\n &= \\mathop{\\mathbb{E}}_{a \\sim \\hat \\pi(s)} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} \\left[ {\\color{blue} V_{\\hi+1}^{\\star}(s')} \\right] \\right] && \\text{definition of } \\mathcal{J}^{\\hat \\pi} \\\\\n &\\le \\mathop{\\mathbb{E}}_{a \\sim \\hat \\pi(s)} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} \\left[ {\\color{blue}[ \\mathcal{J}^{\\hat \\pi} (V_{t+2}^{\\star})] (s')} \\right] \\right] && \\text{above lemma} \\\\\n &= \\mathop{\\mathbb{E}}_{a \\sim \\hat \\pi(s)} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)}{\\color{blue} \\left[ \\mathop{\\mathbb{E}}_{a' \\sim \\hat \\pi} r(s', a') + \\mathop{\\mathbb{E}}_{s''} V_{t+2}^{\\star}(s'') \\right]} \\right] && \\text{definition of } \\mathcal{J}^{\\hat \\pi} \\\\\n &\\le \\cdots && \\text{apply at all timesteps} \\\\\n &= \\mathop{\\mathbb{E}}_{\\tau \\sim \\rho^{\\hat \\pi}} [G_{t} \\mid s_\\hi = s] && \\text{rewrite expectation} \\\\\n &= V_{t}^{\\hat \\pi}(s) && \\text{definition}\n\\end{aligned}Vt(s)[Jπ^(Vh+1)](s)=Eaπ^(s)[r(s,a)+EsP(s,a)[Vh+1(s)]]Eaπ^(s)[r(s,a)+EsP(s,a)[[Jπ^(Vt+2)](s)]]=Eaπ^(s)[r(s,a)+EsP(s,a)[Eaπ^r(s,a)+Es′′Vt+2(s′′)]]=Eτρπ^[Gtsh=s]=Vtπ^(s)definition of Jπ^above lemmadefinition of Jπ^apply at all timestepsrewrite expectationdefinition","enumerator":"1.24","key":"RJZDopUIiY"},{"type":"paragraph","position":{"start":{"line":804,"column":1},"end":{"line":804,"column":1}},"children":[{"type":"text","value":"And so we have ","position":{"start":{"line":804,"column":1},"end":{"line":804,"column":1}},"key":"itvjl1Vj5G"},{"type":"inlineMath","value":"V^{\\star} = V^{\\hat \\pi}","position":{"start":{"line":804,"column":1},"end":{"line":804,"column":1}},"html":"V=Vπ^V^{\\star} = V^{\\hat \\pi}V=Vπ^","key":"TqLXxAKCUX"},{"type":"text","value":", making ","position":{"start":{"line":804,"column":1},"end":{"line":804,"column":1}},"key":"ze9WYc5m0U"},{"type":"inlineMath","value":"\\hat \\pi","position":{"start":{"line":804,"column":1},"end":{"line":804,"column":1}},"html":"π^\\hat \\piπ^","key":"oYGOql9Kyu"},{"type":"text","value":" optimal.","position":{"start":{"line":804,"column":1},"end":{"line":804,"column":1}},"key":"yw1fU9msmh"}],"key":"gtAqoFdugi"}],"enumerator":"1.1","key":"cX0tvJ9m2D"},{"type":"paragraph","position":{"start":{"line":807,"column":1},"end":{"line":807,"column":1}},"children":[{"type":"text","value":"Note that this also gives simplified forms of the ","position":{"start":{"line":807,"column":1},"end":{"line":807,"column":1}},"key":"sIJf1uSY4S"},{"type":"crossReference","position":{"start":{"line":807,"column":1},"end":{"line":807,"column":1}},"children":[{"type":"text","value":"Bellman consistency","position":{"start":{"line":807,"column":1},"end":{"line":807,"column":1}},"key":"uo3Yyl87Co"}],"identifier":"bellman_consistency","label":"bellman_consistency","kind":"proof:theorem","template":"Theorem %s","enumerator":"1.1","resolved":true,"html_id":"bellman-consistency","key":"eQ9hRJMMSu"},{"type":"text","value":" equations for the optimal policy:","position":{"start":{"line":807,"column":1},"end":{"line":807,"column":1}},"key":"YYSkf6RRAp"}],"key":"hsY3zfdSVv"},{"type":"proof","kind":"corollary","label":"bellman_consistency_optimal","identifier":"bellman_consistency_optimal","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Bellman consistency equations for the optimal policy","position":{"start":{"line":809,"column":1},"end":{"line":809,"column":1}},"key":"nrjQr53jJK"}],"key":"OWs6kVNiRx"},{"type":"math","value":"\\begin{aligned}\n V_\\hi^\\star(s) &= \\max_a Q_\\hi^\\star(s, a) \\\\\n Q_\\hi^\\star(s, a) &= r(s, a) + \\E_{s' \\sim P(s, a)} [V_{\\hi+1}^\\star(s')]\n\\end{aligned}","position":{"start":{"line":812,"column":1},"end":{"line":817,"column":1}},"html":"Vh(s)=maxaQh(s,a)Qh(s,a)=r(s,a)+EsP(s,a)[Vh+1(s)]\\begin{aligned}\n V_\\hi^\\star(s) &= \\max_a Q_\\hi^\\star(s, a) \\\\\n Q_\\hi^\\star(s, a) &= r(s, a) + \\E_{s' \\sim P(s, a)} [V_{\\hi+1}^\\star(s')]\n\\end{aligned}Vh(s)Qh(s,a)=amaxQh(s,a)=r(s,a)+EsP(s,a)[Vh+1(s)]","enumerator":"1.25","key":"L96ZjQfmTR"}],"enumerator":"1.1","html_id":"bellman-consistency-optimal","key":"G0mRfsAOos"},{"type":"paragraph","position":{"start":{"line":820,"column":1},"end":{"line":823,"column":1}},"children":[{"type":"text","value":"Now that we’ve shown this particular greedy policy is optimal, all we\nneed to do is compute the optimal value function and optimal policy. We\ncan do this by working backwards in time using ","position":{"start":{"line":820,"column":1},"end":{"line":820,"column":1}},"key":"KdDITe9fIY"},{"type":"strong","position":{"start":{"line":820,"column":1},"end":{"line":820,"column":1}},"children":[{"type":"text","value":"dynamic programming","position":{"start":{"line":820,"column":1},"end":{"line":820,"column":1}},"key":"fOz0NIcRWb"}],"key":"LRRT0lpouF"},{"type":"text","value":"\n(DP).","position":{"start":{"line":820,"column":1},"end":{"line":820,"column":1}},"key":"QLfIdWq47O"}],"key":"fEJMt9Fe5s"},{"type":"proof","kind":"definition","label":"pi_star_dp","identifier":"pi_star_dp","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"DP algorithm to compute an optimal policy in a finite-horizon MDP","position":{"start":{"line":825,"column":1},"end":{"line":825,"column":1}},"key":"jiiQPVbPvF"}],"key":"UYBhmF26As"},{"type":"paragraph","position":{"start":{"line":828,"column":1},"end":{"line":830,"column":1}},"children":[{"type":"strong","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"children":[{"type":"text","value":"Base case.","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"key":"U2lJhrwWAa"}],"key":"huQm7c80Wh"},{"type":"text","value":" At the end of the episode (time step ","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"key":"pDjSzbqyjm"},{"type":"inlineMath","value":"H-1","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"html":"H1H-1H1","key":"fF9RqF9fFq"},{"type":"text","value":"), we can’t\ntake any more actions, so the ","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"key":"ISZ98XNsHG"},{"type":"inlineMath","value":"Q","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"html":"QQQ","key":"hYH3iuuym6"},{"type":"text","value":"-function is simply the reward that\nwe obtain:","position":{"start":{"line":828,"column":1},"end":{"line":828,"column":1}},"key":"YCzoxYWReg"}],"key":"voOYmpAT1s"},{"type":"math","value":"Q^\\star_{H-1}(s, a) = r(s, a)","position":{"start":{"line":832,"column":1},"end":{"line":832,"column":1}},"html":"QH1(s,a)=r(s,a)Q^\\star_{H-1}(s, a) = r(s, a)QH1(s,a)=r(s,a)","enumerator":"1.26","key":"kaaIjDOV7Y"},{"type":"paragraph","position":{"start":{"line":834,"column":1},"end":{"line":835,"column":1}},"children":[{"type":"text","value":"so the best thing to do\nis just act greedily and get as much reward as we can!","position":{"start":{"line":834,"column":1},"end":{"line":834,"column":1}},"key":"XK2jKJO39A"}],"key":"eEvxAce9bR"},{"type":"math","value":"\\pi^\\star_{H-1}(s) = \\arg\\max_a Q^\\star_{H-1}(s, a)","position":{"start":{"line":837,"column":1},"end":{"line":837,"column":1}},"html":"πH1(s)=argmaxaQH1(s,a)\\pi^\\star_{H-1}(s) = \\arg\\max_a Q^\\star_{H-1}(s, a)πH1(s)=argamaxQH1(s,a)","enumerator":"1.27","key":"JsuTKHhoJR"},{"type":"paragraph","position":{"start":{"line":839,"column":1},"end":{"line":841,"column":1}},"children":[{"type":"text","value":"Then\n","position":{"start":{"line":839,"column":1},"end":{"line":839,"column":1}},"key":"D8igkEJ2qT"},{"type":"inlineMath","value":"V^\\star_{H-1}(s)","position":{"start":{"line":839,"column":1},"end":{"line":839,"column":1}},"html":"VH1(s)V^\\star_{H-1}(s)VH1(s)","key":"CKmbBLj95y"},{"type":"text","value":", the optimal value of state ","position":{"start":{"line":839,"column":1},"end":{"line":839,"column":1}},"key":"QGwCoo6c7E"},{"type":"inlineMath","value":"s","position":{"start":{"line":839,"column":1},"end":{"line":839,"column":1}},"html":"sss","key":"nxsNHwgNAW"},{"type":"text","value":" at the end of the\ntrajectory, is simply whatever action gives the most reward.","position":{"start":{"line":839,"column":1},"end":{"line":839,"column":1}},"key":"hUEiqcXDh9"}],"key":"g6kpHO1q47"},{"type":"math","value":"V^\\star_{H-1} = \\max_a Q^\\star_{H-1}(s, a)","position":{"start":{"line":843,"column":1},"end":{"line":843,"column":1}},"html":"VH1=maxaQH1(s,a)V^\\star_{H-1} = \\max_a Q^\\star_{H-1}(s, a)VH1=amaxQH1(s,a)","enumerator":"1.28","key":"Qsf7NcMtu1"},{"type":"paragraph","position":{"start":{"line":845,"column":1},"end":{"line":847,"column":1}},"children":[{"type":"strong","position":{"start":{"line":845,"column":1},"end":{"line":845,"column":1}},"children":[{"type":"text","value":"Recursion.","position":{"start":{"line":845,"column":1},"end":{"line":845,"column":1}},"key":"EqddAuie0E"}],"key":"Pamamr7Ois"},{"type":"text","value":" Then, we can work backwards in time, starting from the\nend, using our consistency equations! i.e. for each\n","position":{"start":{"line":845,"column":1},"end":{"line":845,"column":1}},"key":"vF0licul5g"},{"type":"inlineMath","value":"t = H-2, \\dots, 0","position":{"start":{"line":845,"column":1},"end":{"line":845,"column":1}},"html":"t=H2,,0t = H-2, \\dots, 0t=H2,,0","key":"MbdXARxCfT"},{"type":"text","value":", we set","position":{"start":{"line":845,"column":1},"end":{"line":845,"column":1}},"key":"m8BHNrzzEE"}],"key":"KemhTRqO2B"},{"type":"math","value":"\\begin{aligned}\n Q^\\star_{t}(s, a) &= r(s, a) + \\E_{s' \\sim P(s, a)} [V^\\star_{\\hi+1}(s')] \\\\\n \\pi^\\star_{t}(s) &= \\arg\\max_a Q^\\star_{t}(s, a) \\\\\n V^\\star_{t}(s) &= \\max_a Q^\\star_{t}(s, a)\n\\end{aligned}","position":{"start":{"line":849,"column":1},"end":{"line":855,"column":1}},"html":"Qt(s,a)=r(s,a)+EsP(s,a)[Vh+1(s)]πt(s)=argmaxaQt(s,a)Vt(s)=maxaQt(s,a)\\begin{aligned}\n Q^\\star_{t}(s, a) &= r(s, a) + \\E_{s' \\sim P(s, a)} [V^\\star_{\\hi+1}(s')] \\\\\n \\pi^\\star_{t}(s) &= \\arg\\max_a Q^\\star_{t}(s, a) \\\\\n V^\\star_{t}(s) &= \\max_a Q^\\star_{t}(s, a)\n\\end{aligned}Qt(s,a)πt(s)Vt(s)=r(s,a)+EsP(s,a)[Vh+1(s)]=argamaxQt(s,a)=amaxQt(s,a)","enumerator":"1.29","key":"LvMHJUSu3J"}],"enumerator":"1.11","html_id":"pi-star-dp","key":"mL3cSJTFJi"}],"key":"B4r7r14dz7"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def find_optimal_policy(mdp: MDP):\n Q = [None] * mdp.H\n pi = [None] * mdp.H\n V = [None] * mdp.H + [jnp.zeros(mdp.S)] # initialize to 0 at end of time horizon\n\n for h in range(mdp.H - 1, -1, -1):\n Q[h] = mdp.r + mdp.P @ V[h + 1]\n pi[h] = jnp.eye(mdp.S)[jnp.argmax(Q[h], axis=1)] # one-hot\n V[h] = jnp.max(Q[h], axis=1)\n\n Q = jnp.stack(Q)\n pi = jnp.stack(pi)\n V = jnp.stack(V[:-1])\n\n return pi, V, Q","key":"Q9l0jfBTZu"},{"type":"output","id":"mu9yYFAGH8McwoPVc8JGu","data":[],"key":"Ow2QFC3ZrG"}],"data":{},"key":"gz5xQpgXvK"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":876,"column":1},"end":{"line":879,"column":1}},"children":[{"type":"text","value":"At each of the ","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"key":"hMz3Z1Knpa"},{"type":"inlineMath","value":"H","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"html":"HHH","key":"rYGLG1tFbR"},{"type":"text","value":" timesteps, we must compute ","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"key":"ryhAGakYXK"},{"type":"inlineMath","value":"Q^{\\star}","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"html":"QQ^{\\star}Q","key":"ULLlUTy1dr"},{"type":"text","value":" for each of\nthe ","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"key":"aI5lkEZNx5"},{"type":"inlineMath","value":"|\\mathcal{S}| |\\mathcal{A}|","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"html":"SA|\\mathcal{S}| |\\mathcal{A}|S∣∣A","key":"O1X1CVrowQ"},{"type":"text","value":" state-action pairs. Each computation takes ","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"key":"pDEOmdtM57"},{"type":"inlineMath","value":"|\\mathcal{S}|","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"html":"S|\\mathcal{S}|S","key":"t7e7knWIEZ"},{"type":"text","value":"\noperations to evaluate the average value over ","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"key":"Bv2IZmvXyG"},{"type":"inlineMath","value":"s'","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"html":"ss's","key":"G9Le4QOHbb"},{"type":"text","value":". This gives a total\ncomputation time of ","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"key":"IyGroqnwVa"},{"type":"inlineMath","value":"O(H \\cdot |\\mathcal{S}|^2 \\cdot |\\mathcal{A}|)","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"html":"O(HS2A)O(H \\cdot |\\mathcal{S}|^2 \\cdot |\\mathcal{A}|)O(HS2A)","key":"OgglZfyRTI"},{"type":"text","value":".","position":{"start":{"line":876,"column":1},"end":{"line":876,"column":1}},"key":"XcnATgbOci"}],"key":"LwViIPDONe"},{"type":"paragraph","position":{"start":{"line":881,"column":1},"end":{"line":886,"column":1}},"children":[{"type":"text","value":"Note that this algorithm is identical to the policy evaluation algorithm\n","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"cLpNCpZJPz"},{"type":"crossReference","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"children":[{"type":"inlineCode","value":"dp_eval_finite","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"TpRV2MEoOS"}],"identifier":"eval_dp","label":"eval_dp","kind":"heading","template":"Section %s","enumerator":"1.3.1","resolved":true,"html_id":"eval-dp","key":"UlXt7T1JPr"},{"type":"text","value":", but instead of ","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"WeZ2RfQphN"},{"type":"emphasis","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"children":[{"type":"text","value":"averaging","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"IXmVSFeaIY"}],"key":"YvpideTL54"},{"type":"text","value":" over the\nactions chosen by a policy, we instead simply take a ","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"W6KdYmZOM0"},{"type":"emphasis","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"children":[{"type":"text","value":"maximum","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"RRDXzqk6xV"}],"key":"Y5V5YsNTZI"},{"type":"text","value":" over the\naction-values. We’ll see this relationship between ","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"bWUCmNOatx"},{"type":"strong","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"children":[{"type":"text","value":"policy evaluation","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"JZMkye9l1n"}],"key":"JNlZc1Isbh"},{"type":"text","value":"\nand ","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"IVqJ8CAb3e"},{"type":"strong","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"children":[{"type":"text","value":"optimal policy computation","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"oFKYu8amof"}],"key":"ZbTik1WwPJ"},{"type":"text","value":" show up again in the infinite-horizon\nsetting.","position":{"start":{"line":881,"column":1},"end":{"line":881,"column":1}},"key":"RL9peBKGOU"}],"key":"J3y7wBHpc2"}],"key":"pDLEFmpbIV"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"π_opt, V_opt, Q_opt = find_optimal_policy(tidy_mdp)\nassert jnp.allclose(π_opt, tidy_policy_messy_only)\nassert jnp.allclose(V_opt, V_messy)\nassert jnp.allclose(Q_opt[:-1], v_ary_to_q_ary(tidy_mdp, V_messy)[1:])\n\"Assertions passed (the 'tidy when messy' policy is optimal)\"","key":"HfwxWuqvP3"},{"type":"output","id":"uJUd2VyHrakQE2S0qgjCB","data":[{"output_type":"execute_result","execution_count":16,"metadata":{},"data":{"text/plain":{"content":"\"Assertions passed (the 'tidy when messy' policy is optimal)\"","content_type":"text/plain"}}}],"key":"mVgppIXYWR"}],"data":{},"key":"UsfKUwp6R6"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":897,"column":1},"end":{"line":897,"column":1}},"children":[{"type":"text","value":"Infinite-horizon MDPs","position":{"start":{"line":897,"column":1},"end":{"line":897,"column":1}},"key":"hayxLPmybH"}],"label":"infinite_horizon_mdps","identifier":"infinite_horizon_mdps","html_id":"infinite-horizon-mdps","enumerator":"1.4","key":"kyFcmPgg0w"},{"type":"paragraph","position":{"start":{"line":899,"column":1},"end":{"line":900,"column":1}},"children":[{"type":"text","value":"What happens if a trajectory is allowed to continue forever (i.e.\n","position":{"start":{"line":899,"column":1},"end":{"line":899,"column":1}},"key":"uGnVeVjvMp"},{"type":"inlineMath","value":"H = \\infty","position":{"start":{"line":899,"column":1},"end":{"line":899,"column":1}},"html":"H=H = \\inftyH=","key":"frszbE8P8V"},{"type":"text","value":")? This is the setting of ","position":{"start":{"line":899,"column":1},"end":{"line":899,"column":1}},"key":"QPHZTQMVcv"},{"type":"strong","position":{"start":{"line":899,"column":1},"end":{"line":899,"column":1}},"children":[{"type":"text","value":"infinite horizon","position":{"start":{"line":899,"column":1},"end":{"line":899,"column":1}},"key":"yVZ2VpSoOl"}],"key":"mdhLJ8uCj8"},{"type":"text","value":" MDPs.","position":{"start":{"line":899,"column":1},"end":{"line":899,"column":1}},"key":"saXUz2jnFH"}],"key":"grfsNeOFoQ"},{"type":"paragraph","position":{"start":{"line":902,"column":1},"end":{"line":910,"column":1}},"children":[{"type":"text","value":"In this chapter, we’ll describe the necessary adjustments from the\nfinite-horizon case to make the problem tractable. We’ll show that the\n","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"kH9e6XRo3m"},{"type":"crossReference","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"children":[{"type":"text","value":"Bellman operator","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"K96Lq7WAQr"}],"identifier":"bellman_operator","label":"bellman_operator","kind":"proof:definition","template":"Definition %s","enumerator":"1.8","resolved":true,"html_id":"bellman-operator","key":"ZDLjOFR1KH"},{"type":"text","value":" in the discounted reward setting is a\n","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"S8QIlkEAqD"},{"type":"strong","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"children":[{"type":"text","value":"contraction mapping","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"zbhFdFz2bS"}],"key":"BvE3pv8gd6"},{"type":"text","value":" for any policy.\nWe’ll discuss how to evaluate\npolicies (i.e. compute their corresponding value functions). Finally,\nwe’ll present and analyze two iterative algorithms, based on the Bellman\noperator, for computing the optimal policy: ","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"ayQeEO5JOP"},{"type":"strong","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"children":[{"type":"text","value":"value iteration","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"qTDjummItF"}],"key":"A41Neptit7"},{"type":"text","value":" and\n","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"oTailhnrGY"},{"type":"strong","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"children":[{"type":"text","value":"policy iteration","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"dfCwydYMMN"}],"key":"UhURmWFJIl"},{"type":"text","value":".","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"KlL86VdhjV"}],"key":"aZxFr2K4tA"},{"type":"heading","depth":3,"position":{"start":{"line":912,"column":1},"end":{"line":912,"column":1}},"children":[{"type":"text","value":"Discounted rewards","position":{"start":{"line":912,"column":1},"end":{"line":912,"column":1}},"key":"MOrTZyAHdb"}],"identifier":"discounted-rewards","label":"Discounted rewards","html_id":"discounted-rewards","implicit":true,"enumerator":"1.4.1","key":"Zc0YTi8rNZ"},{"type":"paragraph","position":{"start":{"line":914,"column":1},"end":{"line":918,"column":1}},"children":[{"type":"text","value":"First of all, note that maximizing the cumulative reward\n","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"key":"D3y9LLCsvq"},{"type":"inlineMath","value":"r_\\hi + r_{\\hi+1} + r_{\\hi+2} + \\cdots","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"html":"rh+rh+1+rh+2+r_\\hi + r_{\\hi+1} + r_{\\hi+2} + \\cdotsrh+rh+1+rh+2+","key":"yLO7tQsrlc"},{"type":"text","value":" is no longer a good idea since it\nmight blow up to infinity. Instead of a time horizon ","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"key":"PXZLbDGZng"},{"type":"inlineMath","value":"H","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"html":"HHH","key":"wytoQkyZLy"},{"type":"text","value":", we now need a\n","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"key":"nNcIVOvMZp"},{"type":"strong","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"children":[{"type":"text","value":"discount factor","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"key":"jPkL80ltAE"}],"key":"cSuTHTEIC0"},{"type":"text","value":" ","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"key":"FMV2VaQUTS"},{"type":"inlineMath","value":"\\gamma \\in [0, 1)","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"html":"γ[0,1)\\gamma \\in [0, 1)γ[0,1)","key":"Nf3NoLjakj"},{"type":"text","value":" such that rewards become less\nvaluable the further into the future they are:","position":{"start":{"line":914,"column":1},"end":{"line":914,"column":1}},"key":"AEYwfI5JDa"}],"key":"zy7kP6gZ3F"},{"type":"math","value":"r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots = \\sum_{k=0}^\\infty \\gamma^k r_{\\hi+k}.","position":{"start":{"line":920,"column":1},"end":{"line":920,"column":1}},"html":"rh+γrh+1+γ2rh+2+=k=0γkrh+k.r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots = \\sum_{k=0}^\\infty \\gamma^k r_{\\hi+k}.rh+γrh+1+γ2rh+2+=k=0γkrh+k.","enumerator":"1.30","key":"UFV0dFBUnD"},{"type":"paragraph","position":{"start":{"line":922,"column":1},"end":{"line":924,"column":1}},"children":[{"type":"text","value":"We can think of ","position":{"start":{"line":922,"column":1},"end":{"line":922,"column":1}},"key":"WtmDdqBckf"},{"type":"text","value":"γ","position":{"start":{"line":922,"column":1},"end":{"line":922,"column":1}},"key":"TZQfpT3inq"},{"type":"text","value":" as measuring how much we care about the future:\nif it’s close to ","position":{"start":{"line":922,"column":1},"end":{"line":922,"column":1}},"key":"JwBjEVptmQ"},{"type":"text","value":"0","position":{"start":{"line":922,"column":1},"end":{"line":922,"column":1}},"key":"XpcHYgpDxV"},{"type":"text","value":", we only care about the near-term rewards; it’s\nclose to ","position":{"start":{"line":922,"column":1},"end":{"line":922,"column":1}},"key":"xGRDYDBM73"},{"type":"text","value":"1","position":{"start":{"line":922,"column":1},"end":{"line":922,"column":1}},"key":"cR5LRVA11O"},{"type":"text","value":", we put more weight into future rewards.","position":{"start":{"line":922,"column":1},"end":{"line":922,"column":1}},"key":"QHhu8vRC1u"}],"key":"cRO9kvmLhP"},{"type":"paragraph","position":{"start":{"line":926,"column":1},"end":{"line":932,"column":1}},"children":[{"type":"text","value":"You can also analyze ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"bumEhdE9hs"},{"type":"text","value":"γ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"ihYpyAzT5c"},{"type":"text","value":" as the probability of ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"x3D2fXCTbz"},{"type":"emphasis","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"children":[{"type":"text","value":"continuing","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"iC6Hwz95rY"}],"key":"Ql9YgJKwO2"},{"type":"text","value":" the\ntrajectory at each time step. (This is equivalent to ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"bHd7vAmHdu"},{"type":"inlineMath","value":"H","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"html":"HHH","key":"oxiusGSbUO"},{"type":"text","value":" being\ndistributed by a First Success distribution with success probability\n","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"DJpcJW8985"},{"type":"text","value":"γ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"anvTlmmoXK"},{"type":"text","value":".) This accords with the above interpretation: if ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"jSMUggsmBP"},{"type":"text","value":"γ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"ntchbVZdWq"},{"type":"text","value":" is\nclose to ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"CaK473pYEU"},{"type":"text","value":"0","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"zSEWz8WfJN"},{"type":"text","value":", the trajectory will likely be very short, while if\n","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"MYePryyLMF"},{"type":"text","value":"γ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"svnehWyTo2"},{"type":"text","value":" is close to ","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"TogIdHqrme"},{"type":"text","value":"1","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"aBOC77kp8H"},{"type":"text","value":", the trajectory will likely continue for a long\ntime.","position":{"start":{"line":926,"column":1},"end":{"line":926,"column":1}},"key":"be2zoEk4wC"}],"key":"JKIddAPj0x"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"fnJG9dEDrO"}],"key":"Ls3QpHv7JO"},{"type":"paragraph","position":{"start":{"line":935,"column":1},"end":{"line":937,"column":1}},"children":[{"type":"text","value":"Assuming that ","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"key":"IbztZ1zzO5"},{"type":"inlineMath","value":"r_\\hi \\in [0, 1]","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"html":"rh[0,1]r_\\hi \\in [0, 1]rh[0,1]","key":"MNfmckzaMP"},{"type":"text","value":" for all ","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"key":"QsyxIcnRml"},{"type":"inlineMath","value":"\\hi \\in \\mathbb{N}","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"html":"hN\\hi \\in \\mathbb{N}hN","key":"oC6jBGpsg9"},{"type":"text","value":",\nwhat is the maximum ","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"key":"EDO0MzdWq6"},{"type":"strong","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"children":[{"type":"text","value":"discounted","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"key":"c0ycxfK7Bz"}],"key":"li6F1HFl6S"},{"type":"text","value":" cumulative reward? You may find it\nuseful to review geometric series.","position":{"start":{"line":935,"column":1},"end":{"line":935,"column":1}},"key":"iANKYhBwpF"}],"key":"drMz8tizul"}],"key":"QOK2dMQk1u"},{"type":"paragraph","position":{"start":{"line":940,"column":1},"end":{"line":940,"column":1}},"children":[{"type":"text","value":"The other components of the MDP remain the same:","position":{"start":{"line":940,"column":1},"end":{"line":940,"column":1}},"key":"CaBltdmk1T"}],"key":"nHapWfMJAT"},{"type":"math","value":"M = (\\mathcal{S}, \\mathcal{A}, \\mu, P, r, \\gamma).","position":{"start":{"line":942,"column":1},"end":{"line":942,"column":1}},"html":"M=(S,A,μ,P,r,γ).M = (\\mathcal{S}, \\mathcal{A}, \\mu, P, r, \\gamma).M=(S,A,μ,P,r,γ).","enumerator":"1.31","key":"cnxUGx5wSm"},{"type":"paragraph","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"children":[{"type":"text","value":"Code-wise, we can reuse the ","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"bA3tIXMwbN"},{"type":"inlineCode","value":"MDP","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"MJYDNodnWV"},{"type":"text","value":" class from before ","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"Epa9Vs398K"},{"type":"crossReference","kind":"proof:definition","identifier":"finite_horizon_mdp","label":"finite_horizon_mdp","children":[{"type":"text","value":"Definition ","key":"XirYhr4PMV"},{"type":"text","value":"1.2","key":"YTxzEtcf1a"}],"template":"Definition %s","enumerator":"1.2","resolved":true,"html_id":"finite-horizon-mdp","key":"Wjko9t65im"},{"type":"text","value":" and set ","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"nfnUbSQXRg"},{"type":"inlineCode","value":"mdp.H = float('inf')","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"MwQbKxbdrG"},{"type":"text","value":".","position":{"start":{"line":944,"column":1},"end":{"line":944,"column":1}},"key":"L1ZypGvfOz"}],"key":"tJ94biBRdD"}],"key":"dG0owfvLtm"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"tidy_mdp_inf = tidy_mdp._replace(H=float(\"inf\"), γ=0.95)","key":"VrafhjeOLm"},{"type":"output","id":"6r-9b1xWuNH-E2mFupUbF","data":[],"key":"t5jyV4iSGl"}],"data":{},"key":"TnsstsFuH7"},{"type":"block","children":[{"type":"heading","depth":3,"position":{"start":{"line":950,"column":1},"end":{"line":950,"column":1}},"children":[{"type":"text","value":"Stationary policies","position":{"start":{"line":950,"column":1},"end":{"line":950,"column":1}},"key":"jgS6GSP0wA"}],"identifier":"stationary-policies","label":"Stationary policies","html_id":"stationary-policies","implicit":true,"enumerator":"1.4.2","key":"a6hMrWoX2v"},{"type":"paragraph","position":{"start":{"line":952,"column":1},"end":{"line":956,"column":1}},"children":[{"type":"text","value":"The time-dependent policies from the finite-horizon case become\ndifficult to handle in the infinite-horizon case. In particular, many of\nthe DP approaches we saw required us to start at the end of the\ntrajectory, which is no longer possible. We’ll shift to ","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"key":"fYQMfnFQde"},{"type":"strong","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"children":[{"type":"text","value":"stationary","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"key":"VazN8lUMvu"}],"key":"MBBG23FgKZ"},{"type":"text","value":"\npolicies ","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"key":"U9OEeKorx8"},{"type":"inlineMath","value":"\\pi : \\mathcal{S} \\to \\mathcal{A}","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"html":"π:SA\\pi : \\mathcal{S} \\to \\mathcal{A}π:SA","key":"RmDDw937xh"},{"type":"text","value":" (deterministic) or ","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"key":"IHr68K76DE"},{"type":"inlineMath","value":"\\Delta(\\mathcal{A})","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"html":"Δ(A)\\Delta(\\mathcal{A})Δ(A)","key":"nQF4w1Jr0S"},{"type":"text","value":" (stochastic).","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"key":"L7cjhOnUV8"}],"key":"MOvKgvRJe4"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"UU8FHFmWn7"}],"key":"KVyEhuVTTC"},{"type":"paragraph","position":{"start":{"line":959,"column":1},"end":{"line":959,"column":1}},"children":[{"type":"text","value":"Which of the policies in ","position":{"start":{"line":959,"column":1},"end":{"line":959,"column":1}},"key":"Ia15mktCy0"},{"type":"crossReference","kind":"proof:example","identifier":"tidy_policy","label":"tidy_policy","children":[{"type":"text","value":"Example ","key":"crTNtDNlz6"},{"type":"text","value":"1.2","key":"sHGFGia0nK"}],"template":"Example %s","enumerator":"1.2","resolved":true,"html_id":"tidy-policy","key":"fhFrUIAB8k"},{"type":"text","value":" are stationary?","position":{"start":{"line":959,"column":1},"end":{"line":959,"column":1}},"key":"QdLA7FDwhh"}],"key":"OSRHdlgRJw"}],"key":"lL4bRXirZC"},{"type":"heading","depth":3,"position":{"start":{"line":962,"column":1},"end":{"line":962,"column":1}},"children":[{"type":"text","value":"Value functions and Bellman consistency","position":{"start":{"line":962,"column":1},"end":{"line":962,"column":1}},"key":"ZkkSvS65Su"}],"identifier":"value-functions-and-bellman-consistency","label":"Value functions and Bellman consistency","html_id":"value-functions-and-bellman-consistency","implicit":true,"enumerator":"1.4.3","key":"OWMLAiK5F9"},{"type":"paragraph","position":{"start":{"line":964,"column":1},"end":{"line":966,"column":1}},"children":[{"type":"text","value":"We also consider stationary value functions ","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"key":"N2Ohb7mFok"},{"type":"inlineMath","value":"V^\\pi : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"html":"Vπ:SRV^\\pi : \\mathcal{S} \\to \\mathbb{R}Vπ:SR","key":"nq0YejyQka"},{"type":"text","value":" and\n","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"key":"ZaoMsBIclW"},{"type":"inlineMath","value":"Q^\\pi : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"html":"Qπ:S×ARQ^\\pi : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}Qπ:S×AR","key":"EyqkEieLPz"},{"type":"text","value":". We need to insert a factor of ","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"key":"h76uqBHiCl"},{"type":"text","value":"γ","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"key":"GmAkiXSwmT"},{"type":"text","value":"\ninto the Bellman consistency equation ","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"key":"jrMnGZ5T1v"},{"type":"crossReference","kind":"proof:theorem","identifier":"bellman_consistency","label":"bellman_consistency","children":[{"type":"text","value":"Theorem ","key":"E56mDRFvKe"},{"type":"text","value":"1.1","key":"KjNKO1BTk1"}],"template":"Theorem %s","enumerator":"1.1","resolved":true,"html_id":"bellman-consistency","key":"XPglg10Y3w"},{"type":"text","value":" to account for the discounting:","position":{"start":{"line":964,"column":1},"end":{"line":964,"column":1}},"key":"TjB93KnqNh"}],"key":"BuS63Ve1eK"},{"type":"math","value":"\\begin{aligned}\n V^\\pi(s) &= \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} \\cdots \\mid s_\\hi = s] && \\text{for any } \\hi \\in \\mathbb{N} \\\\\n &= \\E_{\\substack{a \\sim \\pi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + \\gamma V^\\pi(s')]\\\\\n Q^\\pi(s, a) &= \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots \\mid s_\\hi = s, a_\\hi = a] && \\text{for any } \\hi \\in \\mathbb{N} \\\\\n &= r(s, a) + \\gamma \\E_{\\substack{s' \\sim P(s, a) \\\\ a' \\sim \\pi(s')}} [Q^\\pi(s', a')]\n\\end{aligned}","label":"bellman_consistency_infinite","identifier":"bellman_consistency_infinite","html":"Vπ(s)=Eτρπ[rh+γrh+1+γ2rh+2sh=s]for any hN=Eaπ(s)sP(s,a)[r(s,a)+γVπ(s)]Qπ(s,a)=Eτρπ[rh+γrh+1+γ2rh+2+sh=s,ah=a]for any hN=r(s,a)+γEsP(s,a)aπ(s)[Qπ(s,a)]\\begin{aligned}\n V^\\pi(s) &= \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} \\cdots \\mid s_\\hi = s] && \\text{for any } \\hi \\in \\mathbb{N} \\\\\n &= \\E_{\\substack{a \\sim \\pi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + \\gamma V^\\pi(s')]\\\\\n Q^\\pi(s, a) &= \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots \\mid s_\\hi = s, a_\\hi = a] && \\text{for any } \\hi \\in \\mathbb{N} \\\\\n &= r(s, a) + \\gamma \\E_{\\substack{s' \\sim P(s, a) \\\\ a' \\sim \\pi(s')}} [Q^\\pi(s', a')]\n\\end{aligned}Vπ(s)Qπ(s,a)=Eτρπ[rh+γrh+1+γ2rh+2sh=s]=Eaπ(s)sP(s,a)[r(s,a)+γVπ(s)]=Eτρπ[rh+γrh+1+γ2rh+2+sh=s,ah=a]=r(s,a)+γEsP(s,a)aπ(s)[Qπ(s,a)]for any hNfor any hN","enumerator":"1.32","html_id":"bellman-consistency-infinite","key":"cNrFIAQJ9c"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"pOue5XPWpt"}],"key":"vqXuNcKIH8"},{"type":"paragraph","position":{"start":{"line":980,"column":1},"end":{"line":981,"column":1}},"children":[{"type":"text","value":"Heuristically speaking, why does it no longer matter which\ntime step we condition on when defining the value function?","position":{"start":{"line":980,"column":1},"end":{"line":980,"column":1}},"key":"qu7T4aeo9E"}],"key":"TOG8RyFTKw"}],"key":"ZLAbzGBhE8"},{"type":"heading","depth":2,"position":{"start":{"line":984,"column":1},"end":{"line":984,"column":1}},"children":[{"type":"text","value":"Solving infinite-horizon MDPs","position":{"start":{"line":984,"column":1},"end":{"line":984,"column":1}},"key":"FpNzesfBRJ"}],"identifier":"solving-infinite-horizon-mdps","label":"Solving infinite-horizon MDPs","html_id":"solving-infinite-horizon-mdps","implicit":true,"enumerator":"1.5","key":"eRRV1kKgjm"},{"type":"heading","depth":3,"position":{"start":{"line":986,"column":1},"end":{"line":986,"column":1}},"children":[{"type":"text","value":"The Bellman operator is a contraction mapping","position":{"start":{"line":986,"column":1},"end":{"line":986,"column":1}},"key":"b9JMNJENis"}],"identifier":"the-bellman-operator-is-a-contraction-mapping","label":"The Bellman operator is a contraction mapping","html_id":"the-bellman-operator-is-a-contraction-mapping","implicit":true,"enumerator":"1.5.1","key":"ggoTCdcdVm"},{"type":"paragraph","position":{"start":{"line":988,"column":1},"end":{"line":991,"column":1}},"children":[{"type":"text","value":"Recall from ","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"key":"UEixeLwTNH"},{"type":"crossReference","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"children":[{"type":"text","value":"Definition ","key":"qNy456We8U"},{"type":"text","value":"1.8","key":"PWd7XxeD7b"}],"identifier":"bellman_operator","label":"bellman_operator","kind":"proof:definition","template":"Definition %s","enumerator":"1.8","resolved":true,"html_id":"bellman-operator","key":"uQ4m0jnMiz"},{"type":"text","value":" that the Bellman operator ","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"key":"xhnLC8PxL9"},{"type":"inlineMath","value":"\\mathcal{J}^{\\pi}","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"html":"Jπ\\mathcal{J}^{\\pi}Jπ","key":"QxBkkO8qzs"},{"type":"text","value":"\nfor a policy ","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"key":"A5FdpxYulo"},{"type":"text","value":"π","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"key":"CIxB5aenQ2"},{"type":"text","value":" takes in a “value function” ","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"key":"CzbfEsHQSL"},{"type":"inlineMath","value":"v : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"html":"v:SRv : \\mathcal{S} \\to \\mathbb{R}v:SR","key":"zNMpKtsU19"},{"type":"text","value":" and\nreturns the r.h.s. of the Bellman equation for that “value function”. In\nthe infinite-horizon setting, this is","position":{"start":{"line":988,"column":1},"end":{"line":988,"column":1}},"key":"DqyaDUB2bF"}],"key":"gAbOdaJPvC"},{"type":"math","value":"[\\mathcal{J}^{\\pi}(v)](s) := \\E_{\\substack{a \\sim \\pi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + \\gamma v(s')].","position":{"start":{"line":993,"column":1},"end":{"line":993,"column":1}},"html":"[Jπ(v)](s):=Eaπ(s)sP(s,a)[r(s,a)+γv(s)].[\\mathcal{J}^{\\pi}(v)](s) := \\E_{\\substack{a \\sim \\pi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + \\gamma v(s')].[Jπ(v)](s):=Eaπ(s)sP(s,a)[r(s,a)+γv(s)].","enumerator":"1.33","key":"X2D0UxDvjZ"},{"type":"paragraph","position":{"start":{"line":995,"column":1},"end":{"line":999,"column":1}},"children":[{"type":"text","value":"The crucial property of the Bellman operator is that it is a\n","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"ZhcOegW1LA"},{"type":"strong","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"children":[{"type":"text","value":"contraction mapping","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"Y4lDsieO24"}],"key":"xULZ9j1oEA"},{"type":"text","value":" for any policy. Intuitively, if we start with\ntwo “value functions” ","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"S3T4KJluin"},{"type":"inlineMath","value":"v, u : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"html":"v,u:SRv, u : \\mathcal{S} \\to \\mathbb{R}v,u:SR","key":"hEPgzbOSsc"},{"type":"text","value":", if we repeatedly apply the\nBellman operator to each of them, they will get closer and closer\ntogether at an exponential rate.","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"JPM03QoQnS"}],"key":"qPLIV9kUBM"},{"type":"proof","kind":"definition","label":"contraction","identifier":"contraction","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Contraction mapping","position":{"start":{"line":1001,"column":1},"end":{"line":1001,"column":1}},"key":"jg8FdGpjVD"}],"key":"DXEzYBGcV3"},{"type":"paragraph","position":{"start":{"line":1004,"column":1},"end":{"line":1005,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"WpfX5XLjs8"},{"type":"inlineMath","value":"X","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"html":"XXX","key":"ygErvFBnfh"},{"type":"text","value":" be some space with a norm ","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"olmAj6OwvO"},{"type":"inlineMath","value":"\\|\\cdot\\|","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"html":"\\|\\cdot\\|","key":"Hzfd9prO1u"},{"type":"text","value":". We call an operator\n","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"fCJu9zFFXK"},{"type":"inlineMath","value":"f: X \\to X","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"html":"f:XXf: X \\to Xf:XX","key":"Lzpez1KXLn"},{"type":"text","value":" a ","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"v3v8LFoPTJ"},{"type":"strong","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"children":[{"type":"text","value":"contraction mapping","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"v8wjPMJDAg"}],"key":"cS57EWsKPg"},{"type":"text","value":" if for any ","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"Z9NOB303oy"},{"type":"inlineMath","value":"x, y \\in X","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"html":"x,yXx, y \\in Xx,yX","key":"ebhrXc7QAk"},{"type":"text","value":",","position":{"start":{"line":1004,"column":1},"end":{"line":1004,"column":1}},"key":"RUM64zHWfn"}],"key":"us9MzF9qME"},{"type":"math","value":"\\|f(x) - f(y)\\| \\le \\gamma \\|x - y\\|","position":{"start":{"line":1007,"column":1},"end":{"line":1007,"column":1}},"html":"f(x)f(y)γxy\\|f(x) - f(y)\\| \\le \\gamma \\|x - y\\|f(x)f(y)γxy","enumerator":"1.34","key":"Ggdciyr8LC"},{"type":"paragraph","position":{"start":{"line":1009,"column":1},"end":{"line":1011,"column":1}},"children":[{"type":"text","value":"for some fixed ","position":{"start":{"line":1009,"column":1},"end":{"line":1009,"column":1}},"key":"jr9WUJjwCr"},{"type":"inlineMath","value":"\\gamma \\in (0, 1)","position":{"start":{"line":1009,"column":1},"end":{"line":1009,"column":1}},"html":"γ(0,1)\\gamma \\in (0, 1)γ(0,1)","key":"Ud9FdXlR8r"},{"type":"text","value":".\nIntuitively, this means that if two points are ","position":{"start":{"line":1009,"column":1},"end":{"line":1009,"column":1}},"key":"p0bvc4Z3w0"},{"type":"text","value":"δ","position":{"start":{"line":1009,"column":1},"end":{"line":1009,"column":1}},"key":"puSmGI1hnV"},{"type":"text","value":" far apart,\nafter applying the mapping,","position":{"start":{"line":1009,"column":1},"end":{"line":1009,"column":1}},"key":"n79WOa2lPE"}],"key":"MfOIfPZl5i"}],"enumerator":"1.12","html_id":"contraction","key":"oIGXS8GT0r"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"mjbXjX6HC8"}],"key":"K9nsQgvHA6"},{"type":"paragraph","position":{"start":{"line":1016,"column":1},"end":{"line":1017,"column":1}},"children":[{"type":"text","value":"Show that for a contraction mapping ","position":{"start":{"line":1016,"column":1},"end":{"line":1016,"column":1}},"key":"DdiOb8wbdH"},{"type":"inlineMath","value":"f","position":{"start":{"line":1016,"column":1},"end":{"line":1016,"column":1}},"html":"fff","key":"TEylMrIdSw"},{"type":"text","value":" with coefficient\n","position":{"start":{"line":1016,"column":1},"end":{"line":1016,"column":1}},"key":"C99G7Ka4Sj"},{"type":"text","value":"γ","position":{"start":{"line":1016,"column":1},"end":{"line":1016,"column":1}},"key":"kl0E1ceLmw"},{"type":"text","value":", for all ","position":{"start":{"line":1016,"column":1},"end":{"line":1016,"column":1}},"key":"G3ul3PrX8q"},{"type":"inlineMath","value":"t \\in \\mathbb{N}","position":{"start":{"line":1016,"column":1},"end":{"line":1016,"column":1}},"html":"tNt \\in \\mathbb{N}tN","key":"oAchuLUNra"},{"type":"text","value":",","position":{"start":{"line":1016,"column":1},"end":{"line":1016,"column":1}},"key":"G1m1VK5voo"}],"key":"KEUKlUkhSS"},{"type":"math","value":"\\|f^{(t)}(x) - f^{(t)}(y)\\| \\le \\gamma^t \\|x - y\\|,","position":{"start":{"line":1019,"column":1},"end":{"line":1019,"column":1}},"html":"f(t)(x)f(t)(y)γtxy,\\|f^{(t)}(x) - f^{(t)}(y)\\| \\le \\gamma^t \\|x - y\\|,f(t)(x)f(t)(y)γtxy,","enumerator":"1.35","key":"JiqDqCkVw2"},{"type":"paragraph","position":{"start":{"line":1021,"column":1},"end":{"line":1023,"column":1}},"children":[{"type":"text","value":"i.e. that any\ntwo points will be pushed closer by at least a factor of ","position":{"start":{"line":1021,"column":1},"end":{"line":1021,"column":1}},"key":"UcEjnjQCP2"},{"type":"text","value":"γ","position":{"start":{"line":1021,"column":1},"end":{"line":1021,"column":1}},"key":"J3tpTJ5vtK"},{"type":"text","value":" at\neach iteration.","position":{"start":{"line":1021,"column":1},"end":{"line":1021,"column":1}},"key":"pcy0PosqP7"}],"key":"WbjBqCKmPP"}],"key":"CZbaIDu9Py"},{"type":"paragraph","position":{"start":{"line":1026,"column":1},"end":{"line":1029,"column":1}},"children":[{"type":"text","value":"It is a powerful fact (known as the ","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"LkALmR4XDK"},{"type":"strong","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"children":[{"type":"text","value":"Banach fixed-point theorem","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"nraqJ23lmi"}],"key":"LtfNTU6DcC"},{"type":"text","value":") that\nevery contraction mapping has a unique ","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"j6eVCIz5kn"},{"type":"strong","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"children":[{"type":"text","value":"fixed point","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"EMOgFtimAZ"}],"key":"oxwf2B2pW1"},{"type":"text","value":" ","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"llqf0fLBKr"},{"type":"inlineMath","value":"x^\\star","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"html":"xx^\\starx","key":"EgX1FeJPZn"},{"type":"text","value":" such\nthat ","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"CIQYugZh1R"},{"type":"inlineMath","value":"f(x^\\star) = x^\\star","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"html":"f(x)=xf(x^\\star) = x^\\starf(x)=x","key":"sORvwtc5KA"},{"type":"text","value":". This means that if we repeatedly apply ","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"uHCakCMOtv"},{"type":"inlineMath","value":"f","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"html":"fff","key":"SOcmHVuI0h"},{"type":"text","value":"\nto any starting point, we will eventually converge to ","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"i3AjizRRCs"},{"type":"inlineMath","value":"x^\\star","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"html":"xx^\\starx","key":"iH02uSfYhp"},{"type":"text","value":":","position":{"start":{"line":1026,"column":1},"end":{"line":1026,"column":1}},"key":"XLUnuZpHDD"}],"key":"wAZIJZqpIS"},{"type":"math","value":"\\|f^{(t)}(x) - x^\\star\\| \\le \\gamma^t \\|x - x^\\star\\|.","label":"contraction_convergence","identifier":"contraction_convergence","html":"f(t)(x)xγtxx.\\|f^{(t)}(x) - x^\\star\\| \\le \\gamma^t \\|x - x^\\star\\|.f(t)(x)xγtxx∥.","enumerator":"1.36","html_id":"contraction-convergence","key":"TbSukWlnHg"},{"type":"paragraph","position":{"start":{"line":1037,"column":1},"end":{"line":1040,"column":1}},"children":[{"type":"text","value":"Let’s return to the RL setting and apply this result to the Bellman\noperator. How can we measure the distance between two “value functions”\n","position":{"start":{"line":1037,"column":1},"end":{"line":1037,"column":1}},"key":"bSU0cYkHwq"},{"type":"inlineMath","value":"v, u : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":1037,"column":1},"end":{"line":1037,"column":1}},"html":"v,u:SRv, u : \\mathcal{S} \\to \\mathbb{R}v,u:SR","key":"d2rbK6EoyJ"},{"type":"text","value":"? We’ll take the ","position":{"start":{"line":1037,"column":1},"end":{"line":1037,"column":1}},"key":"VPz4PBlawy"},{"type":"strong","position":{"start":{"line":1037,"column":1},"end":{"line":1037,"column":1}},"children":[{"type":"text","value":"supremum norm","position":{"start":{"line":1037,"column":1},"end":{"line":1037,"column":1}},"key":"klCTyBxlHz"}],"key":"DRuTAO4DJB"},{"type":"text","value":" as our distance\nmetric:","position":{"start":{"line":1037,"column":1},"end":{"line":1037,"column":1}},"key":"youUpis09j"}],"key":"mrySTM3ETE"},{"type":"math","value":"\\| v - u \\|_{\\infty} := \\sup_{s \\in \\mathcal{S}} |v(s) - u(s)|,","position":{"start":{"line":1042,"column":1},"end":{"line":1042,"column":1}},"html":"vu:=supsSv(s)u(s),\\| v - u \\|_{\\infty} := \\sup_{s \\in \\mathcal{S}} |v(s) - u(s)|,vu:=sSsupv(s)u(s),","enumerator":"1.37","key":"JvAKPWxniC"},{"type":"paragraph","position":{"start":{"line":1044,"column":1},"end":{"line":1048,"column":1}},"children":[{"type":"text","value":"i.e.\nwe compare the “value functions” on the state that causes the biggest\ngap between them. Then ","position":{"start":{"line":1044,"column":1},"end":{"line":1044,"column":1}},"key":"oEbIJeIhBG"},{"type":"crossReference","kind":"equation","identifier":"contraction_convergence","label":"contraction_convergence","children":[{"type":"text","value":"(","key":"Ovb57Zh2m8"},{"type":"text","value":"1.36","key":"y7WDUDv9R6"},{"type":"text","value":")","key":"eMl46AyYL8"}],"template":"(%s)","enumerator":"1.36","resolved":true,"html_id":"contraction-convergence","key":"gOBebb2w2U"},{"type":"text","value":" implies that if we repeatedly\napply ","position":{"start":{"line":1044,"column":1},"end":{"line":1044,"column":1}},"key":"SYohWx9C9J"},{"type":"inlineMath","value":"\\mathcal{J}^\\pi","position":{"start":{"line":1044,"column":1},"end":{"line":1044,"column":1}},"html":"Jπ\\mathcal{J}^\\piJπ","key":"jsMSuM3OdB"},{"type":"text","value":" to any starting “value function”, we will eventually\nconverge to ","position":{"start":{"line":1044,"column":1},"end":{"line":1044,"column":1}},"key":"pNEwBsCPg4"},{"type":"inlineMath","value":"V^\\pi","position":{"start":{"line":1044,"column":1},"end":{"line":1044,"column":1}},"html":"VπV^\\piVπ","key":"I6gFLNq3LU"},{"type":"text","value":":","position":{"start":{"line":1044,"column":1},"end":{"line":1044,"column":1}},"key":"mReyg47NVJ"}],"key":"gSuTsWW9d5"},{"type":"math","value":"\\|(\\mathcal{J}^\\pi)^{(t)}(v) - V^\\pi \\|_{\\infty} \\le \\gamma^{t} \\| v - V^\\pi\\|_{\\infty}.","label":"bellman_convergence","identifier":"bellman_convergence","html":"(Jπ)(t)(v)VπγtvVπ.\\|(\\mathcal{J}^\\pi)^{(t)}(v) - V^\\pi \\|_{\\infty} \\le \\gamma^{t} \\| v - V^\\pi\\|_{\\infty}.(Jπ)(t)(v)VπγtvVπ.","enumerator":"1.38","html_id":"bellman-convergence","key":"SHXh8A3qMt"},{"type":"paragraph","position":{"start":{"line":1056,"column":1},"end":{"line":1057,"column":1}},"children":[{"type":"text","value":"We’ll use this useful fact to prove the convergence of several\nalgorithms later on.","position":{"start":{"line":1056,"column":1},"end":{"line":1056,"column":1}},"key":"N2YZ5ARliY"}],"key":"Q0c8jt89LT"},{"type":"proof","kind":"theorem","label":"bellman_contraction","identifier":"bellman_contraction","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"The Bellman operator is a contraction mapping","position":{"start":{"line":1059,"column":1},"end":{"line":1059,"column":1}},"key":"M4Kxz2h7t5"}],"key":"GMdvfTqIRs"},{"type":"math","value":"\\|\\mathcal{J}^{\\pi} (v) - \\mathcal{J}^{\\pi} (u) \\|_{\\infty} \\le \\gamma \\|v - u \\|_{\\infty}.","position":{"start":{"line":1062,"column":1},"end":{"line":1064,"column":1}},"html":"Jπ(v)Jπ(u)γvu.\\|\\mathcal{J}^{\\pi} (v) - \\mathcal{J}^{\\pi} (u) \\|_{\\infty} \\le \\gamma \\|v - u \\|_{\\infty}.Jπ(v)Jπ(u)γvu.","enumerator":"1.39","key":"AxRpqtEpRo"}],"enumerator":"1.4","html_id":"bellman-contraction","key":"s5sY9uGmRZ"},{"type":"proof","kind":"proof","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Proof of ","position":{"start":{"line":1067,"column":1},"end":{"line":1067,"column":1}},"key":"HU5aFchLgx"},{"type":"crossReference","kind":"proof:theorem","identifier":"bellman_contraction","label":"bellman_contraction","children":[{"type":"text","value":"Theorem ","key":"cgNh4eSiFq"},{"type":"text","value":"1.4","key":"gNRz5nPTZC"}],"template":"Theorem %s","enumerator":"1.4","resolved":true,"html_id":"bellman-contraction","key":"xGuvgYJaLt"}],"key":"YvYQVTd5kG"},{"type":"paragraph","position":{"start":{"line":1069,"column":1},"end":{"line":1069,"column":1}},"children":[{"type":"text","value":"For all states ","position":{"start":{"line":1069,"column":1},"end":{"line":1069,"column":1}},"key":"ZnzwEQfvRp"},{"type":"inlineMath","value":"s \\in \\mathcal{S}","position":{"start":{"line":1069,"column":1},"end":{"line":1069,"column":1}},"html":"sSs \\in \\mathcal{S}sS","key":"pyKYEYBXQg"},{"type":"text","value":",","position":{"start":{"line":1069,"column":1},"end":{"line":1069,"column":1}},"key":"iGWGRZjZMZ"}],"key":"LYDmIZCkIz"},{"type":"math","value":"\\begin{aligned}\n|[\\mathcal{J}^{\\pi} (v)](s) - [\\mathcal{J}^{\\pi} (u)](s)|&= \\Big| \\mathop{\\mathbb{E}}_{a \\sim \\pi(s)} \\left[ r(s, a) + \\gamma \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} v(s') \\right] \\\\\n&\\qquad - \\mathop{\\mathbb{E}}_{a \\sim \\pi(s)} \\left[r(s, a) + \\gamma \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} u(s') \\right] \\Big| \\\\\n&= \\gamma \\left|\\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} [v(s') - u(s')] \\right| \\\\\n&\\le \\gamma \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)}|v(s') - u(s')| \\qquad \\text{(Jensen's inequality)} \\\\\n&\\le \\gamma \\max_{s'} |v(s') - u(s')| \\\\\n&= \\gamma \\|v - u \\|_{\\infty}.\n\\end{aligned}","position":{"start":{"line":1071,"column":1},"end":{"line":1080,"column":1}},"html":"[Jπ(v)](s)[Jπ(u)](s)=Eaπ(s)[r(s,a)+γEsP(s,a)v(s)]Eaπ(s)[r(s,a)+γEsP(s,a)u(s)]=γEsP(s,a)[v(s)u(s)]γEsP(s,a)v(s)u(s)(Jensen’s inequality)γmaxsv(s)u(s)=γvu.\\begin{aligned}\n|[\\mathcal{J}^{\\pi} (v)](s) - [\\mathcal{J}^{\\pi} (u)](s)|&= \\Big| \\mathop{\\mathbb{E}}_{a \\sim \\pi(s)} \\left[ r(s, a) + \\gamma \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} v(s') \\right] \\\\\n&\\qquad - \\mathop{\\mathbb{E}}_{a \\sim \\pi(s)} \\left[r(s, a) + \\gamma \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} u(s') \\right] \\Big| \\\\\n&= \\gamma \\left|\\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} [v(s') - u(s')] \\right| \\\\\n&\\le \\gamma \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)}|v(s') - u(s')| \\qquad \\text{(Jensen's inequality)} \\\\\n&\\le \\gamma \\max_{s'} |v(s') - u(s')| \\\\\n&= \\gamma \\|v - u \\|_{\\infty}.\n\\end{aligned}[Jπ(v)](s)[Jπ(u)](s)=Eaπ(s)[r(s,a)+γEsP(s,a)v(s)]Eaπ(s)[r(s,a)+γEsP(s,a)u(s)]=γEsP(s,a)[v(s)u(s)]γEsP(s,a)v(s)u(s)(Jensen’s inequality)γsmaxv(s)u(s)=γvu.","enumerator":"1.40","key":"AmcePkT6tc"}],"enumerator":"1.2","key":"Xi6CFO74vD"},{"type":"heading","depth":3,"position":{"start":{"line":1083,"column":1},"end":{"line":1083,"column":1}},"children":[{"type":"text","value":"Policy evaluation in infinite-horizon MDPs","position":{"start":{"line":1083,"column":1},"end":{"line":1083,"column":1}},"key":"TNLawe63ea"}],"identifier":"policy-evaluation-in-infinite-horizon-mdps","label":"Policy evaluation in infinite-horizon MDPs","html_id":"policy-evaluation-in-infinite-horizon-mdps","implicit":true,"enumerator":"1.5.2","key":"gAe8GTd54V"},{"type":"paragraph","position":{"start":{"line":1085,"column":1},"end":{"line":1087,"column":1}},"children":[{"type":"text","value":"The backwards DP technique we used in ","position":{"start":{"line":1085,"column":1},"end":{"line":1085,"column":1}},"key":"ohIBNNv9Jk"},{"type":"crossReference","position":{"start":{"line":1085,"column":1},"end":{"line":1085,"column":1}},"children":[{"type":"text","value":"the finite-horizon case","position":{"start":{"line":1085,"column":1},"end":{"line":1085,"column":1}},"key":"RhkQ30q9LN"}],"identifier":"eval_dp","label":"eval_dp","kind":"heading","template":"Section %s","enumerator":"1.3.1","resolved":true,"html_id":"eval-dp","key":"KbAhztKn6P"},{"type":"text","value":" no\nlonger works since there is no “final timestep” to start from. We’ll\nneed another approach to policy evaluation.","position":{"start":{"line":1085,"column":1},"end":{"line":1085,"column":1}},"key":"N14H4fCc8m"}],"key":"zvECUepdcn"},{"type":"paragraph","position":{"start":{"line":1089,"column":1},"end":{"line":1092,"column":1}},"children":[{"type":"text","value":"The Bellman consistency conditions yield a system of equations we can\nsolve to evaluate a deterministic policy ","position":{"start":{"line":1089,"column":1},"end":{"line":1089,"column":1}},"key":"U5epMIMgq2"},{"type":"emphasis","position":{"start":{"line":1089,"column":1},"end":{"line":1089,"column":1}},"children":[{"type":"text","value":"exactly","position":{"start":{"line":1089,"column":1},"end":{"line":1089,"column":1}},"key":"DLvlKiiS8p"}],"key":"zsnzFWofBH"},{"type":"text","value":". For a faster approximate solution,\nwe can iterate the policy’s Bellman operator, since we know that it has\na unique fixed point at the true value function.","position":{"start":{"line":1089,"column":1},"end":{"line":1089,"column":1}},"key":"q4w3UHhsjq"}],"key":"vsldZ2D71R"},{"type":"heading","depth":4,"position":{"start":{"line":1094,"column":1},"end":{"line":1094,"column":1}},"children":[{"type":"text","value":"Matrix inversion for deterministic policies","position":{"start":{"line":1094,"column":1},"end":{"line":1094,"column":1}},"key":"DJSun8Jqeh"}],"identifier":"matrix-inversion-for-deterministic-policies","label":"Matrix inversion for deterministic policies","html_id":"matrix-inversion-for-deterministic-policies","implicit":true,"enumerator":"1.5.2.1","key":"sjPKnHWkcH"},{"type":"paragraph","position":{"start":{"line":1096,"column":1},"end":{"line":1098,"column":1}},"children":[{"type":"text","value":"Note that when the policy ","position":{"start":{"line":1096,"column":1},"end":{"line":1096,"column":1}},"key":"swiuQVf86c"},{"type":"text","value":"π","position":{"start":{"line":1096,"column":1},"end":{"line":1096,"column":1}},"key":"BqYMuX9heI"},{"type":"text","value":" is deterministic, the actions can be\ndetermined from the states, and so we can chop off the action dimension\nfor the rewards and state transitions:","position":{"start":{"line":1096,"column":1},"end":{"line":1096,"column":1}},"key":"MOmjQjkRIy"}],"key":"YlwOKi9J6D"},{"type":"math","value":"\\begin{aligned}\n r^{\\pi} &\\in \\mathbb{R}^{|\\mathcal{S}|} & P^{\\pi} &\\in [0, 1]^{|\\mathcal{S}| \\times |\\mathcal{S}|} & \\mu &\\in [0, 1]^{|\\mathcal{S}|} \\\\\n \\pi &\\in \\mathcal{A}^{|\\mathcal{S}|} & V^\\pi &\\in \\mathbb{R}^{|\\mathcal{S}|} & Q^\\pi &\\in \\mathbb{R}^{|\\mathcal{S}| \\times |\\mathcal{A}|}.\n\\end{aligned}","position":{"start":{"line":1100,"column":1},"end":{"line":1105,"column":1}},"html":"rπRSPπ[0,1]S×Sμ[0,1]SπASVπRSQπRS×A.\\begin{aligned}\n r^{\\pi} &\\in \\mathbb{R}^{|\\mathcal{S}|} & P^{\\pi} &\\in [0, 1]^{|\\mathcal{S}| \\times |\\mathcal{S}|} & \\mu &\\in [0, 1]^{|\\mathcal{S}|} \\\\\n \\pi &\\in \\mathcal{A}^{|\\mathcal{S}|} & V^\\pi &\\in \\mathbb{R}^{|\\mathcal{S}|} & Q^\\pi &\\in \\mathbb{R}^{|\\mathcal{S}| \\times |\\mathcal{A}|}.\n\\end{aligned}rππRSASPπVπ[0,1]S×SRSμQπ[0,1]SRS×A.","enumerator":"1.41","key":"Ca2OdmCcP9"},{"type":"paragraph","position":{"start":{"line":1107,"column":1},"end":{"line":1109,"column":1}},"children":[{"type":"text","value":"For ","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"key":"Absqux4CHv"},{"type":"inlineMath","value":"P^\\pi","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"html":"PπP^\\piPπ","key":"dVeVazeE5g"},{"type":"text","value":", we’ll treat the rows as the states and the\ncolumns as the next states. Then ","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"key":"dVWX9TNJqb"},{"type":"inlineMath","value":"P^\\pi_{s, s'}","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"html":"Ps,sπP^\\pi_{s, s'}Ps,sπ","key":"HdmgGPNQCJ"},{"type":"text","value":" is the probability of\ntransitioning from state ","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"key":"CmqyPP2b11"},{"type":"inlineMath","value":"s","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"html":"sss","key":"WH9VDcL6B9"},{"type":"text","value":" to state ","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"key":"gv7eRlhcwT"},{"type":"inlineMath","value":"s'","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"html":"ss's","key":"t6K8PaKqnr"},{"type":"text","value":" under policy ","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"key":"agki5XShXP"},{"type":"text","value":"π","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"key":"xmq5Jd6yKB"},{"type":"text","value":".","position":{"start":{"line":1107,"column":1},"end":{"line":1107,"column":1}},"key":"rRpnzHMl6X"}],"key":"DSwR6bQ9ij"},{"type":"proof","kind":"example","label":"tidy_tabular","identifier":"tidy_tabular","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Tidying MDP","position":{"start":{"line":1111,"column":1},"end":{"line":1111,"column":1}},"key":"sNrvV6HOS1"}],"key":"dcxyvEAQWW"},{"type":"paragraph","position":{"start":{"line":1114,"column":1},"end":{"line":1116,"column":1}},"children":[{"type":"text","value":"The tabular MDP from before has ","position":{"start":{"line":1114,"column":1},"end":{"line":1114,"column":1}},"key":"G9Yz1LpS7o"},{"type":"inlineMath","value":"|\\mathcal{S}| = 2","position":{"start":{"line":1114,"column":1},"end":{"line":1114,"column":1}},"html":"S=2|\\mathcal{S}| = 2S=2","key":"vB0KKYTSIR"},{"type":"text","value":" and ","position":{"start":{"line":1114,"column":1},"end":{"line":1114,"column":1}},"key":"XcgDQMXCfj"},{"type":"inlineMath","value":"|\\mathcal{A}| = 2","position":{"start":{"line":1114,"column":1},"end":{"line":1114,"column":1}},"html":"A=2|\\mathcal{A}| = 2A=2","key":"n9SM2PkNnq"},{"type":"text","value":". Let’s write\ndown the quantities for the policy ","position":{"start":{"line":1114,"column":1},"end":{"line":1114,"column":1}},"key":"oL2jqc49ri"},{"type":"text","value":"π","position":{"start":{"line":1114,"column":1},"end":{"line":1114,"column":1}},"key":"ZMHfSv2ory"},{"type":"text","value":" that tidies if and only if the\nroom is messy:","position":{"start":{"line":1114,"column":1},"end":{"line":1114,"column":1}},"key":"dM69szh9LM"}],"key":"DgBqrMBFUc"},{"type":"math","value":"r^{\\pi} = \\begin{bmatrix} 1 \\\\ 0 \\end{bmatrix}, \\quad\n P^{\\pi} = \\begin{bmatrix} 0.7 & 0.3 \\\\ 1 & 0 \\end{bmatrix}, \\quad\n \\mu = \\begin{bmatrix} 1 \\\\ 0 \\end{bmatrix}","position":{"start":{"line":1118,"column":1},"end":{"line":1120,"column":1}},"html":"rπ=[10],Pπ=[0.70.310],μ=[10]r^{\\pi} = \\begin{bmatrix} 1 \\\\ 0 \\end{bmatrix}, \\quad\n P^{\\pi} = \\begin{bmatrix} 0.7 & 0.3 \\\\ 1 & 0 \\end{bmatrix}, \\quad\n \\mu = \\begin{bmatrix} 1 \\\\ 0 \\end{bmatrix}rπ=[10],Pπ=[0.710.30],μ=[10]","enumerator":"1.42","key":"edvPChOwVj"},{"type":"paragraph","position":{"start":{"line":1122,"column":1},"end":{"line":1123,"column":1}},"children":[{"type":"text","value":"We’ll see how to\nevaluate this policy in the next section.","position":{"start":{"line":1122,"column":1},"end":{"line":1122,"column":1}},"key":"HX5ob9hfyn"}],"key":"K8etgcX1PP"}],"enumerator":"1.5","html_id":"tidy-tabular","key":"spdx3Usv1Q"},{"type":"paragraph","position":{"start":{"line":1126,"column":1},"end":{"line":1127,"column":1}},"children":[{"type":"text","value":"The Bellman consistency equation for a deterministic policy can be\nwritten in tabular notation as","position":{"start":{"line":1126,"column":1},"end":{"line":1126,"column":1}},"key":"MqHAIJikXT"}],"key":"QJr4NERMYi"},{"type":"math","value":"V^\\pi = r^\\pi + \\gamma P^\\pi V^\\pi.","position":{"start":{"line":1129,"column":1},"end":{"line":1129,"column":1}},"html":"Vπ=rπ+γPπVπ.V^\\pi = r^\\pi + \\gamma P^\\pi V^\\pi.Vπ=rπ+γPπVπ.","enumerator":"1.43","key":"Ybdt5pbBBS"},{"type":"paragraph","position":{"start":{"line":1131,"column":1},"end":{"line":1133,"column":1}},"children":[{"type":"text","value":"(Unfortunately, this notation doesn’t simplify the expression for\n","position":{"start":{"line":1131,"column":1},"end":{"line":1131,"column":1}},"key":"sLvUTWfLqP"},{"type":"inlineMath","value":"Q^\\pi","position":{"start":{"line":1131,"column":1},"end":{"line":1131,"column":1}},"html":"QπQ^\\piQπ","key":"ut7IDV9rRA"},{"type":"text","value":".) This system of equations can be solved with a matrix\ninversion:","position":{"start":{"line":1131,"column":1},"end":{"line":1131,"column":1}},"key":"KGvESfCYOh"}],"key":"btzjhrwpnt"},{"type":"math","value":"V^\\pi = (I - \\gamma P^\\pi)^{-1} r^\\pi.","label":"matrix_inversion_pe","identifier":"matrix_inversion_pe","html":"Vπ=(IγPπ)1rπ.V^\\pi = (I - \\gamma P^\\pi)^{-1} r^\\pi.Vπ=(IγPπ)1rπ.","enumerator":"1.44","html_id":"matrix-inversion-pe","key":"kHgydkSCa3"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"TuNjG66mqA"}],"key":"X3p81cg1AP"},{"type":"paragraph","position":{"start":{"line":1142,"column":1},"end":{"line":1143,"column":1}},"children":[{"type":"text","value":"Note we’ve assumed that ","position":{"start":{"line":1142,"column":1},"end":{"line":1142,"column":1}},"key":"DJBMfqIasH"},{"type":"inlineMath","value":"I - \\gamma P^\\pi","position":{"start":{"line":1142,"column":1},"end":{"line":1142,"column":1}},"html":"IγPπI - \\gamma P^\\piIγPπ","key":"IffmcI2Aog"},{"type":"text","value":" is invertible. Can you see\nwhy this is the case?","position":{"start":{"line":1142,"column":1},"end":{"line":1142,"column":1}},"key":"SoJSaDvXbu"}],"key":"exKN1n6YfW"},{"type":"paragraph","position":{"start":{"line":1145,"column":1},"end":{"line":1149,"column":1}},"children":[{"type":"text","value":"(Recall that a linear operator, i.e. a square matrix, is invertible if\nand only if its null space is trivial; that is, it doesn’t map any\nnonzero vector to zero. In this case, we can see that ","position":{"start":{"line":1145,"column":1},"end":{"line":1145,"column":1}},"key":"jRSOrbGkkB"},{"type":"inlineMath","value":"I - \\gamma P^\\pi","position":{"start":{"line":1145,"column":1},"end":{"line":1145,"column":1}},"html":"IγPπI - \\gamma P^\\piIγPπ","key":"NaocssouhV"},{"type":"text","value":"\nis invertible because it maps any nonzero vector to a vector with at\nleast one nonzero element.)","position":{"start":{"line":1145,"column":1},"end":{"line":1145,"column":1}},"key":"YB4aIhWQhv"}],"key":"jS4Dt6Kw9e"}],"key":"mHx0OwiBV0"}],"key":"Gfea5Cn6k3"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def eval_deterministic_infinite(\n mdp: MDP, policy: Float[Array, \"S A\"]\n) -> Float[Array, \" S\"]:\n pi = jnp.argmax(policy, axis=1) # un-one-hot\n P_π = mdp.P[jnp.arange(mdp.S), pi]\n r_π = mdp.r[jnp.arange(mdp.S), pi]\n return jnp.linalg.solve(jnp.eye(mdp.S) - mdp.γ * P_π, r_π)","key":"WucxcN5eZ7"},{"type":"output","id":"_Pm4B6gAf8H-qqMWgx5Ib","data":[],"key":"djjD72Fxhx"}],"data":{},"key":"fKQQoqLRc0"},{"type":"block","children":[{"type":"proof","kind":"example","label":"tidy_eval_infinite","identifier":"tidy_eval_infinite","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Tidying policy evaluation","position":{"start":{"line":1162,"column":1},"end":{"line":1162,"column":1}},"key":"QFNtFQqmkH"}],"key":"qnGd8Wz7sQ"},{"type":"paragraph","position":{"start":{"line":1165,"column":1},"end":{"line":1166,"column":1}},"children":[{"type":"text","value":"Let’s use the same policy ","position":{"start":{"line":1165,"column":1},"end":{"line":1165,"column":1}},"key":"ICx6fLWTFA"},{"type":"text","value":"π","position":{"start":{"line":1165,"column":1},"end":{"line":1165,"column":1}},"key":"KiLckzXb0I"},{"type":"text","value":" that tidies if and only if the room is\nmessy. Setting ","position":{"start":{"line":1165,"column":1},"end":{"line":1165,"column":1}},"key":"HK5hrGCV0I"},{"type":"inlineMath","value":"\\gamma = 0.95","position":{"start":{"line":1165,"column":1},"end":{"line":1165,"column":1}},"html":"γ=0.95\\gamma = 0.95γ=0.95","key":"uAnmJANWQ5"},{"type":"text","value":", we must invert","position":{"start":{"line":1165,"column":1},"end":{"line":1165,"column":1}},"key":"WmdNPSkSRY"}],"key":"SL8PxxQEed"},{"type":"math","value":"I - \\gamma P^{\\pi} = \\begin{bmatrix} 1 - 0.95 \\times 0.7 & - 0.95 \\times 0.3 \\\\ - 0.95 \\times 1 & 1 - 0.95 \\times 0 \\end{bmatrix} = \\begin{bmatrix} 0.335 & -0.285 \\\\ -0.95 & 1 \\end{bmatrix}.","position":{"start":{"line":1168,"column":1},"end":{"line":1168,"column":1}},"html":"IγPπ=[10.95×0.70.95×0.30.95×110.95×0]=[0.3350.2850.951].I - \\gamma P^{\\pi} = \\begin{bmatrix} 1 - 0.95 \\times 0.7 & - 0.95 \\times 0.3 \\\\ - 0.95 \\times 1 & 1 - 0.95 \\times 0 \\end{bmatrix} = \\begin{bmatrix} 0.335 & -0.285 \\\\ -0.95 & 1 \\end{bmatrix}.IγPπ=[10.95×0.70.95×10.95×0.310.95×0]=[0.3350.950.2851].","enumerator":"1.45","key":"myhdQjsD5O"},{"type":"paragraph","position":{"start":{"line":1170,"column":1},"end":{"line":1170,"column":1}},"children":[{"type":"text","value":"The inverse to two decimal points is","position":{"start":{"line":1170,"column":1},"end":{"line":1170,"column":1}},"key":"KertewdHPL"}],"key":"ndhIXeXbEk"},{"type":"math","value":"(I - \\gamma P^{\\pi})^{-1} = \\begin{bmatrix} 15.56 & 4.44 \\\\ 14.79 & 5.21 \\end{bmatrix}.","position":{"start":{"line":1172,"column":1},"end":{"line":1172,"column":1}},"html":"(IγPπ)1=[15.564.4414.795.21].(I - \\gamma P^{\\pi})^{-1} = \\begin{bmatrix} 15.56 & 4.44 \\\\ 14.79 & 5.21 \\end{bmatrix}.(IγPπ)1=[15.5614.794.445.21].","enumerator":"1.46","key":"F7NBugf3e2"},{"type":"paragraph","position":{"start":{"line":1174,"column":1},"end":{"line":1174,"column":1}},"children":[{"type":"text","value":"Thus the value function is","position":{"start":{"line":1174,"column":1},"end":{"line":1174,"column":1}},"key":"vcjqe3cPCk"}],"key":"Q1vXHN9ZfE"},{"type":"math","value":"V^{\\pi} = (I - \\gamma P^{\\pi})^{-1} r^{\\pi} = \\begin{bmatrix} 15.56 & 4.44 \\\\ 14.79 & 5.21 \\end{bmatrix} \\begin{bmatrix} 1 \\\\ 0 \\end{bmatrix} = \\begin{bmatrix} 15.56 \\\\ 14.79 \\end{bmatrix}.","position":{"start":{"line":1176,"column":1},"end":{"line":1176,"column":1}},"html":"Vπ=(IγPπ)1rπ=[15.564.4414.795.21][10]=[15.5614.79].V^{\\pi} = (I - \\gamma P^{\\pi})^{-1} r^{\\pi} = \\begin{bmatrix} 15.56 & 4.44 \\\\ 14.79 & 5.21 \\end{bmatrix} \\begin{bmatrix} 1 \\\\ 0 \\end{bmatrix} = \\begin{bmatrix} 15.56 \\\\ 14.79 \\end{bmatrix}.Vπ=(IγPπ)1rπ=[15.5614.794.445.21][10]=[15.5614.79].","enumerator":"1.47","key":"KlFFa42L1W"},{"type":"paragraph","position":{"start":{"line":1178,"column":1},"end":{"line":1181,"column":1}},"children":[{"type":"text","value":"Let’s sanity-check this result. Since rewards are at most ","position":{"start":{"line":1178,"column":1},"end":{"line":1178,"column":1}},"key":"TNEObtn1xz"},{"type":"text","value":"1","position":{"start":{"line":1178,"column":1},"end":{"line":1178,"column":1}},"key":"ZS0V4XKc4W"},{"type":"text","value":", the\nmaximum cumulative return of a trajectory is at most\n","position":{"start":{"line":1178,"column":1},"end":{"line":1178,"column":1}},"key":"NAnEfVR79e"},{"type":"inlineMath","value":"1/(1-\\gamma) = 20","position":{"start":{"line":1178,"column":1},"end":{"line":1178,"column":1}},"html":"1/(1γ)=201/(1-\\gamma) = 201/(1γ)=20","key":"Q4BjCCRwxs"},{"type":"text","value":". We see that the value function is indeed slightly\nlower than this.","position":{"start":{"line":1178,"column":1},"end":{"line":1178,"column":1}},"key":"XpesjS69nT"}],"key":"xckh7VAR6h"}],"enumerator":"1.6","html_id":"tidy-eval-infinite","key":"hdKnJDkgCV"}],"key":"n3992dGzBH"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"eval_deterministic_infinite(tidy_mdp_inf, tidy_policy_messy_only[0])","key":"QdNlcxiaWL"},{"type":"output","id":"LxBJnmA3QFog0Sq0jIAP6","data":[{"output_type":"execute_result","execution_count":19,"metadata":{},"data":{"text/plain":{"content":"Array([15.56419, 14.78598], dtype=float32)","content_type":"text/plain"}}}],"key":"Hi7SdZJVzg"}],"data":{},"key":"NvbEAzrN14"},{"type":"block","children":[{"type":"heading","depth":4,"position":{"start":{"line":1189,"column":1},"end":{"line":1189,"column":1}},"children":[{"type":"text","value":"Iterative policy evaluation","position":{"start":{"line":1189,"column":1},"end":{"line":1189,"column":1}},"key":"RENMoUf1Hk"}],"label":"iterative_pe","identifier":"iterative_pe","html_id":"iterative-pe","enumerator":"1.5.2.2","key":"BjozjFY7ZR"},{"type":"paragraph","position":{"start":{"line":1191,"column":1},"end":{"line":1194,"column":1}},"children":[{"type":"text","value":"The matrix inversion above takes roughly ","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"key":"v3xBQ0hnSI"},{"type":"inlineMath","value":"O(|\\mathcal{S}|^3)","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"html":"O(S3)O(|\\mathcal{S}|^3)O(S3)","key":"fEOfaFeBDD"},{"type":"text","value":" time.\nIt also only works for deterministic policies.\nCan we trade off the requirement of finding the ","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"key":"rhfufCsLSc"},{"type":"emphasis","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"children":[{"type":"text","value":"exact","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"key":"RozUTNcLC1"}],"key":"EY8f2RPUZm"},{"type":"text","value":" value function for a faster\n","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"key":"O9e5ihrAu2"},{"type":"emphasis","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"children":[{"type":"text","value":"approximate","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"key":"D9l8kq8nBg"}],"key":"ktIQb3l5N3"},{"type":"text","value":" algorithm that will also extend to stochastic policies?","position":{"start":{"line":1191,"column":1},"end":{"line":1191,"column":1}},"key":"iWn6odD2SS"}],"key":"NewbPNKA57"},{"type":"paragraph","position":{"start":{"line":1196,"column":1},"end":{"line":1199,"column":1}},"children":[{"type":"text","value":"Let’s use the Bellman operator to define an iterative algorithm for\ncomputing the value function. We’ll start with an initial guess\n","position":{"start":{"line":1196,"column":1},"end":{"line":1196,"column":1}},"key":"aopLsN888U"},{"type":"inlineMath","value":"v^{(0)}","position":{"start":{"line":1196,"column":1},"end":{"line":1196,"column":1}},"html":"v(0)v^{(0)}v(0)","key":"e8zoRYYZeN"},{"type":"text","value":" with elements in ","position":{"start":{"line":1196,"column":1},"end":{"line":1196,"column":1}},"key":"BGVQVt6I4v"},{"type":"inlineMath","value":"[0, 1/(1-\\gamma)]","position":{"start":{"line":1196,"column":1},"end":{"line":1196,"column":1}},"html":"[0,1/(1γ)][0, 1/(1-\\gamma)][0,1/(1γ)]","key":"WCVtFm330O"},{"type":"text","value":" and then iterate the\nBellman operator:","position":{"start":{"line":1196,"column":1},"end":{"line":1196,"column":1}},"key":"DETwzActoY"}],"key":"uKxamjsgXx"},{"type":"math","value":"v^{(t+1)} = \\mathcal{J}^{\\pi}(v^{(t)}),","position":{"start":{"line":1201,"column":1},"end":{"line":1201,"column":1}},"html":"v(t+1)=Jπ(v(t)),v^{(t+1)} = \\mathcal{J}^{\\pi}(v^{(t)}),v(t+1)=Jπ(v(t)),","enumerator":"1.48","key":"o0HeLeCGbf"},{"type":"paragraph","position":{"start":{"line":1203,"column":1},"end":{"line":1204,"column":1}},"children":[{"type":"text","value":"i.e. ","position":{"start":{"line":1203,"column":1},"end":{"line":1203,"column":1}},"key":"AB3oDzNqTE"},{"type":"inlineMath","value":"v^{(t)} = (\\mathcal{J}^{\\pi})^{(t)} (v^{(0)})","position":{"start":{"line":1203,"column":1},"end":{"line":1203,"column":1}},"html":"v(t)=(Jπ)(t)(v(0))v^{(t)} = (\\mathcal{J}^{\\pi})^{(t)} (v^{(0)})v(t)=(Jπ)(t)(v(0))","key":"BYiNwdp4o4"},{"type":"text","value":". Note that each iteration\ntakes ","position":{"start":{"line":1203,"column":1},"end":{"line":1203,"column":1}},"key":"FhZj8D7Cor"},{"type":"inlineMath","value":"O(|\\mathcal{S}|^2)","position":{"start":{"line":1203,"column":1},"end":{"line":1203,"column":1}},"html":"O(S2)O(|\\mathcal{S}|^2)O(S2)","key":"VVoS9kylPK"},{"type":"text","value":" time for the matrix-vector multiplication.","position":{"start":{"line":1203,"column":1},"end":{"line":1203,"column":1}},"key":"P998aPetQ5"}],"key":"K3DBJcHr6N"}],"key":"XWyLgYIE8D"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def supremum_norm(v):\n return jnp.max(jnp.abs(v)) # same as jnp.linalg.norm(v, jnp.inf)\n\n\ndef loop_until_convergence(op, v, ε=1e-6):\n \"\"\"Repeatedly apply op to v until convergence (in supremum norm).\"\"\"\n while True:\n v_new = op(v)\n if supremum_norm(v_new - v) < ε:\n return v_new\n v = v_new\n\n\ndef iterative_evaluation(mdp: MDP, pi: Float[Array, \"S A\"], ε=1e-6) -> Float[Array, \" S\"]:\n op = partial(bellman_operator, mdp, pi)\n return loop_until_convergence(op, jnp.zeros(mdp.S), ε)","key":"uOq5vNQEw3"},{"type":"output","id":"DBLZGOpIZxQ9WeFTb7WO7","data":[],"key":"k3ICfR0bnB"}],"data":{},"key":"A1Bz9qOeN9"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":1225,"column":1},"end":{"line":1225,"column":1}},"children":[{"type":"text","value":"Then, as we showed in ","position":{"start":{"line":1225,"column":1},"end":{"line":1225,"column":1}},"key":"fDkszYxyB2"},{"type":"crossReference","kind":"equation","identifier":"bellman_convergence","label":"bellman_convergence","children":[{"type":"text","value":"(","key":"D4bnOACTep"},{"type":"text","value":"1.38","key":"KgYjjXyH1A"},{"type":"text","value":")","key":"VHFwVqTiS9"}],"template":"(%s)","enumerator":"1.38","resolved":true,"html_id":"bellman-convergence","key":"C9G5uraL1O"},{"type":"text","value":", by the Banach fixed-point theorem:","position":{"start":{"line":1225,"column":1},"end":{"line":1225,"column":1}},"key":"pypMIXMYvn"}],"key":"VhvCs5HkH7"},{"type":"math","value":"\\|v^{(t)} - V^\\pi \\|_{\\infty} \\le \\gamma^{t} \\| v^{(0)} - V^\\pi\\|_{\\infty}.","position":{"start":{"line":1227,"column":1},"end":{"line":1227,"column":1}},"html":"v(t)Vπγtv(0)Vπ.\\|v^{(t)} - V^\\pi \\|_{\\infty} \\le \\gamma^{t} \\| v^{(0)} - V^\\pi\\|_{\\infty}.v(t)Vπγtv(0)Vπ.","enumerator":"1.49","key":"gmw7TtzOJ0"}],"key":"nlkTdP1xn5"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"iterative_evaluation(tidy_mdp_inf, tidy_policy_messy_only[0])","key":"sAZW42Goex"},{"type":"output","id":"U66NCDPEsUyFbkeiBOjlE","data":[{"output_type":"execute_result","execution_count":21,"metadata":{},"data":{"text/plain":{"content":"Array([15.564166, 14.785956], dtype=float32)","content_type":"text/plain"}}}],"key":"e3plKZ5pgA"}],"data":{},"key":"HQx5FPFbDr"},{"type":"block","children":[{"type":"proof","kind":"remark","label":"iterations_vi","identifier":"iterations_vi","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Convergence of iterative policy evaluation","position":{"start":{"line":1233,"column":1},"end":{"line":1233,"column":1}},"key":"bcKl43Kr6A"}],"key":"HL23IbxPxo"},{"type":"paragraph","position":{"start":{"line":1236,"column":1},"end":{"line":1237,"column":1}},"children":[{"type":"text","value":"How many iterations do we need for an ","position":{"start":{"line":1236,"column":1},"end":{"line":1236,"column":1}},"key":"te3oQYpuA0"},{"type":"text","value":"ε","position":{"start":{"line":1236,"column":1},"end":{"line":1236,"column":1}},"key":"TT7g6oLd9l"},{"type":"text","value":"-accurate estimate? We\ncan work backwards to solve for ","position":{"start":{"line":1236,"column":1},"end":{"line":1236,"column":1}},"key":"LPlafKAn4A"},{"type":"inlineMath","value":"t","position":{"start":{"line":1236,"column":1},"end":{"line":1236,"column":1}},"html":"ttt","key":"ieQGSWq3Sl"},{"type":"text","value":":","position":{"start":{"line":1236,"column":1},"end":{"line":1236,"column":1}},"key":"Mfgh0j3ZFt"}],"key":"odTaDo9ytA"},{"type":"math","value":"\\begin{aligned}\n \\gamma^t \\|v^{(0)} - V^\\pi\\|_{\\infty} &\\le \\epsilon \\\\\n t &\\ge \\frac{\\log (\\epsilon / \\|v^{(0)} - V^\\pi\\|_{\\infty})}{\\log \\gamma} \\\\\n &= \\frac{\\log (\\|v^{(0)} - V^\\pi\\|_{\\infty} / \\epsilon)}{\\log (1 / \\gamma)},\n\\end{aligned}","position":{"start":{"line":1239,"column":1},"end":{"line":1245,"column":1}},"html":"γtv(0)Vπϵtlog(ϵ/v(0)Vπ)logγ=log(v(0)Vπ/ϵ)log(1/γ),\\begin{aligned}\n \\gamma^t \\|v^{(0)} - V^\\pi\\|_{\\infty} &\\le \\epsilon \\\\\n t &\\ge \\frac{\\log (\\epsilon / \\|v^{(0)} - V^\\pi\\|_{\\infty})}{\\log \\gamma} \\\\\n &= \\frac{\\log (\\|v^{(0)} - V^\\pi\\|_{\\infty} / \\epsilon)}{\\log (1 / \\gamma)},\n\\end{aligned}γtv(0)Vπtϵlogγlog(ϵ/∥v(0)Vπ)=log(1/γ)log(v(0)Vπ/ϵ),","enumerator":"1.50","key":"DvTonrdfq0"},{"type":"paragraph","position":{"start":{"line":1247,"column":1},"end":{"line":1248,"column":1}},"children":[{"type":"text","value":"and so the number of iterations required for an\n","position":{"start":{"line":1247,"column":1},"end":{"line":1247,"column":1}},"key":"r8xCx38m9o"},{"type":"text","value":"ε","position":{"start":{"line":1247,"column":1},"end":{"line":1247,"column":1}},"key":"uN5C4yMD2M"},{"type":"text","value":"-accurate estimate is","position":{"start":{"line":1247,"column":1},"end":{"line":1247,"column":1}},"key":"oluXGI5Jej"}],"key":"PkjnjITE4X"},{"type":"math","value":"T = O\\left( \\frac{1}{1-\\gamma} \\log\\left(\\frac{1}{\\epsilon (1-\\gamma)}\\right) \\right).","position":{"start":{"line":1250,"column":1},"end":{"line":1252,"column":1}},"html":"T=O(11γlog(1ϵ(1γ))).T = O\\left( \\frac{1}{1-\\gamma} \\log\\left(\\frac{1}{\\epsilon (1-\\gamma)}\\right) \\right).T=O(1γ1log(ϵ(1γ)1)).","enumerator":"1.51","key":"ZEa19jL1Ks"},{"type":"paragraph","position":{"start":{"line":1254,"column":1},"end":{"line":1256,"column":1}},"children":[{"type":"text","value":"Note that we’ve applied the inequalities\n","position":{"start":{"line":1254,"column":1},"end":{"line":1254,"column":1}},"key":"CMC30Picfy"},{"type":"inlineMath","value":"\\|v^{(0)} - V^\\pi\\|_{\\infty} \\le 1/(1-\\gamma)","position":{"start":{"line":1254,"column":1},"end":{"line":1254,"column":1}},"html":"v(0)Vπ1/(1γ)\\|v^{(0)} - V^\\pi\\|_{\\infty} \\le 1/(1-\\gamma)v(0)Vπ1/(1γ)","key":"KEQ8oM225u"},{"type":"text","value":" and\n","position":{"start":{"line":1254,"column":1},"end":{"line":1254,"column":1}},"key":"lV9AtP1pEo"},{"type":"inlineMath","value":"\\log (1/x) \\ge 1-x","position":{"start":{"line":1254,"column":1},"end":{"line":1254,"column":1}},"html":"log(1/x)1x\\log (1/x) \\ge 1-xlog(1/x)1x","key":"sUp77AkDQk"},{"type":"text","value":".","position":{"start":{"line":1254,"column":1},"end":{"line":1254,"column":1}},"key":"VTAOgs5lgi"}],"key":"JUw4vNUgoC"}],"enumerator":"1.2","html_id":"iterations-vi","key":"CnuU5u2b5h"},{"type":"heading","depth":3,"position":{"start":{"line":1259,"column":1},"end":{"line":1259,"column":1}},"children":[{"type":"text","value":"Optimal policies in infinite-horizon MDPs","position":{"start":{"line":1259,"column":1},"end":{"line":1259,"column":1}},"key":"YHchVRMQ4x"}],"identifier":"optimal-policies-in-infinite-horizon-mdps","label":"Optimal policies in infinite-horizon MDPs","html_id":"optimal-policies-in-infinite-horizon-mdps","implicit":true,"enumerator":"1.5.3","key":"Oozb8tEz7A"},{"type":"paragraph","position":{"start":{"line":1261,"column":1},"end":{"line":1266,"column":1}},"children":[{"type":"text","value":"Now let’s move on to solving for an optimal policy in the\ninfinite-horizon case. As in ","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"fW8Bzr7bQk"},{"type":"crossReference","kind":"proof:definition","identifier":"optimal_policy_finite","label":"optimal_policy_finite","children":[{"type":"text","value":"the finite-horizon case","key":"ZT49LH2jvr"}],"template":"Definition %s","enumerator":"1.10","resolved":true,"html_id":"optimal-policy-finite","key":"xS6JPEHqiq"},{"type":"text","value":", an ","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"JpACbofrZm"},{"type":"strong","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"children":[{"type":"text","value":"optimal policy","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"uw80Mec1O5"}],"key":"oYLZ2IDQSy"},{"type":"text","value":" ","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"VJUnJOkiqj"},{"type":"inlineMath","value":"\\pi^\\star","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"html":"π\\pi^\\starπ","key":"UI6K8d3KuL"},{"type":"text","value":"\nis one that does at least as well as any other policy in all situations.\nThat is, for all policies ","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"migIOEHjmI"},{"type":"text","value":"π","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"VpX90F4bi9"},{"type":"text","value":", states ","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"mX65vUCd6n"},{"type":"inlineMath","value":"s \\in \\mathcal{S}","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"html":"sSs \\in \\mathcal{S}sS","key":"zAdRsOtPOV"},{"type":"text","value":", times\n","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"d2451G1zE0"},{"type":"inlineMath","value":"\\hi \\in \\mathbb{N}","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"html":"hN\\hi \\in \\mathbb{N}hN","key":"asWVQ5mGS2"},{"type":"text","value":", and initial trajectories\n","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"gb8xvk4v6A"},{"type":"inlineMath","value":"\\tau_\\hi = (s_0, a_0, r_0, \\dots, s_\\hi)","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"html":"τh=(s0,a0,r0,,sh)\\tau_\\hi = (s_0, a_0, r_0, \\dots, s_\\hi)τh=(s0,a0,r0,,sh)","key":"GnzgQ1HStU"},{"type":"text","value":" where ","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"OkGODDeVvY"},{"type":"inlineMath","value":"s_\\hi = s","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"html":"sh=ss_\\hi = ssh=s","key":"qqpJ7BeNHw"},{"type":"text","value":",","position":{"start":{"line":1261,"column":1},"end":{"line":1261,"column":1}},"key":"n646CMRldL"}],"key":"Ny7ZjlrEYn"},{"type":"math","value":"\\begin{aligned}\n V^{\\pi^\\star}(s) &= \\E_{\\tau \\sim \\rho^{\\pi^{\\star}}}[r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots \\mid s_\\hi = s] \\\\\n &\\ge \\E_{\\tau \\sim \\rho^{\\pi}}[r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots \\mid \\tau_\\hi]\n\\end{aligned}","label":"optimal_policy_infinite","identifier":"optimal_policy_infinite","html":"Vπ(s)=Eτρπ[rh+γrh+1+γ2rh+2+sh=s]Eτρπ[rh+γrh+1+γ2rh+2+τh]\\begin{aligned}\n V^{\\pi^\\star}(s) &= \\E_{\\tau \\sim \\rho^{\\pi^{\\star}}}[r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots \\mid s_\\hi = s] \\\\\n &\\ge \\E_{\\tau \\sim \\rho^{\\pi}}[r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots \\mid \\tau_\\hi]\n\\end{aligned}Vπ(s)=Eτρπ[rh+γrh+1+γ2rh+2+sh=s]Eτρπ[rh+γrh+1+γ2rh+2+τh]","enumerator":"1.52","html_id":"optimal-policy-infinite","key":"LI7lgXZaTQ"},{"type":"paragraph","position":{"start":{"line":1278,"column":1},"end":{"line":1279,"column":1}},"children":[{"type":"text","value":"Once again, all optimal policies share the same ","position":{"start":{"line":1278,"column":1},"end":{"line":1278,"column":1}},"key":"ebHooHgA7n"},{"type":"strong","position":{"start":{"line":1278,"column":1},"end":{"line":1278,"column":1}},"children":[{"type":"text","value":"optimal value function","position":{"start":{"line":1278,"column":1},"end":{"line":1278,"column":1}},"key":"aUPzoqd87C"}],"key":"q1ACuqSQze"},{"type":"text","value":" ","position":{"start":{"line":1278,"column":1},"end":{"line":1278,"column":1}},"key":"vqZeRad5q5"},{"type":"inlineMath","value":"V^\\star","position":{"start":{"line":1278,"column":1},"end":{"line":1278,"column":1}},"html":"VV^\\starV","key":"NSMbxLnu02"},{"type":"text","value":", and the greedy policy with respect to this value function\nis optimal.","position":{"start":{"line":1278,"column":1},"end":{"line":1278,"column":1}},"key":"UK7nDpK9e1"}],"key":"bHzyhq6vWP"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"CM1dvZfMBz"}],"key":"B5uTh34L2z"},{"type":"paragraph","position":{"start":{"line":1282,"column":1},"end":{"line":1282,"column":1}},"children":[{"type":"text","value":"Verify this by modifying the proof ","position":{"start":{"line":1282,"column":1},"end":{"line":1282,"column":1}},"key":"Ry6xy00qxX"},{"type":"crossReference","kind":"proof:theorem","identifier":"optimal_greedy","label":"optimal_greedy","children":[{"type":"text","value":"Theorem ","key":"uqvG2HlYzg"},{"type":"text","value":"1.3","key":"jiOcMbQYCB"}],"template":"Theorem %s","enumerator":"1.3","resolved":true,"html_id":"optimal-greedy","key":"b2kH3xGeiO"},{"type":"text","value":" from the finite-horizon case.","position":{"start":{"line":1282,"column":1},"end":{"line":1282,"column":1}},"key":"JlNusQLW2U"}],"key":"RtUseoHTOi"}],"key":"wt0KmMMSP8"},{"type":"paragraph","position":{"start":{"line":1285,"column":1},"end":{"line":1289,"column":1}},"children":[{"type":"text","value":"So how can we compute such an optimal policy? We can’t use the backwards\nDP approach from the finite-horizon case ","position":{"start":{"line":1285,"column":1},"end":{"line":1285,"column":1}},"key":"TL6Ia3WZOm"},{"type":"crossReference","kind":"proof:definition","identifier":"pi_star_dp","label":"pi_star_dp","children":[{"type":"text","value":"Definition ","key":"LKFtRrGTE7"},{"type":"text","value":"1.11","key":"wxU5uVCTOO"}],"template":"Definition %s","enumerator":"1.11","resolved":true,"html_id":"pi-star-dp","key":"NcjcyZEKxL"},{"type":"text","value":" since there’s no “final timestep” to start\nfrom. Instead, we’ll exploit the fact that the Bellman consistency\nequation ","position":{"start":{"line":1285,"column":1},"end":{"line":1285,"column":1}},"key":"so3bmd8LgD"},{"type":"crossReference","kind":"equation","identifier":"bellman_consistency_infinite","label":"bellman_consistency_infinite","children":[{"type":"text","value":"(","key":"ctO5saRQRj"},{"type":"text","value":"1.32","key":"P7KmPupKZL"},{"type":"text","value":")","key":"kXBXTUJ5Hw"}],"template":"(%s)","enumerator":"1.32","resolved":true,"html_id":"bellman-consistency-infinite","key":"WSKJEDzClu"},{"type":"text","value":" for the optimal value\nfunction doesn’t depend on any policy:","position":{"start":{"line":1285,"column":1},"end":{"line":1285,"column":1}},"key":"mN8NjWfurY"}],"key":"nZySSxgEev"},{"type":"math","value":"V^\\star(s) = \\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} V^\\star(s'). \\right]","label":"bellman_optimality","identifier":"bellman_optimality","html":"V(s)=maxa[r(s,a)+γEsP(s,a)V(s).]V^\\star(s) = \\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} V^\\star(s'). \\right]V(s)=amax[r(s,a)+γEsP(s,a)V(s).]","enumerator":"1.53","html_id":"bellman-optimality","key":"w6z4MyltPL"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"jFwpD3K2eJ"}],"key":"JyGDc9LBGx"},{"type":"paragraph","position":{"start":{"line":1298,"column":1},"end":{"line":1299,"column":1}},"children":[{"type":"text","value":"Verify this by substituting the greedy policy into the\nBellman consistency equation.","position":{"start":{"line":1298,"column":1},"end":{"line":1298,"column":1}},"key":"pl8ovxzAIK"}],"key":"nMGW0XWpI7"}],"key":"JHwtPPEQWa"},{"type":"paragraph","position":{"start":{"line":1302,"column":1},"end":{"line":1303,"column":1}},"children":[{"type":"text","value":"As before, thinking of the r.h.s. of ","position":{"start":{"line":1302,"column":1},"end":{"line":1302,"column":1}},"key":"UsjKxEyw3s"},{"type":"crossReference","kind":"equation","identifier":"bellman_optimality","label":"bellman_optimality","children":[{"type":"text","value":"(","key":"NjRC394WTw"},{"type":"text","value":"1.53","key":"vKN8GxqUC8"},{"type":"text","value":")","key":"m9zMXeOyNm"}],"template":"(%s)","enumerator":"1.53","resolved":true,"html_id":"bellman-optimality","key":"Xo9hx7MxeX"},{"type":"text","value":" as an operator on value functions\ngives the ","position":{"start":{"line":1302,"column":1},"end":{"line":1302,"column":1}},"key":"O1JfgQPXKP"},{"type":"strong","position":{"start":{"line":1302,"column":1},"end":{"line":1302,"column":1}},"children":[{"type":"text","value":"Bellman optimality operator","position":{"start":{"line":1302,"column":1},"end":{"line":1302,"column":1}},"key":"nnZUqg2pxu"}],"key":"qXqVE0WXoJ"}],"key":"XszZcRJXDk"},{"type":"math","value":"[\\mathcal{J}^{\\star}(v)](s) = \\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} v(s') \\right]","label":"bellman_optimality_operator","identifier":"bellman_optimality_operator","html":"[J(v)](s)=maxa[r(s,a)+γEsP(s,a)v(s)][\\mathcal{J}^{\\star}(v)](s) = \\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} v(s') \\right][J(v)](s)=amax[r(s,a)+γEsP(s,a)v(s)]","enumerator":"1.54","html_id":"bellman-optimality-operator","key":"z2dnmLQ6CW"}],"key":"BCzxtZB1dT"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def bellman_optimality_operator(mdp: MDP, v: Float[Array, \" S\"]) -> Float[Array, \" S\"]:\n return jnp.max(mdp.r + mdp.γ * mdp.P @ v, axis=1)\n\n\ndef check_optimal(v: Float[Array, \" S\"], mdp: MDP):\n return jnp.allclose(v, bellman_optimality_operator(v, mdp))","key":"NdW9jZJGYR"},{"type":"output","id":"wKMkI_iMg24nTkUJXatTo","data":[],"key":"CrXSAtEy3Y"}],"data":{},"key":"oIbWUdLpLP"},{"type":"block","children":[{"type":"heading","depth":4,"position":{"start":{"line":1321,"column":1},"end":{"line":1321,"column":1}},"children":[{"type":"text","value":"Value iteration","position":{"start":{"line":1321,"column":1},"end":{"line":1321,"column":1}},"key":"KatEk7rF5k"}],"label":"value_iteration","identifier":"value_iteration","html_id":"value-iteration","enumerator":"1.5.3.1","key":"bTNobLRpg8"},{"type":"paragraph","position":{"start":{"line":1323,"column":1},"end":{"line":1326,"column":1}},"children":[{"type":"text","value":"Since the optimal policy is still a policy, our result that the Bellman\noperator is a contracting map still holds, and so we can repeatedly\napply this operator to converge to the optimal value function! This\nalgorithm is known as ","position":{"start":{"line":1323,"column":1},"end":{"line":1323,"column":1}},"key":"XPmpxJ9pzw"},{"type":"strong","position":{"start":{"line":1323,"column":1},"end":{"line":1323,"column":1}},"children":[{"type":"text","value":"value iteration","position":{"start":{"line":1323,"column":1},"end":{"line":1323,"column":1}},"key":"aBUbYHlqXR"}],"key":"EEYFnjNdqc"},{"type":"text","value":".","position":{"start":{"line":1323,"column":1},"end":{"line":1323,"column":1}},"key":"stma1A8VX9"}],"key":"jMpIGpCuU0"}],"key":"KUZJo4N4WH"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def value_iteration(mdp: MDP, ε: float = 1e-6) -> Float[Array, \" S\"]:\n \"\"\"Iterate the Bellman optimality operator until convergence.\"\"\"\n op = partial(bellman_optimality_operator, mdp)\n return loop_until_convergence(op, jnp.zeros(mdp.S), ε)","key":"AOqsjo5ljF"},{"type":"output","id":"jPxosH1e1Nn8V2gJWHRP0","data":[],"key":"LCORgpXyxw"}],"data":{},"key":"QbaB1YMIXO"},{"type":"block","children":[],"key":"nwcvLfhfcb"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"value_iteration(tidy_mdp_inf)","key":"y1ZSLnkgpH"},{"type":"output","id":"aNkPsw4L74ljRTlLE_ouV","data":[{"output_type":"execute_result","execution_count":24,"metadata":{},"data":{"text/plain":{"content":"Array([15.564166, 14.785956], dtype=float32)","content_type":"text/plain"}}}],"key":"YsfFz6RTpp"}],"data":{},"key":"fSEtt07YFo"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":1339,"column":1},"end":{"line":1342,"column":1}},"children":[{"type":"text","value":"Note that the runtime analysis for an ","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"key":"pI52STgfiv"},{"type":"text","value":"ε","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"key":"jRuifeQJIL"},{"type":"text","value":"-optimal value function\nis exactly the same as ","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"key":"KUUiLJT2VV"},{"type":"crossReference","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"children":[{"type":"text","value":"iterative policy evaluation","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"key":"HZLtg0rYyT"}],"identifier":"iterative_pe","label":"iterative_pe","kind":"heading","template":"Section %s","enumerator":"1.5.2.2","resolved":true,"html_id":"iterative-pe","key":"emb57RFrpA"},{"type":"text","value":"! This is because value iteration is simply\nthe special case of applying iterative policy evaluation to the\n","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"key":"i05un0aazz"},{"type":"emphasis","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"children":[{"type":"text","value":"optimal","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"key":"gBLQ5kc0U6"}],"key":"BTAQgPPfAT"},{"type":"text","value":" value function.","position":{"start":{"line":1339,"column":1},"end":{"line":1339,"column":1}},"key":"ja5Nmcab9x"}],"key":"UwXcLaiPDd"},{"type":"paragraph","position":{"start":{"line":1344,"column":1},"end":{"line":1346,"column":1}},"children":[{"type":"text","value":"As the final step of the algorithm, to return an actual policy\n","position":{"start":{"line":1344,"column":1},"end":{"line":1344,"column":1}},"key":"uEBQFkKE2e"},{"type":"inlineMath","value":"\\hat \\pi","position":{"start":{"line":1344,"column":1},"end":{"line":1344,"column":1}},"html":"π^\\hat \\piπ^","key":"ZKglDZwjm0"},{"type":"text","value":", we can simply act greedily with respect to the final iteration\n","position":{"start":{"line":1344,"column":1},"end":{"line":1344,"column":1}},"key":"I8PknfnDNP"},{"type":"inlineMath","value":"v^{(T)}","position":{"start":{"line":1344,"column":1},"end":{"line":1344,"column":1}},"html":"v(T)v^{(T)}v(T)","key":"moFwlRROWZ"},{"type":"text","value":" of our above algorithm:","position":{"start":{"line":1344,"column":1},"end":{"line":1344,"column":1}},"key":"u5dh4Bn0b4"}],"key":"zyq6L4KMSE"},{"type":"math","value":"\\hat \\pi(s) = \\arg\\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} v^{(T)}(s') \\right].","position":{"start":{"line":1348,"column":1},"end":{"line":1348,"column":1}},"html":"π^(s)=argmaxa[r(s,a)+γEsP(s,a)v(T)(s)].\\hat \\pi(s) = \\arg\\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} v^{(T)}(s') \\right].π^(s)=argamax[r(s,a)+γEsP(s,a)v(T)(s)].","enumerator":"1.55","key":"oB5dflhZy3"},{"type":"paragraph","position":{"start":{"line":1350,"column":1},"end":{"line":1352,"column":1}},"children":[{"type":"text","value":"We must be careful, though: the value function of this greedy policy,\n","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"key":"sgWK1rR6ka"},{"type":"inlineMath","value":"V^{\\hat \\pi}","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"html":"Vπ^V^{\\hat \\pi}Vπ^","key":"cNkytSVyLM"},{"type":"text","value":", is ","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"key":"CzSMhPWt6J"},{"type":"emphasis","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"children":[{"type":"text","value":"not","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"key":"RfgSoweFKq"}],"key":"HetqBaWZeV"},{"type":"text","value":" the same as ","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"key":"dTpuhhfRt8"},{"type":"inlineMath","value":"v^{(T)}","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"html":"v(T)v^{(T)}v(T)","key":"ZGCCA9pl8b"},{"type":"text","value":", which need not even be a\nwell-defined value function for some policy!","position":{"start":{"line":1350,"column":1},"end":{"line":1350,"column":1}},"key":"Z3fpfjihWk"}],"key":"m41XsKoY0B"},{"type":"paragraph","position":{"start":{"line":1354,"column":1},"end":{"line":1358,"column":1}},"children":[{"type":"text","value":"The bound on the policy’s quality is actually quite loose: if\n","position":{"start":{"line":1354,"column":1},"end":{"line":1354,"column":1}},"key":"Bmmqv57UIW"},{"type":"inlineMath","value":"\\|v^{(T)} - V^\\star\\|_{\\infty} \\le \\epsilon","position":{"start":{"line":1354,"column":1},"end":{"line":1354,"column":1}},"html":"v(T)Vϵ\\|v^{(T)} - V^\\star\\|_{\\infty} \\le \\epsilonv(T)Vϵ","key":"XYkejPf81W"},{"type":"text","value":", then the greedy policy\n","position":{"start":{"line":1354,"column":1},"end":{"line":1354,"column":1}},"key":"ZCzw0phXcx"},{"type":"inlineMath","value":"\\hat \\pi","position":{"start":{"line":1354,"column":1},"end":{"line":1354,"column":1}},"html":"π^\\hat \\piπ^","key":"ak0smsx2n7"},{"type":"text","value":" satisfies\n","position":{"start":{"line":1354,"column":1},"end":{"line":1354,"column":1}},"key":"btaLJvVlfm"},{"type":"inlineMath","value":"\\|V^{\\hat \\pi} - V^\\star\\|_{\\infty} \\le \\frac{2\\gamma}{1-\\gamma} \\epsilon","position":{"start":{"line":1354,"column":1},"end":{"line":1354,"column":1}},"html":"Vπ^V2γ1γϵ\\|V^{\\hat \\pi} - V^\\star\\|_{\\infty} \\le \\frac{2\\gamma}{1-\\gamma} \\epsilonVπ^V1γ2γϵ","key":"j5au8aUSxb"},{"type":"text","value":",\nwhich might potentially be very large.","position":{"start":{"line":1354,"column":1},"end":{"line":1354,"column":1}},"key":"pvFOrKtyzz"}],"key":"PVAs4DD1BM"},{"type":"proof","kind":"theorem","label":"greedy_worsen","identifier":"greedy_worsen","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Greedy policy value worsening","position":{"start":{"line":1360,"column":1},"end":{"line":1360,"column":1}},"key":"lfaCyqWeEb"}],"key":"p47FaQ4Jge"},{"type":"math","value":"\\|V^{\\hat \\pi} - V^\\star \\|_{\\infty} \\le \\frac{2 \\gamma}{1-\\gamma} \\|v - V^\\star\\|_{\\infty}","position":{"start":{"line":1363,"column":1},"end":{"line":1363,"column":1}},"html":"Vπ^V2γ1γvV\\|V^{\\hat \\pi} - V^\\star \\|_{\\infty} \\le \\frac{2 \\gamma}{1-\\gamma} \\|v - V^\\star\\|_{\\infty}Vπ^V1γ2γvV","enumerator":"1.56","key":"twPtsVaGND"},{"type":"paragraph","position":{"start":{"line":1365,"column":1},"end":{"line":1365,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":1365,"column":1},"end":{"line":1365,"column":1}},"key":"VSXobh5vKR"},{"type":"inlineMath","value":"\\hat \\pi(s) = \\arg\\max_a q(s, a)","position":{"start":{"line":1365,"column":1},"end":{"line":1365,"column":1}},"html":"π^(s)=argmaxaq(s,a)\\hat \\pi(s) = \\arg\\max_a q(s, a)π^(s)=argmaxaq(s,a)","key":"YEs2p8f4yZ"},{"type":"text","value":" is the greedy policy with respect to","position":{"start":{"line":1365,"column":1},"end":{"line":1365,"column":1}},"key":"MWwWI2m8da"}],"key":"lHaI8f7n64"},{"type":"math","value":"q(s, a) = r(s, a) + \\E_{s' \\sim P(s, a)} v(s').","position":{"start":{"line":1367,"column":1},"end":{"line":1367,"column":1}},"html":"q(s,a)=r(s,a)+EsP(s,a)v(s).q(s, a) = r(s, a) + \\E_{s' \\sim P(s, a)} v(s').q(s,a)=r(s,a)+EsP(s,a)v(s).","enumerator":"1.57","key":"WvAbZ4CeK2"}],"enumerator":"1.5","html_id":"greedy-worsen","key":"Mq2XrjYpzF"},{"type":"proof","kind":"proof","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Proof","position":{"start":{"line":1370,"column":1},"end":{"line":1370,"column":1}},"key":"sXICQd5M3d"}],"key":"afSSmTEoNv"},{"type":"paragraph","position":{"start":{"line":1371,"column":1},"end":{"line":1371,"column":1}},"children":[{"type":"text","value":"We first have","position":{"start":{"line":1371,"column":1},"end":{"line":1371,"column":1}},"key":"evgmoZ8ZE8"}],"key":"HlhiZ6ZysK"},{"type":"math","value":"\\begin{aligned}\n V^{\\star}(s) - V^{\\hat \\pi}(s) &= Q^{\\star}(s,\\pi^\\star(s)) - Q^{\\hat \\pi}(s, \\hat \\pi(s))\\\\\n &= [Q^{\\star}(s,\\pi^\\star(s)) - Q^{\\star}(s, \\hat \\pi(s))] + [Q^{\\star}(s, \\hat \\pi(s)) - Q^{\\hat \\pi}(s, \\hat \\pi(s))].\n\\end{aligned}","position":{"start":{"line":1373,"column":1},"end":{"line":1378,"column":1}},"html":"V(s)Vπ^(s)=Q(s,π(s))Qπ^(s,π^(s))=[Q(s,π(s))Q(s,π^(s))]+[Q(s,π^(s))Qπ^(s,π^(s))].\\begin{aligned}\n V^{\\star}(s) - V^{\\hat \\pi}(s) &= Q^{\\star}(s,\\pi^\\star(s)) - Q^{\\hat \\pi}(s, \\hat \\pi(s))\\\\\n &= [Q^{\\star}(s,\\pi^\\star(s)) - Q^{\\star}(s, \\hat \\pi(s))] + [Q^{\\star}(s, \\hat \\pi(s)) - Q^{\\hat \\pi}(s, \\hat \\pi(s))].\n\\end{aligned}V(s)Vπ^(s)=Q(s,π(s))Qπ^(s,π^(s))=[Q(s,π(s))Q(s,π^(s))]+[Q(s,π^(s))Qπ^(s,π^(s))].","enumerator":"1.58","key":"R5Mfz5kCD7"},{"type":"paragraph","position":{"start":{"line":1380,"column":1},"end":{"line":1380,"column":1}},"children":[{"type":"text","value":"Let’s bound these two quantities separately.","position":{"start":{"line":1380,"column":1},"end":{"line":1380,"column":1}},"key":"XnBKUal4Xt"}],"key":"WSOhA1zG00"},{"type":"paragraph","position":{"start":{"line":1382,"column":1},"end":{"line":1382,"column":1}},"children":[{"type":"text","value":"For the first quantity, note that by the definition of ","position":{"start":{"line":1382,"column":1},"end":{"line":1382,"column":1}},"key":"QKJ4Zl6VuN"},{"type":"inlineMath","value":"\\hat \\pi","position":{"start":{"line":1382,"column":1},"end":{"line":1382,"column":1}},"html":"π^\\hat \\piπ^","key":"RSELFqCL05"},{"type":"text","value":", we have","position":{"start":{"line":1382,"column":1},"end":{"line":1382,"column":1}},"key":"fF1IenOoju"}],"key":"Ykcj1BnE0b"},{"type":"math","value":"q(s, \\hat \\pi(s)) \\ge q(s,\\pi^\\star(s)).","position":{"start":{"line":1384,"column":1},"end":{"line":1384,"column":1}},"html":"q(s,π^(s))q(s,π(s)).q(s, \\hat \\pi(s)) \\ge q(s,\\pi^\\star(s)).q(s,π^(s))q(s,π(s)).","enumerator":"1.59","key":"EECn4fj38I"},{"type":"paragraph","position":{"start":{"line":1386,"column":1},"end":{"line":1386,"column":1}},"children":[{"type":"text","value":"Let’s add ","position":{"start":{"line":1386,"column":1},"end":{"line":1386,"column":1}},"key":"jqMietuidr"},{"type":"inlineMath","value":"q(s, \\hat \\pi(s)) - q(s,\\pi^\\star(s)) \\ge 0","position":{"start":{"line":1386,"column":1},"end":{"line":1386,"column":1}},"html":"q(s,π^(s))q(s,π(s))0q(s, \\hat \\pi(s)) - q(s,\\pi^\\star(s)) \\ge 0q(s,π^(s))q(s,π(s))0","key":"JjsyzdUAVw"},{"type":"text","value":" to the first term to get","position":{"start":{"line":1386,"column":1},"end":{"line":1386,"column":1}},"key":"HFpWmZCQst"}],"key":"JyuxXjAgf1"},{"type":"math","value":"\\begin{aligned}\n Q^{\\star}(s,\\pi^\\star(s)) - Q^{\\star}(s, \\hat \\pi(s)) &\\le [Q^{\\star}(s,\\pi^\\star(s))- q(s,\\pi^\\star(s))] + [q(s, \\hat \\pi(s)) - Q^{\\star}(s, \\hat \\pi(s))] \\\\\n &= \\gamma \\E_{s' \\sim P(s, \\pi^{\\star}(s))} [ V^{\\star}(s') - v(s') ] + \\gamma \\E_{s' \\sim P(s, \\hat \\pi(s))} [ v(s') - V^{\\star}(s') ] \\\\\n &\\le 2 \\gamma \\|v - V^{\\star}\\|_{\\infty}.\n\\end{aligned}","position":{"start":{"line":1388,"column":1},"end":{"line":1394,"column":1}},"html":"Q(s,π(s))Q(s,π^(s))[Q(s,π(s))q(s,π(s))]+[q(s,π^(s))Q(s,π^(s))]=γEsP(s,π(s))[V(s)v(s)]+γEsP(s,π^(s))[v(s)V(s)]2γvV.\\begin{aligned}\n Q^{\\star}(s,\\pi^\\star(s)) - Q^{\\star}(s, \\hat \\pi(s)) &\\le [Q^{\\star}(s,\\pi^\\star(s))- q(s,\\pi^\\star(s))] + [q(s, \\hat \\pi(s)) - Q^{\\star}(s, \\hat \\pi(s))] \\\\\n &= \\gamma \\E_{s' \\sim P(s, \\pi^{\\star}(s))} [ V^{\\star}(s') - v(s') ] + \\gamma \\E_{s' \\sim P(s, \\hat \\pi(s))} [ v(s') - V^{\\star}(s') ] \\\\\n &\\le 2 \\gamma \\|v - V^{\\star}\\|_{\\infty}.\n\\end{aligned}Q(s,π(s))Q(s,π^(s))[Q(s,π(s))q(s,π(s))]+[q(s,π^(s))Q(s,π^(s))]=γEsP(s,π(s))[V(s)v(s)]+γEsP(s,π^(s))[v(s)V(s)]2γvV.","enumerator":"1.60","key":"qQ6Bv4ePMW"},{"type":"paragraph","position":{"start":{"line":1397,"column":1},"end":{"line":1397,"column":1}},"children":[{"type":"text","value":"The second quantity is bounded by","position":{"start":{"line":1397,"column":1},"end":{"line":1397,"column":1}},"key":"CwPbFStNxM"}],"key":"gfpuGEveoI"},{"type":"math","value":"\\begin{aligned}\n Q^{\\star}(s, \\hat \\pi(s)) - Q^{\\hat \\pi}(s, \\hat \\pi(s))\n &=\n \\gamma \\E_{s'\\sim P(s, \\hat \\pi(s))}\\left[ V^\\star(s') - V^{\\hat \\pi}(s') \\right] \\\\\n & \\leq \n \\gamma \\|V^{\\star} - V^{\\hat \\pi}\\|_\\infty\n\\end{aligned}","position":{"start":{"line":1399,"column":1},"end":{"line":1407,"column":1}},"html":"Q(s,π^(s))Qπ^(s,π^(s))=γEsP(s,π^(s))[V(s)Vπ^(s)]γVVπ^\\begin{aligned}\n Q^{\\star}(s, \\hat \\pi(s)) - Q^{\\hat \\pi}(s, \\hat \\pi(s))\n &=\n \\gamma \\E_{s'\\sim P(s, \\hat \\pi(s))}\\left[ V^\\star(s') - V^{\\hat \\pi}(s') \\right] \\\\\n & \\leq \n \\gamma \\|V^{\\star} - V^{\\hat \\pi}\\|_\\infty\n\\end{aligned}Q(s,π^(s))Qπ^(s,π^(s))=γEsP(s,π^(s))[V(s)Vπ^(s)]γVVπ^","enumerator":"1.61","key":"BW0ve6t02H"},{"type":"paragraph","position":{"start":{"line":1409,"column":1},"end":{"line":1409,"column":1}},"children":[{"type":"text","value":"and thus","position":{"start":{"line":1409,"column":1},"end":{"line":1409,"column":1}},"key":"R17LKaYzTL"}],"key":"q6tAow8ZJl"},{"type":"math","value":"\\begin{aligned}\n \\|V^\\star - V^{\\hat \\pi}\\|_\\infty &\\le 2 \\gamma \\|v - V^{\\star}\\|_{\\infty} + \\gamma \\|V^{\\star} - V^{\\hat \\pi}\\|_\\infty \\\\\n \\|V^\\star - V^{\\hat \\pi}\\|_\\infty &\\le \\frac{2 \\gamma \\|v - V^{\\star}\\|_{\\infty}}{1-\\gamma}.\n\\end{aligned}","position":{"start":{"line":1411,"column":1},"end":{"line":1416,"column":1}},"html":"VVπ^2γvV+γVVπ^VVπ^2γvV1γ.\\begin{aligned}\n \\|V^\\star - V^{\\hat \\pi}\\|_\\infty &\\le 2 \\gamma \\|v - V^{\\star}\\|_{\\infty} + \\gamma \\|V^{\\star} - V^{\\hat \\pi}\\|_\\infty \\\\\n \\|V^\\star - V^{\\hat \\pi}\\|_\\infty &\\le \\frac{2 \\gamma \\|v - V^{\\star}\\|_{\\infty}}{1-\\gamma}.\n\\end{aligned}VVπ^VVπ^2γvV+γVVπ^1γ2γvV.","enumerator":"1.62","key":"nIBcOnykX3"}],"enumerator":"1.3","key":"hnXKQkEK9u"},{"type":"paragraph","position":{"start":{"line":1419,"column":1},"end":{"line":1419,"column":1}},"children":[{"type":"text","value":"So in order to compensate and achieve ","position":{"start":{"line":1419,"column":1},"end":{"line":1419,"column":1}},"key":"fK6Kiosvgf"},{"type":"inlineMath","value":"\\|V^{\\hat \\pi} - V^{\\star}\\| \\le \\epsilon","position":{"start":{"line":1419,"column":1},"end":{"line":1419,"column":1}},"html":"Vπ^Vϵ\\|V^{\\hat \\pi} - V^{\\star}\\| \\le \\epsilonVπ^Vϵ","key":"LSugS2MuUH"},{"type":"text","value":", we must have","position":{"start":{"line":1419,"column":1},"end":{"line":1419,"column":1}},"key":"fHASLfSenU"}],"key":"L0biYI6Lde"},{"type":"math","value":"\\|v^{(T)} - V^\\star\\|_{\\infty} \\le \\frac{1-\\gamma}{2 \\gamma} \\epsilon.","position":{"start":{"line":1421,"column":1},"end":{"line":1421,"column":1}},"html":"v(T)V1γ2γϵ.\\|v^{(T)} - V^\\star\\|_{\\infty} \\le \\frac{1-\\gamma}{2 \\gamma} \\epsilon.v(T)V2γ1γϵ.","enumerator":"1.63","key":"K16mcrAz93"},{"type":"paragraph","position":{"start":{"line":1423,"column":1},"end":{"line":1423,"column":1}},"children":[{"type":"text","value":"This means, using ","position":{"start":{"line":1423,"column":1},"end":{"line":1423,"column":1}},"key":"lKGi9XD7kt"},{"type":"crossReference","kind":"proof:remark","identifier":"iterations_vi","label":"iterations_vi","children":[{"type":"text","value":"Remark ","key":"DAItjnyfIN"},{"type":"text","value":"1.2","key":"oXcKVdRyYE"}],"template":"Remark %s","enumerator":"1.2","resolved":true,"html_id":"iterations-vi","key":"F9KSXL78rj"},{"type":"text","value":", we need to run value iteration for","position":{"start":{"line":1423,"column":1},"end":{"line":1423,"column":1}},"key":"gnVyp4jJHV"}],"key":"aph5uds3GL"},{"type":"math","value":"T = O\\left( \\frac{1}{1-\\gamma} \\log\\left(\\frac{\\gamma}{\\epsilon (1-\\gamma)^2}\\right) \\right)","position":{"start":{"line":1425,"column":1},"end":{"line":1425,"column":1}},"html":"T=O(11γlog(γϵ(1γ)2))T = O\\left( \\frac{1}{1-\\gamma} \\log\\left(\\frac{\\gamma}{\\epsilon (1-\\gamma)^2}\\right) \\right)T=O(1γ1log(ϵ(1γ)2γ))","enumerator":"1.64","key":"p7baAfeRE5"},{"type":"paragraph","position":{"start":{"line":1427,"column":1},"end":{"line":1427,"column":1}},"children":[{"type":"text","value":"iterations to achieve an ","position":{"start":{"line":1427,"column":1},"end":{"line":1427,"column":1}},"key":"UJGAC0TKs7"},{"type":"text","value":"ε","position":{"start":{"line":1427,"column":1},"end":{"line":1427,"column":1}},"key":"L70t15kDhl"},{"type":"text","value":"-accurate estimate of the optimal value function.","position":{"start":{"line":1427,"column":1},"end":{"line":1427,"column":1}},"key":"NE2B4bZnBo"}],"key":"TZixSmVhmy"},{"type":"heading","depth":4,"position":{"start":{"line":1431,"column":1},"end":{"line":1431,"column":1}},"children":[{"type":"text","value":"Policy iteration","position":{"start":{"line":1431,"column":1},"end":{"line":1431,"column":1}},"key":"crDikePQpF"}],"label":"policy_iteration","identifier":"policy_iteration","html_id":"policy-iteration","enumerator":"1.5.3.2","key":"q6Qs1zsqNj"},{"type":"paragraph","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"children":[{"type":"text","value":"Can we mitigate this “greedy worsening”? What if instead of approximating the optimal value function and then acting greedily by it at the very end, we iteratively improve the policy and value function ","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"key":"Zj2PycTkkw"},{"type":"emphasis","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"children":[{"type":"text","value":"together","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"key":"OAuHQEn5qv"}],"key":"OrJgfpn3IJ"},{"type":"text","value":"? This is the idea behind ","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"key":"AEf90X22lP"},{"type":"strong","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"children":[{"type":"text","value":"policy iteration","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"key":"oHfqFAkeGf"}],"key":"iIalQVRpmr"},{"type":"text","value":". In each step, we simply set the policy to act greedily with respect to its own value function.","position":{"start":{"line":1433,"column":1},"end":{"line":1433,"column":1}},"key":"e59f34Yjtn"}],"key":"x01BOfFl7j"}],"key":"BW7t6ioerX"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def policy_iteration(mdp: MDP, ε=1e-6) -> Float[Array, \"S A\"]:\n \"\"\"Iteratively improve the policy and value function.\"\"\"\n def op(pi):\n return v_to_greedy(mdp, eval_deterministic_infinite(mdp, pi))\n π_init = jnp.ones((mdp.S, mdp.A)) / mdp.A # uniform random policy\n return loop_until_convergence(op, π_init, ε)","key":"DLpXxqAVmk"},{"type":"output","id":"JksVbaOoCyk4JSqjwieNy","data":[],"key":"Tc6HhL5Rip"}],"data":{},"key":"O52Xr4nPrR"},{"type":"block","children":[],"key":"Wn6BigI555"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"policy_iteration(tidy_mdp_inf)","key":"fe7f4gc6mc"},{"type":"output","id":"lsypf0yDDui3NT9V99OaI","data":[{"output_type":"execute_result","execution_count":26,"metadata":{},"data":{"text/plain":{"content":"Array([[1., 0.],\n [0., 1.]], dtype=float32)","content_type":"text/plain"}}}],"key":"JgS4OHbSrh"}],"data":{},"key":"qI9cR9akr7"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":1448,"column":1},"end":{"line":1448,"column":1}},"children":[{"type":"text","value":"Although PI appears more complex than VI, we’ll use the same contraction property ","position":{"start":{"line":1448,"column":1},"end":{"line":1448,"column":1}},"key":"IjvY3IEEQ3"},{"type":"crossReference","kind":"proof:theorem","identifier":"bellman_contraction","label":"bellman_contraction","children":[{"type":"text","value":"Theorem ","key":"EzPZ4N9o3s"},{"type":"text","value":"1.4","key":"AQqoD4hvJC"}],"template":"Theorem %s","enumerator":"1.4","resolved":true,"html_id":"bellman-contraction","key":"dE7n4zaKUA"},{"type":"text","value":" to show convergence. This will give us the same runtime bound as value iteration and iterative policy evaluation for an ","position":{"start":{"line":1448,"column":1},"end":{"line":1448,"column":1}},"key":"HgjuCczJQk"},{"type":"text","value":"ε","position":{"start":{"line":1448,"column":1},"end":{"line":1448,"column":1}},"key":"SiDsv6IosG"},{"type":"text","value":"-optimal value function ","position":{"start":{"line":1448,"column":1},"end":{"line":1448,"column":1}},"key":"OKZGbbKtWj"},{"type":"crossReference","kind":"proof:remark","identifier":"iterations_vi","label":"iterations_vi","children":[{"type":"text","value":"Remark ","key":"TU9nGhFs7J"},{"type":"text","value":"1.2","key":"UmaLP71Spa"}],"template":"Remark %s","enumerator":"1.2","resolved":true,"html_id":"iterations-vi","key":"dOyw5ycEji"},{"type":"text","value":", although in practice, PI often converges much faster.","position":{"start":{"line":1448,"column":1},"end":{"line":1448,"column":1}},"key":"JgdU2Qxi9n"}],"key":"GqBPQ0MuLC"},{"type":"proof","kind":"theorem","label":"pi_iter_analysis","identifier":"pi_iter_analysis","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Policy Iteration runtime and convergence","position":{"start":{"line":1450,"column":1},"end":{"line":1450,"column":1}},"key":"ULSYqUPBrG"}],"key":"ZiIXoVxtnP"},{"type":"paragraph","position":{"start":{"line":1453,"column":1},"end":{"line":1454,"column":1}},"children":[{"type":"text","value":"We aim to show that the number of iterations required for an\n","position":{"start":{"line":1453,"column":1},"end":{"line":1453,"column":1}},"key":"ZeEXt0pp1A"},{"type":"text","value":"ε","position":{"start":{"line":1453,"column":1},"end":{"line":1453,"column":1}},"key":"la1jPDOY9V"},{"type":"text","value":"-accurate estimate of the optimal value function is","position":{"start":{"line":1453,"column":1},"end":{"line":1453,"column":1}},"key":"Nr9740VV1T"}],"key":"i5OaW8m2nk"},{"type":"math","value":"T = O\\left( \\frac{1}{1-\\gamma} \\log\\left(\\frac{1}{\\epsilon (1-\\gamma)}\\right) \\right).","position":{"start":{"line":1456,"column":1},"end":{"line":1456,"column":1}},"html":"T=O(11γlog(1ϵ(1γ))).T = O\\left( \\frac{1}{1-\\gamma} \\log\\left(\\frac{1}{\\epsilon (1-\\gamma)}\\right) \\right).T=O(1γ1log(ϵ(1γ)1)).","enumerator":"1.65","key":"U34xSFpRs3"},{"type":"paragraph","position":{"start":{"line":1458,"column":1},"end":{"line":1458,"column":1}},"children":[{"type":"text","value":"This bound follows from the contraction property ","position":{"start":{"line":1458,"column":1},"end":{"line":1458,"column":1}},"key":"SyXOuvSdYN"},{"type":"crossReference","kind":"equation","identifier":"bellman_convergence","label":"bellman_convergence","children":[{"type":"text","value":"(","key":"mxJPjlXa6B"},{"type":"text","value":"1.38","key":"hXvT7pCxVs"},{"type":"text","value":")","key":"LrGkMetZfr"}],"template":"(%s)","enumerator":"1.38","resolved":true,"html_id":"bellman-convergence","key":"Tx9iMRZclK"},{"type":"text","value":":","position":{"start":{"line":1458,"column":1},"end":{"line":1458,"column":1}},"key":"FwD5At6Fu1"}],"key":"SE6vF8Hl5V"},{"type":"math","value":"\\|V^{\\pi^{t+1}} - V^\\star \\|_{\\infty} \\le \\gamma \\|V^{\\pi^{t}} - V^\\star \\|_{\\infty}.","position":{"start":{"line":1460,"column":1},"end":{"line":1460,"column":1}},"html":"Vπt+1VγVπtV.\\|V^{\\pi^{t+1}} - V^\\star \\|_{\\infty} \\le \\gamma \\|V^{\\pi^{t}} - V^\\star \\|_{\\infty}.Vπt+1VγVπtV.","enumerator":"1.66","key":"Y3FPvkKDSt"},{"type":"paragraph","position":{"start":{"line":1462,"column":1},"end":{"line":1463,"column":1}},"children":[{"type":"text","value":"We’ll prove that the iterates of PI respect the contraction property by\nshowing that the policies improve monotonically:","position":{"start":{"line":1462,"column":1},"end":{"line":1462,"column":1}},"key":"B3zG9XvFSF"}],"key":"tF1xCkBB2o"},{"type":"math","value":"V^{\\pi^{t+1}}(s) \\ge V^{\\pi^{t}}(s).","position":{"start":{"line":1465,"column":1},"end":{"line":1465,"column":1}},"html":"Vπt+1(s)Vπt(s).V^{\\pi^{t+1}}(s) \\ge V^{\\pi^{t}}(s).Vπt+1(s)Vπt(s).","enumerator":"1.67","key":"MWg9N8eI6X"},{"type":"paragraph","position":{"start":{"line":1467,"column":1},"end":{"line":1468,"column":1}},"children":[{"type":"text","value":"Then we’ll use this to show\n","position":{"start":{"line":1467,"column":1},"end":{"line":1467,"column":1}},"key":"gzHjTIGWYE"},{"type":"inlineMath","value":"V^{\\pi^{t+1}}(s) \\ge [\\mathcal{J}^{\\star}(V^{\\pi^{t}})](s)","position":{"start":{"line":1467,"column":1},"end":{"line":1467,"column":1}},"html":"Vπt+1(s)[J(Vπt)](s)V^{\\pi^{t+1}}(s) \\ge [\\mathcal{J}^{\\star}(V^{\\pi^{t}})](s)Vπt+1(s)[J(Vπt)](s)","key":"RoTLsBDkQN"},{"type":"text","value":". Note that","position":{"start":{"line":1467,"column":1},"end":{"line":1467,"column":1}},"key":"M4Hs7G6teF"}],"key":"X5z9fTrLH5"},{"type":"math","value":"\\begin{aligned}\n(s) &= \\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} V^{\\pi^{t}}(s') \\right] \\\\\n &= r(s, \\pi^{t+1}(s)) + \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} V^{\\pi^{t}}(s')\n\\end{aligned}","position":{"start":{"line":1470,"column":1},"end":{"line":1475,"column":1}},"html":"(s)=maxa[r(s,a)+γEsP(s,a)Vπt(s)]=r(s,πt+1(s))+γEsP(s,πt+1(s))Vπt(s)\\begin{aligned}\n(s) &= \\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} V^{\\pi^{t}}(s') \\right] \\\\\n &= r(s, \\pi^{t+1}(s)) + \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} V^{\\pi^{t}}(s')\n\\end{aligned}(s)=amax[r(s,a)+γEsP(s,a)Vπt(s)]=r(s,πt+1(s))+γEsP(s,πt+1(s))Vπt(s)","enumerator":"1.68","key":"uZDrbay6L2"},{"type":"paragraph","position":{"start":{"line":1477,"column":1},"end":{"line":1478,"column":1}},"children":[{"type":"text","value":"Since\n","position":{"start":{"line":1477,"column":1},"end":{"line":1477,"column":1}},"key":"bUawmbrqbK"},{"type":"inlineMath","value":"[\\mathcal{J}^{\\star}(V^{\\pi^{t}})](s) \\ge V^{\\pi^{t}}(s)","position":{"start":{"line":1477,"column":1},"end":{"line":1477,"column":1}},"html":"[J(Vπt)](s)Vπt(s)[\\mathcal{J}^{\\star}(V^{\\pi^{t}})](s) \\ge V^{\\pi^{t}}(s)[J(Vπt)](s)Vπt(s)","key":"ZCJbJZRCe2"},{"type":"text","value":", we then have","position":{"start":{"line":1477,"column":1},"end":{"line":1477,"column":1}},"key":"Rw1xxVYQfi"}],"key":"U5wzLRQpxg"},{"type":"math","value":"\\begin{aligned}\n V^{\\pi^{t+1}}(s) - V^{\\pi^{t}}(s) &\\ge V^{\\pi^{t+1}}(s) - \\mathcal{J}^{\\star} (V^{\\pi^{t}})(s) \\\\\n &= \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} \\left[V^{\\pi^{t+1}}(s') - V^{\\pi^{t}}(s') \\right].\n\\end{aligned}","label":"pi_iter_proof","identifier":"pi_iter_proof","html":"Vπt+1(s)Vπt(s)Vπt+1(s)J(Vπt)(s)=γEsP(s,πt+1(s))[Vπt+1(s)Vπt(s)].\\begin{aligned}\n V^{\\pi^{t+1}}(s) - V^{\\pi^{t}}(s) &\\ge V^{\\pi^{t+1}}(s) - \\mathcal{J}^{\\star} (V^{\\pi^{t}})(s) \\\\\n &= \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} \\left[V^{\\pi^{t+1}}(s') - V^{\\pi^{t}}(s') \\right].\n\\end{aligned}Vπt+1(s)Vπt(s)Vπt+1(s)J(Vπt)(s)=γEsP(s,πt+1(s))[Vπt+1(s)Vπt(s)].","enumerator":"1.69","html_id":"pi-iter-proof","key":"qmGaT5YJV6"},{"type":"paragraph","position":{"start":{"line":1489,"column":1},"end":{"line":1492,"column":1}},"children":[{"type":"text","value":"But note that the\nexpression being averaged is the same as the expression on the l.h.s.\nwith ","position":{"start":{"line":1489,"column":1},"end":{"line":1489,"column":1}},"key":"bCNpsJqzrp"},{"type":"inlineMath","value":"s","position":{"start":{"line":1489,"column":1},"end":{"line":1489,"column":1}},"html":"sss","key":"ceMhV7a0Ln"},{"type":"text","value":" replaced by ","position":{"start":{"line":1489,"column":1},"end":{"line":1489,"column":1}},"key":"CtSc41d53T"},{"type":"inlineMath","value":"s'","position":{"start":{"line":1489,"column":1},"end":{"line":1489,"column":1}},"html":"ss's","key":"emTtVEaCCW"},{"type":"text","value":". So we can apply the same inequality\nrecursively to get","position":{"start":{"line":1489,"column":1},"end":{"line":1489,"column":1}},"key":"r2vs1252ne"}],"key":"npJalKzAUY"},{"type":"math","value":"\\begin{aligned}\n V^{\\pi^{t+1}}(s) - V^{\\pi^{t}}(s) &\\ge \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} \\left[V^{\\pi^{t+1}}(s') - V^{\\pi^{t}}(s') \\right] \\\\\n &\\ge \\gamma^2 \\E_{\\substack{s' \\sim P(s, \\pi^{t+1}(s)) \\\\ s'' \\sim P(s', \\pi^{t+1}(s'))}} \\left[V^{\\pi^{t+1}}(s'') - V^{\\pi^{t}}(s'') \\right]\\\\\n &\\ge \\cdots\n\\end{aligned}","position":{"start":{"line":1494,"column":1},"end":{"line":1500,"column":1}},"html":"Vπt+1(s)Vπt(s)γEsP(s,πt+1(s))[Vπt+1(s)Vπt(s)]γ2EsP(s,πt+1(s))sP(s,πt+1(s))[Vπt+1(s)Vπt(s)]\\begin{aligned}\n V^{\\pi^{t+1}}(s) - V^{\\pi^{t}}(s) &\\ge \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} \\left[V^{\\pi^{t+1}}(s') - V^{\\pi^{t}}(s') \\right] \\\\\n &\\ge \\gamma^2 \\E_{\\substack{s' \\sim P(s, \\pi^{t+1}(s)) \\\\ s'' \\sim P(s', \\pi^{t+1}(s'))}} \\left[V^{\\pi^{t+1}}(s'') - V^{\\pi^{t}}(s'') \\right]\\\\\n &\\ge \\cdots\n\\end{aligned}Vπt+1(s)Vπt(s)γEsP(s,πt+1(s))[Vπt+1(s)Vπt(s)]γ2EsP(s,πt+1(s))s′′P(s,πt+1(s))[Vπt+1(s′′)Vπt(s′′)]","enumerator":"1.70","key":"x9AuJkSQ3i"},{"type":"paragraph","position":{"start":{"line":1502,"column":1},"end":{"line":1506,"column":1}},"children":[{"type":"text","value":"which implies that ","position":{"start":{"line":1502,"column":1},"end":{"line":1502,"column":1}},"key":"aEvpHfLvne"},{"type":"inlineMath","value":"V^{\\pi^{t+1}}(s) \\ge V^{\\pi^{t}}(s)","position":{"start":{"line":1502,"column":1},"end":{"line":1502,"column":1}},"html":"Vπt+1(s)Vπt(s)V^{\\pi^{t+1}}(s) \\ge V^{\\pi^{t}}(s)Vπt+1(s)Vπt(s)","key":"n6ahJiw3Yf"},{"type":"text","value":"\nfor all ","position":{"start":{"line":1502,"column":1},"end":{"line":1502,"column":1}},"key":"I1KmSfrZEI"},{"type":"inlineMath","value":"s","position":{"start":{"line":1502,"column":1},"end":{"line":1502,"column":1}},"html":"sss","key":"oFsUFox7ea"},{"type":"text","value":" (since the r.h.s. converges to zero). We can then plug this\nback into\n","position":{"start":{"line":1502,"column":1},"end":{"line":1502,"column":1}},"key":"AWYo42S4t0"},{"type":"crossReference","kind":"equation","identifier":"pi_iter_proof","label":"pi_iter_proof","children":[{"type":"text","value":"(","key":"c5ffxbstRN"},{"type":"text","value":"1.69","key":"VFVB5vxINW"},{"type":"text","value":")","key":"DkbF0LVIoB"}],"template":"(%s)","enumerator":"1.69","resolved":true,"html_id":"pi-iter-proof","key":"tHhpaOdDIW"},{"type":"text","value":"\nto get the desired result:","position":{"start":{"line":1502,"column":1},"end":{"line":1502,"column":1}},"key":"SmUYuzLuJL"}],"key":"ZyCL4tTUex"},{"type":"math","value":"\\begin{aligned}\n V^{\\pi^{t+1}}(s) - \\mathcal{J}^{\\star} (V^{\\pi^{t}})(s) &= \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} \\left[V^{\\pi^{t+1}}(s') - V^{\\pi^{t}}(s') \\right] \\\\\n &\\ge 0 \\\\\n V^{\\pi^{t+1}}(s) &\\ge [\\mathcal{J}^{\\star}(V^{\\pi^{t}})](s)\n\\end{aligned}","position":{"start":{"line":1508,"column":1},"end":{"line":1514,"column":1}},"html":"Vπt+1(s)J(Vπt)(s)=γEsP(s,πt+1(s))[Vπt+1(s)Vπt(s)]0Vπt+1(s)[J(Vπt)](s)\\begin{aligned}\n V^{\\pi^{t+1}}(s) - \\mathcal{J}^{\\star} (V^{\\pi^{t}})(s) &= \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} \\left[V^{\\pi^{t+1}}(s') - V^{\\pi^{t}}(s') \\right] \\\\\n &\\ge 0 \\\\\n V^{\\pi^{t+1}}(s) &\\ge [\\mathcal{J}^{\\star}(V^{\\pi^{t}})](s)\n\\end{aligned}Vπt+1(s)J(Vπt)(s)Vπt+1(s)=γEsP(s,πt+1(s))[Vπt+1(s)Vπt(s)]0[J(Vπt)](s)","enumerator":"1.71","key":"jWSjqjAaCi"},{"type":"paragraph","position":{"start":{"line":1516,"column":1},"end":{"line":1516,"column":1}},"children":[{"type":"text","value":"This means we can now apply the Bellman convergence result ","position":{"start":{"line":1516,"column":1},"end":{"line":1516,"column":1}},"key":"a42AybVHAZ"},{"type":"crossReference","kind":"equation","identifier":"bellman_convergence","label":"bellman_convergence","children":[{"type":"text","value":"(","key":"fsg1VuDGR7"},{"type":"text","value":"1.38","key":"GWigyhaQwr"},{"type":"text","value":")","key":"alTqEAALTG"}],"template":"(%s)","enumerator":"1.38","resolved":true,"html_id":"bellman-convergence","key":"PLZjH7PUPS"},{"type":"text","value":" to get","position":{"start":{"line":1516,"column":1},"end":{"line":1516,"column":1}},"key":"TtLM01dKRo"}],"key":"Ovr02uYtXE"},{"type":"math","value":"\\|V^{\\pi^{t+1}} - V^\\star \\|_{\\infty} \\le \\|\\mathcal{J}^{\\star} (V^{\\pi^{t}}) - V^{\\star}\\|_{\\infty} \\le \\gamma \\|V^{\\pi^{t}} - V^\\star \\|_{\\infty}.","position":{"start":{"line":1518,"column":1},"end":{"line":1518,"column":1}},"html":"Vπt+1VJ(Vπt)VγVπtV.\\|V^{\\pi^{t+1}} - V^\\star \\|_{\\infty} \\le \\|\\mathcal{J}^{\\star} (V^{\\pi^{t}}) - V^{\\star}\\|_{\\infty} \\le \\gamma \\|V^{\\pi^{t}} - V^\\star \\|_{\\infty}.Vπt+1VJ(Vπt)VγVπtV.","enumerator":"1.72","key":"c2bVpRHXf9"}],"enumerator":"1.6","html_id":"pi-iter-analysis","key":"LUypIfOqKB"},{"type":"heading","depth":2,"position":{"start":{"line":1521,"column":1},"end":{"line":1521,"column":1}},"children":[{"type":"text","value":"Summary","position":{"start":{"line":1521,"column":1},"end":{"line":1521,"column":1}},"key":"O0YM0aXovk"}],"identifier":"summary","label":"Summary","html_id":"summary","implicit":true,"enumerator":"1.6","key":"Y1gV0CinFA"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":1523,"column":1},"end":{"line":1555,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":1523,"column":1},"end":{"line":1530,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":1523,"column":1},"end":{"line":1529,"column":1}},"children":[{"type":"text","value":"Markov decision processes (MDPs) are a framework for sequential\ndecision making under uncertainty. They consist of a state space\n","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"zf0uC4Lriy"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"html":"S\\mathcal{S}S","key":"WoDG73xmSi"},{"type":"text","value":", an action space ","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"YznRSnY978"},{"type":"inlineMath","value":"\\mathcal{A}","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"html":"A\\mathcal{A}A","key":"adHburJGdA"},{"type":"text","value":", an initial state distribution\n","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"WNkCceMTOL"},{"type":"inlineMath","value":"\\mu \\in \\Delta(\\mathcal{S})","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"html":"μΔ(S)\\mu \\in \\Delta(\\mathcal{S})μΔ(S)","key":"rzbED2wXFH"},{"type":"text","value":", a transition function ","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"IdMOQVHcLr"},{"type":"inlineMath","value":"P(s' \\mid s, a)","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"html":"P(ss,a)P(s' \\mid s, a)P(ss,a)","key":"seijHrxnSO"},{"type":"text","value":", and a\nreward function ","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"EsRq7DdL1O"},{"type":"inlineMath","value":"r(s, a)","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"html":"r(s,a)r(s, a)r(s,a)","key":"AumpOxcXz0"},{"type":"text","value":". They can be finite-horizon (ends after\n","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"GufYa7ed88"},{"type":"inlineMath","value":"H","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"html":"HHH","key":"yFdEjiUmZJ"},{"type":"text","value":" timesteps) or infinite-horizon (where rewards scale by\n","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"hyzHX0CtQK"},{"type":"inlineMath","value":"\\gamma \\in (0, 1)","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"html":"γ(0,1)\\gamma \\in (0, 1)γ(0,1)","key":"f6f8t0VVgJ"},{"type":"text","value":" at each timestep).","position":{"start":{"line":1523,"column":1},"end":{"line":1523,"column":1}},"key":"U8F4AiV7ag"}],"key":"WcTmtNqM6V"}],"key":"b2X5iRyWzB"},{"type":"listItem","spread":true,"position":{"start":{"line":1531,"column":1},"end":{"line":1535,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":1531,"column":1},"end":{"line":1534,"column":1}},"children":[{"type":"text","value":"Our goal is to find a policy ","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"bAEIdLKepP"},{"type":"text","value":"π","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"NHq12V9i5H"},{"type":"text","value":" that maximizes expected total\nreward. Policies can be ","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"RrLVRuMkun"},{"type":"strong","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"children":[{"type":"text","value":"deterministic","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"BoVu9XcUX4"}],"key":"g96yZ5fULc"},{"type":"text","value":" or ","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"jTdkieKwh4"},{"type":"strong","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"children":[{"type":"text","value":"stochastic","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"DuMSay8l7F"}],"key":"kIvy01f9Ae"},{"type":"text","value":",\n","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"oOz1DmwNwG"},{"type":"strong","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"children":[{"type":"text","value":"state-dependent","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"nTmHcIzyfI"}],"key":"l68TP8WEjY"},{"type":"text","value":" or ","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"ySZKTQE5Cb"},{"type":"strong","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"children":[{"type":"text","value":"history-dependent","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"aiuoVo67nu"}],"key":"tiIXV5K6AR"},{"type":"text","value":", ","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"L4LkBq8btV"},{"type":"strong","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"children":[{"type":"text","value":"stationary","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"Xn3hPJ5A4h"}],"key":"WaCvQOj22F"},{"type":"text","value":" or\n","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"S7QsXYV4cd"},{"type":"strong","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"children":[{"type":"text","value":"time-dependent","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"pheGnquis2"}],"key":"zSen9fh8a6"},{"type":"text","value":".","position":{"start":{"line":1531,"column":1},"end":{"line":1531,"column":1}},"key":"MHUjVdwiNU"}],"key":"fSb2vjFXtU"}],"key":"c1GbHmZWuN"},{"type":"listItem","spread":true,"position":{"start":{"line":1536,"column":1},"end":{"line":1537,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":1536,"column":1},"end":{"line":1536,"column":1}},"children":[{"type":"text","value":"A policy induces a distribution over ","position":{"start":{"line":1536,"column":1},"end":{"line":1536,"column":1}},"key":"lZmOwkIID9"},{"type":"strong","position":{"start":{"line":1536,"column":1},"end":{"line":1536,"column":1}},"children":[{"type":"text","value":"trajectories","position":{"start":{"line":1536,"column":1},"end":{"line":1536,"column":1}},"key":"F95EyELJzx"}],"key":"qEkXuR1Irp"},{"type":"text","value":".","position":{"start":{"line":1536,"column":1},"end":{"line":1536,"column":1}},"key":"fns9IPg0W1"}],"key":"XM7Dke5MMw"}],"key":"wjCqhp3aD6"},{"type":"listItem","spread":true,"position":{"start":{"line":1538,"column":1},"end":{"line":1545,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":1538,"column":1},"end":{"line":1544,"column":1}},"children":[{"type":"text","value":"We can evaluate a policy by computing its ","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"ishnedTuME"},{"type":"strong","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"children":[{"type":"text","value":"value function","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"rdpP8sVsuH"}],"key":"ART0hjgz0m"},{"type":"text","value":"\n","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"A9dQAwkQXK"},{"type":"inlineMath","value":"V^\\pi(s)","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"html":"Vπ(s)V^\\pi(s)Vπ(s)","key":"hZSlyQAct6"},{"type":"text","value":", which is the expected total reward starting from state\n","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"RkiYUsDzK5"},{"type":"inlineMath","value":"s","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"html":"sss","key":"oYhIKSLZYE"},{"type":"text","value":" and following policy ","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"xGYWfr7uT1"},{"type":"text","value":"π","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"Pts1VqrV1C"},{"type":"text","value":". We can also compute the\n","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"T6O5mRcuR8"},{"type":"strong","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"children":[{"type":"text","value":"state-action value function","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"fqNHJSHZ98"}],"key":"KnJHYm5EIA"},{"type":"text","value":" ","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"r9HNMoqhSP"},{"type":"inlineMath","value":"Q^\\pi(s, a)","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"html":"Qπ(s,a)Q^\\pi(s, a)Qπ(s,a)","key":"I2ZPHA8O7j"},{"type":"text","value":", which is the expected\ntotal reward starting from state ","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"MAaf9xaaYz"},{"type":"inlineMath","value":"s","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"html":"sss","key":"nvVIjCwL6X"},{"type":"text","value":", taking action ","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"h97NkcZ7kQ"},{"type":"inlineMath","value":"a","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"html":"aaa","key":"ZRNmrBYFMz"},{"type":"text","value":", and then\nfollowing policy ","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"DMMVnC6z2t"},{"type":"text","value":"π","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"xxXreZ6DBm"},{"type":"text","value":". In the finite-horizon setting, these also\ndepend on the timestep ","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"TWEPtmKgi6"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"html":"h\\hih","key":"rhlQQX6rsn"},{"type":"text","value":".","position":{"start":{"line":1538,"column":1},"end":{"line":1538,"column":1}},"key":"av7k1z41JT"}],"key":"xhYflHOO0g"}],"key":"y8jyG79bTw"},{"type":"listItem","spread":true,"position":{"start":{"line":1546,"column":1},"end":{"line":1550,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":1546,"column":1},"end":{"line":1549,"column":1}},"children":[{"type":"text","value":"The ","position":{"start":{"line":1546,"column":1},"end":{"line":1546,"column":1}},"key":"ClBgd4CJ2d"},{"type":"strong","position":{"start":{"line":1546,"column":1},"end":{"line":1546,"column":1}},"children":[{"type":"text","value":"Bellman consistency equation","position":{"start":{"line":1546,"column":1},"end":{"line":1546,"column":1}},"key":"p8sunOEgvi"}],"key":"kqViLHIxWL"},{"type":"text","value":" is an equation that the value\nfunction must satisfy. It can be used to solve for the value\nfunctions exactly. Thinking of the r.h.s. of this equation as an\noperator on value functions gives the ","position":{"start":{"line":1546,"column":1},"end":{"line":1546,"column":1}},"key":"J4h4NubUy6"},{"type":"strong","position":{"start":{"line":1546,"column":1},"end":{"line":1546,"column":1}},"children":[{"type":"text","value":"Bellman operator","position":{"start":{"line":1546,"column":1},"end":{"line":1546,"column":1}},"key":"LbT7xSKFr5"}],"key":"cAjBaZ1UDi"},{"type":"text","value":".","position":{"start":{"line":1546,"column":1},"end":{"line":1546,"column":1}},"key":"rvTSsJID2h"}],"key":"L1CiyDGgE3"}],"key":"vC8HpQUa4D"},{"type":"listItem","spread":true,"position":{"start":{"line":1551,"column":1},"end":{"line":1553,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":1551,"column":1},"end":{"line":1552,"column":1}},"children":[{"type":"text","value":"In the finite-horizon setting, we can compute the optimal policy\nusing ","position":{"start":{"line":1551,"column":1},"end":{"line":1551,"column":1}},"key":"dLCBTTa8v9"},{"type":"strong","position":{"start":{"line":1551,"column":1},"end":{"line":1551,"column":1}},"children":[{"type":"text","value":"dynamic programming","position":{"start":{"line":1551,"column":1},"end":{"line":1551,"column":1}},"key":"EnquvzM8Sg"}],"key":"LeSHGsTkob"},{"type":"text","value":".","position":{"start":{"line":1551,"column":1},"end":{"line":1551,"column":1}},"key":"JgR9wkUAsV"}],"key":"Ml9q9jao96"}],"key":"Qbbx7zcwzp"},{"type":"listItem","spread":true,"position":{"start":{"line":1554,"column":1},"end":{"line":1555,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":1554,"column":1},"end":{"line":1555,"column":1}},"children":[{"type":"text","value":"In the infinite-horizon setting, we can compute the optimal policy\nusing ","position":{"start":{"line":1554,"column":1},"end":{"line":1554,"column":1}},"key":"vHE29IfZoZ"},{"type":"strong","position":{"start":{"line":1554,"column":1},"end":{"line":1554,"column":1}},"children":[{"type":"text","value":"value iteration","position":{"start":{"line":1554,"column":1},"end":{"line":1554,"column":1}},"key":"QHzAhls45N"}],"key":"EPQLNFTdAC"},{"type":"text","value":" or ","position":{"start":{"line":1554,"column":1},"end":{"line":1554,"column":1}},"key":"bl6clMMUdV"},{"type":"strong","position":{"start":{"line":1554,"column":1},"end":{"line":1554,"column":1}},"children":[{"type":"text","value":"policy iteration","position":{"start":{"line":1554,"column":1},"end":{"line":1554,"column":1}},"key":"F6eVf48GZr"}],"key":"XrUT1jB50n"},{"type":"text","value":".","position":{"start":{"line":1554,"column":1},"end":{"line":1554,"column":1}},"key":"DvJONLb5FT"}],"key":"IfgipWrw0F"}],"key":"F4LFQzk7hi"}],"key":"QRQlvdqz0S"}],"key":"m2RLRmmXB7"}],"key":"WZpS6hfM1v"},"references":{"cite":{"order":[],"data":{}}},"footer":{"navigation":{"prev":{"title":"CS/STAT 184: Introduction to Reinforcement Learning","url":"/","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"2 Linear Quadratic Regulators","url":"/control","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file diff --git a/myst.search.json b/myst.search.json new file mode 100644 index 0000000..b67e880 --- /dev/null +++ b/myst.search.json @@ -0,0 +1 @@ +{"version":"1","records":[{"hierarchy":{"lvl1":"Appendix: Background"},"type":"lvl1","url":"/background","position":0},{"hierarchy":{"lvl1":"Appendix: Background"},"content":"","type":"content","url":"/background","position":1},{"hierarchy":{"lvl1":"Appendix: Background","lvl2":"O notation"},"type":"lvl2","url":"/background#o-notation","position":2},{"hierarchy":{"lvl1":"Appendix: Background","lvl2":"O notation"},"content":"Throughout this chapter and the rest of the book, we will describe the\nasymptotic behavior of a function using O notation.\n\nFor two functions f(t) and g(t), we say that f(t) \\le O(g(t)) if\nf is asymptotically upper bounded by g. Formally, this means that\nthere exists some constant C > 0 such that f(t) \\le C \\cdot g(t) for\nall t past some point t_0.\n\nWe say f(t) < o(g(t)) if asymptotically f grows strictly slower than\ng. Formally, this means that for any scalar C > 0, there exists\nsome t_0 such that f(t) \\le C \\cdot g(t) for all t > t_0.\nEquivalently, we say f(t) < o(g(t)) if\n\\lim_{t \\to \\infty} f(t)/g(t) = 0.\n\nf(t) = \\Theta(g(t)) means that f and g grow at the same rate\nasymptotically. That is, f(t) \\le O(g(t)) and g(t) \\le O(f(t)).\n\nFinally, we use f(t) \\ge \\Omega(g(t)) to mean that g(t) \\le O(f(t)),\nand f(t) > \\omega(g(t)) to mean that g(t) < o(f(t)).\n\nWe also use the notation \\tilde O(g(t)) to hide logarithmic factors.\nThat is, f(t) = \\tilde O(g(t)) if there exists some constant C such\nthat f(t) \\le C \\cdot g(t) \\cdot \\log^k(t) for some k and all t.\n\nOccasionally, we will also use O(f(t)) (or one of the other symbols)\nas shorthand to manipulate function classes. For example, we might write\nO(f(t)) + O(g(t)) = O(f(t) + g(t)) to mean that the sum of two\nfunctions in O(f(t)) and O(g(t)) is in O(f(t) + g(t)).","type":"content","url":"/background#o-notation","position":3},{"hierarchy":{"lvl1":"Appendix: Background","lvl2":"Python"},"type":"lvl2","url":"/background#python","position":4},{"hierarchy":{"lvl1":"Appendix: Background","lvl2":"Python"},"content":"","type":"content","url":"/background#python","position":5},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits"},"type":"lvl1","url":"/bandits","position":0},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits"},"content":"","type":"content","url":"/bandits","position":1},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl2":"Introduction"},"type":"lvl2","url":"/bandits#introduction","position":2},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl2":"Introduction"},"content":"The multi-armed bandits (MAB) setting is a simple setting for studying the basic challenges of sequential decision-making.\nIn this setting, an agent repeatedly chooses from a fixed set of actions, called arms, each of which has an associated reward distribution. The agent’s goal is to maximize the total reward it receives over some time period. \n| States | Actions | Rewards |\n| :----: | :-----: | :---------------------------------: |\n| None | Finite | $\\mathcal{A} \\to \\triangle([0, 1])$ |\n\n\nIn particular, we’ll spend a lot of time discussing the Exploration-Exploitation Tradeoff: should the agent choose new actions to learn more about the environment, or should it choose actions that it already knows to be good?\n\nOnline advertising\n\nLet’s suppose you, the agent, are an advertising company. You have K different ads that you can show to users; For concreteness, let’s suppose there’s just a single user. You receive 1 reward if the user clicks the ad, and 0 otherwise. Thus, the unknown reward distribution associated to each ad is a Bernoulli distribution defined by the probability that the user clicks on the ad. Your goal is to maximize the total number of clicks by the user.\n\nClinical trials\n\nSuppose you’re a pharmaceutical company, and you’re testing a new drug. You have K different dosages of the drug that you can administer to patients. You receive 1 reward if the patient recovers, and 0 otherwise. Thus, the unknown reward distribution associated to each dosage is a Bernoulli distribution defined by the probability that the patient recovers. Your goal is to maximize the total number of patients that recover.\n\nIn this chapter, we will introduce the multi-armed bandits setting, and discuss some of the challenges that arise when trying to solve problems in this setting. We will also introduce some of the key concepts that we will use throughout the book, such as regret and exploration-exploitation tradeoffs.\n\nfrom jaxtyping import Float, Array\nimport numpy as np\nimport latexify\nfrom typing import Callable, Union\nimport matplotlib.pyplot as plt\n\nimport solutions.bandits as solutions\n\nnp.random.seed(184)\n\ndef random_argmax(ary: Array) -> int:\n \"\"\"Take an argmax and randomize between ties.\"\"\"\n max_idx = np.flatnonzero(ary == ary.max())\n return np.random.choice(max_idx).item()\n\n\n# used as decorator\nlatex = latexify.algorithmic(\n prefixes={\"mab\"},\n identifiers={\"arm\": \"a_t\", \"reward\": \"r\", \"means\": \"mu\"},\n use_math_symbols=True,\n escape_underscores=False,\n)\n\nNamesake\n\nThe name “multi-armed bandits” comes from slot machines in casinos, which are often called “one-armed bandits” since they have one arm (the lever) and take money from the player.\n\nLet K denote the number of arms. We’ll label them 0, \\dots, K-1 and use superscripts to indicate the arm index; since we seldom need to raise a number to a power, this won’t cause much confusion. In this chapter, we’ll consider the Bernoulli bandit setting from the examples above, where arm k either returns reward 1 with probability \\mu^k or 0 otherwise. The agent gets to pull an arm T times in total. We can formalize the Bernoulli bandit in the following Python code:\n\nclass MAB:\n \"\"\"\n The Bernoulli multi-armed bandit environment.\n\n :param means: the means (success probabilities) of the reward distributions for each arm\n :param T: the time horizon\n \"\"\"\n\n def __init__(self, means: Float[Array, \" K\"], T: int):\n assert all(0 <= p <= 1 for p in means)\n self.means = means\n self.T = T\n self.K = self.means.size\n self.best_arm = random_argmax(self.means)\n\n def pull(self, k: int) -> int:\n \"\"\"Pull the `k`-th arm and sample from its (Bernoulli) reward distribution.\"\"\"\n reward = np.random.rand() < self.means[k].item()\n return +reward\n\n\n\nmab = MAB(means=np.array([0.1, 0.8, 0.4]), T=100)\n\nIn pseudocode, the agent’s interaction with the MAB environment can be\ndescribed by the following process:\n\n@latex\ndef mab_loop(mab: MAB, agent: \"Agent\") -> int:\n for t in range(mab.T):\n arm = agent.choose_arm() # in 0, ..., K-1\n reward = mab.pull(arm)\n agent.update_history(arm, reward)\n\n\nmab_loop\n\nThe Agent class stores the pull history and uses it to decide which arm to pull next. Since we are working with Bernoulli bandits, we can summarize the pull history concisely in a \\mathbb{N}^{K \\times 2} array.\n\nclass Agent:\n def __init__(self, K: int, T: int):\n \"\"\"The MAB agent that decides how to choose an arm given the past history.\"\"\"\n self.K = K\n self.T = T\n self.rewards = [] # for plotting\n self.choices = []\n self.history = np.zeros((K, 2), dtype=int)\n\n def choose_arm(self) -> int:\n \"\"\"Choose an arm of the MAB. Algorithm-specific.\"\"\"\n ...\n\n def count(self) -> int:\n \"\"\"The number of pulls made. Also the current step index.\"\"\"\n return len(self.rewards)\n\n def update_history(self, arm: int, reward: int):\n self.rewards.append(reward)\n self.choices.append(arm)\n self.history[arm, reward] += 1\n\nWhat’s the optimal strategy for the agent, i.e. the one that achieves\nthe highest expected reward? Convince yourself that the agent should try\nto always pull the arm with the highest expected reward:\\mu^\\star := \\max_{k \\in [K]} \\mu^k.\n\nThe goal, then, can be rephrased as to minimize the regret, defined\nbelow:\n\nRegret\n\nThe agent’s regret after T timesteps is defined as\\text{Regret}_T := \\sum_{t=0}^{T-1} \\mu^\\star - \\mu^{a_t}.\n\ndef regret_per_step(mab: MAB, agent: Agent):\n \"\"\"Get the difference from the average reward of the optimal arm. The sum of these is the regret.\"\"\"\n return [mab.means[mab.best_arm] - mab.means[arm] for arm in agent.choices]\n\nNote that this depends on the true means of the pulled arms, not the actual\nobserved rewards.\nWe typically think of this as a random variable where\nthe randomness comes from the agent’s strategy (i.e. the sequence of\nactions a_0, \\dots, a_{T-1}).\n\nThroughout the chapter, we will try to upper bound the regret of various\nalgorithms in two different senses:\n\nUpper bound the expected regret, i.e. show\n\\E[\\text{Regret}_T] \\le M_T.\n\nFind a high-probability upper bound on the regret, i.e. show\n\\pr(\\text{Regret}_T \\le M_{T, \\delta}) \\ge 1-\\delta.\n\nNote that these two different approaches say very different things about the regret. The first approach says that the average regret is at most M_T. However, the agent might still achieve higher regret on many runs. The second approach says that, with high probability, the agent will achieve regret at most M_{T, \\delta}. However, it doesn’t say anything about the regret in the remaining δ fraction of runs, which might be arbitrarily high.\n\nWe’d like to achieve sublinear regret in expectation, i.e. \\E[\\text{Regret}_T] = o(T). That is, as we learn more about the environment, we’d like to be able to exploit that knowledge to take the optimal arm as often as possible.\n\nThe rest of the chapter comprises a series of increasingly sophisticated\nMAB algorithms.\n\ndef plot_strategy(mab: MAB, agent: Agent):\n plt.figure(figsize=(10, 6))\n\n # plot reward and cumulative regret\n plt.plot(np.arange(mab.T), np.cumsum(agent.rewards), label=\"reward\")\n cum_regret = np.cumsum(regret_per_step(mab, agent))\n plt.plot(np.arange(mab.T), cum_regret, label=\"cumulative regret\")\n\n # draw colored circles for arm choices\n colors = [\"red\", \"green\", \"blue\"]\n color_array = [colors[k] for k in agent.choices]\n plt.scatter(np.arange(mab.T), np.zeros(mab.T), c=color_array, label=\"arm\")\n\n # labels and title\n plt.xlabel(\"timestep\")\n plt.legend()\n plt.title(f\"{agent.__class__.__name__} reward and regret\")\n plt.show()\n\n","type":"content","url":"/bandits#introduction","position":3},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl2":"Pure exploration (random guessing)"},"type":"lvl2","url":"/bandits#pure-exploration-random-guessing","position":4},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl2":"Pure exploration (random guessing)"},"content":"A trivial strategy is to always choose arms at random (i.e. “pure\nexploration”).\n\nclass PureExploration(Agent):\n def choose_arm(self):\n \"\"\"Choose an arm uniformly at random.\"\"\"\n return solutions.pure_exploration_choose_arm(self)\n\nNote that\\E_{a_t \\sim \\text{Unif}([K])}[\\mu^{a_t}] = \\bar \\mu = \\frac{1}{K} \\sum_{k=1}^K \\mu^k\n\nso the expected regret is simply\\begin{aligned}\n \\E[\\text{Regret}_T] &= \\sum_{t=0}^{T-1} \\E[\\mu^\\star - \\mu^{a_t}] \\\\\n &= T (\\mu^\\star - \\bar \\mu) > 0.\n\\end{aligned}\n\nThis scales as \\Theta(T), i.e. linear in the number of timesteps T. There’s no learning here: the agent doesn’t use any information about the environment to improve its strategy. You can see that the distribution over its arm choices always appears “(uniformly) random”.\n\nagent = PureExploration(mab.K, mab.T)\nmab_loop(mab, agent)\nplot_strategy(mab, agent)\n\n","type":"content","url":"/bandits#pure-exploration-random-guessing","position":5},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl2":"Pure greedy"},"type":"lvl2","url":"/bandits#pure-greedy","position":6},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl2":"Pure greedy"},"content":"How might we improve on pure exploration? Instead, we could try each arm\nonce, and then commit to the one with the highest observed reward. We’ll\ncall this the pure greedy strategy.\n\nclass PureGreedy(Agent):\n def choose_arm(self):\n \"\"\"Choose the arm with the highest observed reward on its first pull.\"\"\"\n return solutions.pure_greedy_choose_arm(self)\n\nNote we’ve used superscripts r^k during the exploration phase to\nindicate that we observe exactly one reward for each arm. Then we use\nsubscripts r_t during the exploitation phase to indicate that we\nobserve a sequence of rewards from the chosen greedy arm \\hat k.\n\nHow does the expected regret of this strategy compare to that of pure\nexploration? We’ll do a more general analysis in the following section.\nNow, for intuition, suppose there’s just K=2 arms, with Bernoulli\nreward distributions with means \\mu^0 > \\mu^1.\n\nLet’s let r^0 be the random reward from the first arm and r^1 be the\nrandom reward from the second. If r^0 > r^1, then we achieve zero\nregret. Otherwise, we achieve regret T(\\mu^0 - \\mu^1). Thus, the\nexpected regret is simply:\\begin{aligned}\n \\E[\\text{Regret}_T] &= \\pr(r^0 < r^1) \\cdot T(\\mu^0 - \\mu^1) + c \\\\\n &= (1 - \\mu^0) \\mu^1 \\cdot T(\\mu^0 - \\mu^1) + c\n\\end{aligned}\n\nWhich is still \\Theta(T), the same as pure exploration!\n\nagent = PureGreedy(mab.K, mab.T)\nmab_loop(mab, agent)\nplot_strategy(mab, agent)\n\nThe cumulative regret is a straight line because the regret only depends on the arms chosen and not the actual reward observed. In fact, if the greedy algorithm happens to get lucky on the first set of pulls, it may act entirely optimally for that episode! But its average regret is what measures its effectiveness.\n\n","type":"content","url":"/bandits#pure-greedy","position":7},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl2":"Explore-then-commit"},"type":"lvl2","url":"/bandits#etc","position":8},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl2":"Explore-then-commit"},"content":"We can improve the pure greedy algorithm as follows: let’s reduce the variance of the reward estimates by pulling each arm N_{\\text{explore}}> 1 times before committing. This is called the explore-then-commit strategy. Note that the “pure greedy” strategy above is just the special case where\nN_{\\text{explore}}= 1.\n\nclass ExploreThenCommit(Agent):\n def __init__(self, K: int, T: int, N_explore: int):\n super().__init__(K, T)\n self.N_explore = N_explore\n\n def choose_arm(self):\n return solutions.etc_choose_arm(self)\n\n\n\nagent = ExploreThenCommit(mab.K, mab.T, mab.T // 15)\nmab_loop(mab, agent)\nplot_strategy(mab, agent)\n\nNotice that now, the graphs are much more consistent, and the algorithm finds the true optimal arm and sticks with it much more frequently. We would expect ETC to then have a better (i.e. lower) average regret. Can we prove this?\n\n","type":"content","url":"/bandits#etc","position":9},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl3":"ETC regret analysis","lvl2":"Explore-then-commit"},"type":"lvl3","url":"/bandits#etc-regret-analysis","position":10},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl3":"ETC regret analysis","lvl2":"Explore-then-commit"},"content":"Let’s analyze the expected regret of the explore-then-commit strategy by splitting it up\ninto the exploration and exploitation phases.","type":"content","url":"/bandits#etc-regret-analysis","position":11},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl4":"Exploration phase.","lvl3":"ETC regret analysis","lvl2":"Explore-then-commit"},"type":"lvl4","url":"/bandits#exploration-phase","position":12},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl4":"Exploration phase.","lvl3":"ETC regret analysis","lvl2":"Explore-then-commit"},"content":"This phase takes N_{\\text{explore}}K timesteps. Since at each step we\nincur at most 1 regret, the total regret is at most\nN_{\\text{explore}}K.","type":"content","url":"/bandits#exploration-phase","position":13},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl4":"Exploitation phase.","lvl3":"ETC regret analysis","lvl2":"Explore-then-commit"},"type":"lvl4","url":"/bandits#exploitation-phase","position":14},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl4":"Exploitation phase.","lvl3":"ETC regret analysis","lvl2":"Explore-then-commit"},"content":"This will take a bit more effort. We’ll prove that for any total time T, we can choose N_{\\text{explore}} such that with arbitrarily high probability, the regret is sublinear.\n\nLet \\hat k denote the arm chosen after the exploration phase. We know the regret from the\nexploitation phase isT_{\\text{exploit}} (\\mu^\\star - \\mu^{\\hat k}) \\qquad \\text{where} \\qquad T_{\\text{exploit}} := T - N_{\\text{explore}}K.\n\nSo we’d like to bound \\mu^\\star - \\mu^{\\hat k} = o(1) (as a function\nof T) in order to achieve sublinear regret. How can we do this?\n\nLet’s define \\Delta^k = \\hat \\mu^k - \\mu^k to denote how far the mean\nestimate for arm k is from the true mean. How can we bound this\nquantity? We’ll use the following useful inequality for i.i.d. bounded\nrandom variables:\n\nHoeffding’s inequality\n\nLet X_0, \\dots, X_{n-1} be i.i.d. random variables with\nX_i \\in [0, 1] almost surely for each i \\in [n]. Then for any\n\\delta > 0,\\pr\\left( \\left| \\frac{1}{n} \\sum_{i=1}^n (X_i - \\E[X_i]) \\right| > \\sqrt{\\frac{\\ln(2/\\delta)}{2n}} \\right) \\le \\delta.\n\nThe proof of this inequality is beyond the scope of this book. See \n\nVershynin (2018) Chapter 2.2.\n\nWe can apply this directly to the rewards for a given arm k, since the rewards from that arm are i.i.d.:\\pr\\left(|\\Delta^k | > \\sqrt{\\frac{\\ln(2/\\delta)}{2N_{\\text{explore}}}} \\right) \\le \\delta.\n\nBut note that we can’t apply this to arm \\hat k directly since\n\\hat k is itself a random variable. Instead, we need to “uniform-ize”\nthis bound across all the arms, i.e. bound the error across all the\narms simultaneously, so that the resulting bound will apply no matter\nwhat \\hat k “crystallizes” to.\n\nThe union bound provides a simple way to do this:\n\nUnion bound\n\nConsider a set of events A_0, \\dots, A_{n-1}. Then\\pr(\\exists i \\in [n]. A_i) \\le \\sum_{i=0}^{n-1} \\pr(A_i).\n\nIn\nparticular, if \\pr(A_i) \\ge 1 - \\delta for each i \\in [n], we have\\pr(\\forall i \\in [n]. A_i) \\ge 1 - n \\delta.\n\nExercise: Prove the second statement above.\n\nApplying the union bound across the arms for the l.h.s. event of \n\n(3.8), we have\\begin{aligned}\n \\pr\\left( \\forall k \\in [K], |\\Delta^k | \\le \\sqrt{\\frac{\\ln(2/\\delta)}{2N_{\\text{explore}}}} \\right) &\\ge 1-K\\delta\n\\end{aligned}\n\nThen to apply this bound to \\hat k in particular, we\ncan apply the useful trick of “adding zero”:\\begin{aligned}\n \\mu^{k^\\star} - \\mu^{\\hat k} &= \\mu^{k^\\star} - \\mu^{\\hat k} + (\\hat \\mu^{k^\\star} - \\hat \\mu^{k^\\star}) + (\\hat \\mu^{\\hat k} - \\hat \\mu^{\\hat k}) \\\\\n &= \\Delta^{\\hat k} - \\Delta^{k^*} + \\underbrace{(\\hat \\mu^{k^\\star} - \\hat \\mu^{\\hat k})}_{\\le 0 \\text{ by definition of } \\hat k} \\\\\n &\\le 2 \\sqrt{\\frac{\\ln(2K/\\delta')}{2N_{\\text{explore}}}} \\text{ with probability at least } 1-\\delta'\n\\end{aligned}\n\nwhere we’ve set \\delta' = K\\delta. Putting this all\ntogether, we’ve shown that, with probability 1 - \\delta',\\text{Regret}_T \\le N_{\\text{explore}}K + T_{\\text{exploit}} \\cdot \\sqrt{\\frac{2\\ln(2K/\\delta')}{N_{\\text{explore}}}}.\n\nNote that it suffices for N_{\\text{explore}} to be on the order of\n\\sqrt{T} to achieve sublinear regret. In particular, we can find the\noptimal N_{\\text{explore}} by setting the derivative of the r.h.s. to\nzero:\\begin{aligned}\n 0 &= K - T_{\\text{exploit}} \\cdot \\frac{1}{2} \\sqrt{\\frac{2\\ln(2K/\\delta')}{N_{\\text{explore}}^3}} \\\\\n N_{\\text{explore}}&= \\left( T_{\\text{exploit}} \\cdot \\frac{\\sqrt{\\ln(2K/\\delta')/2}}{K} \\right)^{2/3}\n\\end{aligned}\n\nPlugging this into the expression for the regret, we\nhave (still with probability 1-\\delta')\\begin{aligned}\n \\text{Regret}_T &\\le 3 T^{2/3} \\sqrt[3]{K \\ln(2K/\\delta') / 2} \\\\\n &= \\tilde{O}(T^{2/3} K^{1/3}).\n\\end{aligned}\n\nThe ETC algorithm is rather “abrupt” in that it switches from\nexploration to exploitation after a fixed number of timesteps. In\npractice, it’s often better to use a more gradual transition, which\nbrings us to the epsilon-greedy algorithm.\n\n","type":"content","url":"/bandits#exploitation-phase","position":15},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl2":"Epsilon-greedy"},"type":"lvl2","url":"/bandits#epsilon-greedy","position":16},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl2":"Epsilon-greedy"},"content":"Instead of doing all of the exploration and then all of the exploitation\nseparately – which additionally requires knowing the time horizon\nbeforehand – we can instead interleave exploration and exploitation by,\nat each timestep, choosing a random action with some probability. We\ncall this the epsilon-greedy algorithm.\n\nclass EpsilonGreedy(Agent):\n def __init__(\n self,\n K: int,\n T: int,\n ε_array: Float[Array, \" T\"],\n ):\n super().__init__(K, T)\n self.ε_array = ε_array\n\n def choose_arm(self):\n return solutions.epsilon_greedy_choose_arm(self)\n\n\n\nagent = EpsilonGreedy(mab.K, mab.T, np.full(mab.T, 0.1))\nmab_loop(mab, agent)\nplot_strategy(mab, agent)\n\nNote that we let ε vary over time. In particular, we might want to gradually decrease ε as we learn more about the reward distributions and no longer need to spend time exploring.\n\nAttention\n\nWhat is the expected regret of the algorithm if we set ε to be a constant?\n\nIt turns out that setting \\epsilon_t = \\sqrt[3]{K \\ln(t)/t} also achieves a regret of \\tilde O(t^{2/3} K^{1/3}) (ignoring the logarithmic factors). (We will not prove this here.) TODO ADD PROOF CITATION\n\nIn ETC, we had to set N_{\\text{explore}} based on the total number of timesteps T. But the epsilon-greedy algorithm actually handles the exploration automatically: the regret rate holds for any t, and doesn’t depend on the final horizon T.\n\nBut the way these algorithms explore is rather naive: we’ve been exploring uniformly across all the arms. But what if we could be smarter about it, and explore more for arms that we’re less certain about?\n\n","type":"content","url":"/bandits#epsilon-greedy","position":17},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl2":"Upper Confidence Bound (UCB)"},"type":"lvl2","url":"/bandits#ucb","position":18},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl2":"Upper Confidence Bound (UCB)"},"content":"To quantify how certain we are about the mean of each arm, we’ll\ncompute confidence intervals for our estimators, and then choose the\narm with the highest upper confidence bound. This operates on the\nprinciple of the benefit of the doubt (i.e. optimism in the face of\nuncertainty): we’ll choose the arm that we’re most optimistic about.\n\nIn particular, for each arm k at time t, we’d like to compute some\nupper confidence bound M^k_t such that \\hat \\mu^k_t \\le M^k_t with\nhigh probability, and then choose a_t := \\arg \\max_{k \\in [K]} M^k_t.\nBut how should we compute M^k_t?\n\nIn \n\nSection 3.4.1, we were able to compute this bound\nusing Hoeffding’s inequality, which assumes that the number of samples\nis fixed. This was the case in ETC (where we pull each arm\nN_{\\text{explore}} times), but in UCB, the number of times we pull\neach arm depends on the agent’s actions, which in turn depend on the\nrandom rewards and are therefore stochastic. So we can’t use\nHoeffding’s inequality directly.\n\nInstead, we’ll apply the same trick we used in the ETC analysis: we’ll\nuse the union bound to compute a looser bound that holds\nuniformly across all timesteps and arms. Let’s introduce some notation\nto discuss this.\n\nLet N^k_t denote the (random) number of times arm k has been pulled\nwithin the first t timesteps, and \\hat \\mu^k_t denote the sample\naverage of those pulls. That is,\\begin{aligned}\n N^k_t &:= \\sum_{\\tau=0}^{t-1} \\mathbf{1} \\{ a_\\tau = k \\} \\\\\n \\hat \\mu^k_t &:= \\frac{1}{N^k_t} \\sum_{\\tau=0}^{t-1} \\mathbf{1} \\{ a_\\tau = k \\} r_\\tau.\n\\end{aligned}\n\nTo achieve the “fixed sample size” assumption, we’ll\nneed to shift our index from time to number of samples from each\narm. In particular, we’ll define \\tilde r^k_n to be the nth sample\nfrom arm k, and \\tilde \\mu^k_n to be the sample average of the first\nn samples from arm k. Then, for a fixed n, this satisfies the\n“fixed sample size” assumption, and we can apply Hoeffding’s inequality\nto get a bound on \\tilde \\mu^k_n.\n\nSo how can we extend our bound on \\tilde\\mu^k_n to \\hat \\mu^k_t?\nWell, we know N^k_t \\le t (where equality would be the case if and\nonly if we had pulled arm k every time). So we can apply the same\ntrick as last time, where we uniform-ize across all possible values of\nN^k_t:\\begin{aligned}\n \\pr\\left( \\forall n \\le t, |\\tilde \\mu^k_n - \\mu^k | \\le \\sqrt{\\frac{\\ln(2/\\delta)}{2n}} \\right) &\\ge 1-t\\delta.\n\\end{aligned}\n\nIn particular, since N^k_t \\le t, and \\tilde \\mu^k_{N^k_t} = \\hat \\mu^k_t by definition, we have\\begin{aligned}\n \\pr\\left( |\\hat \\mu^k_t - \\mu^k | \\le \\sqrt{\\frac{\\ln(2t/\\delta')}{2N^k_t}} \\right) &\\ge 1-\\delta' \\text{ where } \\delta' := t \\delta.\n\\end{aligned}\n\nThis bound would then suffice for applying the UCB algorithm! That is, the upper confidence bound for arm k would beM^k_t := \\hat \\mu^k_t + \\sqrt{\\frac{\\ln(2t/\\delta')}{2N^k_t}},\n\nwhere we can choose \\delta' depending on how tight we want the interval to be.\n\nA smaller \\delta' would give us a larger and higher-confidence interval, emphasizing the exploration term.\n\nA larger \\delta' would give a tighter and lower-confidence interval, prioritizing the current sample averages.\n\nWe can now use this to define the UCB algorithm.\n\nclass UCB(Agent):\n def __init__(self, K: int, T: int, delta: float):\n super().__init__(K, T)\n self.delta = delta\n\n def choose_arm(self):\n return solutions.ucb_choose_arm(self)\n\nIntuitively, UCB prioritizes arms where:\n\n\\hat \\mu^k_t is large, i.e. the arm has a high sample average, and\nwe’d choose it for exploitation, and\n\n\\sqrt{\\frac{\\ln(2t/\\delta')}{2N^k_t}} is large, i.e. we’re still\nuncertain about the arm, and we’d choose it for exploration.\n\nAs desired, this explores in a smarter, adaptive way compared to the\nprevious algorithms. Does it achieve lower regret?\n\nagent = UCB(mab.K, mab.T, 0.9)\nmab_loop(mab, agent)\nplot_strategy(mab, agent)\n\n","type":"content","url":"/bandits#ucb","position":19},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl3":"UCB regret analysis","lvl2":"Upper Confidence Bound (UCB)"},"type":"lvl3","url":"/bandits#ucb-regret-analysis","position":20},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl3":"UCB regret analysis","lvl2":"Upper Confidence Bound (UCB)"},"content":"First we’ll bound the regret incurred at each timestep. Then we’ll bound\nthe total regret across timesteps.\n\nFor the sake of analysis, we’ll use a slightly looser bound that applies\nacross the whole time horizon and across all arms. We’ll omit the\nderivation since it’s very similar to the above (walk through it\nyourself for practice).\\begin{aligned}\n \\pr\\left(\\forall k \\le K, t < T. |\\hat \\mu^k_t - \\mu^k | \\le B^k_t \\right) &\\ge 1-\\delta'' \\\\\n \\text{where} \\quad B^k_t &:= \\sqrt{\\frac{\\ln(2TK/\\delta'')}{2N^k_t}}.\n\\end{aligned}\n\nIntuitively, B^k_t denotes the width of the CI for arm k at time\nt. Then, assuming the above uniform bound holds (which occurs with\nprobability 1-\\delta''), we can bound the regret at each timestep as\nfollows:\\begin{aligned}\n \\mu^\\star - \\mu^{a_t} &\\le \\hat \\mu^{k^*}_t + B_t^{k^*} - \\mu^{a_t} && \\text{applying UCB to arm } k^\\star \\\\\n &\\le \\hat \\mu^{a_t}_t + B^{a_t}_t - \\mu^{a_t} && \\text{since UCB chooses } a_t = \\arg \\max_{k \\in [K]} \\hat \\mu^k_t + B_t^{k} \\\\\n &\\le 2 B^{a_t}_t && \\text{since } \\hat \\mu^{a_t}_t - \\mu^{a_t} \\le B^{a_t}_t \\text{ by definition of } B^{a_t}_t \\\\\n\\end{aligned}\n\nSumming this across timesteps gives\\begin{aligned}\n \\text{Regret}_T &\\le \\sum_{t=0}^{T-1} 2 B^{a_t}_t \\\\\n &= \\sqrt{2\\ln(2TK/\\delta'')} \\sum_{t=0}^{T-1} (N^{a_t}_t)^{-1/2} \\\\\n \\sum_{t=0}^{T-1} (N^{a_t}_t)^{-1/2} &= \\sum_{t=0}^{T-1} \\sum_{k=1}^K \\mathbf{1}\\{ a_t = k \\} (N^k_t)^{-1/2} \\\\\n &= \\sum_{k=1}^K \\sum_{n=1}^{N_T^k} n^{-1/2} \\\\\n &\\le K \\sum_{n=1}^T n^{-1/2} \\\\\n \\sum_{n=1}^T n^{-1/2} &\\le 1 + \\int_1^T x^{-1/2} \\ \\mathrm{d}x \\\\\n &= 1 + (2 \\sqrt{x})_1^T \\\\\n &= 2 \\sqrt{T} - 1 \\\\\n &\\le 2 \\sqrt{T} \\\\\n\\end{aligned}\n\nPutting everything together gives\\begin{aligned}\n \\text{Regret}_T &\\le 2 K \\sqrt{2T \\ln(2TK/\\delta'')} && \\text{with probability } 1-\\delta'' \\\\\n &= \\tilde O(K\\sqrt{T})\n\\end{aligned}\n\nIn fact, we can do a more sophisticated analysis to trim off a factor of \\sqrt{K}\nand show \\text{Regret}_T = \\tilde O(\\sqrt{TK}).\n\n","type":"content","url":"/bandits#ucb-regret-analysis","position":21},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl3":"Lower bound on regret (intuition)","lvl2":"Upper Confidence Bound (UCB)"},"type":"lvl3","url":"/bandits#lower-bound-on-regret-intuition","position":22},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl3":"Lower bound on regret (intuition)","lvl2":"Upper Confidence Bound (UCB)"},"content":"Is it possible to do better than \\Omega(\\sqrt{T}) in general? In fact,\nno! We can show that any algorithm must incur \\Omega(\\sqrt{T}) regret\nin the worst case. We won’t rigorously prove this here, but the\nintuition is as follows.\n\nThe Central Limit Theorem tells us that with T i.i.d. samples from\nsome distribution, we can only learn the mean of the distribution to\nwithin \\Omega(1/\\sqrt{T}) (the standard deviation). Then, since we get\nT samples spread out across the arms, we can only learn each arm’s\nmean to an even looser degree.\n\nThat is, if two arms have means that are within about 1/\\sqrt{T}, we\nwon’t be able to confidently tell them apart, and will sample them about\nequally. But then we’ll incur regret\\Omega((T/2) \\cdot (1/\\sqrt{T})) = \\Omega(\\sqrt{T}).\n\n","type":"content","url":"/bandits#lower-bound-on-regret-intuition","position":23},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl2":"Thompson sampling and Bayesian bandits"},"type":"lvl2","url":"/bandits#thompson-sampling","position":24},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl2":"Thompson sampling and Bayesian bandits"},"content":"So far, we’ve treated the parameters \\mu^0, \\dots, \\mu^{K-1} of the\nreward distributions as fixed. Instead, we can take a Bayesian\napproach where we treat them as random variables from some prior\ndistribution. Then, upon pulling an arm and observing a reward, we can\nsimply condition on this observation to exactly describe the\nposterior distribution over the parameters. This fully describes the\ninformation we gain about the parameters from observing the reward.\n\nFrom this Bayesian perspective, the Thompson sampling algorithm\nfollows naturally: just sample from the distribution of the optimal arm,\ngiven the observations!\n\nclass Distribution:\n def sample(self) -> Float[Array, \" K\"]:\n \"\"\"Sample a vector of means for the K arms.\"\"\"\n ...\n\n def update(self, arm: int, reward: float):\n \"\"\"Condition on obtaining `reward` from the given arm.\"\"\"\n ...\n\n\n\nclass ThompsonSampling(Agent):\n def __init__(self, K: int, T: int, prior: Distribution):\n super().__init__(K, T)\n self.distribution = prior\n\n def choose_arm(self):\n means = self.distribution.sample()\n return random_argmax(means)\n\n def update_history(self, arm: int, reward: int):\n super().update_history(arm, reward)\n self.distribution.update(arm, reward)\n\nIn other words, we sample each arm proportionally to how likely we think\nit is to be optimal, given the observations so far. This strikes a good\nexploration-exploitation tradeoff: we explore more for arms that we’re\nless certain about, and exploit more for arms that we’re more certain\nabout. Thompson sampling is a simple yet powerful algorithm that\nachieves state-of-the-art performance in many settings.\n\nBayesian Bernoulli bandit\n\nWe’ve been working in the Bernoulli bandit setting, where arm k yields a reward of 1 with probability \\mu^k and no reward otherwise. The vector of success probabilities \\boldsymbol{\\mu} = (\\mu^1, \\dots, \\mu^K) thus describes the entire MAB.\n\nUnder the Bayesian perspective, we think of \\boldsymbol{\\mu} as a random vector drawn from some prior distribution \\pi(\\boldsymbol{\\mu}). For example, we might have π be the Uniform distribution over the unit hypercube [0, 1]^K, that is,\\pi(\\boldsymbol{\\mu}) = \\begin{cases}\n 1 & \\text{if } \\boldsymbol{\\mu}\\in [0, 1]^K \\\\\n 0 & \\text{otherwise}\n\\end{cases}\n\nIn this case, upon viewing some reward, we can exactly calculate the posterior distribution of \\boldsymbol{\\mu} using Bayes’s rule (i.e. the definition of conditional probability):\\begin{aligned}\n \\pr(\\boldsymbol{\\mu} \\mid a_0, r_0) &\\propto \\pr(r_0 \\mid a_0, \\boldsymbol{\\mu}) \\pr(a_0 \\mid \\boldsymbol{\\mu}) \\pr(\\boldsymbol{\\mu}) \\\\\n &\\propto (\\mu^{a_0})^{r_0} (1 - \\mu^{a_0})^{1-r_0}.\n\\end{aligned}\n\nThis is the PDF of the\n\\text{Beta}(1 + r_0, 1 + (1 - r_0)) distribution, which is a conjugate\nprior for the Bernoulli distribution. That is, if we start with a Beta\nprior on \\mu^k (note that \\text{Unif}([0, 1]) = \\text{Beta}(1, 1)),\nthen the posterior, after conditioning on samples from\n\\text{Bern}(\\mu^k), will also be Beta. This is a very convenient\nproperty, since it means we can simply update the parameters of the Beta\ndistribution upon observing a reward, rather than having to recompute\nthe entire posterior distribution from scratch.\n\nclass Beta(Distribution):\n def __init__(self, K: int, alpha: int = 1, beta: int = 1):\n self.alphas = np.full(K, alpha)\n self.betas = np.full(K, beta)\n\n def sample(self):\n return np.random.beta(self.alphas, self.betas)\n\n def update(self, arm: int, reward: int):\n self.alphas[arm] += reward\n self.betas[arm] += 1 - reward\n\n\n\nbeta_distribution = Beta(mab.K)\nagent = ThompsonSampling(mab.K, mab.T, beta_distribution)\nmab_loop(mab, agent)\nplot_strategy(mab, agent)\n\nIt turns out that asymptotically, Thompson sampling is optimal in the\nfollowing sense. \n\nLai & Robbins (1985) prove an\ninstance-dependent lower bound that says for any bandit algorithm,\\liminf_{T \\to \\infty} \\frac{\\E[N_T^k]}{\\ln(T)} \\ge \\frac{1}{\\text{KL}(\\mu^k \\parallel \\mu^\\star)}\n\nwhere\\text{KL}(\\mu^k \\parallel \\mu^\\star) = \\mu^k \\ln \\frac{\\mu^k}{\\mu^\\star} + (1 - \\mu^k) \\ln \\frac{1 - \\mu^k}{1 - \\mu^\\star}\n\nmeasures the Kullback-Leibler divergence from the Bernoulli\ndistribution with mean \\mu^k to the Bernoulli distribution with mean\n\\mu^\\star. It turns out that Thompson sampling achieves this lower\nbound with equality! That is, not only is the error rate optimal, but\nthe constant factor is optimal as well.\n\n","type":"content","url":"/bandits#thompson-sampling","position":25},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl2":"Contextual bandits"},"type":"lvl2","url":"/bandits#contextual-bandits","position":26},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl2":"Contextual bandits"},"content":"Note\n\nThis content is advanced material taught at the end of the course.\n\nIn the above MAB environment, the reward distributions of the arms\nremain constant. However, in many real-world settings, we might receive\nadditional information that affects these distributions. For example, in\nthe online advertising case where each arm corresponds to an ad we could\nshow the user, we might receive information about the user’s preferences\nthat changes how likely they are to click on a given ad. We can model\nsuch environments using contextual bandits.\n\nContextual bandit\n\nAt each timestep t, a new context\nx_t is drawn from some distribution \\nu_{\\text{x}}. The learner gets\nto observe the context, and choose an action a_t according to some\ncontext-dependent policy \\pi_t(x_t). Then, the learner observes the\nreward from the chosen arm r_t \\sim \\nu^{a_t}(x_t). The reward\ndistribution also depends on the context.\n\nAssuming our context is discrete, we can just perform the same\nalgorithms, treating each context-arm pair as its own arm. This gives us\nan enlarged MAB of K |\\mathcal{X}| arms.\n\nAttention\n\nWrite down the UCB algorithm for this enlarged MAB. That is, write an\nexpression for \\pi_t(x_t) = \\arg\\max_a \\dots.\n\nRecall that running UCB for T timesteps on an MAB with K arms\nachieves a regret bound of \\tilde{O}(\\sqrt{TK}). So in this problem,\nwe would achieve regret \\tilde{O}(\\sqrt{TK|\\mathcal{X}|}) in the\ncontextual MAB, which has a polynomial dependence on |\\mathcal{X}|.\nBut in a situation where we have large, or even infinitely many\ncontexts, e.g. in the case where our context is a continuous value, this\nbecomes intractable.\n\nNote that this “enlarged MAB” treats the different contexts as entirely\nunrelated to each other, while in practice, often contexts are related\nto each other in some way: for example, we might want to advertise\nsimilar products to users with similar preferences. How can we\nincorporate this structure into our solution?\n\n","type":"content","url":"/bandits#contextual-bandits","position":27},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl3":"Linear contextual bandits","lvl2":"Contextual bandits"},"type":"lvl3","url":"/bandits#lin-ucb","position":28},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl3":"Linear contextual bandits","lvl2":"Contextual bandits"},"content":"We want to model the mean reward of arm k as a function of the\ncontext, i.e. \\mu^k(x). One simple model is the linear one:\n\\mu^k(x) = x^\\top \\theta^k, where x \\in \\mathcal{X} = \\mathbb{R}^d and\n\\theta^k \\in \\mathbb{R}^d describes a feature direction for arm k. Recall\nthat supervised learning gives us a way to estimate a conditional\nexpectation from samples: We learn a least squares estimator from the\ntimesteps where arm k was selected:\\hat \\theta_t^k = \\arg\\min_{\\theta \\in \\mathbb{R}^d} \\sum_{\\{ i \\in [t] : a_i = k \\}} (r_i - x_i^\\top \\theta)^2.\n\nThis has the closed-form solution known as the ordinary least squares\n(OLS) estimator:\\begin{aligned}\n \\hat \\theta_t^k & = (A_t^k)^{-1} \\sum_{\\{ i \\in [t] : a_i = k \\}} x_i r_i \\\\\n \\text{where} \\quad A_t^k & = \\sum_{\\{ i \\in [t] : a_i = k \\}} x_i x_i^\\top.\n\\end{aligned}\n\nWe can now apply the UCB algorithm in this environment in order to\nbalance exploration of new arms and exploitation of arms that we\nbelieve to have high reward. But how should we construct the upper\nconfidence bound? Previously, we treated the pulls of an arm as i.i.d.\nsamples and used Hoeffding’s inequality to bound the distance of the\nsample mean, our estimator, from the true mean. However, now our\nestimator is not a sample mean, but rather the OLS estimator above \n\n(3.30). Instead, we’ll use Chebyshev’s\ninequality to construct an upper confidence bound.\n\nChebyshev’s inequality\n\nFor a random variable Y such that\n\\E Y = 0 and \\E Y^2 = \\sigma^2,|Y| \\le \\beta \\sigma \\quad \\text{with probability} \\ge 1 - \\frac{1}{\\beta^2}\n\nSince the OLS estimator is known to be unbiased (try proving this\nyourself), we can apply Chebyshev’s inequality to\nx_t^\\top (\\hat \\theta_t^k - \\theta^k):\\begin{aligned}\n x_t^\\top \\theta^k \\le x_t^\\top \\hat \\theta_t^k + \\beta \\sqrt{x_t^\\top (A_t^k)^{-1} x_t} \\quad \\text{with probability} \\ge 1 - \\frac{1}{\\beta^2}\n\\end{aligned}\n\nAttention\n\nWe haven’t explained why x_t^\\top (A_t^k)^{-1} x_t is the correct\nexpression for the variance of x_t^\\top \\hat \\theta_t^k. This result\nfollows from some algebra on the definition of the OLS estimator \n\n(3.30).\n\nThe first term is exactly our predicted reward \\hat \\mu^k_t(x_t). To\ninterpret the second term, note thatx_t^\\top (A_t^k)^{-1} x_t = \\frac{1}{N_t^k} x_t^\\top (\\Sigma_t^k)^{-1} x_t,\n\nwhere\\Sigma_t^k = \\frac{1}{N_t^k} \\sum_{\\{ i \\in [t] : a_i = k \\}} x_i x_i^\\top\n\nis the empirical covariance matrix of the contexts (assuming that the\ncontext has mean zero). That is, the learner is encouraged to choose\narms when x_t is not aligned with the data seen so far, or if arm\nk has not been explored much and so N_t^k is small.\n\nWe can now substitute these quantities into UCB to get the LinUCB\nalgorithm:\n\nclass LinUCBPseudocode(Agent):\n def __init__(\n self, K: int, T: int, D: int, lam: float, get_c: Callable[[int], float]\n ):\n super().__init__(K, T)\n self.lam = lam\n self.get_c = get_c\n self.contexts = [None for _ in range(K)]\n self.A = np.repeat(lam * np.eye(D)[...], K)\n self.targets = np.zeros(K, D)\n self.w = np.zeros(K, D)\n\n def choose_arm(self, context: Float[Array, \" D\"]):\n c = self.get_c(self.count)\n scores = self.w @ context + c * np.sqrt(\n context.T @ np.linalg.solve(self.A, context)\n )\n return random_argmax(scores)\n\n def update_history(self, context: Float[Array, \" D\"], arm: int, reward: int):\n self.A[arm] += np.outer(context, context)\n self.targets[arm] += context * reward\n self.w[arm] = np.linalg.solve(self.A[arm], self.targets[arm])\n\nAttention\n\nNote that the matrix A_t^k above might not be invertible. When does this occur? One way to address this is to include a \\lambda I regularization term to ensure that A_t^k is invertible. This is equivalent to solving a ridge regression problem instead of the unregularized least squares problem. Implement this solution. TODO SOLUTION CURRENTLY SHOWN\n\nc_t is similar to the \\log (2t/\\delta') term of UCB: It controls the\nwidth of the confidence interval. Here, we treat it as a tunable\nparameter, though in a theoretical analysis, it would depend on A_t^k\nand the probability δ with which the bound holds.\n\nUsing similar tools for UCB, we can also prove an \\tilde{O}(\\sqrt{T})\nregret bound. The full details of the analysis can be found in Section 3 of \n\nAgarwal et al. (2022).","type":"content","url":"/bandits#lin-ucb","position":29},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl2":"Summary"},"type":"lvl2","url":"/bandits#summary","position":30},{"hierarchy":{"lvl1":"3 Multi-Armed Bandits","lvl2":"Summary"},"content":"In this chapter,\nwe explored the multi-armed bandit setting for analyzing sequential decision-making in an unknown environment.","type":"content","url":"/bandits#summary","position":31},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators"},"type":"lvl1","url":"/control","position":0},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators"},"content":"","type":"content","url":"/control","position":1},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl2":"Introduction"},"type":"lvl2","url":"/control#introduction","position":2},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl2":"Introduction"},"content":"Up to this point, we have considered decision problems with finitely\nmany states and actions. However, in many applications, states and\nactions may take on continuous values. For example, consider autonomous\ndriving, controlling a robot’s joints, and automated manufacturing. How\ncan we teach computers to solve these kinds of problems? This is the\ntask of continuous control.\n\n\n\nFigure 2.1:Solving a Rubik’s Cube with a robot hand.\n\n\n\nFigure 2.2:Boston Dynamics’s Spot robot.\n\nAside from the change in the state and action spaces, the general\nproblem setup remains the same: we seek to construct an optimal policy\nthat outputs actions to solve the desired task. We will see that many\nkey ideas and algorithms, in particular dynamic programming algorithms,\ncarry over to this new setting.\n\nThis chapter introduces a fundamental tool to solve a simple class of\ncontinuous control problems: the linear quadratic regulator. We will\nthen extend this basic method to more complex settings.\n\nCartPole\n\nTry to balance a pencil on its point on a flat surface. It’s much more\ndifficult than it may first seem: the position of the pencil varies\ncontinuously, and the state transitions governing the system, i.e. the\nlaws of physics, are highly complex. This task is equivalent to the\nclassic control problem known as CartPole:\n\nThe state \\st \\in \\mathbb{R}^4 can be described by:\n\nthe position of the cart;\n\nthe velocity of the cart;\n\nthe angle of the pole;\n\nthe angular velocity of the pole.\n\nWe can control the cart by applying a horizontal force \\act \\in \\mathbb{R}.\n\nGoal: Stabilize the cart around an ideal state and action\n(\\st^\\star, \\act^\\star).","type":"content","url":"/control#introduction","position":3},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl2":"Optimal control"},"type":"lvl2","url":"/control#optimal-control","position":4},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl2":"Optimal control"},"content":"Recall that an MDP is defined by its state space \\mathcal{S}, action space\n\\mathcal{A}, state transitions P, reward function r, and discount factor\nγ or time horizon \\hor. These have equivalents in the control\nsetting:\n\nThe state and action spaces are continuous rather than finite.\nThat is, \\mathcal{S} \\subseteq \\mathbb{R}^{n_\\st} and \\mathcal{A} \\subseteq \\mathbb{R}^{n_\\act},\nwhere n_\\st and n_\\act are the corresponding dimensions of these\nspaces, i.e. the number of coordinates to specify a single state or\naction respectively.\n\nWe call the state transitions the dynamics of the system. In the\nmost general case, these might change across timesteps and also\ninclude some stochastic noise w_\\hi at each timestep. We\ndenote these dynamics as the function f_\\hi such that\n\\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi). Of course, we can\nsimplify to cases where the dynamics are deterministic/noise-free\n(no w_\\hi term) and/or time-homogeneous (the same function f\nacross timesteps).\n\nInstead of maximizing the reward function, we seek to minimize the\ncost function c_\\hi: \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}. Often, the cost\nfunction describes how far away we are from a target\nstate-action pair (\\st^\\star, \\act^\\star). An important special\ncase is when the cost is time-homogeneous; that is, it remains the\nsame function c at each timestep h.\n\nWe seek to minimize the undiscounted cost within a finite time\nhorizon \\hor. Note that we end an episode at the final state\n\\st_\\hor -- there is no \\act_\\hor, and so we denote the cost for\nthe final state as c_\\hor(\\st_\\hor).\n\nWith all of these components, we can now formulate the optimal control\nproblem: compute a policy to minimize the expected undiscounted cost\nover \\hor timesteps. In this chapter, we will only consider\ndeterministic, time-dependent policies\n\\pi = (\\pi_0, \\dots, \\pi_{H-1}) where \\pi_h : \\mathcal{S} \\to \\mathcal{A} for each\n\\hi \\in [\\hor].\n\nGeneral optimal control problem\\begin{split}\n \\min_{\\pi_0, \\dots, \\pi_{\\hor-1} : \\mathcal{S} \\to \\mathcal{A}} \\quad & \\E \\left[\n \\left( \\sum_{\\hi=0}^{\\hor-1} c_\\hi(\\st_\\hi, \\act_\\hi) \\right) + c_\\hor(\\st_\\hor)\n \\right] \\\\\n \\text{where} \\quad & \\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi), \\\\\n & \\act_\\hi = \\pi_\\hi(\\st_\\hi) \\\\\n & \\st_0 \\sim \\mu_0 \\\\\n & w_\\hi \\sim \\text{noise}\n\\end{split}","type":"content","url":"/control#optimal-control","position":5},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl3":"A first attempt: Discretization","lvl2":"Optimal control"},"type":"lvl3","url":"/control#a-first-attempt-discretization","position":6},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl3":"A first attempt: Discretization","lvl2":"Optimal control"},"content":"Can we solve this problem using tools from the finite MDP setting? If\n\\mathcal{S} and \\mathcal{A} were finite, then we’d be able to work backwards using the DP algorithm for computing the optimal policy in an MDP (\n\nDefinition 1.11).\nThis inspires us to try discretizing the\nproblem.\n\nSuppose \\mathcal{S} and \\mathcal{A} are bounded, that is,\n\\max_{\\st \\in \\mathcal{S}} \\|\\st\\| \\le B_\\st and\n\\max_{\\act \\in \\mathcal{A}} \\|\\act\\| \\le B_\\act. To make \\mathcal{S} and \\mathcal{A} finite,\nlet’s choose some small positive ε, and simply round each\ncoordinate to the nearest multiple of ε. For example, if\n\\epsilon = 0.01, then we round each element of \\st and \\act to two\ndecimal spaces.\n\nHowever, the discretized \\widetilde{\\mathcal{S}} and \\widetilde{\\mathcal{A}} may be finite, but\nthey may be infeasibly large: we must divide each dimension into\nintervals of length \\varepsilon, resulting in\n|\\widetilde{\\mathcal{S}}| = (B_\\st/\\varepsilon)^{n_\\st} and\n|\\widetilde{\\mathcal{A}}| = (B_\\act/\\varepsilon)^{n_\\act}. To get a sense of how\nquickly this grows, consider \\varepsilon = 0.01, n_\\st = n_\\act = 10.\nThen the number of elements in the transition matrix would be\n|\\widetilde{\\mathcal{S}}|^2 |\\widetilde{\\mathcal{A}}| = (100^{10})^2 (100^{10}) = 10^{60}! (That’s\na trillion trillion trillion trillion trillion.)\n\nWhat properties of the problem could we instead make use of? Note that\nby discretizing the state and action spaces, we implicitly assumed that\nrounding each state or action vector by some tiny amount \\varepsilon\nwouldn’t change the behavior of the system by much; namely, that the\ncost and dynamics were relatively continuous. Can we use this\ncontinuous structure in other ways? This leads us to the linear\nquadratic regulator.","type":"content","url":"/control#a-first-attempt-discretization","position":7},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl2":"The Linear Quadratic Regulator"},"type":"lvl2","url":"/control#lqr","position":8},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl2":"The Linear Quadratic Regulator"},"content":"The optimal control problem \n\nDefinition 2.1 seems highly complex in general. Is there a relevant simplification that we can analyze?\nThe linear quadratic regulator (LQR) is a solvable case and a fundamental tool in control theory.\n\nThe linear quadratic regulator\n\nThe LQR problem is a special case of the \n\nGeneral optimal control problem with linear dynamics and an upward-curved quadratic cost function.\nSolving the LQR problem will additionally enable us to locally approximate more complex setups using Taylor approximations.\n\nLinear, time-homogeneous dynamics: for each timestep \\hi \\in [\\hor],\\begin{aligned}\n \\st_{\\hi+1} &= f(\\st_\\hi, \\act_\\hi, w_\\hi) = A \\st_\\hi + B \\act_\\hi + w_\\hi \\\\\n \\text{where } w_\\hi &\\sim \\mathcal{N}(0, \\sigma^2 I).\n\\end{aligned}\n\nHere, w_\\hi is a spherical Gaussian noise term that makes the dynamics random.\nSetting \\sigma = 0 gives us deterministic state transitions.\nWe will find that the optimal policy actually does not depend on the noise, although the optimal value function and Q-function do.\n\nUpward-curved quadratic, time-homogeneous cost function:c(\\st_\\hi, \\act_\\hi) = \\begin{cases}\n \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi & \\hi < \\hor \\\\\n \\st_\\hi^\\top Q \\st_\\hi & \\hi = \\hor\n\\end{cases}.\n\nThis cost function attempts to stabilize the state and action about (s^\\star, a^\\star) = (0, 0).\nWe require Q \\in \\R^{n_\\st \\times n_\\st} and R \\in \\R^{n_\\act \\times n_\\act} to both be positive definite matrices so that c has a well-defined unique minimum.\nWe can furthermore assume without loss of generality that they are both symmetric (see exercise below).\n\nThis results in the LQR optimization problem:\\begin{aligned}\n \\min_{\\pi_0, \\dots, \\pi_{\\hor-1} : \\mathcal{S} \\to \\mathcal{A}} \\quad & \\E \\left[ \\left( \\sum_{\\hi=0}^{\\hor-1} \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi \\right) + \\st_\\hor^\\top Q \\st_\\hor \\right] \\\\\n \\textrm{where} \\quad & \\st_{\\hi+1} = A \\st_\\hi + B \\act_\\hi + w_\\hi \\\\\n & \\act_\\hi = \\pi_\\hi (\\st_\\hi) \\\\\n & w_\\hi \\sim \\mathcal{N}(0, \\sigma^2 I) \\\\\n & \\st_0 \\sim \\mu_0.\n\\end{aligned}\n\nExercise\n\nHere we’ll show that we don’t lose generality by assuming that Q and R are symmetric.\nShow that replacing Q and R with (Q + Q^\\top) / 2 and (R + R^\\top) / 2 (which are symmetric) yields the same cost function.\n\nWe will henceforth abbreviate “symmetric positive definite” as s.p.d.\nand “positive definite” as p.d.\n\nIt will be helpful to reintroduce the value function notation for a policy to denote the average cost it incurs.\nThese will be instrumental in constructing the optimal policy via dynamic programming,\nas we did in \n\nSection 1.3.2 for MDPs.\n\nValue functions for LQR\n\nGiven a policy \\mathbf{\\pi} = (\\pi_0, \\dots, \\pi_{\\hor-1}),\nwe can define its value function V^\\pi_\\hi : \\mathcal{S} \\to \\mathbb{R} at time \\hi \\in [\\hor] as the average cost-to-go incurred by that policy:\\begin{split}\n V^\\pi_\\hi (\\st) &= \\E \\left[ \\left( \\sum_{i=\\hi}^{\\hor-1} c(\\st_i, \\act_i) \\right) + c(\\st_\\hor) \\mid \\st_\\hi = \\st, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\right] \\\\\n &= \\E \\left[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_i^\\top Q \\st_i + \\act_i^\\top R \\act_i \\right) + \\st_\\hor^\\top Q \\st_\\hor \\mid \\st_\\hi = \\st, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\right] \\\\\n\\end{split}\n\nThe Q-function additionally conditions on the first action we take:\\begin{split}\n Q^\\pi_\\hi (\\st, \\act) &= \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} c(\\st_i, \\act_i) \\right) + c(\\st_\\hor) \\\\\n &\\qquad\\qquad \\mid (\\st_\\hi, \\act_\\hi) = (\\st, \\act), \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\bigg] \\\\\n &= \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_i^\\top Q \\st_i + \\act_i^\\top R \\act_i \\right) + \\st_\\hor^\\top Q \\st_\\hor \\\\\n &\\qquad\\qquad \\mid (\\st_\\hi, \\act_\\hi) = (\\st, \\act), \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\bigg] \\\\\n\\end{split}\n\nNote that since we use cost instead of reward,\nthe best policies are the ones with smaller values of the value function.","type":"content","url":"/control#lqr","position":9},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl2":"Optimality and the Riccati Equation"},"type":"lvl2","url":"/control#optimal-lqr","position":10},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl2":"Optimality and the Riccati Equation"},"content":"In this section,\nwe’ll compute the optimal value function V^\\star_h,\nQ-function Q^\\star_h,\nand policy \\pi^\\star_h in \n\nthe linear quadratic regulator using dynamic programming\nin a very similar way to the DP algorithms \n\nin the MDP setting.\nRecall the definition of the optimal value function:\n\nOptimal value function in LQR\n\nThe optimal value function is the one that,\nat any time and in any state,\nachieves minimum cost across all policies:\\begin{split}\n V^\\star_\\hi(\\st) &= \\min_{\\pi_\\hi, \\dots, \\pi_{\\hor-1}} V^\\pi_\\hi(\\st) \\\\\n &= \\min_{\\pi_{\\hi}, \\dots, \\pi_{\\hor-1}} \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi \\right) + \\st_\\hor^\\top Q \\st_\\hor \\\\\n &\\hspace{8em} \\mid \\st_\\hi = \\st, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi \\le i < H \\bigg] \\\\\n\\end{split}\n\nThe optimal Q-function is defined similarly,\nconditioned on the starting action as well:\\begin{split}\n Q^\\star_\\hi(\\st, \\act) &= \\min_{\\pi_\\hi, \\dots, \\pi_{\\hor-1}} Q^\\pi_\\hi(\\st, \\act) \\\\\n &= \\min_{\\pi_{\\hi}, \\dots, \\pi_{\\hor-1}} \\E \\bigg[ \\left( \\sum_{i=\\hi}^{\\hor-1} \\st_\\hi^\\top Q \\st_\\hi + \\act_\\hi^\\top R \\act_\\hi \\right) + \\st_\\hor^\\top Q \\st_\\hor \\\\\n &\\hspace{8em} \\mid \\st_\\hi = \\st, \\act_\\hi = \\act, \\act_i = \\pi_i(\\st_i) \\quad \\forall \\hi < i < H \\bigg] \\\\\n\\end{split}\n\nBoth of the definitions above assume deterministic policies. Otherwise we would have to take an expectation over actions drawn from the policy, i.e. \\act_\\hi \\sim \\pi_\\hi (\\st_\\hi).\n\nWe will prove the striking fact that the solution has very simple structure:\nV_h^\\star and Q^\\star_h are upward-curved quadratics\nand \\pi_h^\\star is linear and furthermore does not depend on the noise!\n\nOptimal value function in LQR is an upward-curved quadratic\n\nAt each timestep \\hi \\in [\\hor],V^\\star_\\hi(\\st) = \\st^\\top P_\\hi \\st + p_\\hi\n\nfor some s.p.d. matrix P_\\hi \\in \\mathbb{R}^{n_\\st \\times n_\\st} and scalar\np_\\hi \\in \\mathbb{R}.\n\nOptimal policy in LQR is linear\n\nAt each timestep \\hi \\in [\\hor],\\pi^\\star_\\hi (\\st) = - K_\\hi \\st\n\nfor some K_\\hi \\in \\mathbb{R}^{n_\\act \\times n_\\st}.\n(The negative is due to convention.)\n\nThe construction (and inductive proof) proceeds similarly to the one \n\nin the MDP setting.\n\nWe’ll compute V_\\hor^\\star (at the end of the horizon) as our base case.\n\nThen we’ll work step-by-step backwards in time, using V_{\\hi+1}^\\star to compute Q_\\hi^\\star, \\pi_{\\hi}^\\star, and V_\\hi^\\star. TODO insert reference for proof by induction \n\nBase case:\nAt the final timestep,\nthere are no possible actions to take,\nand so V^\\star_\\hor(\\st) = c(\\st) = \\st^\\top Q \\st.\nThus V_\\hor^\\star(\\st) = \\st^\\top P_\\hor \\st + p_\\hor\nwhere P_\\hor = Q and p_\\hor = 0.\n\nInductive hypothesis:\nWe seek to show that the inductive step holds for both theorems:\nIf V^\\star_{\\hi+1}(\\st) is an upward-curved quadratic,\nthen V^\\star_\\hi(\\st) must also be an upward-curved quadratic,\nand \\pi^\\star_\\hi(\\st) must be linear.\nWe’ll break this down into the following steps:\n\nShow that Q^\\star_\\hi(\\st, \\act) is an upward-curved quadratic (in both\n\\st and \\act).\n\nDerive the optimal policy\n\\pi^\\star_\\hi(\\st) = \\arg \\min_\\act Q^\\star_\\hi(\\st, \\act) and show\nthat it’s linear.\n\nShow that V^\\star_\\hi(\\st) is an upward-curved quadratic.\n\nWe first assume the inductive hypothesis that our theorems are true at\ntime \\hi+1. That is,V^\\star_{\\hi+1}(\\st) = \\st^\\top P_{\\hi+1} \\st + p_{\\hi+1} \\quad \\forall \\st \\in \\mathcal{S}.\n\nQ^\\star_\\hi(\\st, \\act) is an upward-curved quadratic\n\nLet us decompose Q^\\star_\\hi : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}\ninto the immediate reward plus the expected cost-to-go:Q^\\star_\\hi(\\st, \\act) = c(\\st, \\act) + \\E_{\\st' \\sim f(\\st, \\act, w_{\\hi+1})} [V^\\star_{\\hi+1}(\\st')].\n\nRecall c(\\st, \\act) := \\st^\\top Q \\st + \\act^\\top R \\act.\nLet’s consider the expectation over the next timestep.\nThe only randomness in the dynamics comes from the noise\nw_{\\hi+1} \\sim \\mathcal{N}(0, \\sigma^2 I),\nso we can expand the expectation as:\\begin{aligned}\n & \\E_{\\st'} [V^\\star_{\\hi+1}(\\st')] \\\\\n {} = {} & \\E_{w_{\\hi+1}} [V^\\star_{\\hi+1}(A \\st + B \\act + w_{\\hi+1})] & & \\text{definition of } f \\\\\n {} = {} & \\E_{w_{\\hi+1}} [ (A \\st + B \\act + w_{\\hi+1})^\\top P_{\\hi+1} (A \\st + B \\act + w_{\\hi+1}) + p_{\\hi+1} ]. & & \\text{inductive hypothesis}\n\\end{aligned}\n\nSumming and combining like terms, we get\\begin{aligned}\n Q^\\star_\\hi(\\st, \\act) & = \\st^\\top Q \\st + \\act^\\top R \\act + \\E_{w_{\\hi+1}} [(A \\st + B \\act + w_{\\hi+1})^\\top P_{\\hi+1} (A \\st + B \\act + w_{\\hi+1}) + p_{\\hi+1}] \\\\\n & = \\st^\\top (Q + A^\\top P_{\\hi+1} A)\\st + \\act^\\top (R + B^\\top P_{\\hi+1} B) \\act + 2 \\st^\\top A^\\top P_{\\hi+1} B \\act \\\\\n & \\qquad + \\E_{w_{\\hi+1}} [w_{\\hi+1}^\\top P_{\\hi+1} w_{\\hi+1}] + p_{\\hi+1}.\n\\end{aligned}\n\nNote that the terms that are linear in w_\\hi have mean\nzero and vanish. Now consider the remaining expectation over the noise.\nBy expanding out the product and using linearity of expectation, we can\nwrite this out as\\begin{aligned}\n \\E_{w_{\\hi+1}} [w_{\\hi+1}^\\top P_{\\hi+1} w_{\\hi+1}] & = \\sum_{i=1}^d \\sum_{j=1}^d (P_{\\hi+1})_{ij} \\E_{w_{\\hi+1}} [(w_{\\hi+1})_i (w_{\\hi+1})_j] \\\\\n & = \\sigma^2 \\mathrm{Tr}(P_{\\hi + 1})\n\\end{aligned}\n\nQuadratic forms\n\nWhen solving quadratic forms, i.e. expressions of the form x^\\top A x,\nit’s often helpful to consider the terms on the diagonal (i = j) separately from those off the diagonal.\n\nIn this case, the expectation of each diagonal term becomes(P_{\\hi+1})_{ii} \\E (w_{\\hi+1})_i^2 = \\sigma^2 (P_{\\hi+1})_{ii}.\n\nOff the diagonal, since the elements of w_{\\hi+1} are independent, the\nexpectation factors, and since each element has mean zero, the term\nvanishes:(P_{\\hi+1})_{ij} \\E [(w_{\\hi+1})_i] \\E [(w_{\\hi+1})_j] = 0.\n\nThus,\nthe only terms left are the ones on the diagonal,\nso the sum of these can be expressed as the trace of \\sigma^2 P_{\\hi+1}:\\E_{w_{\\hi+1}} [w_{\\hi+1}^\\top P_{\\hi+1} w_{\\hi+1}] = \\sigma^2 \\mathrm{Tr}(P_{\\hi+1}).\n\nSubstituting this back into the expression for Q^\\star_\\hi, we have:\\begin{aligned}\n Q^\\star_\\hi(\\st, \\act) & = \\st^\\top (Q + A^\\top P_{\\hi+1} A) \\st + \\act^\\top (R + B^\\top P_{\\hi+1} B) \\act\n + 2\\st^\\top A^\\top P_{\\hi+1} B \\act \\\\\n & \\qquad + \\sigma^2 \\mathrm{Tr}(P_{\\hi+1}) + p_{\\hi+1}.\n\\end{aligned}\n\nAs we hoped, this expression is quadratic in \\st and \\act.\nFurthermore,\nwe’d like to show that it also curves upwards\nwith respect to \\act\nso that its minimum with respect to \\act is well-defined.\nWe can do this by noting that the Hessian matrix of second derivatives is positive definite:\\nabla_{\\act \\act} Q_\\hi^\\star(\\st, \\act) = R + B^\\top P_{\\hi+1} B\n\nSince R is s.p.d. (by \n\nthe LQR definition),\nand P_{\\hi+1} is s.p.d. (by the inductive hypothesis),\nthis sum must also be s.p.d.,\nand so Q^\\star_\\hi is indeed an upward-curved quadratic with respect to \\act.\n(If this isn’t clear, try proving it as an exercise.)\nThe proof of its upward curvature with respect to \\st is equivalent.\n\n\\pi^\\star_\\hi is linear\n\nSince Q^\\star_\\hi is an upward-curved quadratic,\nfinding its minimum over \\act is easy:\nwe simply set the gradient with respect to \\act equal to zero and solve for \\act.\nFirst, we calculate the gradient:\\begin{aligned}\n \\nabla_\\act Q^\\star_\\hi(\\st, \\act) & = \\nabla_\\act [ \\act^\\top (R + B^\\top P_{\\hi+1} B) \\act + 2 \\st^\\top A^\\top P_{\\hi+1} B \\act ] \\\\\n & = 2 (R + B^\\top P_{\\hi+1} B) \\act + 2 (\\st^\\top A^\\top P_{\\hi+1} B)^\\top\n\\end{aligned}\n\nSetting this to zero, we get\\begin{aligned}\n 0 & = (R + B^\\top P_{\\hi+1} B) \\pi^\\star_\\hi(\\st) + B^\\top P_{\\hi+1} A \\st \\nonumber \\\\\n \\pi^\\star_\\hi(\\st) & = (R + B^\\top P_{\\hi+1} B)^{-1} (-B^\\top P_{\\hi+1} A \\st) \\nonumber \\\\\n & = - K_\\hi \\st,\n\\end{aligned}\n\nwhereK_\\hi = (R + B^\\top P_{\\hi+1} B)^{-1} B^\\top P_{\\hi+1} A.\n\nNote that this optimal policy doesn’t depend on the starting distribution \\mu_0.\nIt’s also fully deterministic and isn’t affected by the noise terms\nw_0, \\dots, w_{\\hor-1}.\n\nV^\\star_\\hi(\\st) is an upward-curved quadratic\n\nUsing the identity V^\\star_\\hi(\\st) = Q^\\star_\\hi(\\st, \\pi^\\star(\\st)), we have:\\begin{aligned}\n V^\\star_\\hi(\\st) & = Q^\\star_\\hi(\\st, \\pi^\\star(\\st)) \\\\\n & = \\st^\\top (Q + A^\\top P_{\\hi+1} A) \\st + (-K_\\hi \\st)^\\top (R + B^\\top P_{\\hi+1} B) (-K_\\hi \\st)\n + 2\\st^\\top A^\\top P_{\\hi+1} B (-K_\\hi \\st) \\\\\n & \\qquad + \\mathrm{Tr}(\\sigma^2 P_{\\hi+1}) + p_{\\hi+1}\n\\end{aligned}\n\nNote that with respect to \\st,\nthis is the sum of a quadratic term and a constant,\nwhich is exactly what we were aiming for!\nThe scalar term is clearlyp_\\hi = \\mathrm{Tr}(\\sigma^2 P_{\\hi+1}) + p_{\\hi+1}.\n\nWe can simplify the quadratic term by substituting in K_\\hi from \n\n(2.23).\nNotice that when we do this,\nthe (R+B^\\top P_{\\hi+1} B) term in the expression is cancelled out by its inverse,\nand the remaining terms combine to give the Riccati equation:\n\nRiccati equationP_\\hi = Q + A^\\top P_{\\hi+1} A - A^\\top P_{\\hi+1} B (R + B^\\top P_{\\hi+1} B)^{-1} B^\\top P_{\\hi+1} A.\n\nThere are several nice properties to note about the Riccati equation:\n\nIt’s defined recursively.\nGiven the dynamics defined by A and B, and the state cost matrix Q,\nwe can recursively calculate P_\\hi across all timesteps starting from P_\\hor = Q.\n\nP_\\hi often appears in calculations surrounding optimality,\nsuch as V^\\star_\\hi, Q^\\star_\\hi, and \\pi^\\star_\\hi.\n\nTogether with the dynamics given by A and B,\nand the action coefficients R in the lost function,\nit fully defines the optimal policy \n\nLemma 2.2.\n\nIt remains to prove that V^\\star_\\hi curves upwards, that is, that P_\\hi is s.p.d. We will use the following fact about Schur complements:\n\nPositive definiteness of Schur complements\n\nLetD = \\begin{pmatrix}\nA & B \\\\\nB^\\top & C\n\\end{pmatrix}\n\nbe a symmetric (m+n) \\times (m+n) block matrix,\nwhere A \\in \\R^{m \\times m}, B \\in \\R^{m \\times n}, C \\in \\R^{n \\times n}.\nThe Schur complement of A is denotedD/A = C - B^\\top A^{-1} B.\n\nSchur complements have various uses in linear algebra and numerical computation.\n\nA useful fact for us is that\nif A is positive definite,\nthen D is positive semidefinite\nif and only if D/A is positive semidefinite.\n\nLet P denote P_{\\hi + 1} for brevity.\nWe already know Q is p.d.,\nso it suffices to show thatS = P - P B (R + B^\\top P B)^{-1} B^\\top P\n\nis p.s.d. (positive semidefinite),\nsince left- and right- multiplying by A^\\top and A respectively\npreserves p.s.d.\nWe note that S is the Schur complement D/(R + B^\\top P B), whereD = \\begin{pmatrix}\nR + B^\\top P B & B^\\top P \\\\\nP B & P\n\\end{pmatrix}.\n\nThus we must show that D is p.s.d..\nThis can be seen by computing\\begin{aligned}\n\\begin{pmatrix}\ny^\\top & z^\\top\n\\end{pmatrix}\nD\n\\begin{pmatrix}\ny \\\\ z\n\\end{pmatrix}\n&= y^\\top R y + y^\\top B^\\top P B y + 2 y^\\top B^\\top P z + z^\\top P z \\\\\n&= y^\\top R y + (By + z)^\\top P (By + z) \\\\\n&> 0.\n\\end{aligned}\n\nSince R + B^\\top P B is p.d. and D is p.s.d.,\nthen S = D / (R + B^\\top P B) must be p.s.d.,\nand P_\\hi = Q + A S A^\\top must be p.d.\n\nNow we’ve shown that V^\\star_\\hi(\\st) = \\st^\\top P_\\hi \\st + p_\\hi,\nwhere P_\\hi is s.p.d.,\nproving the inductive hypothesis and completing the proof of \n\nTheorem 2.2 and \n\nTheorem 2.1.\n\nIn summary, we just demonstrated that at each timestep \\hi \\in [\\hor],\nthe optimal value function V^\\star_\\hi\nand optimal Q-function Q^\\star_\\hi are both upward-curved quadratics\nand the optimal policy \\pi^\\star_\\hi is linear.\nWe also showed that all of these quantities can be calculated\nusing a sequence of s.p.d. matrices P_0, \\dots, P_H\nthat can be defined recursively using the Riccati equation \n\nDefinition 2.5.\n\nBefore we move on to some extensions of LQR, let’s consider how the\nstate at time \\hi behaves when we act according to this optimal\npolicy.","type":"content","url":"/control#optimal-lqr","position":11},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl3":"Expected state at time \\hi","lvl2":"Optimality and the Riccati Equation"},"type":"lvl3","url":"/control#expected-state-at-time-hi","position":12},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl3":"Expected state at time \\hi","lvl2":"Optimality and the Riccati Equation"},"content":"How can we compute the expected state at time \\hi when acting\naccording to the optimal policy? Let’s first express \\st_\\hi in a\ncleaner way in terms of the history. Note that having linear dynamics\nmakes it easy to expand terms backwards in time:\\begin{aligned}\n \\st_\\hi & = A \\st_{\\hi-1} + B \\act_{\\hi-1} + w_{\\hi-1} \\\\\n & = A (A\\st_{\\hi-2} + B \\act_{\\hi-2} + w_{\\hi-2}) + B \\act_{\\hi-1} + w_{\\hi-1} \\\\\n & = \\cdots \\\\\n & = A^\\hi \\st_0 + \\sum_{i=0}^{\\hi-1} A^i (B \\act_{\\hi-i-1} + w_{\\hi-i-1}).\n\\end{aligned}\n\nLet’s consider the average state at this time, given all the past\nstates and actions. Since we assume that \\E [w_\\hi] = 0 (this is the\nzero vector in d dimensions), when we take an expectation, the w_\\hi\nterm vanishes due to linearity, and so we’re left with\\E [\\st_\\hi \\mid \\st_{0:(\\hi-1)}, \\act_{0:(\\hi-1)}] = A^\\hi \\st_0 + \\sum_{i=0}^{\\hi-1} A^i B \\act_{\\hi-i-1}.\n\nExercise\n\nShow that if we choose actions according to the optimal policy \n\nLemma 2.2, \n\n(2.33) becomes\\E [\\st_\\hi \\mid \\st_0, \\act_i = \\pi^\\star_i(\\st_i)\\quad \\forall i \\le \\hi] = \\left( \\prod_{i=0}^{\\hi-1} (A - B K_i) \\right) \\st_0.\n\nThis introdces the quantity A - B K_i, which shows up frequently in\ncontrol theory. For example, one important question is: will \\st_\\hi\nremain bounded, or will it go to infinity as time goes on? To answer\nthis, let’s imagine for simplicity that these K_is are equal (call\nthis matrix K). Then the expression above becomes (A-BK)^\\hi \\st_0.\nNow consider the maximum eigenvalue \\lambda_{\\max} of A - BK. If\n|\\lambda_{\\max}| > 1, then there’s some nonzero initial state\n\\bar \\st_0, the corresponding eigenvector, for which\\lim_{\\hi \\to \\infty} (A - BK)^\\hi \\bar \\st_0\n = \\lim_{\\hi \\to \\infty} \\lambda_{\\max}^\\hi \\bar \\st_0\n = \\infty.\n\nOtherwise, if |\\lambda_{\\max}| < 1, then it’s impossible for your original state to explode as dramatically.","type":"content","url":"/control#expected-state-at-time-hi","position":13},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl2":"Extensions"},"type":"lvl2","url":"/control#extensions","position":14},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl2":"Extensions"},"content":"We’ve now formulated an optimal solution for the time-homogeneous LQR\nand computed the expected state under the optimal policy. However, real\nworld tasks rarely have such simple dynamics, and we may wish to design\nmore complex cost functions. In this section, we’ll consider more\ngeneral extensions of LQR where some of the assumptions we made above\nare relaxed. Specifically, we’ll consider:\n\nTime-dependency, where the dynamics and cost function might\nchange depending on the timestep.\n\nGeneral quadratic cost, where we allow for linear terms and a\nconstant term.\n\nTracking a goal trajectory rather than aiming for a single goal\nstate-action pair.\n\nCombining these will allow us to use the LQR solution to solve more\ncomplex setups by taking Taylor approximations of the dynamics and\ncost functions.","type":"content","url":"/control#extensions","position":15},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl3":"Time-dependent dynamics and cost function","lvl2":"Extensions"},"type":"lvl3","url":"/control#time-dep-lqr","position":16},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl3":"Time-dependent dynamics and cost function","lvl2":"Extensions"},"content":"So far, we’ve considered the time-homogeneous case, where the dynamics\nand cost function stay the same at every timestep. However, this might\nnot always be the case. As an example, in many sports, the rules and\nscoring system might change during an overtime period. To address these\nsorts of problems, we can loosen the time-homogeneous restriction, and\nconsider the case where the dynamics and cost function are\ntime-dependent. Our analysis remains almost identical; in fact, we can\nsimply add a time index to the matrices A and B that determine the\ndynamics and the matrices Q and R that determine the cost.\n\nThe modified problem is now defined as follows:\n\nTime-dependent LQR\\begin{aligned}\n \\min_{\\pi_{0}, \\dots, \\pi_{\\hor-1}} \\quad & \\E \\left[ \\left( \\sum_{\\hi=0}^{\\hor-1} (\\st_\\hi^\\top Q_\\hi \\st_\\hi) + \\act_\\hi^\\top R_\\hi \\act_\\hi \\right) + \\st_\\hor^\\top Q_\\hor \\st_\\hor \\right] \\\\\n \\textrm{where} \\quad & \\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi) = A_\\hi \\st_\\hi + B_\\hi \\act_\\hi + w_\\hi \\\\\n & \\st_0 \\sim \\mu_0 \\\\\n & \\act_\\hi = \\pi_\\hi (\\st_\\hi) \\\\\n & w_\\hi \\sim \\mathcal{N}(0, \\sigma^2 I).\n\\end{aligned}\n\nThe derivation of the optimal value functions and the optimal policy\nremains almost exactly the same, and we can modify the Riccati equation\naccordingly:\n\nTime-dependent Riccati EquationP_\\hi = Q_\\hi + A_\\hi^\\top P_{\\hi+1} A_\\hi - A_\\hi^\\top P_{\\hi+1} B_\\hi (R_\\hi + B_\\hi^\\top P_{\\hi+1} B_\\hi)^{-1} B_\\hi^\\top P_{\\hi+1} A_\\hi.\n\nNote that this is just the time-homogeneous Riccati equation\n(\n\nDefinition 2.5), but with the time index added to each of the\nrelevant matrices.\n\nExercise\n\nWalk through the proof in \n\nSection 2.4 to verify that we can simply add \\hi for the time-dependent case.\n\nAdditionally, by allowing the dynamics to vary across time, we gain the\nability to locally approximate nonlinear dynamics at each timestep.\nWe’ll discuss this later in the chapter.","type":"content","url":"/control#time-dep-lqr","position":17},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl3":"More general quadratic cost functions","lvl2":"Extensions"},"type":"lvl3","url":"/control#more-general-quadratic-cost-functions","position":18},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl3":"More general quadratic cost functions","lvl2":"Extensions"},"content":"Our original cost function had only second-order terms with respect to\nthe state and action, incentivizing staying as close as possible to\n(\\st^\\star, \\act^\\star) = (0, 0). We can also consider more general\nquadratic cost functions that also have first-order terms and a constant\nterm. Combining this with time-dependent dynamics results in the\nfollowing expression, where we introduce a new matrix M_\\hi for the\ncross term, linear coefficients q_\\hi and r_\\hi for the state and\naction respectively, and a constant term c_\\hi:c_\\hi(\\st_\\hi, \\act_\\hi) = ( \\st_\\hi^\\top Q_\\hi \\st_\\hi + \\st_\\hi^\\top M_\\hi \\act_\\hi + \\act_\\hi^\\top R_\\hi \\act_\\hi ) + (\\st_\\hi^\\top q_\\hi + \\act_\\hi^\\top r_\\hi) + c_\\hi.\n\nSimilarly, we can also include a\nconstant term v_\\hi \\in \\mathbb{R}^{n_\\st} in the dynamics (note that this is\ndeterministic at each timestep, unlike the stochastic noise w_\\hi):\\st_{\\hi+1} = f_\\hi(\\st_\\hi, \\act_\\hi, w_\\hi) = A_\\hi \\st_\\hi + B_\\hi \\act_\\hi + v_\\hi + w_\\hi.\n\nexercise\n\nDerive the optimal solution. You will need to slightly modify the\nproof in \n\nSection 2.4.","type":"content","url":"/control#more-general-quadratic-cost-functions","position":19},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl3":"Tracking a predefined trajectory","lvl2":"Extensions"},"type":"lvl3","url":"/control#tracking-a-predefined-trajectory","position":20},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl3":"Tracking a predefined trajectory","lvl2":"Extensions"},"content":"Consider applying LQR to a task like autonomous driving, where the\ntarget state-action pair changes over time. We might want the vehicle to\nfollow a predefined trajectory of states and actions\n(\\st_\\hi^\\star, \\act_\\hi^\\star)_{\\hi=0}^{\\hor-1}. To express this as a\ncontrol problem, we’ll need a corresponding time-dependent cost\nfunction:c_\\hi(\\st_\\hi, \\act_\\hi) = (\\st_\\hi - \\st^\\star_\\hi)^\\top Q (\\st_\\hi - \\st^\\star_\\hi) + (\\act_\\hi - \\act^\\star_\\hi)^\\top R (\\act_\\hi - \\act^\\star_\\hi).\n\nNote that this punishes states and actions that are far from the\nintended trajectory. By expanding out these multiplications, we can see\nthat this is actually a special case of the more general quadratic cost\nfunction above \n\n(2.38):M_\\hi = 0, \\qquad q_\\hi = -2Q \\st^\\star_\\hi, \\qquad r_\\hi = -2R \\act^\\star_\\hi, \\qquad c_\\hi = (\\st^\\star_\\hi)^\\top Q (\\st^\\star_\\hi) + (\\act^\\star_\\hi)^\\top R (\\act^\\star_\\hi).","type":"content","url":"/control#tracking-a-predefined-trajectory","position":21},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl2":"Approximating nonlinear dynamics"},"type":"lvl2","url":"/control#approx-nonlinear","position":22},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl2":"Approximating nonlinear dynamics"},"content":"The LQR algorithm solves for the optimal policy when the dynamics are\nlinear and the cost function is an upward-curved quadratic. However,\nreal settings are rarely this simple! Let’s return to the CartPole\nexample from the start of the chapter\n(\n\nExample 2.1). The dynamics (physics) aren’t linear. How\ncan we approximate this by an LQR problem?\n\nConcretely, let’s consider a noise-free problem since, as we saw, the\nnoise doesn’t factor into the optimal policy. Let’s assume the dynamics\nand cost function are stationary, and ignore the terminal state for\nsimplicity:\n\nNonlinear control problem\\begin{aligned}\n \\min_{\\pi_0, \\dots, \\pi_{\\hor-1} : \\mathcal{S} \\to \\mathcal{A}} \\quad & \\E_{\\st_0} \\left[ \\sum_{\\hi=0}^{\\hor-1} c(\\st_\\hi, \\act_\\hi) \\right] \\\\\n \\text{where} \\quad & \\st_{\\hi+1} = f(\\st_\\hi, \\act_\\hi) \\\\\n & \\act_\\hi = \\pi_\\hi(\\st_\\hi) \\\\\n & \\st_0 \\sim \\mu_0 \\\\\n & c(\\st, \\act) = d(\\st, \\st^\\star) + d(\\act, \\act^\\star).\n\\end{aligned}\n\nHere, d denotes a function that measures the\n“distance” between its two arguments.\n\nThis is now only slightly simplified from the general optimal control\nproblem (see\n\n\nDefinition 2.1). Here, we don’t know an analytical form\nfor the dynamics f or the cost function c, but we assume that we’re\nable to query/sample/simulate them to get their values at a given\nstate and action. To clarify, consider the case where the dynamics are\ngiven by real world physics. We can’t (yet) write down an expression for\nthe dynamics that we can differentiate or integrate analytically.\nHowever, we can still simulate the dynamics and cost function by\nrunning a real-world experiment and measuring the resulting states and\ncosts. How can we adapt LQR to this more general nonlinear case?","type":"content","url":"/control#approx-nonlinear","position":23},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl3":"Local linearization","lvl2":"Approximating nonlinear dynamics"},"type":"lvl3","url":"/control#local-linearization","position":24},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl3":"Local linearization","lvl2":"Approximating nonlinear dynamics"},"content":"How can we apply LQR when the dynamics are nonlinear or the cost\nfunction is more complex? We’ll exploit the useful fact that we can take\na function that’s locally continuous around (s^\\star, a^\\star) and\napproximate it nearby with low-order polynomials (i.e. its Taylor\napproximation). In particular, as long as the dynamics f are\ndifferentiable around (\\st^\\star, \\act^\\star) and the cost function\nc is twice differentiable at (\\st^\\star, \\act^\\star), we can take a\nlinear approximation of f and a quadratic approximation of c to\nbring us back to the regime of LQR.\n\nLinearizing the dynamics around (\\st^\\star, \\act^\\star) gives:\\begin{gathered}\n f(\\st, \\act) \\approx f(\\st^\\star, \\act^\\star) + \\nabla_\\st f(\\st^\\star, \\act^\\star) (\\st - \\st^\\star) + \\nabla_\\act f(\\st^\\star, \\act^\\star) (\\act - \\act^\\star) \\\\\n (\\nabla_\\st f(\\st, \\act))_{ij} = \\frac{d f_i(\\st, \\act)}{d \\st_j}, \\quad i, j \\le n_\\st \\qquad (\\nabla_\\act f(\\st, \\act))_{ij} = \\frac{d f_i(\\st, \\act)}{d \\act_j}, \\quad i \\le n_\\st, j \\le n_\\act\n\\end{gathered}\n\nand quadratizing the cost function around\n(\\st^\\star, \\act^\\star) gives:\\begin{aligned}\n c(\\st, \\act) & \\approx c(\\st^\\star, \\act^\\star) \\quad \\text{constant term} \\\\\n & \\qquad + \\nabla_\\st c(\\st^\\star, \\act^\\star) (\\st - \\st^\\star) + \\nabla_\\act c(\\st^\\star, \\act^\\star) (a - \\act^\\star) \\quad \\text{linear terms} \\\\\n & \\left. \\begin{aligned}\n & \\qquad + \\frac{1}{2} (\\st - \\st^\\star)^\\top \\nabla_{\\st \\st} c(\\st^\\star, \\act^\\star) (\\st - \\st^\\star) \\\\\n & \\qquad + \\frac{1}{2} (\\act - \\act^\\star)^\\top \\nabla_{\\act \\act} c(\\st^\\star, \\act^\\star) (\\act - \\act^\\star) \\\\\n & \\qquad + (\\st - \\st^\\star)^\\top \\nabla_{\\st \\act} c(\\st^\\star, \\act^\\star) (\\act - \\act^\\star)\n \\end{aligned} \\right\\} \\text{quadratic terms}\n\\end{aligned}\n\nwhere the gradients and Hessians are defined as\\begin{aligned}\n (\\nabla_\\st c(\\st, \\act))_{i} & = \\frac{d c(\\st, \\act)}{d \\st_i}, \\quad i \\le n_\\st\n & (\\nabla_\\act c(\\st, \\act))_{i} & = \\frac{d c(\\st, \\act)}{d \\act_i}, \\quad i \\le n_\\act \\\\\n (\\nabla_{\\st \\st} c(\\st, \\act))_{ij} & = \\frac{d^2 c(\\st, \\act)}{d \\st_i d \\st_j}, \\quad i, j \\le n_\\st\n & (\\nabla_{\\act \\act} c(\\st, \\act))_{ij} & = \\frac{d^2 c(\\st, \\act)}{d \\act_i d \\act_j}, \\quad i, j \\le n_\\act \\\\\n (\\nabla_{\\st \\act} c(\\st, \\act))_{ij} & = \\frac{d^2 c(\\st, \\act)}{d \\st_i d \\act_j}. \\quad i \\le n_\\st, j \\le n_\\act\n\\end{aligned}\n\nExercise: Note that this cost can be expressed in the general\nquadratic form seen in\n\n\n(2.38). Derive the corresponding\nquantities Q, R, M, q, r, c.","type":"content","url":"/control#local-linearization","position":25},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl3":"Finite differencing","lvl2":"Approximating nonlinear dynamics"},"type":"lvl3","url":"/control#finite-differencing","position":26},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl3":"Finite differencing","lvl2":"Approximating nonlinear dynamics"},"content":"To calculate these gradients and Hessians in practice,\nwe use a method known as finite differencing for numerically computing derivatives.\nNamely, we can simply use the limit definition of the derivative, and\nsee how the function changes as we add or subtract a tiny δ to\nthe input.\\frac{d}{dx} f(x) = \\lim_{\\delta \\to 0} \\frac{f(x + \\delta) - f(x)}{\\delta}\n\nNote that this only requires us to be able to query the function, not\nto have an analytical expression for it, which is why it’s so useful in\npractice.","type":"content","url":"/control#finite-differencing","position":27},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl3":"Local convexification","lvl2":"Approximating nonlinear dynamics"},"type":"lvl3","url":"/control#local-convexification","position":28},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl3":"Local convexification","lvl2":"Approximating nonlinear dynamics"},"content":"However, simply taking the second-order approximation of the cost\nfunction is insufficient, since for the LQR setup we required that the\nQ and R matrices were positive definite, i.e. that all of their\neigenvalues were positive.\n\nOne way to naively force some symmetric matrix D to be positive definite\nis to set any non-positive eigenvalues to some small positive value \\varepsilon > 0.\nRecall that any real symmetric matrix D \\in \\mathbb{R}^{n \\times n} has an basis of eigenvectors u_1, \\dots, u_n\nwith corresponding eigenvalues \\lambda_1, \\dots, \\lambda_n\nsuch that D u_i = \\lambda_i u_i.\nThen we can construct the positive definite approximation by\\widetilde{D} = \\left( \\sum_{i=1, \\dots, n \\mid \\lambda_i > 0} \\lambda_i u_i u_i^\\top \\right) + \\varepsilon I.\n\nExercise: Convince yourself that \\widetilde{D} is indeed positive\ndefinite.\n\nNote that Hessian matrices are generally symmetric, so we can apply this\nprocess to Q and R to obtain the positive definite approximations\n\\widetilde{Q} and \\widetilde{R}.\nNow that we have an upward-curved\nquadratic approximation to the cost function, and a linear approximation\nto the state transitions, we can simply apply the time-homogenous LQR\nmethods from \n\nSection 2.4.\n\nBut what happens when we enter states far away from \\st^\\star or want\nto use actions far from \\act^\\star? A Taylor approximation is only\naccurate in a local region around the point of linearization, so the\nperformance of our LQR controller will degrade as we move further away.\nWe’ll see how to address this in the next section using the iterative LQR algorithm.\n\n\n\nFigure 2.3:Local linearization might only be accurate in a small region around the\npoint of linearization.","type":"content","url":"/control#local-convexification","position":29},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl3":"Iterative LQR","lvl2":"Approximating nonlinear dynamics"},"type":"lvl3","url":"/control#iterative-lqr","position":30},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl3":"Iterative LQR","lvl2":"Approximating nonlinear dynamics"},"content":"To address these issues with local linearization, we’ll use an iterative\napproach, where we repeatedly linearize around different points to\ncreate a time-dependent approximation of the dynamics, and then solve\nthe resulting time-dependent LQR problem to obtain a better policy. This\nis known as iterative LQR or iLQR:\n\nIterative LQR\n\nFor each iteration of the algorithm:\n\nForm a time-dependent LQR problem around the current candidate\ntrajectory using local linearization.\n\nCompute the optimal policy using \n\nSection 2.5.1.\n\nGenerate a new series of actions using this policy.\n\nCompute a better candidate trajectory by interpolating between the\ncurrent and proposed actions.\n\nNow let’s go through the details of each step. We’ll use superscripts to\ndenote the iteration of the algorithm. We’ll also denote\n\\bar \\st_0 = \\E_{\\st_0 \\sim \\mu_0} [\\st_0] as the expected initial\nstate.\n\nAt iteration i of the algorithm, we begin with a candidate\ntrajectory\n\\bar \\tau^i = (\\bar \\st^i_0, \\bar \\act^i_0, \\dots, \\bar \\st^i_{\\hor-1}, \\bar \\act^i_{\\hor-1}).\n\nStep 1: Form a time-dependent LQR problem. At each timestep\n\\hi \\in [\\hor], we use the techniques from\n\n\nSection 2.6 to linearize the dynamics and\nquadratize the cost function around (\\bar \\st^i_\\hi, \\bar \\act^i_\\hi):\\begin{aligned}\n f_\\hi(\\st, \\act) & \\approx f(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) + \\nabla_{\\st } f(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)(\\st - \\bar {\\st}^i_\\hi) + \\nabla_{\\act } f(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)(\\act - \\bar {\\act}^i_\\hi) \\\\\n c_\\hi(\\st, \\act) & \\approx c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) + \\begin{bmatrix}\n \\st - \\bar {\\st }^i_\\hi& \\act - \\bar {\\act}^i_\\hi\n \\end{bmatrix} \\begin{bmatrix}\n \\nabla_{\\st } c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)\\\\\n \\nabla_{\\act} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)\n \\end{bmatrix} \\\\\n & \\qquad + \\frac{1}{2} \\begin{bmatrix}\n \\st - \\bar {\\st }^i_\\hi& \\act - \\bar {\\act}^i_\\hi\n \\end{bmatrix} \\begin{bmatrix}\n \\nabla_{\\st \\st} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) & \\nabla_{\\st \\act} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) \\\\\n \\nabla_{\\act \\st} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi) & \\nabla_{\\act \\act} c(\\bar {\\st}^i_\\hi, \\bar {\\act}^i_\\hi)\n \\end{bmatrix}\n \\begin{bmatrix}\n \\st - \\bar {\\st }^i_\\hi\\\\\n \\act - \\bar {\\act}^i_\\hi\n \\end{bmatrix}.\n\\end{aligned}\n\nStep 2: Compute the optimal policy. We can now solve the\ntime-dependent LQR problem using the Riccati equation from\n\n\nSection 2.5.1 to compute the optimal policy\n\\pi^i_0, \\dots, \\pi^i_{\\hor-1}.\n\nStep 3: Generate a new series of actions. We can then generate a new\nsample trajectory by taking actions according to this optimal policy:\\bar \\st^{i+1}_0 = \\bar \\st_0, \\qquad \\widetilde \\act_\\hi = \\pi^i_\\hi(\\bar \\st^{i+1}_\\hi), \\qquad \\bar \\st^{i+1}_{\\hi+1} = f(\\bar \\st^{i+1}_\\hi, \\widetilde \\act_\\hi).\n\nNote that the states are sampled according to the true dynamics, which\nwe assume we have query access to.\n\nStep 4: Compute a better candidate trajectory., Note that we’ve\ndenoted these actions as \\widetilde \\act_\\hi and aren’t directly using\nthem for the next iteration \\bar \\act^{i+1}_\\hi. Rather, we want to\ninterpolate between them and the actions from the previous iteration\n\\bar \\act^i_0, \\dots, \\bar \\act^i_{\\hor-1}. This is so that the cost\nwill increase monotonically, since if the new policy turns out to\nactually be worse, we can stay closer to the previous trajectory. (Can\nyou think of an intuitive example where this might happen?)\n\nFormally, we want to find \\alpha \\in [0, 1] to generate the next\niteration of actions\n\\bar \\act^{i+1}_0, \\dots, \\bar \\act^{i+1}_{\\hor-1} such that the cost\nis minimized:\\begin{aligned}\n \\min_{\\alpha \\in [0, 1]} \\quad & \\sum_{\\hi=0}^{\\hor-1} c(\\st_\\hi, \\bar \\act^{i+1}_\\hi) \\\\\n \\text{where} \\quad & \\st_{\\hi+1} = f(\\st_\\hi, \\bar \\act^{i+1}_\\hi) \\\\\n & \\bar \\act^{i+1}_\\hi = \\alpha \\bar \\act^i_\\hi + (1-\\alpha) \\widetilde \\act_\\hi \\\\\n & \\st_0 = \\bar \\st_0.\n\\end{aligned}\n\nNote that this optimizes over the closed interval\n[0, 1], so by the Extreme Value Theorem, it’s guaranteed to have a\nglobal maximum.\n\nThe final output of this algorithm is a policy \\pi^{n_\\text{steps}}\nderived after n_\\text{steps} of the algorithm. Though the proof is\nsomewhat complex, one can show that for many nonlinear control problems,\nthis solution converges to a locally optimal solution (in the policy\nspace).","type":"content","url":"/control#iterative-lqr","position":31},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl2":"Summary"},"type":"lvl2","url":"/control#summary","position":32},{"hierarchy":{"lvl1":"2 Linear Quadratic Regulators","lvl2":"Summary"},"content":"This chapter introduced some approaches to solving different variants of\nthe optimal control problem\n\n\nDefinition 2.1. We began with the simple case of linear\ndynamics and an upward-curved quadratic cost. This model is called the\nLQR and we solved for the optimal policy using dynamic programming. We\nthen extended these results to the more general nonlinear case via local\nlinearization. We finally saw the iterative LQR algorithm for solving\nnonlinear control problems.","type":"content","url":"/control#summary","position":33},{"hierarchy":{"lvl1":"9 Exploration in MDPs"},"type":"lvl1","url":"/exploration","position":0},{"hierarchy":{"lvl1":"9 Exploration in MDPs"},"content":"","type":"content","url":"/exploration","position":1},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl2":"Introduction"},"type":"lvl2","url":"/exploration#introduction","position":2},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl2":"Introduction"},"content":"One of the key challenges of reinforcement learning is the exploration-exploitation tradeoff. Should we exploit actions we know will give high reward, or should we explore different actions to discover potentially better strategies? An algorithm that doesn’t explore effectively might easily overfit to certain areas of the state space, and fail to generalize once they enter a region they haven’t yet seen. The algorithms we saw in the chapter on fitted DP \n\n5 Fitted Dynamic Programming Algorithms suffer from this issue.\n\nIn \n\n3 Multi-Armed Bandits, where the state never changes so all we care about are the actions, we saw algorithms like \n\nSection 3.6 and \n\nThompson sampling that incentivize the learner to explore arms that it is uncertain about. In this chapter, we will see how to generalize these ideas to the MDP setting.\n\nPer-episode regret\n\nTo quantify the performance of a learning algorithm, we will consider its per-episode regret over T timesteps/episodes:\\text{Regret}_T = \\E\\left[ \\sum_{t=0}^{T-1} V^\\star_0(s_0) - V^{\\pi^t}_0(s_0) \\right]\n\nwhere \\pi^t is the policy generated by the algorithm at the tth iteration.","type":"content","url":"/exploration#introduction","position":3},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl3":"Sparse reward","lvl2":"Introduction"},"type":"lvl3","url":"/exploration#sparse-reward","position":4},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl3":"Sparse reward","lvl2":"Introduction"},"content":"Exploration is especially crucial in sparse reward problems where reward doesn’t come until after many steps, and algorithms which do not systematically explore new states may fail to learn anything meaningful (within a reasonable amount of time).\n\nFor example, policy gradient algorithms require the gradient to be nonzero in order to learn. If we never observe any reward, the gradient will always be zero, and the policy will never change or improve.\n\nSparse Reward MDP\n\nHere’s a simple example of an MDP with sparse reward:\n\nThere are |\\mathcal{S}| states. The agent starts in the leftmost state. In every state, there are three possible actions, two of which move the agent left and one which moves the agent right. The reward function assigns r=1 to the rightmost cell.","type":"content","url":"/exploration#sparse-reward","position":5},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl3":"Exploration in deterministic MDPs","lvl2":"Introduction"},"type":"lvl3","url":"/exploration#exploration-in-deterministic-mdps","position":6},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl3":"Exploration in deterministic MDPs","lvl2":"Introduction"},"content":"Let us address the exploration problem in a deterministic MDP where taking action a in state s always leads to the state P(s, a) \\in \\mathcal{S}. In this simple setting, there will be no “automatic” exploration due to randomness, so our strategy must actively explore new states. One simple strategy is to visit every possible state-action pair to learn the entire MDP. Then, once the MDP is known, we can use DP to solve for the optimal policy. (This should remind you of the \n\nSection 3.4 algorithm.)\n\nExplore-then-exploit (for deterministic MDPs)\n\nWe’ll keep a set K of all the (s, a, r, s') pairs we’ve observed. Each episode, we’ll choose an unseen state-action pair for which the reward and the next state are unknown, and take the shortest path there. We assume that every state can be reached from the initial state within a single episode. :::{algorithmic}\n$K \\gets \\emptyset$ Using our known transitions $K$, compute the shortest path $\\tilde \\pi$ to $(s, a)$ Execute $\\tilde \\pi$ to visit $(s, a)$ and observe $r = r(s, a), s' = P(s, a)$ $K \\gets K \\cup \\{ (s, a, r, s') \\}$ Compute the optimal policy $\\pi^\\star$ in the MDP $K$ (e.g. using policy iteration). $\\pi^\\star$.\n::: \n\nThe shortest path computation can be implemented using DP. We leave this as an exercise.\n\nPerformance of explore-then-exploit\n\nAs long as every state can be reached from s_0 within a single episode, i.e. |\\mathcal{S}| \\le \\hor, this will eventually be able to explore all |\\mathcal{S}| |\\mathcal{A}| state-action pairs, adding one new transition per episode. We know it will take at most |\\mathcal{S}| |\\mathcal{A}| iterations to explore the entire MDP, after which \\pi^t = \\pi^\\star, incurring no additional regret.\nFor each \\pi^t up until then, corresponding to the shortest-path policies \\tilde \\pi, the value of policy \\pi^t will differ from that of \\pi^\\star by at most \\hor, since the policies will differ by at most 1 reward at each timestep. So,\\sum_{t=0}^{T-1} V^\\star_0 - V_0^{\\pi^t} \\le |\\mathcal{S}||\\mathcal{A}| \\hor.\n\n(Note that this MDP and algorithm are deterministic, so the regret is not random.)","type":"content","url":"/exploration#exploration-in-deterministic-mdps","position":7},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl2":"Treating an unknown MDP as a MAB"},"type":"lvl2","url":"/exploration#mdp-mab","position":8},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl2":"Treating an unknown MDP as a MAB"},"content":"We also explored the exploration-exploitation tradeoff in \n\n3 Multi-Armed Bandits. Recall tthat in the MAB setting, we have K arms, each of which has an unknown reward distribution, and we want to learn which of the arms is optimal, i.e. has the highest mean reward.\n\nOne algorithm that struck a good balance between exploration and exploitation was the upper confidence bound algorithm \n\nSection 3.6: For each arm, we construct a confidence interval for its true mean award, and then choose the arm with the highest upper confidence bound. In summary,k_{t+1} \\gets \\arg\\max_{k \\in [K]} \\frac{R^{k}_t}{N^{k}_t} + \\sqrt{\\frac{\\ln(2t/\\delta)}{2 N^{k}_t}}\n\nwhere N_t^k indicates the number of times arm k has been pulled up until time t, R_t^k indicates the total reward obtained by pulling arm k up until time t, and \\delta > 0 controls the width of the confidence interval. How might we extend UCB to the MDP case?\n\nLet us formally describe an unknown MDP as an MAB problem. In an unknown MDP, we want to learn which policy is optimal. So if we want to apply MAB techniques to solving an MDP, it makes sense to think of arms as policies. There are K = (|\\mathcal{A}|^{|\\mathcal{S}|})^\\hor deterministic policies in a finite MDP. Then, “pulling” arm π corresponds to using π to act through a trajectory in the MDP, and observing the total reward.\n\nAttention\n\nWhich quantity that we have seen so far equals the mean reward from arm π?\n\nRecall that UCB incurs regret \\tilde{O}(\\sqrt{TK}), where T is the number of pulls and K is the number of arms. So in the MDP-as-MAB problem, using UCB for T episodes would achieve regret\\tilde{O}(\\sqrt{|\\mathcal{A}|^{|\\mathcal{S}|\\hor} T})\n\nThis scales exponentially in |\\mathcal{S}| and \\hor, which quickly becomes intractable. Notably, this method doesn’t consider the information that we gain across different policies. We can illustrate this with the following example:\n\nTreating an MDP as a MAB\n\nConsider a “coin MDP” with two states “heads” and “tails”, two actions “Y” and “N”, and a time horizon of \\hor=2. The state transition flips the coin, and doesn’t depend on the action. The reward only depends on the action: Taking action Y gives reward 1, and taking action N gives reward 0.\n\nSuppose we collect data from the two constant policies \\pi_{\\text{Y}}(s) = \\text{Y} and \\pi_{\\text{N}}(s) = \\text{N}. Now we want to learn about the policy \\tilde{\\pi} that takes action Y and then N. Do we need to collect data from \\tilde{\\pi} to evaluate it? No: Since the reward only depends on the action, we can infer its value from our data on the policies \\pi_{\\text{Y}} and \\pi_{\\text{N}}. However, if we treat the MDP as a bandit in which \\tilde{\\pi} is a new, unknown arm, we ignore the known correlation between the action and the reward.","type":"content","url":"/exploration#mdp-mab","position":9},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl2":"UCB-VI"},"type":"lvl2","url":"/exploration#ucb-vi","position":10},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl2":"UCB-VI"},"content":"The approach above is inefficient: We shouldn’t need to consider all |\\mathcal{A}|^{|\\mathcal{S}| H} deterministic policies to achieve low regret. Rather, all we need to describe the optimal policy is Q^\\star, which has H |\\mathcal{S}||\\mathcal{A}| entries to be learned. Can we borrow ideas from UCB to reduce the regret to this order (i.e. polynomial in |\\mathcal{S}|, |\\mathcal{A}|, and H)?\n\nOne way to frame the UCB algorithm is that, when choosing arms, we optimize over a proxy reward that is the sum of the estimated mean reward and an exploration term. In the UCB-VI algorithm, we will extend this idea to the case of an unknown MDP \\mathcal{M}^{?} by modelling a proxy MDP \\tilde{\\mathcal{M}} with a reward function that encourages exploration. Then, we will use DP to solve for the optimal policy in \\tilde{\\mathcal{M}}.\n\nAssumptions: For simplicity, here we assume the reward function of \\mathcal{M}^{?} is known, so we only need to model the state transitions, though the rewards can be modelled similarly. We will also consider the more general case of a time-varying MDP, where the transition and reward functions can change over time. We take the convention that P_\\hi is the distribution of s_{h+1} \\mid s_{h}, a_{h} and r_\\hi is applied to s_\\hi, a_\\hi.\n\nAt a high level, the UCB-VI algorithm can be described as follows:\n\nModelling: Use previous data to model the transitions \\hat{P}_0, \\dots, \\hat{P}_{H-1}.\n\nReward bonus: Design a reward bonus b_\\hi(s, a) \\in \\mathbb{R} to encourage exploration, analogous to the UCB term.\n\nOptimistic planning: Use DP to compute the optimal policy \\hat \\pi_\\hi(s) in the modelled MDP\\tilde{\\mathcal{M}} = (\\mathcal{S}, \\mathcal{A}, \\{ \\hat{P}_\\hi \\}_{h \\in [H]}, \\{ r_\\hi + b_\\hi \\}_{h \\in [H]}, H).\n\nExecution: Use \\hat \\pi_\\hi(s) to collect a new trajectory, and repeat.\n\nWe detail each of these steps below. The full definition follows in \n\n(9.16).","type":"content","url":"/exploration#ucb-vi","position":11},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl3":"Modelling the transitions","lvl2":"UCB-VI"},"type":"lvl3","url":"/exploration#modelling-the-transitions","position":12},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl3":"Modelling the transitions","lvl2":"UCB-VI"},"content":"We seek to approximate P_\\hi(s_{h+1} \\mid s_\\hi, a_\\hi) = \\frac{\\pr(s_\\hi, a_\\hi, s_{h+1})}{\\pr(s_\\hi, a_\\hi)}. We can estimate these using their sample probabilities from the dataset. That is, define\\begin{aligned}\n N_\\hi^t(s, a, s') & := \\sum_{i=0}^{t-1} \\ind{ (s_\\hi^i, a_\\hi^i, s_{h+1}^i) = (s, a, s') } \\\\\n N_\\hi^t(s, a) & := \\sum_{i=0}^{t-1} \\ind{ (s_\\hi^i, a_\\hi^i) = (s, a) } \\\\\n\\end{aligned}\n\nThen we can model\\hat{P}_\\hi^t(s' \\mid s, a) = \\frac{N_\\hi^t(s, a, s')}{N_\\hi^t(s, a)}.\n\nNote that this is also a fairly naive, nonparametric estimator that doesn’t assume any underlying structure of the MDP. We’ll see how to incorporate assumptions about the MDP in the following section.","type":"content","url":"/exploration#modelling-the-transitions","position":13},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl3":"Reward bonus","lvl2":"UCB-VI"},"type":"lvl3","url":"/exploration#reward-bonus","position":14},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl3":"Reward bonus","lvl2":"UCB-VI"},"content":"To motivate the reward bonus term b_\\hi^t(s, a), recall how we designed the reward bonus term for UCB:\n\nWe used Hoeffding’s inequality to bound, with high probability, how far the sample mean \\hat \\mu_t^k deviated from the true mean \\mu^k.\n\nBy inverting this inequality, we obtained a (1-\\delta)-confidence interval for the true mean, centered at our estimate.\n\nTo make this bound uniform across all timesteps t \\in [T], we applied the union bound and multiplied δ by a factor of T.\n\nWe’d like to do the same for UCB-VI, and construct the bonus term such that V^\\star_\\hi(s) \\le \\hat{V}_\\hi^t(s) with high probability. However, our construction will be more complex than the MAB case, since \\hat{V}_\\hi^t(s) depends on the bonus b_\\hi^t(s, a) implicitly via DP. We claim that the bonus term that gives the proper bound isb_\\hi^t(s, a) = 2 H \\sqrt{\\frac{\\log( |\\mathcal{S}||\\mathcal{A}|H T/\\delta )}{N_\\hi^t(s, a)}}.\n\nWe will only provide a heuristic sketch of the proof; see \n\nAgarwal et al. (2022) (Section 7.3) for a full proof.\n\nUCB-VI reward bonus construction\n\nWe aim to show that, with high probability,V_\\hi^\\star(s) \\le \\hat{V}_\\hi^t(s) \\quad \\forall t \\in [T], h \\in [H], s \\in \\mathcal{S}.\n\nWe’ll do this by bounding the error incurred at each step of DP. Recall that DP solves for \\hat{V}_\\hi^t(s) recursively as follows:\\hat{V}_\\hi^t(s) = \\max_{a \\in \\mathcal{A}} \\left[ \\tilde r^t_\\hi(s, a) + \\E_{s' \\sim \\hat{P}_\\hi^t(\\cdot \\mid s, a)} \\left[ \\hat{V}_{h+1}^t(s') \\right] \\right]\n\nwhere \\tilde r^t_\\hi(s, a) = r_\\hi(s, a) + b_\\hi^t(s, a) is the reward function of our modelled MDP \\tilde{\\mathcal{M}}^t. On the other hand, we know that V^\\star must satisfyV^\\star_\\hi(s) = \\max_{a \\in \\mathcal{A}} \\left[ \\tilde r^t_\\hi(s, a) + \\E_{s' \\sim P^?_\\hi(\\cdot \\mid s, a)} [V^\\star_{\\hi+1}(s')] \\right]\n\nso it suffices to bound the difference between the two inner expectations. There are two sources of error:\n\nThe value functions \\hat{V}^t_{h+1} v.s. V^\\star_{h+1}\n\nThe transition probabilities \\hat{P}_\\hi^t v.s. P^?_\\hi.\n\nWe can bound these individually, and then combine them by the triangle inequality. For the former, we can simply bound the difference by H, assuming that the rewards are within [0, 1]. Now, all that is left is to bound the error from the transition probabilities:\\text{error} = \\left| \\E_{s' \\sim \\hat{P}_\\hi^t(\\cdot \\mid s, a)} \\left[ V^\\star_{h+1}(s') \\right] - \\E_{s' \\sim P^?_\\hi(\\cdot \\mid s, a)} \\left[ V^\\star_{h+1}(s') \\right]. \\right|\n\nLet us bound this term for a fixed s, a, h, t. (Later we can make this uniform across s, a, h, t using the union bound.) Note that expanding out the definition of \\hat{P}_\\hi^t gives\\begin{aligned}\n \\E_{s' \\sim \\hat{P}_\\hi^t(\\cdot \\mid s, a)} \\left[ V^\\star_{h+1}(s') \\right] & = \\sum_{s' \\in \\mathcal{S}} \\frac{N^t_\\hi(s, a, s')}{N^t_\\hi(s, a)} V^\\star_{h+1}(s') \\\\\n & = \\frac{1}{N^t_\\hi(s, a)} \\sum_{i=0}^{t-1} \\sum_{s' \\in \\mathcal{S}} \\ind{ (s_\\hi^i, a_\\hi^i, s_{h+1}^i) = (s, a, s') } V^\\star_{h+1}(s') \\\\\n & = \\frac{1}{N^t_\\hi(s, a)} \\sum_{i=0}^{t-1} \\underbrace{\\ind{ (s_\\hi^i, a_\\hi^i) = (s, a) } V^\\star_{h+1}(s_{h+1}^i)}_{X^i}\n\\end{aligned}\n\nsince the terms where s' \\neq s_{h+1}^i vanish.\n\nNow, in order to apply Hoeffding’s inequality, we would like to express the second term in \n\n(9.12) as a sum over t random variables as well. We will do this by redundantly averaging over all desired trajectories (i.e. where we visit state s and action a at time h):\\begin{aligned}\n \\E_{s' \\sim P^?_\\hi(\\cdot \\mid s, a)} \\left[ V^\\star_{h+1}(s') \\right]\n & = \\sum_{s' \\in \\mathcal{S}} P^?_\\hi(s' \\mid s, a) V^\\star_{h+1}(s') \\\\\n & = \\sum_{s' \\in \\mathcal{S}} \\frac{1}{N^t_\\hi(s, a)} \\sum_{i=0}^{t-1} \\ind{ (s_\\hi^i, a_\\hi^i) = (s, a) } P^?_\\hi(s' \\mid s, a) V^\\star_{h+1}(s') \\\\\n & = \\frac{1}{N^t_\\hi(s, a)} \\sum_{i=0}^{t-1} \\E_{s_{h+1}^i \\sim P^?_{h}(\\cdot \\mid s_\\hi^i, a_\\hi^i)} X^i.\n\\end{aligned}\n\nNow we can apply Hoeffding’s inequality to X^i - \\E_{s_{h+1}^i \\sim P^?_{h}(\\cdot \\mid s_\\hi^i, a_\\hi^i)} X^i, which is bounded by \\hor, to obtain that, with probability at least 1-\\delta,\\text{error} = \\left| \\frac{1}{N^t_\\hi(s, a)} \\sum_{i=0}^{t-1} \\left(X^i - \\E_{s_{h+1}^i \\sim P^?_{h}(\\cdot \\mid s_\\hi^i, a_\\hi^i)} X^i \\right) \\right| \\le 2 H \\sqrt{\\frac{\\ln(1/\\delta)}{N_\\hi^t(s, a)}}.\n\nApplying a union bound over all s \\in \\mathcal{S}, a \\in \\mathcal{A}, t \\in [T], h \\in [H] gives the b_\\hi^t(s, a) term above.","type":"content","url":"/exploration#reward-bonus","position":15},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl3":"Definition","lvl2":"UCB-VI"},"type":"lvl3","url":"/exploration#definition","position":16},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl3":"Definition","lvl2":"UCB-VI"},"content":"Putting these parts together, we can define the algorithm as follows:3 + 1 = 4 TODO :::{algorithmic}\n$N_\\hi(s, a, s') \\gets \\sum_{i=0}^{t-1} \\ind{ (s_\\hi^i, a_\\hi^i, s_{h+1}^i) = (s, a, s') }$ $N_\\hi(s, a) \\gets \\sum_{i=0}^{t-1} \\ind{ (s_\\hi^i, a_\\hi^i) = (s, a) }$ $\\hat P_\\hi \\gets \\frac{N_\\hi(s, a, s')}{N_\\hi(s, a)}$ $b_\\hi(s, a) \\gets 2 H \\sqrt{\\frac{\\log( |\\mathcal{S}||\\mathcal{A}|H T/\\delta )}{N_\\hi(s, a)}}$ $\\tilde{\\mathcal{M}} \\gets (\\mathcal{S}, \\mathcal{A}, \\{ \\hat{P}_\\hi \\}_{h \\in [H-1]}, \\{ r_\\hi + b_\\hi \\}_{h \\in [H-1]}, H)$ $\\hat \\pi \\gets \\text{VI}(\\tilde{\\mathcal{M}})$ Use $\\hat \\pi_h(s)$ to collect a new trajectory $(s^t_\\hi, a^t_\\hi, s^t_{\\hi+1})_{\\hi \\in [\\hor]}$\n::: ","type":"content","url":"/exploration#definition","position":17},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl3":"Performance of UCB-VI","lvl2":"UCB-VI"},"type":"lvl3","url":"/exploration#performance-of-ucb-vi","position":18},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl3":"Performance of UCB-VI","lvl2":"UCB-VI"},"content":"How exactly does UCB-VI strike a good balance between exploration and exploitation? In UCB for MABs, the bonus exploration term is simple to interpret: It encourages the learner to take actions with a high exploration term. Here, the policy depends on the bonus term indirectly: The policy is obtained by planning in an MDP where the bonus term is added to the reward function. Note that the bonuses propagate backwards in DP, effectively enabling the learner to plan to explore unknown states. This effect takes some further interpretation.\n\nRecall we constructed b^t_\\hi so that, with high probability, V^\\star_\\hi(s) \\le \\hat{V}_\\hi^t(s) and soV^\\star_\\hi(s) - V^{\\pi^t}_\\hi(s) \\le \\hat{V}_\\hi^t(s) - V^{\\pi^t}_\\hi(s).\n\nThat is, the l.h.s. measures how suboptimal policy \\pi^t is in the true environment, while the r.h.s. is the difference in the policy’s value when acting in the modelled MDP \\tilde{\\mathcal{M}}^t instead of the true one \\mathcal{M}^{?}.\n\nIf the r.h.s. is small, this implies that the l.h.s. difference is also small, i.e. that \\pi^t is exploiting actions that are giving high reward.\n\nIf the r.h.s. is large, then we have overestimated the value: \\pi^t, the optimal policy of \\tilde{\\mathcal{M}}^t, does not perform well in the true environment \\mathcal{M}^{?}. This indicates that one of the b_h^t(s, a) terms must be large, or some \\hat P^t_\\hi(\\cdot \\mid s, a) must be inaccurate, indicating a state-action pair with a low visit count N^t_\\hi(s, a) that the learner was encouraged to explore.\n\nIt turns out that UCB-VI achieves a per-episode regret of\n\nUCB-VI regret\\E \\left[ \\sum_{t=0}^{T-1} \\left(V^\\star_0(s_0) - V^{\\pi^t}_0(s_0) \\right) \\right] = \\tilde{O}(H^2 \\sqrt{|\\mathcal{S}| |\\mathcal{A}| T})\n\nComparing this to the UCB regret bound \\tilde{O}(\\sqrt{T K}), where K is the number of arms of the MAB, we see that we’ve reduced the number of effective arms from |\\mathcal{A}|^{|\\mathcal{S}|\\hor} (in \n\n(9.4)) to H^4 |\\mathcal{S}||\\mathcal{A}|, which is indeed polynomial in |\\mathcal{S}|, |\\mathcal{A}|, and H, as desired. This is also roughly the number of episodes it takes to achieve constant-order average regret:\\frac{1}{T} \\E[\\text{Regret}_T] = \\tilde{O}\\left(\\sqrt{\\frac{H^4 |\\mathcal{S}||\\mathcal{A}|}{T}}\\right)\n\nNote that the time-dependent transition matrix has H |\\mathcal{S}|^2 |\\mathcal{A}| entries. Assuming H \\ll |\\mathcal{S}|, this shows that it’s possible to achieve low regret, and achieve a near-optimal policy, while only understanding a 1/|\\mathcal{S}| fraction of the world’s dynamics.","type":"content","url":"/exploration#performance-of-ucb-vi","position":19},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl2":"Linear MDPs"},"type":"lvl2","url":"/exploration#linear-mdps","position":20},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl2":"Linear MDPs"},"content":"A polynomial dependency on |\\mathcal{S}| and |\\mathcal{A}| is manageable when the state and action spaces are small. But for large or continuous state and action spaces, even this polynomial factor will become intractable. Can we find algorithms that don’t depend on |\\mathcal{S}| or |\\mathcal{A}| at all, effectively reducing the dimensionality of the MDP? In this section, we’ll explore linear MDPs: an example of a parameterized MDP where the rewards and state transitions depend only on some parameter space of dimension d that is independent from |\\mathcal{S}| or |\\mathcal{A}|.\n\nLinear MDP\n\nWe assume that the transition probabilities and rewards are linear in some feature vector\n\n\\phi(s, a) \\in \\mathbb{R}^d:\\begin{aligned}\n P_\\hi(s' \\mid s, a) & = \\phi(s, a)^\\top \\mu^\\star_\\hi(s') \\\\\n r_\\hi(s, a) & = \\phi(s, a)^\\top \\theta_\\hi^\\star\n\\end{aligned}\n\nNote that we can also think of P_\\hi(\\cdot \\mid s, a) = \\mu_\\hi^\\star as an |\\mathcal{S}| \\times d matrix, and think of \\mu^\\star_\\hi(s') as indexing into the s'-th row of this matrix (treating it as a column vector). Thinking of V^\\star_{\\hi+1} as an |\\mathcal{S}|-dimensional vector, this allows us to write\\E_{s' \\sim P_\\hi(\\cdot \\mid s, a)}[V^\\star_{\\hi+1}(s)] = (\\mu^\\star_\\hi \\phi(s, a))^\\top V^\\star_{\\hi+1}.\n\nThe ϕ feature mapping can be designed to capture interactions between the state s and action a. In this book, we’ll assume that the feature map \\phi : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}^d and the reward function (described by \\theta_\\hi^\\star) are known to the learner.","type":"content","url":"/exploration#linear-mdps","position":21},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl3":"Planning in a linear MDP","lvl2":"Linear MDPs"},"type":"lvl3","url":"/exploration#planning-in-a-linear-mdp","position":22},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl3":"Planning in a linear MDP","lvl2":"Linear MDPs"},"content":"It turns out that Q^\\star_\\hi is also linear with respect to this feature mapping. We can prove this by simply computing it using DP. We initialize V_{H}^\\star(s) = 0 \\forall s. Then we iterate:\\begin{aligned}\n Q^\\star_\\hi(s, a) & = r_\\hi(s, a) + \\E_{s' \\sim P_\\hi(\\cdot \\mid s, a)} [V^\\star_{h+1}(s')] \\\\\n & = \\phi(s, a)^\\top \\theta_\\hi^\\star + (\\mu_\\hi^\\star \\phi(s, a))^\\top V^\\star_{h+1} \\\\\n & = \\phi(s, a)^\\top \\underbrace{( \\theta_\\hi^\\star + (\\mu_\\hi^\\star)^\\top V^\\star_{h+1})}_{w_\\hi} \\\\\n V^\\star_\\hi(s) & = \\max_a Q^\\star_\\hi(s, a) \\\\\n \\pi^\\star_\\hi(s) & = \\arg\\max_a Q^\\star_\\hi(s, a)\n\\end{aligned}\n\nAttention\n\nShow that Q^\\pi_\\hi is also linear with respect to \\phi(s, a) for any policy π.","type":"content","url":"/exploration#planning-in-a-linear-mdp","position":23},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl3":"UCB-VI in a linear MDP","lvl2":"Linear MDPs"},"type":"lvl3","url":"/exploration#lin-ucb-vi","position":24},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl3":"UCB-VI in a linear MDP","lvl2":"Linear MDPs"},"content":"","type":"content","url":"/exploration#lin-ucb-vi","position":25},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl4":"Modelling the transitions","lvl3":"UCB-VI in a linear MDP","lvl2":"Linear MDPs"},"type":"lvl4","url":"/exploration#modelling-the-transitions-1","position":26},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl4":"Modelling the transitions","lvl3":"UCB-VI in a linear MDP","lvl2":"Linear MDPs"},"content":"This linear assumption on the MDP will also allow us to model the unknown dynamics P^?_\\hi(s' \\mid s, a) with techniques from supervised learning (SL). Recall that SL is useful for estimating conditional expectations by minimizing mean squared error. We can rephrase the estimation of P^?_\\hi(s' \\mid s, a) as a least-squares problem as follows: Write \\delta_s to denote a one-hot vector in \\mathbb{R}^{|\\mathcal{S}|}, with a 1 in the s-th entry and 0 everywhere else. Note that\\E_{s' \\sim P_h(\\cdot \\mid s, a)} [\\delta_{s'}] = P_h(\\cdot \\mid s, a) = \\mu_h^\\star \\phi(s, a).\n\nFurthermore, since the expectation here is linear with respect to \\phi(s, a), we can directly apply least-squares multi-target linear regression to construct the estimate\\hat \\mu = \\arg\\min_{\\mu \\in \\mathbb{R}^{|\\mathcal{S}| \\times d}} \\sum_{t=0}^{T-1} \\|\\mu \\phi(s_h^i, a_h^i) - \\delta_{s_{h+1}^i} \\|_2^2.\n\nThis has a well-known closed-form solution:\\begin{aligned}\n \\hat \\mu^\\top & = (A_h^t)^{-1} \\sum_{i=0}^{t-1} \\phi(s_h^i, a_h^i) \\delta_{s_{h+1}^i}^\\top \\\\\n \\text{where} \\quad A_h^t & = \\sum_{i=0}^{t-1} \\phi(s_h^i, a_h^i) \\phi(s_h^i, a_h^i)^\\top + \\lambda I\n\\end{aligned}\n\nwhere we include a \\lambda I term to ensure that the matrix A^t_h is invertible. (This can also be derived by adding a \\lambda \\|\\mu\\|_{\\text{F}}^2 regularization term to the objective.) We can directly plug in this estimate into \\hat{P}^t_h(\\cdot \\mid s, a) = \\hat \\mu^t_h \\phi(s, a).","type":"content","url":"/exploration#modelling-the-transitions-1","position":27},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl4":"Reward bonus","lvl3":"UCB-VI in a linear MDP","lvl2":"Linear MDPs"},"type":"lvl4","url":"/exploration#reward-bonus-1","position":28},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl4":"Reward bonus","lvl3":"UCB-VI in a linear MDP","lvl2":"Linear MDPs"},"content":"Now, to design the reward bonus, we can’t apply Hoeffding anymore, since the terms no longer involve sample means of bounded random variables; Instead, we’re incorporating information across different states and actions. Rather, we can construct an upper bound using Chebyshev’s inequality in the same way we did for the LinUCB algorithm in the MAB setting \n\nSection 3.8.1:b^t_\\hi(s, a) = \\beta \\sqrt{\\phi(s, a)^\\top (A^t_h)^{-1} \\phi(s, a)}, \\quad \\beta = \\tilde O(d \\hor).\n\nNote that this isn’t explicitly inversely proportional to N_h^t(s, a) as in the original UCB-VI bonus term \n\n(9.8). Rather, it is inversely proportional to the amount that the direction \\phi(s, a) has been explored in the history. That is, if A_h^t has a large component in the direction \\phi(s, a), implying that this direction is well explored, then the bonus term will be small, and vice versa.\n\nWe can now plug in these transition estimates and reward bonuses into the UCB-VI algorithm \n\n(9.16).","type":"content","url":"/exploration#reward-bonus-1","position":29},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl4":"Performance","lvl3":"UCB-VI in a linear MDP","lvl2":"Linear MDPs"},"type":"lvl4","url":"/exploration#performance","position":30},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl4":"Performance","lvl3":"UCB-VI in a linear MDP","lvl2":"Linear MDPs"},"content":"LinUCB-VI regret\n\nThe LinUCB-VI algorithm achieves expected regret\\E[\\text{Regret}_T] = \\E\\left[\\sum_{t=0}^{T-1} V^\\star_0(s_0) - V^{\\pi^t}_0(s_0) \\right] \\le \\tilde O(H^2 d^{1.5} \\sqrt{T})\n\nComparing this to our bound for UCB-VI in an environment without this linear assumption, we see that we go from a sample complexity of \\tilde \\Omega(H^4 |\\mathcal{S}||\\mathcal{A}|) to \\tilde \\Omega(H^4 d^{3}). This new sample complexity only depends on the feature dimension and not on the state or action space of the MDP!","type":"content","url":"/exploration#performance","position":31},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl2":"Summary"},"type":"lvl2","url":"/exploration#summary","position":32},{"hierarchy":{"lvl1":"9 Exploration in MDPs","lvl2":"Summary"},"content":"In this chapter, we’ve explored how to explore in an unknown MDP.\n\nWe first discussed the explore-then-exploit algorithm \n\nDefinition 9.2, a simple way to explore a deterministic MDP by visiting all state-action pairs.\n\nWe then discussed how to treat an unknown MDP as a MAB \n\nSection 9.2, and how this approach is inefficient since it doesn’t make use of relationships between policies.\n\nWe then introduced the UCB-VI algorithm \n\n(9.16), which models the unknown MDP by a proxy MDP with a reward bonus term that encourages exploration.\n\nFinally, assuming that the transitions and rewards are linear with respect to a feature transformation of the state and action, we introduced the LinUCB-VI algorithm \n\nSection 9.4.2, which has a sample complexity independent of the size of the state and action spaces.","type":"content","url":"/exploration#summary","position":33},{"hierarchy":{"lvl1":"5 Fitted Dynamic Programming Algorithms"},"type":"lvl1","url":"/fitted-dp","position":0},{"hierarchy":{"lvl1":"5 Fitted Dynamic Programming Algorithms"},"content":"","type":"content","url":"/fitted-dp","position":1},{"hierarchy":{"lvl1":"5 Fitted Dynamic Programming Algorithms","lvl2":"Introduction"},"type":"lvl2","url":"/fitted-dp#introduction","position":2},{"hierarchy":{"lvl1":"5 Fitted Dynamic Programming Algorithms","lvl2":"Introduction"},"content":"We borrow these definitions from the \n\n1 Markov Decision Processes chapter:\n\nfrom typing import NamedTuple, Callable, Optional\nfrom jaxtyping import Float, Array\nimport jax.numpy as np\nfrom jax import grad, vmap\nimport jax.random as rand\nfrom tqdm import tqdm\nimport gymnasium as gym\n\nkey = rand.PRNGKey(184)\n\n\nclass Transition(NamedTuple):\n s: int\n a: int\n r: float\n\n\nTrajectory = list[Transition]\n\n\ndef get_num_actions(trajectories: list[Trajectory]) -> int:\n \"\"\"Get the number of actions in the dataset. Assumes actions range from 0 to A-1.\"\"\"\n return max(max(t.a for t in τ) for τ in trajectories) + 1\n\n\nState = Float[Array, \"...\"] # arbitrary shape\n\n# assume finite `A` actions and f outputs an array of Q-values\n# i.e. Q(s, a, h) is implemented as f(s, h)[a]\nQFunction = Callable[[State, int], Float[Array, \" A\"]]\n\n\ndef Q_zero(A: int) -> QFunction:\n \"\"\"A Q-function that always returns zero.\"\"\"\n return lambda s, a: np.zeros(A)\n\n\n# a deterministic time-dependent policy\nPolicy = Callable[[State, int], int]\n\n\ndef q_to_greedy(Q: QFunction) -> Policy:\n \"\"\"Get the greedy policy for the given state-action value function.\"\"\"\n return lambda s, h: np.argmax(Q(s, h))\n\nThe \n\n1 Markov Decision Processes chapter discussed the case of finite MDPs, where the state and action spaces \\mathcal{S} and \\mathcal{A} were finite.\nThis gave us a closed-form expression for computing the r.h.s. of \n\nthe Bellman one-step consistency equation.\nIn this chapter, we consider the case of large or continuous state spaces, where the state space is too large to be enumerated.\nIn this case, we need to approximate the value function and Q-function using methods from supervised learning.\n\nWe will first take a quick detour to introduce the empirical risk minimization framework for function approximation.\nWe will then see its application to fitted RL algorithms,\nwhich attempt to learn the optimal value function (and the optimal policy) from a dataset of trajectories.","type":"content","url":"/fitted-dp#introduction","position":3},{"hierarchy":{"lvl1":"5 Fitted Dynamic Programming Algorithms","lvl2":"Empirical risk minimization"},"type":"lvl2","url":"/fitted-dp#erm","position":4},{"hierarchy":{"lvl1":"5 Fitted Dynamic Programming Algorithms","lvl2":"Empirical risk minimization"},"content":"The supervised learning task is as follows:\nWe seek to learn the relationship between some input variables x and some output variable y\n(drawn from their joint distribution).\nPrecisely, we want to find a function \\hat f : x \\mapsto y that minimizes the\nsquared error of the prediction:\\hat f = \\arg\\min_{f} \\E[(y - f(x))^2]\n\nAn equivalent framing is that we seek to approximate the conditional expectation of y given x:\n\nConditional expectation minimizes mean squared error\\arg\\min_{f} \\E[(y - f(x))^2] = (x \\mapsto \\E[y \\mid x])\n\nWe can decompose the mean squared error as\\begin{aligned}\n\\E[(y - f(x))^2] &= \\E[ (y - \\E[y \\mid x] + \\E[y \\mid x] - f(x))^2 ] \\\\\n&= \\E[ (y - \\E[y \\mid x])^2 ] + \\E[ (\\E[y \\mid x] - f(x))^2 ] + 2 \\E[ (y - \\E[y \\mid x])(\\E[y \\mid x] - f(x)) ] \\\\\n\\end{aligned}\n\nAttention\n\nUse the law of iterated expectations to show that the last term is zero.\n\nThe first term is the irreducible error, and the second term is the error due to the approximation,\nwhich is minimized at 0 when f(x) = \\E[y \\mid x].\n\nIn most applications, the joint distribution of x, y is unknown or extremely complex, and so we can’t\nanalytically evaluate \\E [y \\mid x].\nInstead, our strategy is to draw N samples (x_i, y_i) from the joint distribution of x and y,\nand then use the sample average \\sum_{i=1}^N (y_i - f(x_i))^2 / N to approximate the mean squared error.\nThen we use a fitting method to find a function \\hat f that minimizes this objective\nand thus approximates the conditional expectation.\nThis approach is called empirical risk minimization.\n\nEmpirical risk minimization\n\nGiven a dataset of samples (x_1, y_1), \\dots, (x_N, y_N), empirical risk minimization seeks to find a function f (from some class of functions \\mathcal{F}) that minimizes the empirical risk:\\hat f = \\arg\\min_{f \\in \\mathcal{F}} \\frac{1}{N} \\sum_{i=1}^N (y_i - f(x_i))^2\n\nWe will cover the details of the minimization process in [](#the next section ).\n\nAttention\n\nWhy is it important that we constrain our search to a class of functions \\mathcal{F}?\n\nHint: Consider the function f(x) = \\sum_{i=1}^N y_i \\mathbb{1}_{\\{ x = x_i \\}}. What is the empirical risk of this function? Would you consider it a good approximation of the conditional expectation?","type":"content","url":"/fitted-dp#erm","position":5},{"hierarchy":{"lvl1":"5 Fitted Dynamic Programming Algorithms","lvl2":"Fitted value iteration"},"type":"lvl2","url":"/fitted-dp#fitted-value-iteration","position":6},{"hierarchy":{"lvl1":"5 Fitted Dynamic Programming Algorithms","lvl2":"Fitted value iteration"},"content":"Let us apply ERM to the RL problem of computing the optimal policy / value function.\n\nHow did we compute the optimal value function in MDPs with finite state and action spaces?\n\nIn a [](#finite-horizon MDP ), we can use \n\ndynamic programming, working backwards from the end of the time horizon, to compute the optimal value function exactly.\n\nIn an [](#infinite-horizon MDP ), we can use [](#value iteration ), which iterates the Bellman optimality operator \n\n(1.54) to approximately compute the optimal value function.\n\nOur existing approaches represent the value function, and the MDP itself,\nin matrix notation.\nBut what happens if the state space is extremely large, or even infinite (e.g. real-valued)?\nThen computing a weighted sum over all possible next states, which is required to compute the Bellman operator,\nbecomes intractable.\n\nInstead, we will need to use function approximation methods from supervised learning to solve for the value function in an alternative way.\n\nIn particular, suppose we have a dataset of N trajectories \\tau_1, \\dots, \\tau_N \\sim \\rho_{\\pi} from some policy π (called the data collection policy) acting in the MDP of interest.\nLet us indicate the trajectory index in the superscript, so that\\tau_i = \\{ s_0^i, a_0^i, r_0^i, s_1^i, a_1^i, r_1^i, \\dots, s_{\\hor-1}^i, a_{\\hor-1}^i, r_{\\hor-1}^i \\}.\n\ndef collect_data(\n env: gym.Env, N: int, H: int, key: rand.PRNGKey, π: Optional[Policy] = None\n) -> list[Trajectory]:\n \"\"\"Collect a dataset of trajectories from the given policy (or a random one).\"\"\"\n trajectories = []\n seeds = [rand.bits(k).item() for k in rand.split(key, N)]\n for i in tqdm(range(N)):\n τ = []\n s, _ = env.reset(seed=seeds[i])\n for h in range(H):\n # sample from a random policy\n a = π(s, h) if π else env.action_space.sample()\n s_next, r, terminated, truncated, _ = env.step(a)\n τ.append(Transition(s, a, r))\n if terminated or truncated:\n break\n s = s_next\n trajectories.append(τ)\n return trajectories\n\nenv = gym.make(\"LunarLander-v2\")\ntrajectories = collect_data(env, 100, 300, key)\ntrajectories[0][:5] # show first five transitions from first trajectory\n\nCan we view the dataset of trajectories as a “labelled dataset” in order to apply supervised learning to approximate the optimal Q-function? Yes!\nRecall that we can characterize the optimal Q-function using the \n\nBellman optimality equations,\nwhich don’t depend on an actual policy:Q_\\hi^\\star(s, a) = r(s, a) + \\E_{s' \\sim P(s, a)} [\\max_{a'} Q_{\\hi+1}^\\star(s', a')]\n\nWe can think of the arguments to the Q-function -- i.e. the current state, action, and timestep \\hi --\nas the inputs x, and the r.h.s. of the above equation as the label f(x). Note that the r.h.s. can also be expressed as a conditional expectation:f(x) = \\E [y \\mid x] \\quad \\text{where} \\quad y = r(s_\\hi, a_\\hi) + \\max_{a'} Q^\\star_{\\hi + 1}(s', a').\n\nApproximating the conditional expectation is precisely the task that \n\nSection 5.2 is suited for!\n\nOur above dataset would give us N \\cdot \\hor samples in the dataset:x_{i \\hi} = (s_\\hi^i, a_\\hi^i, \\hi) \\qquad y_{i \\hi} = r(s_\\hi^i, a_\\hi^i) + \\max_{a'} Q^\\star_{\\hi + 1}(s_{\\hi + 1}^i, a')\n\ndef get_X(trajectories: list[Trajectory]):\n \"\"\"\n We pass the state and timestep as input to the Q-function\n and return an array of Q-values.\n \"\"\"\n rows = [(τ[h].s, τ[h].a, h) for τ in trajectories for h in range(len(τ))]\n return [np.stack(ary) for ary in zip(*rows)]\n\n\ndef get_y(\n trajectories: list[Trajectory],\n f: Optional[QFunction] = None,\n π: Optional[Policy] = None,\n):\n \"\"\"\n Transform the dataset of trajectories into a dataset for supervised learning.\n If `π` is None, instead estimates the optimal Q function.\n Otherwise, estimates the Q function of π.\n \"\"\"\n f = f or Q_zero(get_num_actions(trajectories))\n y = []\n for τ in trajectories:\n for h in range(len(τ) - 1):\n s, a, r = τ[h]\n Q_values = f(s, h + 1)\n y.append(r + (Q_values[π(s, h + 1)] if π else Q_values.max()))\n y.append(τ[-1].r)\n return np.array(y)\n\ns, a, h = get_X(trajectories[:1])\nprint(\"states:\", s[:5])\nprint(\"actions:\", a[:5])\nprint(\"timesteps:\", h[:5])\n\nget_y(trajectories[:1])[:5]\n\nThen we can use empirical risk minimization to find a function \\hat f that approximates the optimal Q-function.\n\n# We will see some examples of fitting methods in the next section\nFittingMethod = Callable[[Float[Array, \"N D\"], Float[Array, \" N\"]], QFunction]\n\nBut notice that the definition of y_{i \\hi} depends on the Q-function itself!\nHow can we resolve this circular dependency?\nRecall that we faced the same issue \n\nwhen evaluating a policy in an infinite-horizon MDP. There, we iterated the \n\nDefinition 1.8 since we knew that the policy’s value function was a fixed point of the policy’s Bellman operator.\nWe can apply the same strategy here, using the \\hat f from the previous iteration to compute the labels y_{i \\hi},\nand then using this new dataset to fit the next iterate.\n\nFitted Q-function iteration\n\nInitialize some function \\hat f(s, a, h) \\in \\mathbb{R}.\n\nIterate the following:\n\nGenerate a supervised learning dataset X, y from the trajectories and the current estimate f, where the labels come from the r.h.s. of the Bellman optimality operator \n\n(1.54)\n\nSet \\hat f to the function that minimizes the empirical risk:\\hat f \\gets \\arg\\min_f \\frac{1}{N} \\sum_{i=1}^N (y_i - f(x_i))^2.\n\ndef fitted_q_iteration(\n trajectories: list[Trajectory],\n fit: FittingMethod,\n epochs: int,\n Q_init: Optional[QFunction] = None,\n) -> QFunction:\n \"\"\"\n Run fitted Q-function iteration using the given dataset.\n Returns an estimate of the optimal Q-function.\n \"\"\"\n Q_hat = Q_init or Q_zero(get_num_actions(trajectories))\n X = get_X(trajectories)\n for _ in range(epochs):\n y = get_y(trajectories, Q_hat)\n Q_hat = fit(X, y)\n return Q_hat\n\n","type":"content","url":"/fitted-dp#fitted-value-iteration","position":7},{"hierarchy":{"lvl1":"5 Fitted Dynamic Programming Algorithms","lvl2":"Fitted policy evaluation"},"type":"lvl2","url":"/fitted-dp#fitted-pi-eval","position":8},{"hierarchy":{"lvl1":"5 Fitted Dynamic Programming Algorithms","lvl2":"Fitted policy evaluation"},"content":"We can also use this fixed-point interation to evaluate a policy using the dataset (not necessarily the one used to generate the trajectories):\n\nFitted policy evaluation\n\nInput: Policy \\pi : \\mathcal{S} \\times [H] \\to \\Delta(\\mathcal{A}) to be evaluated.\n\nOutput: An approximation of the value function Q^\\pi of the policy.\n\nInitialize some function \\hat f(s, a, h) \\in \\mathbb{R}.\n\nIterate the following:\n\nGenerate a supervised learning dataset X, y from the trajectories and the current estimate f, where the labels come from the r.h.s. of the \n\nBellman consistency equation for the given policy.\n\nSet \\hat f to the function that minimizes the empirical risk:\\hat f \\gets \\arg\\min_f \\frac{1}{N} \\sum_{i=1}^N (y_i - f(x_i))^2.\n\ndef fitted_evaluation(\n trajectories: list[Trajectory],\n fit: FittingMethod,\n π: Policy,\n epochs: int,\n Q_init: Optional[QFunction] = None,\n) -> QFunction:\n \"\"\"\n Run fitted policy evaluation using the given dataset.\n Returns an estimate of the Q-function of the given policy.\n \"\"\"\n Q_hat = Q_init or Q_zero(get_num_actions(trajectories))\n X = get_X(trajectories)\n for _ in tqdm(range(epochs)):\n y = get_y(trajectories, Q_hat, π)\n Q_hat = fit(X, y)\n return Q_hat\n\nAttention\n\nSpot the difference between fitted_evaluation and fitted_q_iteration. (See the definition of get_y.)\nHow would you modify this algorithm to evaluate the data collection policy?","type":"content","url":"/fitted-dp#fitted-pi-eval","position":9},{"hierarchy":{"lvl1":"5 Fitted Dynamic Programming Algorithms","lvl2":"Fitted policy iteration"},"type":"lvl2","url":"/fitted-dp#fitted-policy-iteration","position":10},{"hierarchy":{"lvl1":"5 Fitted Dynamic Programming Algorithms","lvl2":"Fitted policy iteration"},"content":"We can use this policy evaluation algorithm to adapt the [](#policy iteration algorithm ) to this new setting. The algorithm remains exactly the same -- repeatedly make the policy greedy w.r.t. its own value function -- except now we must evaluate the policy (i.e. compute its value function) using the iterative fitted_evaluation algorithm.\n\ndef fitted_policy_iteration(\n trajectories: list[Trajectory],\n fit: FittingMethod,\n epochs: int,\n evaluation_epochs: int,\n π_init: Optional[Policy] = lambda s, h: 0, # constant zero policy\n):\n \"\"\"Run fitted policy iteration using the given dataset.\"\"\"\n π = π_init\n for _ in range(epochs):\n Q_hat = fitted_evaluation(trajectories, fit, π, evaluation_epochs)\n π = q_to_greedy(Q_hat)\n return π\n\n","type":"content","url":"/fitted-dp#fitted-policy-iteration","position":11},{"hierarchy":{"lvl1":"5 Fitted Dynamic Programming Algorithms","lvl2":"Summary"},"type":"lvl2","url":"/fitted-dp#summary","position":12},{"hierarchy":{"lvl1":"5 Fitted Dynamic Programming Algorithms","lvl2":"Summary"},"content":"","type":"content","url":"/fitted-dp#summary","position":13},{"hierarchy":{"lvl1":"7 Imitation Learning"},"type":"lvl1","url":"/imitation-learning","position":0},{"hierarchy":{"lvl1":"7 Imitation Learning"},"content":"","type":"content","url":"/imitation-learning","position":1},{"hierarchy":{"lvl1":"7 Imitation Learning","lvl2":"Introduction"},"type":"lvl2","url":"/imitation-learning#introduction","position":2},{"hierarchy":{"lvl1":"7 Imitation Learning","lvl2":"Introduction"},"content":"Imagine you are tasked with learning how to drive. How do, or did, you go about it?\nAt first, this task might seem insurmountable: there are a vast array of controls, and the cost of making a single mistake could be extremely high, making it hard to explore by trial and error.\nLuckily, there are already people in the world who know how to drive who can get you started.\nIn almost every challenge we face,\nwe “stand on the shoulders of giants” and learn skills from experts who have already mastered them.\n\nNow in machine learning,\nwe are often trying to teach machines to accomplish tasks that humans are already proficient at.\nIn such cases, the machine learning algorithm is the one learning the new skill, and humans are the “experts” that can demonstrate how to perform the task.\nImitation learning is a strategy for getting the learner to perform at least as well as the expert.\nWe’ll see that the most naive form of imitation learning, called behavioral cloning, is really an application of supervised learning to interactive tasks.\nWe’ll then explore dataset aggregation (DAgger) as a way to query an expert and learn even more effectively.","type":"content","url":"/imitation-learning#introduction","position":3},{"hierarchy":{"lvl1":"7 Imitation Learning","lvl2":"Behavioral cloning"},"type":"lvl2","url":"/imitation-learning#behavioral-cloning","position":4},{"hierarchy":{"lvl1":"7 Imitation Learning","lvl2":"Behavioral cloning"},"content":"This notion of “learning from human-provided data” may remind you of the basic premise of \n\n4 Supervised learning.\nIn supervised learning,\nthere is some mapping from inputs to outputs,\nsuch as the task of assigning the correct label to an image,\nthat humans can implicitly compute.\nTo teach a machine to calculate this mapping,\nwe first collect a large training dataset by getting people to label a lot of inputs,\nand then use some optimization algorithm to produce a predictor that maps from the inputs to the outputs as closely as possible.\n\nHow does this relate to interactive tasks?\nHere, the input is the observation seen by the agent and the output is the action it selects,\nso the mapping is the agent’s policy.\nWhat’s stopping us from applying supervised learning techniques to mimic the expert’s policy?\nIn principle, nothing!\nThis is called behavioral cloning.\n\nBehavioral cloning\n\nCollect a training dataset of trajectories \\mathcal{D} = (s^n, a^n)_{n=1}^{N} generated by an expert policy \\pi_\\text{expert}. (For example, if the dataset contains M trajectories, each with a finite horizon H, then N = M \\times H.)\n\nUse a SL algorithm \\texttt{fit} : \\mathcal{D} \\mapsto \\widetilde{\\pi} to extract a policy \\widetilde{\\pi} that approximates the expert policy.\n\nTypically, this second task can be framed as empirical loss minimization:\\widetilde{\\pi} = \\arg\\min_{\\pi \\in \\Pi} \\sum_{n=0}^{N-1} \\text{loss}(\\pi(s^n), a^n)\n\nwhere Π is some class of possible policies, \\text{loss} is the loss function to measure how different the policy’s prediction is from the true observed action,\nand the SL algorithm itself, also known as the fitting method, tells us how to compute this \\arg\\min.\n\nHow should we choose the loss function?\nIn supervised learning, we saw that the mean squared error is a good choice for continuous outputs.\nHowever, how should we measure the difference between two actions in a discrete action space?\nIn this setting, the policy acts more like a classifier that picks the best action in a given state.\nRather than considering a deterministic policy that just outputs a single action,\nwe’ll consider a stochastic policy π that outputs a distribution over actions.\nThis allows us to assign a likelihood to observing the entire dataset \\mathcal{D} under the policy π,\nassuming the state-action pairs are independent:\\pr_\\pi (\\mathcal{D}) = \\prod_{n=1}^{N} \\pi(a_n \\mid s_n)\n\nNote that the states and actions are not, however, actually independent! A key property of interactive tasks is that the agent’s output -- the action that it takes -- may influence its next observation.\nWe want to find a policy under which the training dataset \\mathcal{D} is the most likely.\nThis is called the maximum likelihood estimate of the policy that generated the dataset:\\widetilde{\\pi} = \\arg\\max_{\\pi \\in \\Pi} \\pr_{\\pi}(\\mathcal{D})\n\nThis is also equivalent to picking the negative log likelihood as the loss function:\\begin{align*}\n\\widetilde{\\pi} &= \\arg\\min_{\\pi \\in \\Pi} - \\log \\pr_\\pi(\\mathcal{D}) \\\\\n&= \\arg\\min_{\\pi \\in \\Pi} \\sum_{n=1}^N - \\log \\pi(a_n \\mid s_n)\n\\end{align*}","type":"content","url":"/imitation-learning#behavioral-cloning","position":5},{"hierarchy":{"lvl1":"7 Imitation Learning","lvl3":"Performance of behavioral cloning","lvl2":"Behavioral cloning"},"type":"lvl3","url":"/imitation-learning#performance-of-behavioral-cloning","position":6},{"hierarchy":{"lvl1":"7 Imitation Learning","lvl3":"Performance of behavioral cloning","lvl2":"Behavioral cloning"},"content":"Can we quantify how well this algorithm works?\nFor simplicity, let’s consider the case where the action space is finite and both the expert policy and learned policy are deterministic.\nSuppose the learned policy obtains \\varepsilon classification error.\nThat is, for trajectories drawn from the expert policy,\nthe learned policy chooses a different action at most \\varepsilon of the time:\\mathbb{E}_{\\tau \\sim \\rho_{\\pi_{\\text{expert}}}} \\left[ \\frac 1 \\hor \\sum_{\\hi=0}^{\\hor-1} \\ind{ \\widetilde{\\pi}(s_\\hi) \\ne \\pi_{\\text{expert}} (s_\\hi) } \\right] \\le \\varepsilon\n\nThen, their value functions differ by| V^{\\pi_{\\text{expert}}} - V^{\\widetilde{\\pi}} | \\le H^2 \\varepsilon\n\nwhere H is the horizon.\n\nPerformance of behavioral cloning\n\nRecall the \n\nTheorem 1 allows us to express the difference between \\pi_{\\text{expert}} and \\widetilde{\\pi} asV_0^{\\pi_{\\text{expert}}}(s) - V_0^{\\widetilde{\\pi}} (s) = \\E_{\\tau \\sim \\rho^{\\pi_{\\text{expert}}} \\mid s_0 = s} \\left[ \\sum_{\\hi=0}^{\\hor-1} A_\\hi^{\\widetilde{\\pi}} (s_\\hi, a_\\hi) \\right].\n\nNow since the expert policy is deterministic, we can substitute a_\\hi = \\pi_{\\text{expert}}(s_\\hi).\nThis allows us to make a further simplification:\nsince \\pi_{\\text{expert}} is deterministic,\nthe advantage of the chosen action is exactly zero:A^{\\pi_{\\text{expert}}}(s, \\pi_{\\text{expert}}(s)) = Q^{\\pi_{\\text{expert}}}(s, \\pi_{\\text{expert}}(s)) - V^{\\pi_{\\text{expert}}}(s) = 0.\n\nBut the right-hand-side of \n\n(7.7) uses A^{\\widetilde{\\pi}}, not A^{\\pi_{\\text{expert}}}.\nTo bridge this gap,\nwe now use the assumption that \\widetilde{\\pi} obtains \\varepsilon classification error.\nNote that A_\\hi^{\\widetilde{\\pi}}(s_\\hi, \\pi_{\\text{expert}}(s_\\hi)) = 0 when \\pi_{\\text{expert}}(s_\\hi) = \\widetilde{\\pi}(s_\\hi).\nIn the case where the two policies differ on s_\\hi, which occurs with probability \\varepsilon, the advantage is naively upper bounded by H (assuming rewards are bounded between 0 and 1).\nTaking the final sum gives the desired bound. TODO ADD DISTRIBUTION SHIFT EXAMPLE FROM SLIDES ","type":"content","url":"/imitation-learning#performance-of-behavioral-cloning","position":7},{"hierarchy":{"lvl1":"7 Imitation Learning","lvl2":"Distribution shift"},"type":"lvl2","url":"/imitation-learning#distribution-shift","position":8},{"hierarchy":{"lvl1":"7 Imitation Learning","lvl2":"Distribution shift"},"content":"Let us return to the driving analogy. Suppose you have taken some driving lessons and now feel comfortable in your neighbourhood. But today you have to travel to an area you haven’t visited before, such as a highway, where it would be dangerous to try and apply the techniques you’ve already learned.\nThis is the issue of distribution shift: a policy learned under a certain distribution of states may not perform well if this distribution changes.\n\nThis is already a common issue in supervised learning, where the training dataset for a model might not resemble the environment where it gets deployed.\nIn interactive environments, this issue is further exacerbated by the dependency between the observations and the agent’s behavior; if you take a wrong turn early on, it may be difficult or impossible to recover in that trajectory.\n\nHow could you learn a strategy for these new settings?\nIn the driving example, you might decide to install a dashcam to record the car’s surroundings. That way, once you make it back to safety, you can show the recording to an expert, who can provide feedback at each step of the way.\nThen the next time you go for a drive, you can remember the expert’s advice, and take a safer route.\nYou could then repeat this training as many times as desired, thereby collecting the expert’s feedback over a diverse range of locations.\nThis is the key idea behind dataset aggregation.","type":"content","url":"/imitation-learning#distribution-shift","position":9},{"hierarchy":{"lvl1":"7 Imitation Learning","lvl2":"Dataset aggregation (DAgger)"},"type":"lvl2","url":"/imitation-learning#dataset-aggregation-dagger","position":10},{"hierarchy":{"lvl1":"7 Imitation Learning","lvl2":"Dataset aggregation (DAgger)"},"content":"The DAgger algorithm is due to \n\nRoss et al. (2010).\nIt assumes that we have query access to the expert policy.\nThat is, for a given state s,\nwe can ask for the expert’s action \\pi_{\\text{expert}}(s) in that state.\nWe also need access to the environment for rolling out policies.\nThis makes DAgger an online algorithm,\nas opposed to pure behavioral cloning,\nwhich is offline since we don’t need to act in the environment at all.\n\nYou can think of DAgger as a specific way of collecting the dataset \\mathcal{D}.\n\nDAgger\n\nInputs: \\pi_{\\text{expert}}, an initial policy \\pi_{\\text{init}}, the number of iterations T, and the number of trajectories N to collect per iteration.\n\nInitialize \\mathcal{D} = \\{\\} (the empty set) and \\pi = \\pi_{\\text{init}}.\n\nFor t = 1, \\dots, T:\n\nCollect N trajectories \\tau_1, \\dots, \\tau_N using the current policy π.\n\nFor each trajectory \\tau_n:\n\nReplace each action a_h in \\tau_n with the expert action \\pi_{\\text{expert}}(s_h).\n\nCall the resulting trajectory \\tau^{\\text{expert}}_n.\n\n\\mathcal{D} \\gets \\mathcal{D} \\cup \\{ \\tau^{\\text{expert}}_1, \\dots, \\tau^{\\text{expert}}_n \\}.\n\nLet \\pi \\gets \\texttt{fit}(\\mathcal{D}), where \\texttt{fit} is a behavioral cloning algorithm.\n\nReturn π.\n\nHow well does DAgger perform?\nWe omit a proof here, but under certain assumptions,\nthe DAgger algorithm can better approximate the expert policy:|V^{\\pi_{\\text{expert}}} - V^{\\pi_{\\text{DAgger}}}| \\le H \\varepsilon\n\nwhere \\varepsilon is the “classification error” guaranteed by the supervised learning algorithm. TODO ","type":"content","url":"/imitation-learning#dataset-aggregation-dagger","position":11},{"hierarchy":{"lvl1":"7 Imitation Learning","lvl2":"Summary"},"type":"lvl2","url":"/imitation-learning#summary","position":12},{"hierarchy":{"lvl1":"7 Imitation Learning","lvl2":"Summary"},"content":"For tasks where it is too difficult or expensive to learn from scratch,\nwe can instead start off with a collection of expert demonstrations.\nThen we can use supervised learning techniques to find a policy that imitates the expert demonstrations.\n\nThe simplest way to do this is to apply a supervised learning algorithm to an already-collected dataset of expert state-action pairs.\nThis is called behavioral cloning.\nHowever, given query access to the expert policy,\nwe can do better by integrating its feedback in an online loop.\nThe DAgger algorithm is one way of doing this,\nwhere we use the expert policy to augment trajectories and then learn from this augmented dataset using behavioral cloning.","type":"content","url":"/imitation-learning#summary","position":13},{"hierarchy":{"lvl1":"Introduction"},"type":"lvl1","url":"/","position":0},{"hierarchy":{"lvl1":"Introduction"},"content":"Welcome to the study of reinforcement learning!\nThis textbook accompanies the undergraduate course \n\nCS 1840/STAT 184 taught at Harvard.\nIt is intended to be a friendly yet rigorous introduction to this active subfield of machine learning.\n\n","type":"content","url":"/","position":1},{"hierarchy":{"lvl1":"Introduction","lvl2":"Prerequisites"},"type":"lvl2","url":"/#prerequisites","position":2},{"hierarchy":{"lvl1":"Introduction","lvl2":"Prerequisites"},"content":"This book assumes the same prerequisites as the course: You should be familiar with multivariable calculus, linear algebra, and probability.\nFor Harvard undergraduates, this is fulfilled by Math 21a, Math 21b, and Stat 110, or their equivalents.\nStat 111 is strongly recommended but not required.\nSpecifically, we will assume that you know the following topics. The italicized terms have brief re-introductions in the text or in the \n\nAppendix: Background:\n\nLinear Algebra: Vectors and matrices, matrix multiplication, matrix\ninversion, eigenvalues and eigenvectors.\n\nMultivariable Calculus: Partial derivatives, the chain rule, Taylor series, gradients, directional derivatives, Lagrange multipliers.\n\nProbability: Random variables, probability distributions,\nexpectation and variance, the law of iterated expectations (Adam’s rule), covariance, conditional probability, Bayes’s rule, and the law of total probability.\n\nYou should also be comfortable with programming in Python.\nSee \n\nSection 6 for more about this textbook’s philosophy regarding programming.\n\n","type":"content","url":"/#prerequisites","position":3},{"hierarchy":{"lvl1":"Introduction","lvl2":"Reinforcement learning in a nutshell"},"type":"lvl2","url":"/#reinforcement-learning-in-a-nutshell","position":4},{"hierarchy":{"lvl1":"Introduction","lvl2":"Reinforcement learning in a nutshell"},"content":"Broadly speaking,\nRL studies sequential decision-making in dynamic environments.\nAn RL algorithm finds a strategy, called a policy, that maximizes the reward it obtains from the environment.\n\nRL provides a powerful framework for attacking a wide variety of problems,\nincluding robotic control, video games and board games, resource management, language modelling, and more.\nIt also provides an interdisciplinary paradigm for studying animal and human behavior.\nMany of the most stunning results in machine learning, ranging from AlphaGo to ChatGPT, are built using RL algorithms.\n\nHow does RL compare to the other two core machine learning paradigms,\nsupervised learning and unsupervised learning?\n\nSupervised learning (SL) concerns itself with learning a mapping from inputs to outputs.\nTypically the data takes the form of statistically independent input-output pairs.\nIn RL, however, the data is generated by the agent interacting with the environment,\nmeaning the sequential observations of the state are not independent from each other.\n\nConversely, SL is a well-studied field that provides many useful tools for RL.\n\nUnsupervised learning concerns itself with learning the structure of data without the use of outside feedback or labels.\nIn RL, though, the agent receives a reward signal from the environment,\nwhich can be thought of as a sort of feedback.\n\nUnsupervised learning is crucial in many real-world applications of RL for dimensionality reduction and other purposes.\n\n","type":"content","url":"/#reinforcement-learning-in-a-nutshell","position":5},{"hierarchy":{"lvl1":"Introduction","lvl2":"Core tasks of reinforcement learning"},"type":"lvl2","url":"/#core-tasks-of-reinforcement-learning","position":6},{"hierarchy":{"lvl1":"Introduction","lvl2":"Core tasks of reinforcement learning"},"content":"What tasks, exactly, does RL comprise?\nAn RL algorithm must typically solve two main subtasks:\n\nPolicy evaluation (prediction):\nHow ‘good’ is a specific state, or state-action pair (under a given policy)?\nThat is, how much reward does it lead to in the long run?\n\nPolicy optimization (control):\nSuppose we fully understand how the environment behaves.\nWhat is the best action to take in every scenario? **Recursion (bootstrapping):** How can we \"reuse\" our current predictions to generate new information? **Exploration-exploitation tradeoff:** Should we try new actions, or capitalize on actions that we currently believe to be good? \n\n","type":"content","url":"/#core-tasks-of-reinforcement-learning","position":7},{"hierarchy":{"lvl1":"Introduction","lvl2":"Course overview"},"type":"lvl2","url":"/#course-overview","position":8},{"hierarchy":{"lvl1":"Introduction","lvl2":"Course overview"},"content":"The course will progress through the following units:\n\n1 Markov Decision Processes introduces Markov Decision Processes,\nthe core mathematical framework for describing a large class of interactive environments.\n\n2 Linear Quadratic Regulators is a standalone chapter on the linear quadratic regulator (LQR),\nan important tool for continuous control,\nin which the state and action spaces are no longer finite but rather continuous.\nThis has widespread applications in robotics.\n\n3 Multi-Armed Bandits introduces the multi-armed bandit (MAB) model for stateless sequential decision-making tasks.\nIn exploring a number of algorithms,\nwe will see how each of them strikes a different balance between exploring new options and exploiting known options.\nThis exploration-exploitation tradeoff is a core consideration in RL algorithm design.\n\n4 Supervised learning is a standalone crash course on some tools from supervised learning that we will use in later chapters.\n\n5 Fitted Dynamic Programming Algorithms introduces fitted dynamic programming (fitted DP) algorithms for solving MDPs.\nThese algorithms use supervised learning to approximately evaluate policies when they cannot be evaluated exactly.\n\n6 Policy Gradient Methods explores an important class of algorithms based on iteratively improving a policy.\nWe will also encounter the use of deep neural networks to express more complicated policies and approximate complicated functions.\n\n7 Imitation Learning attempts to learn a good policy from expert demonstrations.\nAt its most basic, this is an application of supervised learning to RL tasks.\n\n8 Tree Search Methods looks at ways to explicitly plan ahead when the environment’s dynamics are known.\nWe will study the Monte Carlo Tree Search heuristic,\nwhich has been used to great success in the famous AlphaGo algorithm and its successors.\n\n9 Exploration in MDPs continues to investigate the exploration-exploitation tradeoff.\nWe will extend ideas from multi-armed bandits to the MDP setting.\n\nAppendix: Background contains an overview of selected background mathematical content and programming content. \n| Chapter | States | Actions | Rewards (or costs) |\n|:-------:|:------:|:-------:|:-------:|\n| [](#bandits) | N/A | Finite | Stochastic |\n| [](#mdps) | Finite | Finite | Deterministic |\n| [](#fitted_dp) | Large or continuous | Finite | Deterministic |\n| [](#lqr) | Continuous | Continuous | Deterministic |\n\n\n","type":"content","url":"/#course-overview","position":9},{"hierarchy":{"lvl1":"Introduction","lvl2":"Notation"},"type":"lvl2","url":"/#notation","position":10},{"hierarchy":{"lvl1":"Introduction","lvl2":"Notation"},"content":"We will use the following notation throughout the book.\nThis notation is inspired by \n\nSutton & Barto (2018) and \n\nAgarwal et al. (2022).\nWe use [N] as shorthand for the set \\{ 0, 1, \\dots, N-1 \\}.\n\nElement\n\nSpace\n\nDefinition (of element)\n\ns\n\n\\mathcal{S}\n\nA state.\n\na\n\n\\mathcal{A}\n\nAn action.\n\nr\n\n\n\nA reward.\n\nγ\n\n\n\nA discount factor.\n\nτ\n\n\\mathcal{T}\n\nA trajectory.\n\nπ\n\nΠ\n\nA policy.\n\nV^\\pi\n\n\\mathcal{S} \\to \\mathbb{R}\n\nThe value function of policy π.\n\nQ^\\pi\n\n\\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}\n\nThe action-value function (a.k.a. Q-function) of policy π.\n\nA^\\pi\n\n\\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}\n\nThe advantage function of policy π.\n\n\n\n\\triangle(\\mathcal{X})\n\nA distribution supported on \\mathcal{X}.\n\n\\hi\n\n[\\hor]\n\nTime horizon index of an MDP (subscript).\n\nk\n\n[K]\n\nArm index of a multi-armed bandit (superscript).\n\nt\n\n[T]\n\nIteration index of an algorithm (subscript).\n\nθ\n\nΘ\n\nA set of parameters.\n\nNote that throughout the text, certain symbols will stand for either random variables or fixed values.\nWe aim to clarify in ambiguous settings.\nBe warned that\n\n","type":"content","url":"/#notation","position":11},{"hierarchy":{"lvl1":"Introduction","lvl2":"Programming"},"type":"lvl2","url":"/#programming","position":12},{"hierarchy":{"lvl1":"Introduction","lvl2":"Programming"},"content":"Why include code in a textbook?\nWe believe that implementing an algorithm is a strong test of your understanding of it;\nmathematical notation can often abstract away details,\nwhile a computer must be given every single instruction.\nWe have sought to write readable Python code that is self-contained within each file.\nThis approach is inspired by \n\nSussman et al. (2013).\nThere are some ways in which the code style differs from typical software projects:\n\nWe keep use of language features to a minimum,\neven if it leads to code that could otherwise be more concisely or idiomatically expressed.\n\nThe variable names used in the code match those used in the main text.\nFor example, the variable s will be used instead of the more explicit state.\n\nWe also make extensive use of Python type annotations to explicitly specify variable types, including shapes of vectors and matrices using the \n\njaxtyping library.\n\nThis is an interactive book built with \n\nJupyter Book.\nIt uses \n\nPython 3.11.\nIt uses the \n\nJAX library for numerical computing.\nJAX was chosen for the clarity of its functional style and due to its mature RL ecosystem,\nsustained in large part by the Google DeepMind research group and a large body of open-source contributors.\nWe use the standard \n\nGymnasium library for interfacing with RL environments.\n\nThe following names are exported from the utils module:import matplotlib.pyplot as plt\n\n# convenient class builder\nfrom typing import NamedTuple\n\n# function typings\nfrom collections.abc import Callable\n\n# array typings\nfrom jaxtyping import Float, Array\n\n# convenient function composition\nfrom functools import partial\n\n# numerical computing and linear algebra\nimport jax\nimport jax.numpy as jnp\n\n# print functions as latex\nimport latexify\n\nplt.style.use(\"fivethirtyeight\")","type":"content","url":"/#programming","position":13},{"hierarchy":{"lvl1":"1 Markov Decision Processes"},"type":"lvl1","url":"/mdps","position":0},{"hierarchy":{"lvl1":"1 Markov Decision Processes"},"content":"","type":"content","url":"/mdps","position":1},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl2":"Introduction"},"type":"lvl2","url":"/mdps#introduction","position":2},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl2":"Introduction"},"content":"The field of RL studies how an agent can learn to make sequential decisions in an interactive environment.\nThis is a very general problem!\nHow can we formalize this task in a way that is both sufficiently general yet also tractable enough for fruitful analysis?\n\nLet’s consider some examples of sequential decision problems to identify the key common properties we’d like to capture:\n\nBoard games and video games, where a player takes actions in a virtual environment.\n\nInventory management, where a company must efficiently move resources from producers to consumers.\n\nRobotic control, where a robot can move and interact with the real world to complete some task.\n\nIn these environments and many others, the state transitions,\nthe “rules” of the environment,\nonly depend on the most recent state and action (generally speaking).\nFor example, if you want to take a break while playing a game of chess,\nyou could take a picture of the board,\nand later on reset the board to that state and continue playing;\nthe past history of moves doesn’t matter (generally speaking).\nThis is called the Markov property.\n\nMarkov property\n\nAn interactive environment satisfies the Markov property if the\nprobability of transitioning to a new state only depends on the current\nstate and action:\\pr(s_{\\hi+1} \\mid s_0, a_0, \\dots, s_\\hi, a_\\hi) = P(s_{\\hi+1} \\mid s_\\hi, a_\\hi)\n\nwhere P : \\mathcal{S} \\times \\mathcal{A} \\to \\triangle(\\mathcal{S}) describes the state transitions.\n(We’ll elaborate on this notation later in the chapter.)\n\nEnvironments that satisfy the Markov property are called Markov decision processes (MDPs).\nThis chapter will focus on introducing core vocabulary for MDPs that will be useful throughout the book.\n\nAttention\n\nWhat information might be encoded in the state for each of the above examples?\nWhat might the valid set of actions be?\nDescribe the state transitions heuristically and verify that they satisfy the Markov property.\n\nMDPs are usually classified as finite-horizon, where the interactions end after some finite number of time steps,\nor infinite-horizon, where the interactions can continue indefinitely.\nWe’ll begin with the finite-horizon case and discuss the infinite-horizon case in the second half of the chapter.\n\nWe’ll describe how to evaluate different strategies, called policies, and how to compute (or approximate)\nthe optimal policy for a given MDP.\nWe’ll introduce the Bellman consistency condition, which allows us to analyze the whole sequence of interactions in terms of individual timesteps.\n\nfrom utils import NamedTuple, Float, Array, partial, jax, jnp, latexify\n\n","type":"content","url":"/mdps#introduction","position":3},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl2":"Finite-horizon MDPs"},"type":"lvl2","url":"/mdps#finite-horizon-mdps","position":4},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl2":"Finite-horizon MDPs"},"content":"","type":"content","url":"/mdps#finite-horizon-mdps","position":5},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl3":"Definition","lvl2":"Finite-horizon MDPs"},"type":"lvl3","url":"/mdps#definition","position":6},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl3":"Definition","lvl2":"Finite-horizon MDPs"},"content":"Finite-horizon Markov decision process\n\nThe components of a finite-horizon Markov decision process are:\n\nThe state that the agent interacts with. We use \\mathcal{S} to denote\nthe set of possible states, called the state space.\n\nThe actions that the agent can take. We use \\mathcal{A} to denote the\nset of possible actions, called the action space.\n\nSome initial state distribution \\mu \\in \\triangle(\\mathcal{S}).\n\nThe state transitions (a.k.a. dynamics)\nP : \\mathcal{S} \\times \\mathcal{A} \\to \\triangle(\\mathcal{S}) that describe what state the agent\ntransitions to after taking an action.\n\nThe reward signal. In this course we’ll take it to be a\ndeterministic function on state-action pairs,\nr : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}, but in general many results will\nextend to a stochastic reward signal.\n\nA time horizon \\hor \\in \\mathbb{N} that specifies the number of\ninteractions in an episode.\n\nCombined together, these objects specify a finite-horizon Markov\ndecision process:M = (\\mathcal{S}, \\mathcal{A}, \\mu, P, r, \\hor).\n\nWhen there are finitely many states and actions, i.e.\n|\\mathcal{S}|, |\\mathcal{A}| < \\infty, we can express\nthe relevant quantities as vectors and matrices (i.e. tables of\nvalues):\\begin{aligned}\n \\mu &\\in [0, 1]^{|\\mathcal{S}|} &\n P &\\in [0, 1]^{(|\\mathcal{S} \\times \\mathcal{A}|) \\times |\\mathcal{S}|} &\n r &\\in \\mathbb{R}^{|\\mathcal{S}| \\times |\\mathcal{A}|}\n\\end{aligned}\n\nAttention\n\nVerify that the types and shapes provided above make sense!\n\nclass MDP(NamedTuple):\n \"\"\"A description of a Markov decision process with finitely many states and actions.\"\"\"\n S: int # number of states\n A: int # number of actions\n μ: Float[Array, \" S\"]\n P: Float[Array, \"S A S\"] # \"current\" state, \"current\" action, \"next\" state\n r: Float[Array, \"S A\"]\n H: int\n γ: float = 1.0 # discount factor (used later)\n\nTidying MDP\n\nLet’s consider a simple decision problem throughout this chapter:\nthe task of keeping your room tidy!\n\nYour room has the possible states\n\\mathcal{S} = \\{ \\text{orderly}, \\text{messy} \\}.\nYou can take either of the actions \\mathcal{A} = \\{ \\text{ignore}, \\text{tidy} \\}.\nThe room starts off orderly.\n\nThe state transitions are as follows:\nif you tidy the room, it becomes (or remains) orderly;\nif you ignore the room, it might become messy (see table below).\n\nThe rewards are as follows: You get penalized for tidying an orderly room (a waste of time) or ignoring a messy room,\nbut you get rewarded for ignoring an orderly room (since you can enjoy your additional time).\nTidying a messy room is a chore that gives no reward.\n\nThese are summarized in the following table:\\begin{array}{ccccc}\n s & a & P(\\text{orderly} \\mid s, a) & P(\\text{messy} \\mid s, a) & r(s, a) \\\\\n \\text{orderly} & \\text{ignore} & 0.7 & 0.3 & 1 \\\\\n \\text{orderly} & \\text{tidy} & 1 & 0 & -1 \\\\\n \\text{messy} & \\text{ignore} & 0 & 1 & -1 \\\\\n \\text{messy} & \\text{tidy} & 1 & 0 & 0 \\\\\n\\end{array}\n\nConsider a time horizon of \\hor = 7 days (one interaction per day). Let\nt = 0 correspond to Monday and t = 6 correspond to Sunday.\n\ntidy_mdp = MDP(\n S=2, # 0 = orderly, 1 = messy\n A=2, # 0 = ignore, 1 = tidy\n μ=jnp.array([1.0, 0.0]), # start in orderly state\n P=jnp.array([\n [\n [0.7, 0.3], # orderly, ignore\n [1.0, 0.0], # orderly, tidy\n ],\n [\n [0.0, 1.0], # messy, ignore\n [1.0, 0.0], # messy, tidy\n ],\n ]),\n r=jnp.array([\n [\n 1.0, # orderly, ignore\n -1.0, # orderly, tidy\n ],\n [\n -1.0, # messy, ignore\n 0.0, # messy, tidy\n ]\n ]),\n H=7,\n)\n\n","type":"content","url":"/mdps#definition","position":7},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl3":"Policies","lvl2":"Finite-horizon MDPs"},"type":"lvl3","url":"/mdps#policies","position":8},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl3":"Policies","lvl2":"Finite-horizon MDPs"},"content":"Policies\n\nA policy π describes the agent’s strategy:\nwhich actions it takes in a given situation.\nA key goal of RL is to find the optimal policy that maximizes the total reward on average.\n\nThere are three axes along which policies can vary: their outputs,\ninputs, and time-dependence.\n\nDeterministic or stochastic. A deterministic policy outputs\nactions while a stochastic policy outputs distributions over\nactions.\n\n\n\nA deterministic policy.\n\n\n\nA stochastic policy.\n\nState-dependent or history-dependent. A state-dependent (a.k.a.\n“Markovian”) policy only depends on the current state, while a\nhistory-dependent policy depends on the sequence of past states,\nactions, and rewards. We’ll only consider state-dependent policies\nin this course.\n\nStationary or time-dependent. A stationary (a.k.a. time-homogeneous) policy\nremains the same function at all time steps, while a time-dependent policy can depend on the current timestep.\nFor consistency with states and actions, we will denote the timestep as a subscript,\ni.e. \\pi = \\{ \\pi_0, \\dots, \\pi_{\\hor-1} \\}.\n\nNote that for finite state and action spaces,\nwe can represent a randomized mapping \\mathcal{S} \\to \\Delta(\\mathcal{A})\nas a matrix \\pi \\in [0, 1]^{\\mathcal{S} \\times \\mathcal{A}} where each row describes\nthe policy’s distribution over actions for the corresponding state.\n\nA fascinating result is that every finite-horizon MDP has an optimal deterministic time-dependent policy!\nIntuitively, the Markov property implies that the current state contains all the information we need to make the optimal decision.\nWe’ll prove this result constructively later in the chapter.\n\nPolicies for the tidying MDP\n\nHere are some possible policies for the tidying MDP \n\nExample 1.1:\n\nAlways tidy: \\pi(s) = \\text{tidy}.\n\nOnly tidy on weekends: \\pi_\\hi(s) = \\text{tidy} if\n\\hi \\in \\{ 5, 6 \\} and \\pi_\\hi(s) = \\text{ignore} otherwise.\n\nOnly tidy if the room is messy: \\pi_\\hi(\\text{messy}) = \\text{tidy}\nand \\pi_\\hi(\\text{orderly}) = \\text{ignore} for all \\hi.\n\n# arrays of shape (H, S, A) represent time-dependent policies\ntidy_policy_always_tidy = (\n jnp.zeros((7, 2, 2))\n .at[:, :, 1].set(1.0)\n)\ntidy_policy_weekends = (\n jnp.zeros((7, 2, 2))\n .at[5:7, :, 1].set(1.0)\n .at[0:5, :, 0].set(1.0)\n)\ntidy_policy_messy_only = (\n jnp.zeros((7, 2, 2))\n .at[:, 1, 1].set(1.0)\n .at[:, 0, 0].set(1.0)\n)\n\nNote\n\nArray objects in Jax are immutable, that is, they cannot be changed.\nThis might seem inconvenient, but in larger projects,\nimmutability makes code much easier to reason about.\n\n","type":"content","url":"/mdps#policies","position":9},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl3":"Trajectories","lvl2":"Finite-horizon MDPs"},"type":"lvl3","url":"/mdps#trajectories","position":10},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl3":"Trajectories","lvl2":"Finite-horizon MDPs"},"content":"Trajectories\n\nA sequence of states, actions, and rewards is called a trajectory:\\tau = (s_0, a_0, r_0, \\dots, s_{H-1}, a_{H-1}, r_{H-1})\n\nwhere r_\\hi = r(s_\\hi, a_\\hi).\n(Note that some sources omit the reward at the final time step. This is a minor detail.)\n\nclass Transition(NamedTuple):\n \"\"\"A single state-action-reward interaction with the environment.\n\n A trajectory comprises a sequence of transitions.\n \"\"\"\n s: int\n a: int\n r: float\n\nOnce we’ve chosen a policy,\nwe can sample trajectories by repeatedly choosing actions according to the policy,\ntransitioning according to the state transitions, and observing the rewards.\n\nThat is, a policy induces a distribution \\rho^{\\pi} over trajectories.\n(We assume that μ and P are clear from context.)\n\nTrajectories in the tidying environment\n\nHere is a possible trajectory for the tidying example:\n\n\\hi\n\n0\n\n1\n\n2\n\n3\n\n4\n\n5\n\n6\n\ns\n\norderly\n\norderly\n\norderly\n\nmessy\n\nmessy\n\norderly\n\norderly\n\na\n\ntidy\n\nignore\n\nignore\n\nignore\n\ntidy\n\nignore\n\nignore\n\nr\n\n-1\n\n1\n\n1\n\n-1\n\n0\n\n1\n\n1\n\nCould any of the policies in \n\nExample 1.2 have generated this trajectory?\n\nNote that for a state-dependent policy, using the Markov property \n\nDefinition 1.1,\nwe can write down the likelihood function of this probability distribution in an autoregressive way (i.e. one timestep at a time):\n\nAutoregressive trajectory distribution\\rho^{\\pi}(\\tau) := \\mu(s_0) \\pi_0(a_0 \\mid s_0) P(s_1 \\mid s_0, a_0) \\cdots P(s_{\\hor-1} \\mid s_{\\hor-2}, a_{\\hor-2}) \\pi_{\\hor-1}(a_{\\hor-1} \\mid s_{\\hor-1})\n\ndef trajectory_log_likelihood(\n mdp: MDP,\n τ: list[Transition],\n π: Float[Array, \"S A\"],\n) -> float:\n \"\"\"Compute the log-likelihood of a trajectory under a given MDP and policy.\"\"\"\n\n # initial distribution and action\n total = jnp.log(mdp.μ[τ[0].s])\n total += jnp.log(π[τ[0].s, τ[0].a])\n\n # remaining state transitions and actions\n for i in range(1, mdp.H):\n total += jnp.log(mdp.P[τ[i - 1].s, τ[i - 1].a, τ[i].s])\n total += jnp.log(π[τ[i].s, τ[i].a])\n\n return total\n\nAttention\n\nHow would you modify this to include stochastic rewards?\n\nFor a deterministic policy π, we have that \\pi_\\hi(a \\mid s) = \\mathbb{I}[a = \\pi_\\hi(s)];\nthat is, the probability of taking an action is 1 if it’s the unique action prescribed by the policy for that state and 0 otherwise.\nIn this case, the only randomness in sampling trajectories comes from the initial state distribution μ and the state transitions P.\n\n","type":"content","url":"/mdps#trajectories","position":11},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl3":"Value functions","lvl2":"Finite-horizon MDPs"},"type":"lvl3","url":"/mdps#value-functions","position":12},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl3":"Value functions","lvl2":"Finite-horizon MDPs"},"content":"The main goal of RL is to find a policy that maximizes the expected total\nreward \\E [r_0 + \\cdots + r_{\\hor-1}].\n\nAttention\n\nNote that r_0 + \\cdots + r_{\\hor-1} is a random variable.\nWhat sources of randomness does it depend on?\nDescribe the generating process.\n\nLet’s introduce some notation for analyzing this quantity.\n\nA policy’s value function at time \\hi is its expected remaining reward from a given state:\n\nValue functionV_\\hi^\\pi(s) := \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\cdots + r_{H-1} \\mid s_\\hi = s]\n\nSimilarly, we can define the action-value function (aka the\nQ-function) at time h as the expected remaining reward from a given state and taking a given action:\n\nAction-value functionQ_\\hi^\\pi(s, a) := \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\cdots + r_{H-1} \\mid s_\\hi = s, a_\\hi = a]\n\n","type":"content","url":"/mdps#value-functions","position":13},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl4":"Relating the value function and action-value function","lvl3":"Value functions","lvl2":"Finite-horizon MDPs"},"type":"lvl4","url":"/mdps#relating-the-value-function-and-action-value-function","position":14},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl4":"Relating the value function and action-value function","lvl3":"Value functions","lvl2":"Finite-horizon MDPs"},"content":"Note that the value function is just the expected action-value over\nactions drawn from the policy:V_\\hi^\\pi(s) = \\E_{a \\sim \\pi_\\hi(s)} [Q_\\hi^\\pi(s, a)]\n\ndef q_to_v(\n policy: Float[Array, \"S A\"],\n q: Float[Array, \"S A\"],\n) -> Float[Array, \" S\"]:\n \"\"\"\n Compute the value function for a given policy in a known finite MDP\n at a single timestep from its action-value function.\n \"\"\"\n return jnp.average(q, weights=policy, axis=1)\n\nand the action-value is the sum of the immediate reward and the expected value of the following\nstate:Q_\\hi^\\pi(s, a) = r(s, a) + \\E_{s' \\sim P(s, a)} [V_{\\hi+1}^\\pi(s')]\n\ndef v_to_q(\n mdp: MDP,\n v_next: Float[Array, \" S\"],\n) -> Float[Array, \"S A\"]:\n \"\"\"\n Compute the action-value function in a known finite MDP\n at a single timestep from the corresponding value function.\n \"\"\"\n # the discount factor is relevant later\n return mdp.r + mdp.γ * mdp.P @ v_next\n\n\n# convert a list of v functions to a list of q functions\nv_ary_to_q_ary = jax.vmap(v_to_q, in_axes=(None, 0))\n\n","type":"content","url":"/mdps#relating-the-value-function-and-action-value-function","position":15},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl4":"Greedy policies","lvl3":"Value functions","lvl2":"Finite-horizon MDPs"},"type":"lvl4","url":"/mdps#greedy-policies","position":16},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl4":"Greedy policies","lvl3":"Value functions","lvl2":"Finite-horizon MDPs"},"content":"For any given Q \\in \\mathbb{R}^{|\\mathcal{S}| \\times |\\mathcal{A}|}, we can define the greedy policy \\hat \\pi_Q as the deterministic policy that selects the action with the highest Q-value at each state:\\hat \\pi_Q(s) = \\arg\\max_{a} Q_{sa}\n\ndef q_to_greedy(q: Float[Array, \"S A\"]) -> Float[Array, \"S A\"]:\n \"\"\"\n Get the (deterministic) greedy policy with respect to an action-value function.\n Return the policy as a matrix of shape (S, A) where each row is a one-hot vector.\n \"\"\"\n A = q.shape[1]\n a_ary = jnp.argmax(q, axis=1)\n return jnp.eye(A)[a_ary]\n\n\ndef v_to_greedy(mdp: MDP, v: Float[Array, \" S\"]) -> Float[Array, \"S A\"]:\n \"\"\"Get the (deterministic) greedy policy with respect to a value function.\"\"\"\n return q_to_greedy(v_to_q(mdp, v))\n\n","type":"content","url":"/mdps#greedy-policies","position":17},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl3":"The one-step (Bellman) consistency equation","lvl2":"Finite-horizon MDPs"},"type":"lvl3","url":"/mdps#the-one-step-bellman-consistency-equation","position":18},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl3":"The one-step (Bellman) consistency equation","lvl2":"Finite-horizon MDPs"},"content":"Note that by simply considering the cumulative reward as the sum of the\ncurrent reward and the future cumulative reward, we can describe the\nvalue function recursively (in terms of itself). This is named the\nBellman consistency equation after Richard Bellman (1920--1984),\nwho is credited with introducing dynamic programming in 1953.\n\nBellman consistency equation for the value functionV_\\hi^\\pi(s) = \\E_{\\substack{a \\sim \\pi_\\hi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + V_{\\hi+1}^\\pi(s')]\n\ndef check_bellman_consistency_v(\n mdp: MDP,\n policy: Float[Array, \"H S A\"],\n v_ary: Float[Array, \"H S\"],\n) -> bool:\n \"\"\"\n Check that the given (time-dependent) \"value function\"\n satisfies the Bellman consistency equation.\n \"\"\"\n return all(\n jnp.allclose(\n # lhs\n v_ary[h],\n # rhs\n jnp.sum(policy[h] * (mdp.r + mdp.γ * mdp.P @ v_ary[h + 1]), axis=1),\n )\n for h in range(mdp.H - 1)\n )\n\nAttention\n\nVerify that this equation holds by expanding V_\\hi^\\pi(s)\nand V_{\\hi+1}^\\pi(s').\n\nOne can analogously derive the Bellman consistency equation for the\naction-value function:\n\nBellman consistency equation for action-valuesQ_\\hi^\\pi(s, a) = r(s, a) + \\E_{\\substack{s' \\sim P(s, a) \\\\ a' \\sim \\pi_{\\hi+1}(s')}} [Q_{\\hi+1}^\\pi(s', a')]\n\nAttention\n\nWrite a check_bellman_consistency_q function for the action-value function.\n\nThe Bellman consistency equation for deterministic policies\n\nNote that for deterministic policies, the Bellman consistency equation\nsimplifies to\\begin{aligned}\n V_\\hi^\\pi(s) &= r(s, \\pi_\\hi(s)) + \\E_{s' \\sim P(s, \\pi_\\hi(s))} [V_{\\hi+1}^\\pi(s')] \\\\\n Q_\\hi^\\pi(s, a) &= r(s, a) + \\E_{s' \\sim P(s, a)} [Q_{\\hi+1}^\\pi(s', \\pi_{\\hi+1}(s'))]\n\\end{aligned}\n\n","type":"content","url":"/mdps#the-one-step-bellman-consistency-equation","position":19},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl3":"The one-step Bellman operator","lvl2":"Finite-horizon MDPs"},"type":"lvl3","url":"/mdps#the-one-step-bellman-operator","position":20},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl3":"The one-step Bellman operator","lvl2":"Finite-horizon MDPs"},"content":"Fix a policy π. Consider the higher-order operator that takes in a\n“value function” v : \\mathcal{S} \\to \\mathbb{R} and returns the r.h.s. of the Bellman\nequation for that “value function”:\n\nBellman operator[\\mathcal{J}^{\\pi}(v)](s) := \\E_{\\substack{a \\sim \\pi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + v(s')].\n\nThis is a crucial tool for reasoning about MDPs.\nIntuitively, it answers the following question:\nif we evaluate the next state using v,\nhow good is the current state, according to the given policy?\n\ndef bellman_operator_looping(\n mdp: MDP,\n policy: Float[Array, \"S A\"],\n v: Float[Array, \" S\"],\n) -> Float[Array, \" S\"]:\n \"\"\"\n Looping definition of the Bellman operator.\n Concise version is below\n \"\"\"\n v_new = jnp.zeros(mdp.S)\n for s in range(mdp.S):\n for a in range(mdp.A):\n for s_next in range(mdp.S):\n v_new[s] += (\n policy[s, a]\n * mdp.P[s, a, s_next]\n * (mdp.r[s, a] + mdp.γ * v[s_next])\n )\n return v_new\n\nNote that we can concisely implement this using the q_to_v and v_to_q utilities from above:\n\ndef bellman_operator(\n mdp: MDP,\n policy: Float[Array, \"S A\"],\n v: Float[Array, \" S\"],\n) -> Float[Array, \" S\"]:\n \"\"\"For a known finite MDP, the Bellman operator can be exactly evaluated.\"\"\"\n return q_to_v(policy, v_to_q(mdp, v)) # equivalent\n return jnp.sum(policy * (mdp.r + mdp.γ * mdp.P @ v), axis=1)\n\nWe’ll call \\mathcal{J}^\\pi : \\mathbb{R}^\\mathcal{S} \\to \\mathbb{R}^\\mathcal{S} the Bellman\noperator of π.\nNote that it’s defined on any “value function” mapping states to real numbers;\nv doesn’t have to be a well-defined value function for some policy (hence the lowercase notation).\nThe Bellman operator also gives us a concise way to express \n\nTheorem 1.1 for the value function:V_\\hi^\\pi = \\mathcal{J}^{\\pi}(V_{\\hi+1}^\\pi)\n\nIntuitively, the output of the Bellman operator, a new “value function”,\nevaluates states as follows: from a given state, take one action\naccording to π, observe the reward, and then evaluate the next state\nusing the input “value function”.\n\nWhen we discuss infinite-horizon MDPs, the Bellman operator will turn\nout to be more than just a notational convenience: We’ll use it to\nconstruct algorithms for computing the optimal policy.","type":"content","url":"/mdps#the-one-step-bellman-operator","position":21},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl2":"Solving finite-horizon MDPs"},"type":"lvl2","url":"/mdps#finite-horizon-mdps-1","position":22},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl2":"Solving finite-horizon MDPs"},"content":"","type":"content","url":"/mdps#finite-horizon-mdps-1","position":23},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl3":"Policy evaluation in finite-horizon MDPs","lvl2":"Solving finite-horizon MDPs"},"type":"lvl3","url":"/mdps#eval-dp","position":24},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl3":"Policy evaluation in finite-horizon MDPs","lvl2":"Solving finite-horizon MDPs"},"content":"How can we actually compute the value function of a given policy? This\nis the task of policy evaluation.\n\nDP algorithm to evaluate a policy in a finite-horizon MDP\n\nThe Bellman consistency equation\n\n\nTheorem 1.1\ngives us a convenient algorithm for\nevaluating stationary policies: it expresses the value function at\ntimestep \\hi as a function of the value function at timestep \\hi+1. This\nmeans we can start at the end of the time horizon, where the value is\nknown, and work backwards in time, using the Bellman consistency\nequation to compute the value function at each time step.\n\ndef dp_eval_finite(mdp: MDP, policy: Float[Array, \"S A\"]) -> Float[Array, \"H S\"]:\n \"\"\"Evaluate a policy using dynamic programming.\"\"\"\n V_ary = [None] * mdp.H + [jnp.zeros(mdp.S)] # initialize to 0 at end of time horizon\n for h in range(mdp.H - 1, -1, -1):\n V_ary[h] = bellman_operator(mdp, policy[h], V_ary[h + 1])\n return jnp.stack(V_ary[:-1])\n\nThis runs in time O(H \\cdot |\\mathcal{S}|^2 \\cdot |\\mathcal{A}|) by counting the\nloops.\n\nAttention\n\nDo you see where we compute Q^\\pi_\\hi along the way? Make\nthis step explicit.\n\nTidying policy evaluation\n\nLet’s evaluate the policy from\n\n\nExample 1.2 in the tidying MDP\nthat tidies if and only if the room is\nmessy. We’ll use the Bellman consistency equation to compute the value\nfunction at each time step.\\begin{aligned}\nV_{H-1}^\\pi(\\text{orderly}) &= r(\\text{orderly}, \\text{ignore}) \\\\\n&= 1 \\\\\nV_{H-1}^\\pi(\\text{messy}) &= r(\\text{messy}, \\text{tidy}) \\\\\n&= 0 \\\\\nV_{H-2}^\\pi(\\text{orderly}) &= r(\\text{orderly}, \\text{ignore}) + \\E_{s' \\sim P(\\text{orderly}, \\text{ignore})} [V_{H-1}^\\pi(s')] \\\\\n&= 1 + 0.7 \\cdot V_{H-1}^{\\pi}(\\text{orderly}) + 0.3 \\cdot V_{H-1}^{\\pi}(\\text{messy}) \\\\\n&= 1 + 0.7 \\cdot 1 + 0.3 \\cdot 0 \\\\\n&= 1.7 \\\\\nV_{H-2}^\\pi(\\text{messy}) &= r(\\text{messy}, \\text{tidy}) + \\E_{s' \\sim P(\\text{messy}, \\text{tidy})} [V_{H-1}^\\pi(s')] \\\\\n&= 0 + 1 \\cdot V_{H-1}^{\\pi}(\\text{orderly}) + 0 \\cdot V_{H-1}^{\\pi}(\\text{messy}) \\\\\n&= 1 \\\\\nV_{H-3}^\\pi(\\text{orderly}) &= r(\\text{orderly}, \\text{ignore}) + \\E_{s' \\sim P(\\text{orderly}, \\text{ignore})} [V_{H-2}^\\pi(s')] \\\\\n&= 1 + 0.7 \\cdot V_{H-2}^{\\pi}(\\text{orderly}) + 0.3 \\cdot V_{H-2}^{\\pi}(\\text{messy}) \\\\\n&= 1 + 0.7 \\cdot 1.7 + 0.3 \\cdot 1 \\\\\n&= 2.49 \\\\\nV_{H-3}^\\pi(\\text{messy}) &= r(\\text{messy}, \\text{tidy}) + \\E_{s' \\sim P(\\text{messy}, \\text{tidy})} [V_{H-2}^\\pi(s')] \\\\\n&= 0 + 1 \\cdot V_{H-2}^{\\pi}(\\text{orderly}) + 0 \\cdot V_{H-2}^{\\pi}(\\text{messy}) \\\\\n&= 1.7\n\\end{aligned}\n\netc. You may wish to repeat this computation for the\nother policies to get a better sense of this algorithm.\n\nV_messy = dp_eval_finite(tidy_mdp, tidy_policy_messy_only)\nV_messy\n\n","type":"content","url":"/mdps#eval-dp","position":25},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl3":"Optimal policies in finite-horizon MDPs","lvl2":"Solving finite-horizon MDPs"},"type":"lvl3","url":"/mdps#opt-dynamic-programming","position":26},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl3":"Optimal policies in finite-horizon MDPs","lvl2":"Solving finite-horizon MDPs"},"content":"We’ve just seen how to evaluate a given policy. But how can we find\nthe optimal policy for a given environment?\n\nOptimal policies\n\nWe call a policy optimal, and denote it by \\pi^\\star, if it does at\nleast as well as any other policy π (including stochastic and\nhistory-dependent ones) in all situations:\\begin{aligned}\n V_\\hi^{\\pi^\\star}(s) &= \\E_{\\tau \\sim \\rho^{\\pi^{\\star}}}[r_\\hi + \\cdots + r_{H-1} \\mid s_\\hi = s] \\\\\n &\\ge \\E_{\\tau \\sim \\rho^{\\pi}}[r_\\hi + \\cdots + r_{H-1} \\mid \\tau_\\hi] \\quad \\forall \\pi, \\tau_\\hi, \\hi \\in [H]\n\\end{aligned}\n\nwhere we condition on the\ntrajectory up to time \\hi, denoted\n\\tau_\\hi = (s_0, a_0, r_0, \\dots, s_\\hi), where s_\\hi = s.\n\nConvince yourself that all optimal policies must have the same value\nfunction. We call this the optimal value function and denote it by\nV_\\hi^\\star(s). The same goes for the action-value function\nQ_\\hi^\\star(s, a).\n\nIt is a stunning fact that every finite-horizon MDP has an optimal\npolicy that is time-dependent and deterministic. In particular, we can\nconstruct such a policy by acting greedily with respect to the optimal\naction-value function:\n\nIt is optimal to be greedy with respect to the optimal value function\\pi_\\hi^\\star(s) = \\arg\\max_a Q_\\hi^\\star(s, a).\n\nProof\n\nLet V^{\\star} and Q^{\\star} denote the optimal value and\naction-value functions. Consider the greedy policy\\hat \\pi_\\hi(s) := \\arg\\max_a Q_\\hi^{\\star}(s, a).\n\nWe aim to show that\n\\hat \\pi is optimal; that is, V^{\\hat \\pi} = V^{\\star}.\n\nFix an arbitrary state s \\in \\mathcal{S} and time \\hi \\in [H].\n\nFirstly, by the definition of V^{\\star}, we already know\nV_\\hi^{\\star}(s) \\ge V_\\hi^{\\hat \\pi}(s). So for equality to hold we just\nneed to show that V_\\hi^{\\star}(s) \\le V_\\hi^{\\hat \\pi}(s). We’ll first\nshow that the Bellman operator \\mathcal{J}^{\\hat \\pi} never decreases\nV_\\hi^{\\star}. Then we’ll apply this result recursively to show that\nV^{\\star} = V^{\\hat \\pi}.\n\nThe Bellman operator never decreases the optimal value function\n\n\\mathcal{J}^{\\hat \\pi} never decreases V_\\hi^{\\star}\n(elementwise):[\\mathcal{J}^{\\hat \\pi} (V_{\\hi+1}^{\\star})](s) \\ge V_\\hi^{\\star}(s).\n\nProof:\\begin{aligned}\n V_\\hi^{\\star}(s) &= \\max_{\\pi \\in \\Pi} V_\\hi^{\\pi}(s) \\\\\n &= \\max_{\\pi \\in \\Pi} \\mathop{\\mathbb{E}}_{a \\sim \\pi(\\dots)}\\left[r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} V_{\\hi+1}^\\pi(s') \\right] && \\text{Bellman consistency} \\\\\n &\\le \\max_{\\pi \\in \\Pi} \\mathop{\\mathbb{E}}_{a \\sim \\pi(\\dots)}\\left[r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} V_{\\hi+1}^{\\star}(s') \\right] && \\text{definition of } V^\\star \\\\\n &= \\max_{a} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} V_{\\hi+1}^{\\star}(s') \\right] && \\text{only depends on } \\pi \\text{ via } a \\\\\n &= [\\mathcal{J}^{\\hat \\pi}(V_{\\hi+1}^{\\star})](s). \n\\end{aligned}\n\nNote that the chosen action a \\sim \\pi(\\dots) above\nmight depend on the past history; this isn’t shown in the notation and\ndoesn’t affect our result (make sure you see why).\n\nWe can now apply this result recursively to getV^{\\star}_t(s) \\le V^{\\hat \\pi}_t(s)\n\nas follows. (Note that even\nthough \\hat \\pi is deterministic, we’ll use the a \\sim \\hat \\pi(s)\nnotation to make it explicit that we’re sampling a trajectory from it.)\\begin{aligned}\n V_{t}^{\\star}(s) &\\le [\\mathcal{J}^{\\hat \\pi}(V_{\\hi+1}^{\\star})](s) \\\\\n &= \\mathop{\\mathbb{E}}_{a \\sim \\hat \\pi(s)} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} \\left[ {\\color{blue} V_{\\hi+1}^{\\star}(s')} \\right] \\right] && \\text{definition of } \\mathcal{J}^{\\hat \\pi} \\\\\n &\\le \\mathop{\\mathbb{E}}_{a \\sim \\hat \\pi(s)} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} \\left[ {\\color{blue}[ \\mathcal{J}^{\\hat \\pi} (V_{t+2}^{\\star})] (s')} \\right] \\right] && \\text{above lemma} \\\\\n &= \\mathop{\\mathbb{E}}_{a \\sim \\hat \\pi(s)} \\left[ r(s, a) + \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)}{\\color{blue} \\left[ \\mathop{\\mathbb{E}}_{a' \\sim \\hat \\pi} r(s', a') + \\mathop{\\mathbb{E}}_{s''} V_{t+2}^{\\star}(s'') \\right]} \\right] && \\text{definition of } \\mathcal{J}^{\\hat \\pi} \\\\\n &\\le \\cdots && \\text{apply at all timesteps} \\\\\n &= \\mathop{\\mathbb{E}}_{\\tau \\sim \\rho^{\\hat \\pi}} [G_{t} \\mid s_\\hi = s] && \\text{rewrite expectation} \\\\\n &= V_{t}^{\\hat \\pi}(s) && \\text{definition}\n\\end{aligned}\n\nAnd so we have V^{\\star} = V^{\\hat \\pi}, making \\hat \\pi optimal.\n\nNote that this also gives simplified forms of the \n\nBellman consistency equations for the optimal policy:\n\nBellman consistency equations for the optimal policy\\begin{aligned}\n V_\\hi^\\star(s) &= \\max_a Q_\\hi^\\star(s, a) \\\\\n Q_\\hi^\\star(s, a) &= r(s, a) + \\E_{s' \\sim P(s, a)} [V_{\\hi+1}^\\star(s')]\n\\end{aligned}\n\nNow that we’ve shown this particular greedy policy is optimal, all we\nneed to do is compute the optimal value function and optimal policy. We\ncan do this by working backwards in time using dynamic programming\n(DP).\n\nDP algorithm to compute an optimal policy in a finite-horizon MDP\n\nBase case. At the end of the episode (time step H-1), we can’t\ntake any more actions, so the Q-function is simply the reward that\nwe obtain:Q^\\star_{H-1}(s, a) = r(s, a)\n\nso the best thing to do\nis just act greedily and get as much reward as we can!\\pi^\\star_{H-1}(s) = \\arg\\max_a Q^\\star_{H-1}(s, a)\n\nThen\nV^\\star_{H-1}(s), the optimal value of state s at the end of the\ntrajectory, is simply whatever action gives the most reward.V^\\star_{H-1} = \\max_a Q^\\star_{H-1}(s, a)\n\nRecursion. Then, we can work backwards in time, starting from the\nend, using our consistency equations! i.e. for each\nt = H-2, \\dots, 0, we set\\begin{aligned}\n Q^\\star_{t}(s, a) &= r(s, a) + \\E_{s' \\sim P(s, a)} [V^\\star_{\\hi+1}(s')] \\\\\n \\pi^\\star_{t}(s) &= \\arg\\max_a Q^\\star_{t}(s, a) \\\\\n V^\\star_{t}(s) &= \\max_a Q^\\star_{t}(s, a)\n\\end{aligned}\n\ndef find_optimal_policy(mdp: MDP):\n Q = [None] * mdp.H\n pi = [None] * mdp.H\n V = [None] * mdp.H + [jnp.zeros(mdp.S)] # initialize to 0 at end of time horizon\n\n for h in range(mdp.H - 1, -1, -1):\n Q[h] = mdp.r + mdp.P @ V[h + 1]\n pi[h] = jnp.eye(mdp.S)[jnp.argmax(Q[h], axis=1)] # one-hot\n V[h] = jnp.max(Q[h], axis=1)\n\n Q = jnp.stack(Q)\n pi = jnp.stack(pi)\n V = jnp.stack(V[:-1])\n\n return pi, V, Q\n\nAt each of the H timesteps, we must compute Q^{\\star} for each of\nthe |\\mathcal{S}| |\\mathcal{A}| state-action pairs. Each computation takes |\\mathcal{S}|\noperations to evaluate the average value over s'. This gives a total\ncomputation time of O(H \\cdot |\\mathcal{S}|^2 \\cdot |\\mathcal{A}|).\n\nNote that this algorithm is identical to the policy evaluation algorithm\n\n\ndp_eval_finite, but instead of averaging over the\nactions chosen by a policy, we instead simply take a maximum over the\naction-values. We’ll see this relationship between policy evaluation\nand optimal policy computation show up again in the infinite-horizon\nsetting.\n\nπ_opt, V_opt, Q_opt = find_optimal_policy(tidy_mdp)\nassert jnp.allclose(π_opt, tidy_policy_messy_only)\nassert jnp.allclose(V_opt, V_messy)\nassert jnp.allclose(Q_opt[:-1], v_ary_to_q_ary(tidy_mdp, V_messy)[1:])\n\"Assertions passed (the 'tidy when messy' policy is optimal)\"\n\n","type":"content","url":"/mdps#opt-dynamic-programming","position":27},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl2":"Infinite-horizon MDPs"},"type":"lvl2","url":"/mdps#infinite-horizon-mdps","position":28},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl2":"Infinite-horizon MDPs"},"content":"What happens if a trajectory is allowed to continue forever (i.e.\nH = \\infty)? This is the setting of infinite horizon MDPs.\n\nIn this chapter, we’ll describe the necessary adjustments from the\nfinite-horizon case to make the problem tractable. We’ll show that the\n\n\nBellman operator in the discounted reward setting is a\ncontraction mapping for any policy.\nWe’ll discuss how to evaluate\npolicies (i.e. compute their corresponding value functions). Finally,\nwe’ll present and analyze two iterative algorithms, based on the Bellman\noperator, for computing the optimal policy: value iteration and\npolicy iteration.","type":"content","url":"/mdps#infinite-horizon-mdps","position":29},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl3":"Discounted rewards","lvl2":"Infinite-horizon MDPs"},"type":"lvl3","url":"/mdps#discounted-rewards","position":30},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl3":"Discounted rewards","lvl2":"Infinite-horizon MDPs"},"content":"First of all, note that maximizing the cumulative reward\nr_\\hi + r_{\\hi+1} + r_{\\hi+2} + \\cdots is no longer a good idea since it\nmight blow up to infinity. Instead of a time horizon H, we now need a\ndiscount factor \\gamma \\in [0, 1) such that rewards become less\nvaluable the further into the future they are:r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots = \\sum_{k=0}^\\infty \\gamma^k r_{\\hi+k}.\n\nWe can think of γ as measuring how much we care about the future:\nif it’s close to 0, we only care about the near-term rewards; it’s\nclose to 1, we put more weight into future rewards.\n\nYou can also analyze γ as the probability of continuing the\ntrajectory at each time step. (This is equivalent to H being\ndistributed by a First Success distribution with success probability\nγ.) This accords with the above interpretation: if γ is\nclose to 0, the trajectory will likely be very short, while if\nγ is close to 1, the trajectory will likely continue for a long\ntime.\n\nAttention\n\nAssuming that r_\\hi \\in [0, 1] for all \\hi \\in \\mathbb{N},\nwhat is the maximum discounted cumulative reward? You may find it\nuseful to review geometric series.\n\nThe other components of the MDP remain the same:M = (\\mathcal{S}, \\mathcal{A}, \\mu, P, r, \\gamma).\n\nCode-wise, we can reuse the MDP class from before \n\nDefinition 1.2 and set mdp.H = float('inf').\n\ntidy_mdp_inf = tidy_mdp._replace(H=float(\"inf\"), γ=0.95)\n\n","type":"content","url":"/mdps#discounted-rewards","position":31},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl3":"Stationary policies","lvl2":"Infinite-horizon MDPs"},"type":"lvl3","url":"/mdps#stationary-policies","position":32},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl3":"Stationary policies","lvl2":"Infinite-horizon MDPs"},"content":"The time-dependent policies from the finite-horizon case become\ndifficult to handle in the infinite-horizon case. In particular, many of\nthe DP approaches we saw required us to start at the end of the\ntrajectory, which is no longer possible. We’ll shift to stationary\npolicies \\pi : \\mathcal{S} \\to \\mathcal{A} (deterministic) or \\Delta(\\mathcal{A}) (stochastic).\n\nAttention\n\nWhich of the policies in \n\nExample 1.2 are stationary?","type":"content","url":"/mdps#stationary-policies","position":33},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl3":"Value functions and Bellman consistency","lvl2":"Infinite-horizon MDPs"},"type":"lvl3","url":"/mdps#value-functions-and-bellman-consistency","position":34},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl3":"Value functions and Bellman consistency","lvl2":"Infinite-horizon MDPs"},"content":"We also consider stationary value functions V^\\pi : \\mathcal{S} \\to \\mathbb{R} and\nQ^\\pi : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}. We need to insert a factor of γ\ninto the Bellman consistency equation \n\nTheorem 1.1 to account for the discounting:\\begin{aligned}\n V^\\pi(s) &= \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} \\cdots \\mid s_\\hi = s] && \\text{for any } \\hi \\in \\mathbb{N} \\\\\n &= \\E_{\\substack{a \\sim \\pi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + \\gamma V^\\pi(s')]\\\\\n Q^\\pi(s, a) &= \\E_{\\tau \\sim \\rho^\\pi} [r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots \\mid s_\\hi = s, a_\\hi = a] && \\text{for any } \\hi \\in \\mathbb{N} \\\\\n &= r(s, a) + \\gamma \\E_{\\substack{s' \\sim P(s, a) \\\\ a' \\sim \\pi(s')}} [Q^\\pi(s', a')]\n\\end{aligned}\n\nAttention\n\nHeuristically speaking, why does it no longer matter which\ntime step we condition on when defining the value function?","type":"content","url":"/mdps#value-functions-and-bellman-consistency","position":35},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl2":"Solving infinite-horizon MDPs"},"type":"lvl2","url":"/mdps#solving-infinite-horizon-mdps","position":36},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl2":"Solving infinite-horizon MDPs"},"content":"","type":"content","url":"/mdps#solving-infinite-horizon-mdps","position":37},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl3":"The Bellman operator is a contraction mapping","lvl2":"Solving infinite-horizon MDPs"},"type":"lvl3","url":"/mdps#the-bellman-operator-is-a-contraction-mapping","position":38},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl3":"The Bellman operator is a contraction mapping","lvl2":"Solving infinite-horizon MDPs"},"content":"Recall from \n\nDefinition 1.8 that the Bellman operator \\mathcal{J}^{\\pi}\nfor a policy π takes in a “value function” v : \\mathcal{S} \\to \\mathbb{R} and\nreturns the r.h.s. of the Bellman equation for that “value function”. In\nthe infinite-horizon setting, this is[\\mathcal{J}^{\\pi}(v)](s) := \\E_{\\substack{a \\sim \\pi(s) \\\\ s' \\sim P(s, a)}} [r(s, a) + \\gamma v(s')].\n\nThe crucial property of the Bellman operator is that it is a\ncontraction mapping for any policy. Intuitively, if we start with\ntwo “value functions” v, u : \\mathcal{S} \\to \\mathbb{R}, if we repeatedly apply the\nBellman operator to each of them, they will get closer and closer\ntogether at an exponential rate.\n\nContraction mapping\n\nLet X be some space with a norm \\|\\cdot\\|. We call an operator\nf: X \\to X a contraction mapping if for any x, y \\in X,\\|f(x) - f(y)\\| \\le \\gamma \\|x - y\\|\n\nfor some fixed \\gamma \\in (0, 1).\nIntuitively, this means that if two points are δ far apart,\nafter applying the mapping,\n\nAttention\n\nShow that for a contraction mapping f with coefficient\nγ, for all t \\in \\mathbb{N},\\|f^{(t)}(x) - f^{(t)}(y)\\| \\le \\gamma^t \\|x - y\\|,\n\ni.e. that any\ntwo points will be pushed closer by at least a factor of γ at\neach iteration.\n\nIt is a powerful fact (known as the Banach fixed-point theorem) that\nevery contraction mapping has a unique fixed point x^\\star such\nthat f(x^\\star) = x^\\star. This means that if we repeatedly apply f\nto any starting point, we will eventually converge to x^\\star:\\|f^{(t)}(x) - x^\\star\\| \\le \\gamma^t \\|x - x^\\star\\|.\n\nLet’s return to the RL setting and apply this result to the Bellman\noperator. How can we measure the distance between two “value functions”\nv, u : \\mathcal{S} \\to \\mathbb{R}? We’ll take the supremum norm as our distance\nmetric:\\| v - u \\|_{\\infty} := \\sup_{s \\in \\mathcal{S}} |v(s) - u(s)|,\n\ni.e.\nwe compare the “value functions” on the state that causes the biggest\ngap between them. Then \n\n(1.36) implies that if we repeatedly\napply \\mathcal{J}^\\pi to any starting “value function”, we will eventually\nconverge to V^\\pi:\\|(\\mathcal{J}^\\pi)^{(t)}(v) - V^\\pi \\|_{\\infty} \\le \\gamma^{t} \\| v - V^\\pi\\|_{\\infty}.\n\nWe’ll use this useful fact to prove the convergence of several\nalgorithms later on.\n\nThe Bellman operator is a contraction mapping\\|\\mathcal{J}^{\\pi} (v) - \\mathcal{J}^{\\pi} (u) \\|_{\\infty} \\le \\gamma \\|v - u \\|_{\\infty}.\n\nProof of \n\nTheorem 1.4\n\nFor all states s \\in \\mathcal{S},\\begin{aligned}\n|[\\mathcal{J}^{\\pi} (v)](s) - [\\mathcal{J}^{\\pi} (u)](s)|&= \\Big| \\mathop{\\mathbb{E}}_{a \\sim \\pi(s)} \\left[ r(s, a) + \\gamma \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} v(s') \\right] \\\\\n&\\qquad - \\mathop{\\mathbb{E}}_{a \\sim \\pi(s)} \\left[r(s, a) + \\gamma \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} u(s') \\right] \\Big| \\\\\n&= \\gamma \\left|\\mathop{\\mathbb{E}}_{s' \\sim P(s, a)} [v(s') - u(s')] \\right| \\\\\n&\\le \\gamma \\mathop{\\mathbb{E}}_{s' \\sim P(s, a)}|v(s') - u(s')| \\qquad \\text{(Jensen's inequality)} \\\\\n&\\le \\gamma \\max_{s'} |v(s') - u(s')| \\\\\n&= \\gamma \\|v - u \\|_{\\infty}.\n\\end{aligned}","type":"content","url":"/mdps#the-bellman-operator-is-a-contraction-mapping","position":39},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl3":"Policy evaluation in infinite-horizon MDPs","lvl2":"Solving infinite-horizon MDPs"},"type":"lvl3","url":"/mdps#policy-evaluation-in-infinite-horizon-mdps","position":40},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl3":"Policy evaluation in infinite-horizon MDPs","lvl2":"Solving infinite-horizon MDPs"},"content":"The backwards DP technique we used in \n\nthe finite-horizon case no\nlonger works since there is no “final timestep” to start from. We’ll\nneed another approach to policy evaluation.\n\nThe Bellman consistency conditions yield a system of equations we can\nsolve to evaluate a deterministic policy exactly. For a faster approximate solution,\nwe can iterate the policy’s Bellman operator, since we know that it has\na unique fixed point at the true value function.","type":"content","url":"/mdps#policy-evaluation-in-infinite-horizon-mdps","position":41},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl4":"Matrix inversion for deterministic policies","lvl3":"Policy evaluation in infinite-horizon MDPs","lvl2":"Solving infinite-horizon MDPs"},"type":"lvl4","url":"/mdps#matrix-inversion-for-deterministic-policies","position":42},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl4":"Matrix inversion for deterministic policies","lvl3":"Policy evaluation in infinite-horizon MDPs","lvl2":"Solving infinite-horizon MDPs"},"content":"Note that when the policy π is deterministic, the actions can be\ndetermined from the states, and so we can chop off the action dimension\nfor the rewards and state transitions:\\begin{aligned}\n r^{\\pi} &\\in \\mathbb{R}^{|\\mathcal{S}|} & P^{\\pi} &\\in [0, 1]^{|\\mathcal{S}| \\times |\\mathcal{S}|} & \\mu &\\in [0, 1]^{|\\mathcal{S}|} \\\\\n \\pi &\\in \\mathcal{A}^{|\\mathcal{S}|} & V^\\pi &\\in \\mathbb{R}^{|\\mathcal{S}|} & Q^\\pi &\\in \\mathbb{R}^{|\\mathcal{S}| \\times |\\mathcal{A}|}.\n\\end{aligned}\n\nFor P^\\pi, we’ll treat the rows as the states and the\ncolumns as the next states. Then P^\\pi_{s, s'} is the probability of\ntransitioning from state s to state s' under policy π.\n\nTidying MDP\n\nThe tabular MDP from before has |\\mathcal{S}| = 2 and |\\mathcal{A}| = 2. Let’s write\ndown the quantities for the policy π that tidies if and only if the\nroom is messy:r^{\\pi} = \\begin{bmatrix} 1 \\\\ 0 \\end{bmatrix}, \\quad\n P^{\\pi} = \\begin{bmatrix} 0.7 & 0.3 \\\\ 1 & 0 \\end{bmatrix}, \\quad\n \\mu = \\begin{bmatrix} 1 \\\\ 0 \\end{bmatrix}\n\nWe’ll see how to\nevaluate this policy in the next section.\n\nThe Bellman consistency equation for a deterministic policy can be\nwritten in tabular notation asV^\\pi = r^\\pi + \\gamma P^\\pi V^\\pi.\n\n(Unfortunately, this notation doesn’t simplify the expression for\nQ^\\pi.) This system of equations can be solved with a matrix\ninversion:V^\\pi = (I - \\gamma P^\\pi)^{-1} r^\\pi.\n\nAttention\n\nNote we’ve assumed that I - \\gamma P^\\pi is invertible. Can you see\nwhy this is the case?\n\n(Recall that a linear operator, i.e. a square matrix, is invertible if\nand only if its null space is trivial; that is, it doesn’t map any\nnonzero vector to zero. In this case, we can see that I - \\gamma P^\\pi\nis invertible because it maps any nonzero vector to a vector with at\nleast one nonzero element.)\n\ndef eval_deterministic_infinite(\n mdp: MDP, policy: Float[Array, \"S A\"]\n) -> Float[Array, \" S\"]:\n pi = jnp.argmax(policy, axis=1) # un-one-hot\n P_π = mdp.P[jnp.arange(mdp.S), pi]\n r_π = mdp.r[jnp.arange(mdp.S), pi]\n return jnp.linalg.solve(jnp.eye(mdp.S) - mdp.γ * P_π, r_π)\n\nTidying policy evaluation\n\nLet’s use the same policy π that tidies if and only if the room is\nmessy. Setting \\gamma = 0.95, we must invertI - \\gamma P^{\\pi} = \\begin{bmatrix} 1 - 0.95 \\times 0.7 & - 0.95 \\times 0.3 \\\\ - 0.95 \\times 1 & 1 - 0.95 \\times 0 \\end{bmatrix} = \\begin{bmatrix} 0.335 & -0.285 \\\\ -0.95 & 1 \\end{bmatrix}.\n\nThe inverse to two decimal points is(I - \\gamma P^{\\pi})^{-1} = \\begin{bmatrix} 15.56 & 4.44 \\\\ 14.79 & 5.21 \\end{bmatrix}.\n\nThus the value function isV^{\\pi} = (I - \\gamma P^{\\pi})^{-1} r^{\\pi} = \\begin{bmatrix} 15.56 & 4.44 \\\\ 14.79 & 5.21 \\end{bmatrix} \\begin{bmatrix} 1 \\\\ 0 \\end{bmatrix} = \\begin{bmatrix} 15.56 \\\\ 14.79 \\end{bmatrix}.\n\nLet’s sanity-check this result. Since rewards are at most 1, the\nmaximum cumulative return of a trajectory is at most\n1/(1-\\gamma) = 20. We see that the value function is indeed slightly\nlower than this.\n\neval_deterministic_infinite(tidy_mdp_inf, tidy_policy_messy_only[0])\n\n","type":"content","url":"/mdps#matrix-inversion-for-deterministic-policies","position":43},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl4":"Iterative policy evaluation","lvl3":"Policy evaluation in infinite-horizon MDPs","lvl2":"Solving infinite-horizon MDPs"},"type":"lvl4","url":"/mdps#iterative-pe","position":44},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl4":"Iterative policy evaluation","lvl3":"Policy evaluation in infinite-horizon MDPs","lvl2":"Solving infinite-horizon MDPs"},"content":"The matrix inversion above takes roughly O(|\\mathcal{S}|^3) time.\nIt also only works for deterministic policies.\nCan we trade off the requirement of finding the exact value function for a faster\napproximate algorithm that will also extend to stochastic policies?\n\nLet’s use the Bellman operator to define an iterative algorithm for\ncomputing the value function. We’ll start with an initial guess\nv^{(0)} with elements in [0, 1/(1-\\gamma)] and then iterate the\nBellman operator:v^{(t+1)} = \\mathcal{J}^{\\pi}(v^{(t)}),\n\ni.e. v^{(t)} = (\\mathcal{J}^{\\pi})^{(t)} (v^{(0)}). Note that each iteration\ntakes O(|\\mathcal{S}|^2) time for the matrix-vector multiplication.\n\ndef supremum_norm(v):\n return jnp.max(jnp.abs(v)) # same as jnp.linalg.norm(v, jnp.inf)\n\n\ndef loop_until_convergence(op, v, ε=1e-6):\n \"\"\"Repeatedly apply op to v until convergence (in supremum norm).\"\"\"\n while True:\n v_new = op(v)\n if supremum_norm(v_new - v) < ε:\n return v_new\n v = v_new\n\n\ndef iterative_evaluation(mdp: MDP, pi: Float[Array, \"S A\"], ε=1e-6) -> Float[Array, \" S\"]:\n op = partial(bellman_operator, mdp, pi)\n return loop_until_convergence(op, jnp.zeros(mdp.S), ε)\n\nThen, as we showed in \n\n(1.38), by the Banach fixed-point theorem:\\|v^{(t)} - V^\\pi \\|_{\\infty} \\le \\gamma^{t} \\| v^{(0)} - V^\\pi\\|_{\\infty}.\n\niterative_evaluation(tidy_mdp_inf, tidy_policy_messy_only[0])\n\nConvergence of iterative policy evaluation\n\nHow many iterations do we need for an ε-accurate estimate? We\ncan work backwards to solve for t:\\begin{aligned}\n \\gamma^t \\|v^{(0)} - V^\\pi\\|_{\\infty} &\\le \\epsilon \\\\\n t &\\ge \\frac{\\log (\\epsilon / \\|v^{(0)} - V^\\pi\\|_{\\infty})}{\\log \\gamma} \\\\\n &= \\frac{\\log (\\|v^{(0)} - V^\\pi\\|_{\\infty} / \\epsilon)}{\\log (1 / \\gamma)},\n\\end{aligned}\n\nand so the number of iterations required for an\nε-accurate estimate isT = O\\left( \\frac{1}{1-\\gamma} \\log\\left(\\frac{1}{\\epsilon (1-\\gamma)}\\right) \\right).\n\nNote that we’ve applied the inequalities\n\\|v^{(0)} - V^\\pi\\|_{\\infty} \\le 1/(1-\\gamma) and\n\\log (1/x) \\ge 1-x.","type":"content","url":"/mdps#iterative-pe","position":45},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl3":"Optimal policies in infinite-horizon MDPs","lvl2":"Solving infinite-horizon MDPs"},"type":"lvl3","url":"/mdps#optimal-policies-in-infinite-horizon-mdps","position":46},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl3":"Optimal policies in infinite-horizon MDPs","lvl2":"Solving infinite-horizon MDPs"},"content":"Now let’s move on to solving for an optimal policy in the\ninfinite-horizon case. As in \n\nthe finite-horizon case, an optimal policy \\pi^\\star\nis one that does at least as well as any other policy in all situations.\nThat is, for all policies π, states s \\in \\mathcal{S}, times\n\\hi \\in \\mathbb{N}, and initial trajectories\n\\tau_\\hi = (s_0, a_0, r_0, \\dots, s_\\hi) where s_\\hi = s,\\begin{aligned}\n V^{\\pi^\\star}(s) &= \\E_{\\tau \\sim \\rho^{\\pi^{\\star}}}[r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots \\mid s_\\hi = s] \\\\\n &\\ge \\E_{\\tau \\sim \\rho^{\\pi}}[r_\\hi + \\gamma r_{\\hi+1} + \\gamma^2 r_{\\hi+2} + \\cdots \\mid \\tau_\\hi]\n\\end{aligned}\n\nOnce again, all optimal policies share the same optimal value function V^\\star, and the greedy policy with respect to this value function\nis optimal.\n\nAttention\n\nVerify this by modifying the proof \n\nTheorem 1.3 from the finite-horizon case.\n\nSo how can we compute such an optimal policy? We can’t use the backwards\nDP approach from the finite-horizon case \n\nDefinition 1.11 since there’s no “final timestep” to start\nfrom. Instead, we’ll exploit the fact that the Bellman consistency\nequation \n\n(1.32) for the optimal value\nfunction doesn’t depend on any policy:V^\\star(s) = \\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} V^\\star(s'). \\right]\n\nAttention\n\nVerify this by substituting the greedy policy into the\nBellman consistency equation.\n\nAs before, thinking of the r.h.s. of \n\n(1.53) as an operator on value functions\ngives the Bellman optimality operator[\\mathcal{J}^{\\star}(v)](s) = \\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} v(s') \\right]\n\ndef bellman_optimality_operator(mdp: MDP, v: Float[Array, \" S\"]) -> Float[Array, \" S\"]:\n return jnp.max(mdp.r + mdp.γ * mdp.P @ v, axis=1)\n\n\ndef check_optimal(v: Float[Array, \" S\"], mdp: MDP):\n return jnp.allclose(v, bellman_optimality_operator(v, mdp))\n\n","type":"content","url":"/mdps#optimal-policies-in-infinite-horizon-mdps","position":47},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl4":"Value iteration","lvl3":"Optimal policies in infinite-horizon MDPs","lvl2":"Solving infinite-horizon MDPs"},"type":"lvl4","url":"/mdps#value-iteration","position":48},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl4":"Value iteration","lvl3":"Optimal policies in infinite-horizon MDPs","lvl2":"Solving infinite-horizon MDPs"},"content":"Since the optimal policy is still a policy, our result that the Bellman\noperator is a contracting map still holds, and so we can repeatedly\napply this operator to converge to the optimal value function! This\nalgorithm is known as value iteration.\n\ndef value_iteration(mdp: MDP, ε: float = 1e-6) -> Float[Array, \" S\"]:\n \"\"\"Iterate the Bellman optimality operator until convergence.\"\"\"\n op = partial(bellman_optimality_operator, mdp)\n return loop_until_convergence(op, jnp.zeros(mdp.S), ε)\n\n\n\nvalue_iteration(tidy_mdp_inf)\n\nNote that the runtime analysis for an ε-optimal value function\nis exactly the same as \n\niterative policy evaluation! This is because value iteration is simply\nthe special case of applying iterative policy evaluation to the\noptimal value function.\n\nAs the final step of the algorithm, to return an actual policy\n\\hat \\pi, we can simply act greedily with respect to the final iteration\nv^{(T)} of our above algorithm:\\hat \\pi(s) = \\arg\\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} v^{(T)}(s') \\right].\n\nWe must be careful, though: the value function of this greedy policy,\nV^{\\hat \\pi}, is not the same as v^{(T)}, which need not even be a\nwell-defined value function for some policy!\n\nThe bound on the policy’s quality is actually quite loose: if\n\\|v^{(T)} - V^\\star\\|_{\\infty} \\le \\epsilon, then the greedy policy\n\\hat \\pi satisfies\n\\|V^{\\hat \\pi} - V^\\star\\|_{\\infty} \\le \\frac{2\\gamma}{1-\\gamma} \\epsilon,\nwhich might potentially be very large.\n\nGreedy policy value worsening\\|V^{\\hat \\pi} - V^\\star \\|_{\\infty} \\le \\frac{2 \\gamma}{1-\\gamma} \\|v - V^\\star\\|_{\\infty}\n\nwhere \\hat \\pi(s) = \\arg\\max_a q(s, a) is the greedy policy with respect toq(s, a) = r(s, a) + \\E_{s' \\sim P(s, a)} v(s').\n\nProof\n\nWe first have\\begin{aligned}\n V^{\\star}(s) - V^{\\hat \\pi}(s) &= Q^{\\star}(s,\\pi^\\star(s)) - Q^{\\hat \\pi}(s, \\hat \\pi(s))\\\\\n &= [Q^{\\star}(s,\\pi^\\star(s)) - Q^{\\star}(s, \\hat \\pi(s))] + [Q^{\\star}(s, \\hat \\pi(s)) - Q^{\\hat \\pi}(s, \\hat \\pi(s))].\n\\end{aligned}\n\nLet’s bound these two quantities separately.\n\nFor the first quantity, note that by the definition of \\hat \\pi, we haveq(s, \\hat \\pi(s)) \\ge q(s,\\pi^\\star(s)).\n\nLet’s add q(s, \\hat \\pi(s)) - q(s,\\pi^\\star(s)) \\ge 0 to the first term to get\\begin{aligned}\n Q^{\\star}(s,\\pi^\\star(s)) - Q^{\\star}(s, \\hat \\pi(s)) &\\le [Q^{\\star}(s,\\pi^\\star(s))- q(s,\\pi^\\star(s))] + [q(s, \\hat \\pi(s)) - Q^{\\star}(s, \\hat \\pi(s))] \\\\\n &= \\gamma \\E_{s' \\sim P(s, \\pi^{\\star}(s))} [ V^{\\star}(s') - v(s') ] + \\gamma \\E_{s' \\sim P(s, \\hat \\pi(s))} [ v(s') - V^{\\star}(s') ] \\\\\n &\\le 2 \\gamma \\|v - V^{\\star}\\|_{\\infty}.\n\\end{aligned}\n\nThe second quantity is bounded by\\begin{aligned}\n Q^{\\star}(s, \\hat \\pi(s)) - Q^{\\hat \\pi}(s, \\hat \\pi(s))\n &=\n \\gamma \\E_{s'\\sim P(s, \\hat \\pi(s))}\\left[ V^\\star(s') - V^{\\hat \\pi}(s') \\right] \\\\\n & \\leq \n \\gamma \\|V^{\\star} - V^{\\hat \\pi}\\|_\\infty\n\\end{aligned}\n\nand thus\\begin{aligned}\n \\|V^\\star - V^{\\hat \\pi}\\|_\\infty &\\le 2 \\gamma \\|v - V^{\\star}\\|_{\\infty} + \\gamma \\|V^{\\star} - V^{\\hat \\pi}\\|_\\infty \\\\\n \\|V^\\star - V^{\\hat \\pi}\\|_\\infty &\\le \\frac{2 \\gamma \\|v - V^{\\star}\\|_{\\infty}}{1-\\gamma}.\n\\end{aligned}\n\nSo in order to compensate and achieve \\|V^{\\hat \\pi} - V^{\\star}\\| \\le \\epsilon, we must have\\|v^{(T)} - V^\\star\\|_{\\infty} \\le \\frac{1-\\gamma}{2 \\gamma} \\epsilon.\n\nThis means, using \n\nRemark 1.2, we need to run value iteration forT = O\\left( \\frac{1}{1-\\gamma} \\log\\left(\\frac{\\gamma}{\\epsilon (1-\\gamma)^2}\\right) \\right)\n\niterations to achieve an ε-accurate estimate of the optimal value function.","type":"content","url":"/mdps#value-iteration","position":49},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl4":"Policy iteration","lvl3":"Optimal policies in infinite-horizon MDPs","lvl2":"Solving infinite-horizon MDPs"},"type":"lvl4","url":"/mdps#policy-iteration","position":50},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl4":"Policy iteration","lvl3":"Optimal policies in infinite-horizon MDPs","lvl2":"Solving infinite-horizon MDPs"},"content":"Can we mitigate this “greedy worsening”? What if instead of approximating the optimal value function and then acting greedily by it at the very end, we iteratively improve the policy and value function together? This is the idea behind policy iteration. In each step, we simply set the policy to act greedily with respect to its own value function.\n\ndef policy_iteration(mdp: MDP, ε=1e-6) -> Float[Array, \"S A\"]:\n \"\"\"Iteratively improve the policy and value function.\"\"\"\n def op(pi):\n return v_to_greedy(mdp, eval_deterministic_infinite(mdp, pi))\n π_init = jnp.ones((mdp.S, mdp.A)) / mdp.A # uniform random policy\n return loop_until_convergence(op, π_init, ε)\n\n\n\npolicy_iteration(tidy_mdp_inf)\n\nAlthough PI appears more complex than VI, we’ll use the same contraction property \n\nTheorem 1.4 to show convergence. This will give us the same runtime bound as value iteration and iterative policy evaluation for an ε-optimal value function \n\nRemark 1.2, although in practice, PI often converges much faster.\n\nPolicy Iteration runtime and convergence\n\nWe aim to show that the number of iterations required for an\nε-accurate estimate of the optimal value function isT = O\\left( \\frac{1}{1-\\gamma} \\log\\left(\\frac{1}{\\epsilon (1-\\gamma)}\\right) \\right).\n\nThis bound follows from the contraction property \n\n(1.38):\\|V^{\\pi^{t+1}} - V^\\star \\|_{\\infty} \\le \\gamma \\|V^{\\pi^{t}} - V^\\star \\|_{\\infty}.\n\nWe’ll prove that the iterates of PI respect the contraction property by\nshowing that the policies improve monotonically:V^{\\pi^{t+1}}(s) \\ge V^{\\pi^{t}}(s).\n\nThen we’ll use this to show\nV^{\\pi^{t+1}}(s) \\ge [\\mathcal{J}^{\\star}(V^{\\pi^{t}})](s). Note that\\begin{aligned}\n(s) &= \\max_a \\left[ r(s, a) + \\gamma \\E_{s' \\sim P(s, a)} V^{\\pi^{t}}(s') \\right] \\\\\n &= r(s, \\pi^{t+1}(s)) + \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} V^{\\pi^{t}}(s')\n\\end{aligned}\n\nSince\n[\\mathcal{J}^{\\star}(V^{\\pi^{t}})](s) \\ge V^{\\pi^{t}}(s), we then have\\begin{aligned}\n V^{\\pi^{t+1}}(s) - V^{\\pi^{t}}(s) &\\ge V^{\\pi^{t+1}}(s) - \\mathcal{J}^{\\star} (V^{\\pi^{t}})(s) \\\\\n &= \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} \\left[V^{\\pi^{t+1}}(s') - V^{\\pi^{t}}(s') \\right].\n\\end{aligned}\n\nBut note that the\nexpression being averaged is the same as the expression on the l.h.s.\nwith s replaced by s'. So we can apply the same inequality\nrecursively to get\\begin{aligned}\n V^{\\pi^{t+1}}(s) - V^{\\pi^{t}}(s) &\\ge \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} \\left[V^{\\pi^{t+1}}(s') - V^{\\pi^{t}}(s') \\right] \\\\\n &\\ge \\gamma^2 \\E_{\\substack{s' \\sim P(s, \\pi^{t+1}(s)) \\\\ s'' \\sim P(s', \\pi^{t+1}(s'))}} \\left[V^{\\pi^{t+1}}(s'') - V^{\\pi^{t}}(s'') \\right]\\\\\n &\\ge \\cdots\n\\end{aligned}\n\nwhich implies that V^{\\pi^{t+1}}(s) \\ge V^{\\pi^{t}}(s)\nfor all s (since the r.h.s. converges to zero). We can then plug this\nback into\n\n\n(1.69)\nto get the desired result:\\begin{aligned}\n V^{\\pi^{t+1}}(s) - \\mathcal{J}^{\\star} (V^{\\pi^{t}})(s) &= \\gamma \\E_{s' \\sim P(s, \\pi^{t+1}(s))} \\left[V^{\\pi^{t+1}}(s') - V^{\\pi^{t}}(s') \\right] \\\\\n &\\ge 0 \\\\\n V^{\\pi^{t+1}}(s) &\\ge [\\mathcal{J}^{\\star}(V^{\\pi^{t}})](s)\n\\end{aligned}\n\nThis means we can now apply the Bellman convergence result \n\n(1.38) to get\\|V^{\\pi^{t+1}} - V^\\star \\|_{\\infty} \\le \\|\\mathcal{J}^{\\star} (V^{\\pi^{t}}) - V^{\\star}\\|_{\\infty} \\le \\gamma \\|V^{\\pi^{t}} - V^\\star \\|_{\\infty}.","type":"content","url":"/mdps#policy-iteration","position":51},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl2":"Summary"},"type":"lvl2","url":"/mdps#summary","position":52},{"hierarchy":{"lvl1":"1 Markov Decision Processes","lvl2":"Summary"},"content":"Markov decision processes (MDPs) are a framework for sequential\ndecision making under uncertainty. They consist of a state space\n\\mathcal{S}, an action space \\mathcal{A}, an initial state distribution\n\\mu \\in \\Delta(\\mathcal{S}), a transition function P(s' \\mid s, a), and a\nreward function r(s, a). They can be finite-horizon (ends after\nH timesteps) or infinite-horizon (where rewards scale by\n\\gamma \\in (0, 1) at each timestep).\n\nOur goal is to find a policy π that maximizes expected total\nreward. Policies can be deterministic or stochastic,\nstate-dependent or history-dependent, stationary or\ntime-dependent.\n\nA policy induces a distribution over trajectories.\n\nWe can evaluate a policy by computing its value function\nV^\\pi(s), which is the expected total reward starting from state\ns and following policy π. We can also compute the\nstate-action value function Q^\\pi(s, a), which is the expected\ntotal reward starting from state s, taking action a, and then\nfollowing policy π. In the finite-horizon setting, these also\ndepend on the timestep \\hi.\n\nThe Bellman consistency equation is an equation that the value\nfunction must satisfy. It can be used to solve for the value\nfunctions exactly. Thinking of the r.h.s. of this equation as an\noperator on value functions gives the Bellman operator.\n\nIn the finite-horizon setting, we can compute the optimal policy\nusing dynamic programming.\n\nIn the infinite-horizon setting, we can compute the optimal policy\nusing value iteration or policy iteration.","type":"content","url":"/mdps#summary","position":53},{"hierarchy":{"lvl1":"6 Policy Gradient Methods"},"type":"lvl1","url":"/pg","position":0},{"hierarchy":{"lvl1":"6 Policy Gradient Methods"},"content":"","type":"content","url":"/pg","position":1},{"hierarchy":{"lvl1":"6 Policy Gradient Methods","lvl2":"Introduction"},"type":"lvl2","url":"/pg#introduction","position":2},{"hierarchy":{"lvl1":"6 Policy Gradient Methods","lvl2":"Introduction"},"content":"The core task of RL is finding the optimal policy in a given environment.\nThis is essentially an optimization problem:\nout of some space of policies,\nwe want to find the one that achieves the maximum total reward (in expectation).\n\nIt’s typically intractable to compute the optimal policy exactly in some finite number of steps.\nInstead, policy optimization algorithms start from some randomly initialized policy,\nand then improve it step by step.\nWe’ve already seen some examples of these,\nnamely \n\nSection 1.5.3.2 for finite MDPs and \n\nSection 2.6.4 in continuous control.\n\nIn particular, we often use policies that can be described by some finite set of parameters.\nWe will see some examples in \n\nSection 3.1.\nFor such parameterized policies,\nwe can approximate the policy gradient:\nthe gradient of the expected total reward with respect to the parameters.\nThis tells us the direction the parameters should be updated to achieve a higher expected total reward.\nPolicy gradient methods are responsible for groundbreaking applications including AlphaGo, OpenAI Five, and large language models,\nmany of which use policies parameterized as deep neural networks.\n\nWe begin the chapter with a short review of gradient ascent,\na general optimization method.\n\nWe’ll then see how to estimate the policy gradient,\nenabling us to apply (stochastic) gradient ascent in the RL setting.\n\nThen we’ll explore some proximal optimization techniques that ensure the steps taken are “not too large”.\nThis is helpful to stabilize training and widely used in practice.\n\nfrom utils import plt, Array, Callable, jax, jnp, latexify\n\n","type":"content","url":"/pg#introduction","position":3},{"hierarchy":{"lvl1":"6 Policy Gradient Methods","lvl2":"Gradient Ascent"},"type":"lvl2","url":"/pg#gradient-ascent","position":4},{"hierarchy":{"lvl1":"6 Policy Gradient Methods","lvl2":"Gradient Ascent"},"content":"Note\n\nYou may have previously heard of gradient descent for minimizing functions.\nOptimization problems are usually posed as minimization problems by convention.\nHowever, in RL, we usually talk about maximizing the expected total reward,\nand so we perform gradient ascent instead.\n\nGradient ascent is a general optimization algorithm for any differentiable function.\nA suitable analogy for this algorithm is hiking up a mountain,\nwhere you keep taking steps in the steepest direction upwards.\nHere, your vertical position y is the function being optimized,\nand your horizontal position (x, z) is the input to the function.\nThe slope of the mountain at your current position is given by the gradient,\nwritten \\nabla y(x, z) \\in \\mathbb{R}^2.\n\ndef f(x, y):\n \"\"\"Himmelblau's function\"\"\"\n return (x**2 + y - 11)**2 + (x + y**2 - 7)**2\n\n# Create a grid of points\nx = jnp.linspace(-5, 5, 400)\ny = jnp.linspace(-5, 5, 400)\nX, Y = jnp.meshgrid(x, y)\nZ = f(X, Y)\n\n# Create the plot\nfig, ax = plt.subplots(figsize=(6, 6))\n\n# Plot the function using imshow\nimg = ax.imshow(Z, extent=[-5, 5, -5, 5], origin='lower')\n\n# Add color bar\nfig.colorbar(img, ax=ax)\n\n# Gradient computation using JAX\ntx, ty = 1.0, 1.0\ngx, gy = jax.grad(f, argnums=(0, 1))(tx, ty)\n\n# Scatter point\nax.scatter(tx, ty, color='red', s=100)\n\n# Add arrow representing the gradient\nax.arrow(tx, ty, gx * 0.01, gy * 0.01, head_width=0.3, head_length=0.3, fc='blue', ec='blue')\n\n# Add plot title\nax.set_title(\"Gradient ascent example\")\n\nplt.show()\n\nFor differentiable functions, this can be thought of as the vector of partial derivatives,\\nabla y(x, z) = \\begin{pmatrix}\n\\frac{\\partial y}{\\partial x} \\\\\n\\frac{\\partial y}{\\partial z}\n\\end{pmatrix}.\n\nTo calculate the slope (aka “directional derivative”) of the mountain in a given direction (\\Delta x, \\Delta z),\nyou take the dot product of the difference vector with the gradient.\nThis means that the direction with the highest slope is exactly the gradient itself,\nso we can describe the gradient ascent algorithm as follows:\n\nGradient ascent\\begin{pmatrix}\nx^{k+1} \\\\ z^{k+1}\n\\end{pmatrix}\n= \n\\begin{pmatrix}\nx^{k} \\\\ z^{k}\n\\end{pmatrix}\n+\n\\eta \\nabla y(x^{k}, z^{k})\n\nwhere k denotes the iteration of the algorithm and \\eta > 0 is a “step size” hyperparameter that controls the size of the steps we take.\n(Note that we could also vary the step size across iterations, that is, \\eta^0, \\dots, \\eta^K.)\n\nThe case of a two-dimensional input is easy to visualize.\nBut this idea can be straightforwardly extended to higher-dimensional inputs.\n\nFrom now on, we’ll use J to denote the function we’re trying to maximize,\nand θ to denote the parameters being optimized over. (In the above example, \\theta = \\begin{pmatrix} x & z \\end{pmatrix}^\\top).\n\nNotice that our parameters will stop changing once \\nabla J(\\theta) = 0.\nOnce we reach this stationary point, our current parameters are ‘locally optimal’ in some sense;\nit’s impossible to increase the function by moving in any direction.\nIf J is convex, then the only point where this happens is at the global optimum.\nOtherwise, if J is nonconvex, the best we can hope for is a local optimum.\n\nNote\n\nHow does a computer compute the gradient of a function?\n\nOne way is symbolic differentiation,\nwhich is similar to the way you might compute it by hand:\nthe computer applies a list of rules to transform the symbols involved.\nPython’s sympy package supports symbolic differentiation.\nHowever, functions implemented in code may not always have a straightforward symbolic representation.\n\nAnother way is numerical differentiation,\nwhich is based on the limit definition of a (directional) derivative:\\nabla_{\\boldsymbol{u}} J(\\boldsymbol{x}) = \\lim_{\\varepsilon \\to 0}\n\\frac{J(\\boldsymbol{x} + \\varepsilon \\boldsymbol{u}) - J(\\boldsymbol{x})}{\\varepsilon}\n\nThen, we can substitute a small value of \\varepsilon on the r.h.s. to approximate the directional derivative.\nHow small, though? If we need an accurate estimate,\nwe may need such a small value of \\varepsilon that typical computers will run into rounding errors.\nAlso, to compute the full gradient,\nwe would need to compute the r.h.s. once for each input dimension.\nThis is an issue if computing J is expensive.\n\nAutomatic differentiation achieves the best of both worlds.\nLike symbolic differentiation,\nwe manually implement the derivative rules for a few basic operations.\nHowever, instead of executing these on the symbols,\nwe execute them on the values when the function gets called,\nlike in numerical differentiation.\nThis allows us to differentiate through programming constructs such as branches or loops,\nand doesn’t involve any arbitrarily small values.\n\n\nBaydin et al. (2018) provides an accessible survey of automatic differentiation.\n\n","type":"content","url":"/pg#gradient-ascent","position":5},{"hierarchy":{"lvl1":"6 Policy Gradient Methods","lvl3":"Stochastic gradient ascent","lvl2":"Gradient Ascent"},"type":"lvl3","url":"/pg#stochastic-gradient-ascent","position":6},{"hierarchy":{"lvl1":"6 Policy Gradient Methods","lvl3":"Stochastic gradient ascent","lvl2":"Gradient Ascent"},"content":"In real applications,\ncomputing the gradient of the target function is not so simple.\nAs an example from supervised learning, J(\\theta) might be the sum of squared prediction errors across an entire training dataset.\nHowever, if our dataset is very large, it might not fit into our computer’s memory!\nIn these cases, we often compute some estimate of the gradient at each step, \\tilde \\nabla J(\\theta), and walk in that direction instead.\nThis is called stochastic gradient ascent.\nIn the SL example above, we might randomly choose a minibatch of samples and use them to estimate the true prediction error. (This approach is known as minibatch SGD.)\n\ndef sgd(\n theta_init: Array,\n estimate_gradient: Callable[[Array], Array],\n η: float,\n n_steps: int,\n):\n \"\"\"Perform `n_steps` steps of SGD.\n\n `estimate_gradient` eats the current parameters and returns an estimate of the objective function's gradient at those parameters.\n \"\"\"\n θ = theta_init\n for step in range(n_steps):\n θ += η * estimate_gradient(θ)\n return θ\n\nWhat makes one gradient estimator better than another?\nIdeally, we want this estimator to be unbiased; that is, on average, it matches a single true gradient step:\\E [\\tilde \\nabla J(\\theta)] = \\nabla J(\\theta).\n\nWe also want the variance of the estimator to be low so that its performance doesn’t change drastically at each step.\n\nWe can actually show that, for many “nice” functions, in a finite number of steps, SGD will find a θ that is “close” to a stationary point.\nIn another perspective, for such functions, the local “landscape” of J around θ becomes flatter and flatter the longer we run SGD.\n\nSGD convergence\n\nMore formally, suppose we run SGD for K steps, using an unbiased gradient estimator.\nLet the step size \\eta^k scale as O(1/\\sqrt{k}).\nThen if J is bounded and β-smooth (see below),\nand the norm of the gradient estimator has a bounded second moment \\sigma^2,\\|\\nabla J(\\theta^K)\\|^2 \\le O \\left( M \\beta \\sigma^2 / K\\right).\n\nWe call a function β-smooth if its gradient is Lipschitz continuous with constant β:\\|\\nabla J(\\theta) - \\nabla J(\\theta')\\| \\le \\beta \\|\\theta - \\theta'\\|.\n\nWe’ll now see a concrete application of gradient ascent in the context of policy optimization.\n\n","type":"content","url":"/pg#stochastic-gradient-ascent","position":7},{"hierarchy":{"lvl1":"6 Policy Gradient Methods","lvl2":"Policy (stochastic) gradient ascent"},"type":"lvl2","url":"/pg#policy-stochastic-gradient-ascent","position":8},{"hierarchy":{"lvl1":"6 Policy Gradient Methods","lvl2":"Policy (stochastic) gradient ascent"},"content":"Remember that in RL, the primary goal is to find the optimal policy that achieves the maximimum total reward, which we can express using the value function we defined in \n\nDefinition 1.6:\\begin{aligned}\n J(\\pi) := \\E_{s_0 \\sim \\mu_0} V^{\\pi} (s_0) = & \\E_{\\tau \\sim \\rho^\\pi} \\sum_{\\hi=0}^{\\hor-1} r(s_\\hi, a_\\hi)\n\\end{aligned}\n\nwhere \\rho^\\pi is the distribution over trajectories induced by π (see \n\nDefinition 1.5).\n\n(Note that we’ll continue to work in the undiscounted, finite-horizon case. Analogous results hold for the discounted, infinite-horizon setup.)\n\nAs shown by the notation, this is exactly the function J that we want to maximize using gradient ascent.\nWhat variables are we optimizing over in this problem?\nWell, the objective function J is a function of the policy π,\nbut in general, π is a function,\nand optimizing over the entire space of arbitrary input-output mappings would be intractable.\nInstead, we need to describe π in terms of some finite set of parameters θ.\n\n","type":"content","url":"/pg#policy-stochastic-gradient-ascent","position":9},{"hierarchy":{"lvl1":"6 Policy Gradient Methods","lvl3":"Example policy parameterizations","lvl2":"Policy (stochastic) gradient ascent"},"type":"lvl3","url":"/pg#parameterizations","position":10},{"hierarchy":{"lvl1":"6 Policy Gradient Methods","lvl3":"Example policy parameterizations","lvl2":"Policy (stochastic) gradient ascent"},"content":"What are some ways we could parameterize our policy?\n\nTabular representation\n\nIf both the state and action spaces are finite, perhaps we could simply learn a preference value \\theta_{s,a} for each state-action pair.\nThen to turn this into a valid distribution, we perform a softmax operation:\nwe exponentiate each of them,\nand then normalize to form a valid distribution:\\pi^\\text{softmax}_\\theta(a | s) = \\frac{\\exp(\\theta_{s,a})}{\\sum_{s,a'} \\exp (\\theta_{s,a'})}.\n\nHowever, this doesn’t make use of any structure in the states or actions,\nso while this is flexible, it is also prone to overfitting.\n\nLinear in features\n\nAnother approach is to map each state-action pair into some feature space \\phi(s, a) \\in \\mathbb{R}^p. Then, to map a feature vector to a probability, we take a linear combination of the features and take a softmax:\\pi^\\text{linear in features}_{\\theta}(a|s) = \\frac{\\exp(\\theta^\\top \\phi(s, a))}{\\sum_{a'} \\exp(\\theta^\\top \\phi(s, a'))}.\n\nAnother interpretation is that θ represents the feature vector of the “desired” state-action pair, as state-action pairs whose features align closely with θ are given higher probability.\n\nNeural policies\n\nMore generally, we could map states and actions to unnormalized scores via some parameterized function f_\\theta : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R}, such as a neural network, and choose actions according to a softmax: \\pi^\\text{general}_\\theta(a|s) = \\frac{\\exp(f_{\\theta}(s,a))}{\\sum_{a'} \\exp(f_{\\theta}(s,a'))}.\n\nDiagonal Gaussian policies for continuous action spaces\n\nConsider a continuous n-dimensional action space \\mathcal{A} = \\mathbb{R}^n. Then for a stochastic policy, we could use a function to predict the mean action and then add some random noise about it. For example, we could use a neural network to predict the mean action \\mu_\\theta(s) and then add some noise \\epsilon \\sim \\mathcal{N}(0, \\sigma^2 I) to it:\\pi_\\theta(a|s) = \\mathcal{N}(\\mu_\\theta(s), \\sigma^2 I). **Exercise:** Can you extend the \"linear in features\" policy to continuous action spaces in a similar way? \n\nNow that we have seen some examples of parameterized policies,\nwe will write the total reward in terms of the parameters,\noverloading notation and letting \\rho_\\theta := \\rho^{\\pi_\\theta}:J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} R(\\tau)\n\nwhere R(\\tau) = \\sum_{\\hi=0}^{\\hor-1} r(s_\\hi, a_\\hi) denotes the total reward in the trajectory.\n\nNow how do we maximize this function (the expected total reward) over the parameters?\nOne simple idea would be to directly apply gradient ascent:\\theta^{k+1} = \\theta^k + \\eta \\nabla J(\\theta^k).\n\nIn order to apply this technique, we need to be able to evaluate the gradient \\nabla J(\\theta).\nBut J(\\theta) is very difficult, or even intractable, to compute exactly, since it involves taking an expectation over all possible trajectories \\tau.\nCan we rewrite it in a form that’s more convenient to implement?\n\n","type":"content","url":"/pg#parameterizations","position":11},{"hierarchy":{"lvl1":"6 Policy Gradient Methods","lvl3":"Importance Sampling","lvl2":"Policy (stochastic) gradient ascent"},"type":"lvl3","url":"/pg#importance-sampling","position":12},{"hierarchy":{"lvl1":"6 Policy Gradient Methods","lvl3":"Importance Sampling","lvl2":"Policy (stochastic) gradient ascent"},"content":"There is a general trick called importance sampling for evaluating difficult expectations.\nSuppose we want to estimate \\E_{x \\sim p}[f(x)] where p is hard or expensive to sample from,\nbut easy to evaluate the likelihood p(x) of.\nSuppose that we can easily sample from a different distribution q.\nSince an expectation is just a weighted average, we can sample x from q, compute f(x), and then reweight the results:\nif x is very likely under p but unlikely under q,\nwe should boost its weighting,\nand if it is common under q but uncommon under p,\nwe should lower its weighting.\nThe reweighting factor is exactly the likelihood ratio between the target distribution p and the sampling distribution q:\\E_{x \\sim p}[f(x)] = \\sum_{x \\in \\mathcal{X}} f(x) p(x) = \\sum_{x \\in \\mathcal{X}} f(x) \\frac{p(x)}{q(x)} q(x) = \\E_{x \\sim q} \\left[ \\frac{p(x)}{q(x)} f(x) \\right].\n\nDoesn’t this seem too good to be true? If there were no drawbacks, we could use this to estimate any expectation of any function on any arbitrary distribution! The drawback is that the variance may be very large due to the likelihood ratio term.\nIf there are values of x that are very rare in the sampling distribution q,\nbut common under p,\nthen the likelihood ratio p(x)/q(x) will cause the variance to blow up.","type":"content","url":"/pg#importance-sampling","position":13},{"hierarchy":{"lvl1":"6 Policy Gradient Methods","lvl2":"The REINFORCE policy gradient"},"type":"lvl2","url":"/pg#the-reinforce-policy-gradient","position":14},{"hierarchy":{"lvl1":"6 Policy Gradient Methods","lvl2":"The REINFORCE policy gradient"},"content":"Returning to RL, suppose there is some trajectory distribution \\rho(\\tau) that is easy to sample from, such as a database of existing trajectories.\nWe can then rewrite \\nabla J(\\theta), a.k.a. the policy gradient, as follows.\nAll gradients are being taken with respect to θ.\\begin{aligned}\n \\nabla J(\\theta) & = \\nabla \\E_{\\tau \\sim \\rho_\\theta} [ R(\\tau) ] \\\\\n & = \\nabla \\E_{\\tau \\sim \\rho} \\left[ \\frac{\\rho_\\theta(\\tau)}{\\rho(\\tau)} R(\\tau) \\right] & & \\text{likelihood ratio trick} \\\\\n & = \\E_{\\tau \\sim \\rho} \\left[ \\frac{\\nabla \\rho_\\theta(\\tau)}{\\rho(\\tau)} R(\\tau) \\right] & & \\text{switching gradient and expectation}\n\\end{aligned}\n\nNote that for \\rho = \\rho_\\theta, the inside term becomes\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} [ \\nabla \\log \\rho_\\theta(\\tau) \\cdot R(\\tau)].\n\n(The order of operations is \\nabla (\\log \\rho_\\theta)(\\tau).)\n\nRecall that when the state transitions are Markov (i.e. s_{t} only depends on s_{t-1}, a_{t-1}) and the policy is time-homogeneous (i.e. a_\\hi \\sim \\pi_\\theta (s_\\hi)), we can write out the likelihood of a trajectory under the policy \\pi_\\theta autoregressively, as in \n\nDefinition 1.5. Taking the log of the trajectory likelihood turns it into a sum of terms:\\log \\rho_\\theta(\\tau) = \\log \\mu(s_0) + \\sum_{\\hi=0}^{\\hor-1} \\log \\pi_\\theta(a_\\hi \\mid s_\\hi) + \\log P(s_{\\hi+1} \\mid s_\\hi, a_\\hi)\n\nWhen we take the gradient with respect to the parameters θ,\nonly the \\pi_\\theta(a_\\hi | s_\\hi) terms depend on θ.\nThis gives the following expression for the policy gradient, known as the “REINFORCE” policy gradient \n\nWilliams (1992):\\begin{aligned}\n \\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\nabla_\\theta \\log \\pi_{\\theta}(a_\\hi | s_\\hi) R(\\tau) \\right]\n\\end{aligned}\n\nThis expression allows us to estimate the gradient by sampling a few sample trajectories from \\pi_\\theta,\ncalculating the likelihoods of the chosen actions,\nand substituting these into the expression inside the brackets of \n\n(18).\nThen we can update the parameters θ in this direction to perform stochastic gradient ascent.\n\nThe rest of this chapter investigates ways to reduce the variance of this estimator by subtracting off certain correlated quantities.\n\nNote\n\nHere is an alternative, intuitive presentation of \n\n(18).\n\nIntuitively speaking,\nwe want to update the policy parameters to maximize the probability of taking optimal actions.\nThat is, suppose we are in state s, and a^\\star is an optimal action to take.\nThen we want to solve \\theta = \\arg\\max_{\\theta'} \\pi_{\\theta'}(a^\\star \\mid s),\nwhich would lead to the gradient ascent expression\\theta \\gets \\theta + \\nabla \\pi_{\\theta}(a^\\star \\mid s).\n\nHowever, we don’t know the optimal action a^\\star in practice.\nSo instead, we must try many actions,\nand increase the probability of the “good” ones\nand decrease the probability of the “bad” ones.\nSuppose A(s, a) is a measure of how good action a is in state s.\nThen we could write\\theta \\gets \\theta + \\sum_a \\pi_{\\theta}(a \\mid s) A(s, a) \\nabla \\pi_{\\theta}(a \\mid s).\n\nBut this has an issue: the size of each step doesn’t just depend on how good it is,\nbut also how often the policy takes it already.\nThis could lead to a positive feedback loop where likely actions become more and more likely,\nwithout respect to the quality of the action.\nSo we divide by the likelihood to cancel out this factor:\\theta \\gets \\theta + \\sum_a \\pi_{\\theta}(a \\mid s) A(s, a) \\frac{\\nabla \\pi_{\\theta}(a \\mid s)}{\\pi_{\\theta}(a \\mid s)}.\n\nBut once we simplify, and sum across timesteps, this becomes almost exactly the gradient written above!\\theta \\gets \\theta + \\mathbb{E}_{a \\sim \\pi_{\\theta}(\\cdot \\mid s)} [\\sum_{\\hi=0}^{\\hor-1} A(s_\\hi, a_\\hi) \\nabla \\log \\pi_{\\theta}(a_\\hi \\mid s_\\hi) ].\n\nWe will see later on what A concretely corresponds to.def estimate_gradient_reinforce_pseudocode(env, π, θ):\n τ = sample_trajectory(env, π(θ))\n gradient_hat = 0\n for s, a, r in τ:\n def policy_log_likelihood(θ):\n return log(π(θ)(s, a))\n gradient_hat += jax.grad(policy_log_likelihood)(θ) * τ.total_reward\n return gradient_hat\n\nFor some intuition into how this method works, recall that we update our parameters according to\\begin{aligned}\n \\theta_{t+1} &= \\theta_t + \\eta \\nabla J(\\theta_t) \\\\\n &= \\theta_t + \\eta \\E_{\\tau \\sim \\rho_{\\theta_t}} [\\nabla \\log \\rho_{\\theta_t}(\\tau) \\cdot R(\\tau)].\n\\end{aligned}\n\nConsider the “good” trajectories where R(\\tau) is large. Then θ gets updated so that these trajectories become more likely. To see why, recall that \\rho_{\\theta}(\\tau) is the likelihood of the trajectory τ under the policy \\pi_\\theta, so the gradient points in the direction that makes τ more likely.\n\n","type":"content","url":"/pg#the-reinforce-policy-gradient","position":15},{"hierarchy":{"lvl1":"6 Policy Gradient Methods","lvl2":"Baselines and advantages"},"type":"lvl2","url":"/pg#baselines-and-advantages","position":16},{"hierarchy":{"lvl1":"6 Policy Gradient Methods","lvl2":"Baselines and advantages"},"content":"A central idea from supervised learning is the bias-variance decomposition,\nwhich shows that the mean squared error of an estimator is the sum of its squared bias and its variance.\nThe REINFORCE gradient estimator \n\n(18) is already unbiased, meaning that its expectation over trajectories is the true policy gradient.\nCan we find ways to reduce its variance as well?\n\nAs a first step,\nconsider that the action taken at step t does not affect the reward from previous timesteps, since they’re already in the past.\nYou can also show rigorously that this is the case,\nand that we only need to consider the present and future rewards to calculate the policy gradient:\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\nabla_\\theta \\log \\pi_{\\theta}(a_\\hi | s_\\hi) \\sum_{\\hi' = \\hi}^{\\hor-1} r(s_{\\hi'}, a_{\\hi'}) \\right]\n\nFurthermore, by a conditioning argument, we can replace the inner sum over remaining rewards with the policy’s Q-function,\nevaluated at the current state:\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\nabla_\\theta \\log \\pi_{\\theta}(a_\\hi | s_\\hi) Q^{\\pi_\\theta}(s_{\\hi}, a_{\\hi}) \\right]\n\nExercise: Prove that this is equivalent to the previous definitions. What modification to the expression must be made for the discounted, infinite-horizon setting?\n\nWe can further reduce variance by subtracting a baseline function b_\\hi : \\mathcal{S} \\to \\mathbb{R} at each timestep \\hi.\nThis modifies the policy gradient as follows:\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{\\hi=0}^{H-1} \\nabla \\log \\pi_\\theta (a_\\hi | s_\\hi) \\left(\n Q^{\\pi_\\theta}(s_\\hi, a_\\hi)\n - b_\\hi(s_\\hi)\n \\right)\n \\right].\n\n(Again, you should try to prove that this equality still holds.)\nFor example, we might want b_\\hi to estimate the average reward-to-go at a given timestep:b_\\hi^\\theta = \\E_{\\tau \\sim \\rho_\\theta} R_\\hi(\\tau).\n\nAs a better baseline, we could instead choose the value function.\nNote that the random variable Q^\\pi_\\hi(s, a) - V^\\pi_\\hi(s),\nwhere the randomness is taken over the actions, is centered around zero.\n(Recall V^\\pi_\\hi(s) = \\E_{a \\sim \\pi} Q^\\pi_\\hi(s, a).)\nThis quantity matches the intuition given in \n\nNote 1:\nit is positive for actions that are better than average (in state s),\nand negative for actions that are worse than average.\nIn fact, this quantity has a particular name: the advantage function.\n\nAdvantage functionA^\\pi_\\hi(s) = Q^\\pi_\\hi(s, a) - V^\\pi_\\hi(s)\n\nThis measures how much better this action does than the average for that policy.\n(Note that for an optimal policy \\pi^\\star, the advantage of a given state-action pair is always zero or negative.)\n\nWe can now express the policy gradient as follows. Note that the advantage function effectively replaces the Q-function from \n\n(25):\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{\\hi=0}^{\\hor-1} \\nabla \\log \\pi_\\theta(a_\\hi | s_\\hi) A^{\\pi_\\theta}_\\hi (s_\\hi, a_\\hi)\n\\right].\n\nNote that to avoid correlations between the gradient estimator and the value estimator (i.e. baseline), we must estimate them with independently sampled trajectories: TODO could use more explanation _why_ we want to avoid correlations Policy gradient with a learned baseline \n\ndef pg_with_learned_baseline_pseudocode(env, π, η, θ_init, K, N):\n θ = θ_init\n for k in range(K):\n trajectories = sample_trajectories(env, π(θ), N)\n V_hat = fit(trajectories) # estimates the value function of π(θ)\n τ = sample_trajectories(env, π(θ), 1)\n g = jnp.zeros_like(θ) # gradient estimator\n\n for h, (s, a) in enumerate(τ):\n def log_likelihood(θ_):\n return jnp.log(π(θ_)(s, a))\n g = g + jax.grad(log_likelihood)(θ) * (return_to_go(τ, h) - V_hat(s))\n \n θ = θ + η * g\n return θ\n\nNote that you could also generalize this by allowing the learning rate η to vary across steps,\nor take multiple trajectories τ and compute the sample average of the gradient estimates.\n\nThe baseline estimation step fit can be done using any appropriate supervised learning algorithm.\nNote that the gradient estimator will be unbiased regardless of the baseline.\n\n","type":"content","url":"/pg#baselines-and-advantages","position":17},{"hierarchy":{"lvl1":"6 Policy Gradient Methods","lvl2":"Comparing policy gradient algorithms to policy iteration"},"type":"lvl2","url":"/pg#comparing-policy-gradient-algorithms-to-policy-iteration","position":18},{"hierarchy":{"lvl1":"6 Policy Gradient Methods","lvl2":"Comparing policy gradient algorithms to policy iteration"},"content":" TODO maybe restructure this part \n\nWhat advantages does the policy gradient algorithm have over the policy iteration algorithms covered in \n\nSection 1.5.3.2?\n\nPolicy iteration recap\n\nRecall that policy iteration is an algorithm for MDPs with unknown state transitions where we alternate between these two steps:\n\nEstimating the Q-function (or advantage function) of the current policy;\n\nUpdating the policy to be greedy with respect to this approximate Q-function (or advantage function).\n\nTo analyze the difference between them, we’ll make use of the performance difference lemma, which provides an expression for comparing the difference between two value functions.\n\nPerformance difference lemma\n\nSuppose Alice is playing a game (an MDP).\nBob is spectating, and can evaluate how good an action is compared to his own strategy.\n(That is, Bob can compute his advantage function A_\\hi^{\\text{Bob}}(s_\\hi, a_\\hi)).\nThe performance difference lemma says that Bob can now calculate exactly how much better or worse he is than Alice as follows:V_0^{\\text{Alice}}(s) - V_0^{\\text{Bob}}(s) = \\E_{\\tau \\sim \\rho_{\\text{Alice}, s}} \\left[ \\sum_{h=0}^{H-1} A_\\hi^{\\text{Bob}} (s_\\hi, a_\\hi) \\right]\n\nwhere \\rho_{\\text{Alice}, s} denotes the distribution over trajectories starting in state s when Alice is playing.\n\nTo see why, consider a specific step \\hi in the trajectory. We compute how much better actions from Bob are than the actions from Alice, on average.\nBut this is exactly the average Bob-advantage across actions from Alice, as described in the PDL!\n\nFormally, this corresponds to a nice telescoping simplification when we expand out the definition of the advantage function. Note that\\begin{aligned}\nA^\\pi_\\hi(s_\\hi, a_\\hi) &= Q^\\pi_\\hi(s_\\hi, a_\\hi) - V^\\pi_\\hi(s_\\hi) \\\\\n&= r_\\hi(s_\\hi, a_\\hi) + \\E_{s_{\\hi+1} \\sim P(s_\\hi, a_\\hi)} [V^\\pi_{\\hi+1}(s_{\\hi+1})] - V^\\pi_\\hi(s_\\hi)\n\\end{aligned}\n\nso expanding out the r.h.s. expression of \n\n(30) and grouping terms together gives\\begin{aligned}\n\\E_{\\tau \\sim \\rho_{\\text{Alice}, s}} \\left[ \\sum_{\\hi=0}^{\\hor-1} A_\\hi^{\\text{Bob}} (s_\\hi, a_\\hi) \\right] &= \\E_{\\tau \\sim \\rho_{\\text{Alice}, s}} \\left[ \\left( \\sum_{\\hi=0}^{\\hor-1} r_\\hi(s_\\hi, a_\\hi) \\right) + \\left( V^{\\text{Bob}}_1(s_1) + \\cdots + V^{\\text{Bob}}_\\hor(s_\\hor) \\right) - \\left( V^{\\text{Bob}_0}(s_0) + \\cdots + V^{\\text{Bob}}_{\\hor-1}(s_{\\hor-1}) \\right) \\right] \\\\\n&= V^{\\text{Alice}}_0(s) - V^{\\text{Bob}}_0(s)\n\\end{aligned}\n\nas desired. (Note that the “inner” expectation from expanding the advantage function has the same distribution as the outer one, so omitting it here is valid.)\n\nThe PDL gives insight into why fitted approaches such as PI don’t work as well in the “full” RL setting.\nTo see why, let’s consider a single iteration of policy iteration, where policy π gets updated to \\tilde \\pi. We’ll assume these policies are deterministic.\nSuppose the new policy \\tilde \\pi chooses some action with a negative advantage with respect to π.\nThat is, when acting according to π, taking the action from \\tilde \\pi would perform worse than expected.\nDefine \\Delta_\\infty to be the most negative advantage, that is, \\Delta_\\infty = \\min_{s \\in \\mathcal{S}} A^{\\pi}_\\hi(s, \\tilde \\pi(s)).\nPlugging this into the \n\nTheorem 1 gives\\begin{aligned}\nV_0^{\\tilde \\pi}(s) - V_0^{\\pi}(s) &= \\E_{\\tau \\sim \\rho_{\\tilde \\pi, s}} \\left[\n\\sum_{\\hi=0}^{\\hor-1} A_\\hi^{\\pi}(s_\\hi, a_\\hi)\n\\right] \\\\\n&\\ge H \\Delta_\\infty \\\\\nV_0^{\\tilde \\pi}(s) &\\ge V_0^{\\pi}(s) - H|\\Delta_\\infty|.\n\\end{aligned}\n\nThat is, for some state s, the lower bound on the performance of \\tilde \\pi is lower than the performance of π.\nThis doesn’t state that \\tilde \\pi will necessarily perform worse than π,\nonly suggests that it might be possible.\nIf these worst case states do exist, though,\nPI does not avoid situations where the new policy often visits them;\nIt does not enforce that the trajectory distributions \\rho_\\pi and \\rho_{\\tilde \\pi} be close to each other.\nIn other words, the “training distribution” that our prediction rule is fitted on, \\rho_\\pi, may differ significantly from the “evaluation distribution” \\rho_{\\tilde \\pi}. \nThis is an instance of *distributional shift*.\nTo begin, let's ask, where *do* fitted approaches work well?\nThey are commonly seen in SL,\nwhere a prediction rule is fit using some labelled training set,\nand then assessed on a test set from the same distribution.\nBut policy iteration isn't performed in the same scenario:\nthere is now _distributional shift_ between the different iterations of the policy. \n\nOn the other hand, policy gradient methods do, albeit implicitly,\nencourage \\rho_\\pi and \\rho_{\\tilde \\pi} to be similar.\nSuppose that the mapping from policy parameters to trajectory distributions is relatively smooth.\nThen, by adjusting the parameters only a small distance,\nthe new policy will also have a similar trajectory distribution.\nBut this is not very rigorous, and in practice the parameter-to-distribution mapping may not be so smooth.\nCan we constrain the distance between the resulting distributions more explicitly?\n\nThis brings us to the next three methods:\n\ntrust region policy optimization (TRPO), which explicitly constrains the difference between the distributions before and after each step;\n\nthe natural policy gradient (NPG), a first-order approximation of TRPO;\n\nproximal policy optimization (PPO), a “soft relaxation” of TRPO.\n\n","type":"content","url":"/pg#comparing-policy-gradient-algorithms-to-policy-iteration","position":19},{"hierarchy":{"lvl1":"6 Policy Gradient Methods","lvl2":"Trust region policy optimization"},"type":"lvl2","url":"/pg#trust-region-policy-optimization","position":20},{"hierarchy":{"lvl1":"6 Policy Gradient Methods","lvl2":"Trust region policy optimization"},"content":"We saw above that policy gradient methods are effective because they implicitly constrain how much the policy changes at each iteration.\nCan we design an algorithm that explicitly constrains the “step size”?\nThat is, we want to improve the policy as much as possible,\nmeasured in terms of the r.h.s. of the \n\nTheorem 1,\nwhile ensuring that its trajectory distribution does not change too much:\\begin{aligned}\n\\theta^{k+1} &\\gets \\arg\\max_{\\theta^{\\text{opt}}} \\E_{s_0, \\dots, s_{H-1} \\sim \\pi^{k}} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\E_{a_\\hi \\sim \\pi^{\\theta^\\text{opt}}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi) \\right] \\\\\n& \\text{where } \\text{distance}(\\rho_{\\theta^{\\text{opt}}}, \\rho_{\\theta^k}) < \\delta\n\\end{aligned}\n\nNote that we have made a small change to the r.h.s. expression:\nwe use the states sampled from the old policy, and only use the actions from the new policy.\nIt would be computationally infeasible to sample entire trajectories from \\pi_\\theta as we are optimizing over θ.\nOn the other hand, if \\pi_\\theta returns a vector representing a probability distribution over actions,\nthen evaluating the expected advantage with respect to this distribution only requires taking a dot product.\nThis approximation also matches the r.h.s. of the PDL to first order in θ.\n(We will elaborate more on this later.)\n\nHow do we describe the distance between \\rho_{\\theta^{\\text{opt}}} and \\rho_{\\theta^k}?\nWe’ll use the Kullback-Leibler divergence (KLD):\n\nKullback-Leibler divergence\n\nFor two PDFs p, q,\\kl{p}{q} := \\E_{x \\sim p} \\left[ \\log \\frac{p(x)}{q(x)} \\right]\n\nThis can be interpreted in many different ways, many stemming from information theory.\nOne such interpretation is that \\kl{p}{q} describes my average “surprise” if I think data is being generated by q but it’s actually generated by p.\n(The surprise of an event with probability p is - \\log_2 p.)\nNote that \\kl{p}{q} = 0 if and only if p = q. Also note that it is generally not symmetric.\n\nBoth the objective function and the KLD constraint involve a weighted average over the space of all trajectories.\nThis is intractable in general, so we need to estimate the expectation.\nAs before, we can do this by taking an empirical average over samples from the trajectory distribution.\nThis gives us the following pseudocode:\n\nTrust region policy optimization (exact)def trpo_pseudocode(env, δ, θ_init, M):\n θ = θ_init\n for k in range(K):\n trajectories = sample_trajectories(env, π(θ), M)\n A_hat = fit(trajectories)\n \n def approximate_gain(θ_):\n total_advantage = 0\n for τ in trajectories:\n for s, _a, _r in τ:\n for a in env.action_space:\n total_advantage += π(θ)(s, a) * A_hat(s, a)\n return total_advantage\n \n def constraint(θ_):\n kl_div = 0\n for τ in trajectories:\n for s, a, _r in τ:\n kl_div += jnp.log(π(θ)(s, a)) - jnp.log(π(θ_)(s, a))\n return kl_div <= δ\n \n θ = optimize(approximate_gain, constraint)\n\n return θ\nApplying importance sampling allows us to estimate the TRPO objective as follows:\n\n::::{prf:definition} Trust region policy optimization (implementation)\n:label: trpo_implement\n\n:::{prf:definitionic} TODO\nInitialize $\\theta^0$\n\nSample $N$ trajectories from $\\rho^k$ to learn a value estimator $\\tilde b_\\hi(s) \\approx V^{\\pi^k}_\\hi(s)$\n\nSample $M$ trajectories $\\tau_0, \\dots, \\tau_{M-1} \\sim \\rho^k$\n\n$$\\begin{gathered}\n \\theta^{k+1} \\gets \\arg\\max_{\\theta} \\frac{1}{M} \\sum_{m=0}^{M-1} \\sum_{h=0}^{H-1} \\frac{\\pi_\\theta(a_\\hi \\mid s_\\hi)}{\\pi^k(a_\\hi \\mid s_\\hi)} [ R_\\hi(\\tau_m) - \\tilde b_\\hi(s_\\hi) ] \\\\\n \\text{where } \\sum_{m=0}^{M-1} \\sum_{h=0}^{H-1} \\log \\frac{\\pi_k(a_\\hi^m \\mid s_\\hi^m)}{\\pi_\\theta(a_\\hi^m \\mid s_\\hi^m)} \\le \\delta\n \n\\end{gathered}$$\n:::\n:::: \n\nThe above isn’t entirely complete:\nwe still need to solve the actual optimization problem at each step.\nUnless we know additional properties of the problem,\nthis might be an intractable optimization.\nDo we need to solve it exactly, though?\nInstead, if we assume that both the objective function and the constraint are somewhat smooth in terms of the policy parameters,\nwe can use their Taylor expansions to give us a simpler optimization problem with a closed-form solution.\nThis brings us to the natural policy gradient algorithm.\n\n","type":"content","url":"/pg#trust-region-policy-optimization","position":21},{"hierarchy":{"lvl1":"6 Policy Gradient Methods","lvl2":"Natural policy gradient"},"type":"lvl2","url":"/pg#natural-policy-gradient","position":22},{"hierarchy":{"lvl1":"6 Policy Gradient Methods","lvl2":"Natural policy gradient"},"content":"We take a linear (first-order) approximation to the objective function and a quadratic (second-order) approximation to the KL divergence constraint about the current estimate \\theta^k.\nThis results in the optimization problem\\begin{gathered}\n \\max_\\theta \\nabla_\\theta J(\\pi_{\\theta^k})^\\top (\\theta - \\theta^k) \\\\\n \\text{where } \\frac{1}{2} (\\theta - \\theta^k)^\\top F_{\\theta^k} (\\theta - \\theta^k) \\le \\delta\n\\end{gathered}\n\nwhere F_{\\theta^k} is the Fisher information matrix defined below.\n\nFisher information matrix\n\nLet p_\\theta denote a parameterized distribution.\nIts Fisher information matrix F_\\theta can be defined equivalently as:\\begin{aligned}\n F_{\\theta} & = \\E_{x \\sim p_\\theta} \\left[ (\\nabla_\\theta \\log p_\\theta(x)) (\\nabla_\\theta \\log p_\\theta(x))^\\top \\right] & \\text{covariance matrix of the Fisher score} \\\\\n & = \\E_{x \\sim p_{\\theta}} [- \\nabla_\\theta^2 \\log p_\\theta(x)] & \\text{average Hessian of the negative log-likelihood}\n\\end{aligned}\n\nRecall that the Hessian of a function describes its curvature:\nfor a vector \\delta \\in \\Theta,\nthe quantity \\delta^\\top F_\\theta \\delta describes how rapidly the negative log-likelihood changes if we move by δ.\nThe Fisher information matrix is precisely the Hessian of the KL divergence (with respect to either one of the parameters).\n\nIn particular, when p_\\theta = \\rho_{\\theta} denotes a trajectory distribution, we can further simplify the expression:F_{\\theta} = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{h=0}^{H-1} (\\nabla \\log \\pi_\\theta (a_\\hi \\mid s_\\hi)) (\\nabla \\log \\pi_\\theta(a_\\hi \\mid s_\\hi))^\\top \\right]\n\nNote that we’ve used the Markov property to cancel out the cross terms corresponding to two different time steps.\n\nThis is a convex optimization problem with a closed-form solution.\nTo see why, it helps to visualize the case where θ is two-dimensional:\nthe constraint describes the inside of an ellipse,\nand the objective function is linear,\nso we can find the extreme point on the boundary of the ellipse.\nWe recommend \n\nBoyd & Vandenberghe (2004) for a comprehensive treatment of convex optimization.\n\nMore generally, for a higher-dimensional θ,\nwe can compute the global optima by setting the gradient of the Lagrangian to zero:\\begin{aligned}\n \\mathcal{L}(\\theta, \\alpha) & = \\nabla J(\\pi_{\\theta^k})^\\top (\\theta - \\theta^k) - \\alpha \\left[ \\frac{1}{2} (\\theta - \\theta^k)^\\top F_{\\theta^k} (\\theta - \\theta^k) - \\delta \\right] \\\\\n \\nabla \\mathcal{L}(\\theta^{k+1}, \\alpha) & := 0 \\\\\n \\implies \\nabla J(\\pi_{\\theta^k}) & = \\alpha F_{\\theta^k} (\\theta^{k+1} - \\theta^k) \\\\\n \\theta^{k+1} & = \\theta^k + \\eta F_{\\theta^k}^{-1} \\nabla J(\\pi_{\\theta^k}) \\\\\n \\text{where } \\eta & = \\sqrt{\\frac{2 \\delta}{\\nabla J(\\pi_{\\theta^k})^\\top F_{\\theta^k}^{-1} \\nabla J(\\pi_{\\theta^k})}}\n\\end{aligned}\n\nThis gives us the closed-form update.\nNow the only challenge is to estimate the Fisher information matrix,\nsince, as with the KL divergence constraint, it is an expectation over trajectories, and computing it exactly is therefore typically intractable.\n\nNatural policy gradient\n\nHow many trajectory samples do we need to accurately estimate the Fisher information matrix?\nAs a rule of thumb, the sample complexity should scale with the dimension of the parameter space.\nThis makes this approach intractable in the deep learning setting where we might have a very large number of parameters.\n\nAs you can see, the NPG is the “basic” policy gradient algorithm we saw above,\nbut with the gradient transformed by the inverse Fisher information matrix.\nThis matrix can be understood as accounting for the geometry of the parameter space.\nThe typical gradient descent algorithm implicitly measures distances between parameters using the typical Euclidean distance.\nHere, where the parameters map to a distribution, using the natural gradient update is equivalent to optimizing over distribution space rather than parameter space,\nwhere distance between distributions is measured by the \n\nDefinition 3.\n\nNatural gradient on a simple problem\n\nLet’s step away from RL and consider the following optimization problem over Bernoulli distributions \\pi \\in \\Delta(\\{ 0, 1 \\}):\\begin{aligned}\n J(\\pi) & = 100 \\cdot \\pi(1) + 1 \\cdot \\pi(0)\n\\end{aligned}\n\nWe can think of the space of such distributions as the line between (0, 1) to (1, 0) on the Cartesian plane:\n\nClearly the optimal distribution is the constant one \\pi(1) = 1. Suppose we optimize over the parameterized family \\pi_\\theta(1) = \\frac{\\exp(\\theta)}{1+\\exp(\\theta)}.\nThen our optimization algorithm should set θ to be unboundedly large.\nThen the “vanilla” gradient is\\nabla_\\theta J(\\pi_\\theta) = \\frac{99 \\exp(\\theta)}{(1 + \\exp(\\theta))^2}.\n\nNote that as \\theta \\to \\infty that the increments get closer and closer to 0;\nthe rate of increase becomes exponentially slow.\n\nHowever, if we compute the Fisher information “matrix” (which is just a scalar in this case), we can account for the geometry induced by the parameterization.\\begin{aligned}\n F_\\theta & = \\E_{x \\sim \\pi_\\theta} [ (\\nabla_\\theta \\log \\pi_\\theta(x))^2 ] \\\\\n & = \\frac{\\exp(\\theta)}{(1 + \\exp(\\theta))^2}.\n\\end{aligned}\n\nThis gives the natural gradient update\\begin{aligned}\n \\theta^{k+1} & = \\theta^k + \\eta F_{\\theta^k}^{-1} \\nabla_ \\theta J(\\theta^k) \\\\\n & = \\theta^k + 99 \\eta\n\\end{aligned}\n\nwhich increases at a constant rate, i.e. improves the objective more quickly than “vanilla” gradient ascent.\n\nThough the NPG now gives a closed-form optimization step,\nit requires computing the inverse Fisher information matrix,\nwhich typically scales as O((\\dim \\Theta)^3).\nThis can be expensive if the parameter space is large.\nCan we find an algorithm that works in linear time with respect to the dimension of the parameter space?\n\n","type":"content","url":"/pg#natural-policy-gradient","position":23},{"hierarchy":{"lvl1":"6 Policy Gradient Methods","lvl2":"Proximal policy optimization"},"type":"lvl2","url":"/pg#proximal-policy-optimization","position":24},{"hierarchy":{"lvl1":"6 Policy Gradient Methods","lvl2":"Proximal policy optimization"},"content":"We can relax the TRPO optimization problem in a different way:\nRather than imposing a hard constraint on the KL distance,\nwe can instead impose a soft constraint by incorporating it into the objective and penalizing parameter values that drastically change the trajectory distribution.\\begin{aligned}\n\\theta^{k+1} &\\gets \\arg\\max_{\\theta} \\E_{s_0, \\dots, s_{H-1} \\sim \\rho_{\\pi^{k}}} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\E_{a_\\hi \\sim \\pi_{\\theta}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi) \\right] - \\lambda \\kl{\\rho_{\\theta}}{\\rho_{\\theta^k}}\n\\end{aligned}\n\nHere λ is a regularization hyperparameter that controls the tradeoff between the two terms.\nThis is the objective of the proximal policy optimization algorithm \n\nSchulman et al. (2017).\n\nLike the original TRPO algorithm \n\nDefinition 4,\nPPO is not gradient-based; rather, at each step, we try to maximize local advantage relative to the current policy.\n\nHow do we solve this optimization?\nLet us begin by simplifying the \\kl{\\rho_{\\pi^k}}{\\rho_{\\pi_{\\theta}}} term. Expanding gives\\begin{aligned}\n \\kl{\\rho_{\\pi^k}}{\\rho_{\\pi_{\\theta}}} & = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[\\log \\frac{\\rho_{\\pi^k}(\\tau)}{\\rho_{\\pi_{\\theta}}(\\tau)}\\right] \\\\\n & = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\log \\frac{\\pi^k(a_\\hi \\mid s_\\hi)}{\\pi_{\\theta}(a_\\hi \\mid s_\\hi)}\\right] & \\text{state transitions cancel} \\\\\n & = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\log \\frac{1}{\\pi_{\\theta}(a_\\hi \\mid s_\\hi)}\\right] + c\n\\end{aligned}\n\nwhere c is some constant with respect to θ, and can be ignored.\nThis gives the objective\\ell^k(\\theta)\n=\n\\E_{s_0, \\dots, s_{H-1} \\sim \\rho_{\\pi^{k}}} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\E_{a_\\hi \\sim \\pi_{\\theta}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi) \\right] - \\lambda \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\log \\frac{1}{\\pi_{\\theta}(a_\\hi \\mid s_\\hi)}\\right]\n\nOnce again, this takes an expectation over trajectories.\nBut here we cannot directly sample trajectories from \\pi^k,\nsince in the first term, the actions actually come from \\pi_\\theta.\nTo make this term line up with the other expectation,\nwe would need the actions to also come from \\pi^k.\n\nThis should sound familiar:\nwe want to estimate an expectation over one distribution by sampling from another.\nWe can once again use \n\nSection 3.2 to rewrite the inner expectation:\\E_{a_\\hi \\sim \\pi_{\\theta}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi)\n=\n\\E_{a_\\hi \\sim \\pi^k(s_\\hi)} \\frac{\\pi_\\theta(a_\\hi \\mid s_\\hi)}{\\pi^k(a_\\hi \\mid s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi)\n\nNow we can combine the expectations together to get the objective\\ell^k(\\theta) = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\left( \\frac{\\pi_\\theta(a_\\hi \\mid s_\\hi)}{\\pi^k(a_\\hi \\mid s_\\hi)} A^{\\pi^k}(s_\\hi, a_\\hi) - \\lambda \\log \\frac{1}{\\pi_\\theta(a_\\hi \\mid s_\\hi)} \\right) \\right]\n\nNow we can estimate this function by a sample average over trajectories from \\pi^k.\nRemember that to complete a single iteration of PPO,\nwe execute\\theta^{k+1} \\gets \\arg\\max_{\\theta} \\ell^k(\\theta).\n\nIf \\ell^k is differentiable, we can optimize it by gradient ascent, completing a single iteration of PPO.def ppo_pseudocode(\n env,\n π: Callable[[Params], Callable[[State, Action], Float]],\n λ: float,\n θ_init: Params,\n n_iters: int,\n n_fit_trajectories: int,\n n_sample_trajectories: int,\n):\n θ = θ_init\n for k in range(n_iters):\n fit_trajectories = sample_trajectories(env, π(θ), n_fit_trajectories)\n A_hat = fit(fit_trajectories)\n\n sample_trajectories = sample_trajectories(env, π(θ), n_sample_trajectories)\n \n def objective(θ_opt):\n total_objective = 0\n for τ in sample_trajectories:\n for s, a, _r in τ:\n total_objective += π(θ_opt)(s, a) / π(θ)(s, a) * A_hat(s, a) + λ * jnp.log(π(θ_opt)(s, a))\n return total_objective / n_sample_trajectories\n \n θ = optimize(objective, θ)\n\n return θ","type":"content","url":"/pg#proximal-policy-optimization","position":25},{"hierarchy":{"lvl1":"6 Policy Gradient Methods","lvl2":"Summary"},"type":"lvl2","url":"/pg#summary","position":26},{"hierarchy":{"lvl1":"6 Policy Gradient Methods","lvl2":"Summary"},"content":"Policy gradient methods are a powerful family of algorithms that directly optimize the expected total reward by iteratively updating the policy parameters.\nPrecisely,\nwe estimate the gradient of the expected total reward (with respect to the parameters),\nand update the parameters in that direction.\nBut estimating the gradient is a tricky task!\nWe saw many ways to reduce the variance of the gradient estimator,\nculminating in the advantage-based expression \n\n(29).\n\nBut updating the parameters doesn’t entirely solve the problem:\nSometimes, a small step in the parameters might lead to a big step in the policy.\nTo avoid changing the policy too much at each step,\nwe must account for the curvature in the parameter space.\nWe first did this explicitly with \n\nDefinition 4,\nand then saw ways to relax the constraint in \n\nDefinition 6 and \n\nSection 9.\n\nThese are still popular methods to this day,\nespecially because they efficiently integrate with deep neural networks for representing complex functions.","type":"content","url":"/pg#summary","position":27},{"hierarchy":{"lvl1":"8 Tree Search Methods"},"type":"lvl1","url":"/planning","position":0},{"hierarchy":{"lvl1":"8 Tree Search Methods"},"content":"","type":"content","url":"/planning","position":1},{"hierarchy":{"lvl1":"8 Tree Search Methods","lvl2":"Introduction"},"type":"lvl2","url":"/planning#introduction","position":2},{"hierarchy":{"lvl1":"8 Tree Search Methods","lvl2":"Introduction"},"content":"Have you ever lost a strategy game against a skilled opponent?\nIt probably seemed like they were ahead of you at every turn.\nThey might have been planning ahead and anticipating your actions,\nthen planning around them in order to win.\nIf this opponent was a computer,\nthey might have been using one of the strategies that we are about to explore.","type":"content","url":"/planning#introduction","position":3},{"hierarchy":{"lvl1":"8 Tree Search Methods","lvl2":"Deterministic, zero sum, fully observable two-player games"},"type":"lvl2","url":"/planning#deterministic-zero-sum-fully-observable-two-player-games","position":4},{"hierarchy":{"lvl1":"8 Tree Search Methods","lvl2":"Deterministic, zero sum, fully observable two-player games"},"content":"In this chapter, we will focus on games that are:\n\ndeterministic,\n\nzero sum (one player wins and the other loses),\n\nfully observable, that is, the state of the game is perfectly known by both players,\n\nfor two players that alternate turns,\n\nWe can represent such a game as a complete game tree.\nEach possible state is a node in the tree,\nand since we only consider deterministic games,\nwe can represent actions as edges leading from the current state to the next.\nEach path through the tree, from root to leaf, represents a single game.\n\n\n\nThe first two layers of the complete game tree of tic-tac-toe.\nFrom Wikimedia.\n\nIf you could store the complete game tree on a computer,\nyou would be able to win every potentially winnable game\nby searching all paths from your current state and taking a winning move.\nWe will see an explicit algorithm for this in \n\nthe next section.\nHowever, as games become more complex,\nit becomes computationally impossible to search every possible path.\n\nFor instance,\na chess player has roughly 30 actions to choose from at each turn,\nand each game takes roughly 40 moves per player,\nso trying to solve chess exactly using minimax\nwould take somewhere on the order of 30^{80} \\approx 10^{118} operations.\nThat’s 10 billion billion billion billion billion billion billion billion billion billion billion billion billion operations.\nAs of the time of writing,\nthe fastest processor can achieve almost 10 GHz (10 billion operations per second),\nso to fully solve chess using minimax is many, many orders of magnitude out of reach.\n\nIt is thus intractable, in any realistic setting, to solve the complete game tree exactly.\nLuckily, only a small fraction of those games ever occur in reality;\nLater in this chapter,\nwe will explore ways to prune away parts of the tree that we know we can safely ignore.\nWe can also approximate the value of a state without fully evaluating it.\nUsing these approximations, we can no longer guarantee winning the game,\nbut we can come up with strategies that will do well against most opponents.","type":"content","url":"/planning#deterministic-zero-sum-fully-observable-two-player-games","position":5},{"hierarchy":{"lvl1":"8 Tree Search Methods","lvl3":"Notation","lvl2":"Deterministic, zero sum, fully observable two-player games"},"type":"lvl3","url":"/planning#notation","position":6},{"hierarchy":{"lvl1":"8 Tree Search Methods","lvl3":"Notation","lvl2":"Deterministic, zero sum, fully observable two-player games"},"content":"Let us now describe these games formally.\nWe’ll call the first player Max and the second player Min.\nMax seeks to maximize the final game score,\nwhile Min seeks to minimize the final game score.\n\nWe’ll use \\mathcal{S} to denote the set of all possible game states.\n\nThe game begins in some initial state s_0 \\in \\mathcal{S}.\n\nMax moves on even turn numbers h = 2n,\nand Min moves on odd turn numbers h = 2n+1,\nwhere n is a natural number.\n\nThe space of possible actions, \\mathcal{A}_h(s),\ndepends on the state itself, as well as whose turn it is.\n(For example, in tic-tac-toe, Max can only play Xs while Min can only play Os.)\n\nThe game ends after H total moves (which might be even or odd). We call the final state a terminal state.\n\nP denotes the state transitions, that is,\nP(s, a) denotes the resulting state when taking action a \\in \\mathcal{A}(s) in state s. We’ll assume that this function is time-homogeneous (a.k.a. stationary) and doesn’t change across timesteps.\n\nr(s) denotes the game score of the terminal state s.\nNote that this is some positive or negative value seen by both players:\nA positive value indicates Max winning, a negative value indicates Min winning, and a value of 0 indicates a tie.\n\nWe also call the sequence of states and actions a trajectory.\n\nAttention\n\nAbove, we suppose that the game ends after H total moves.\nBut most real games have a variable length.\nHow would you describe this?\n\nTic-tac-toe\n\nLet us frame tic-tac-toe in this setting.\n\nEach of the 9 squares is either empty, marked X, or marked O.\nSo there are |\\mathcal{S}| = 3^9 potential states.\nNot all of these may be reachable!\n\nThe initial state s_0 is the empty board.\n\nThe set of possible actions for Max in state s, \\mathcal{A}_{2n}(s), is the set of tuples (\\text{``X''}, i) where i refers to an empty square in s.\nSimilarly, \\mathcal{A}_{2n+1}(s) is the set of tuples (\\text{``O''}, i) where i refers to an empty square in s.\n\nWe can take H = 9 as the longest possible game length.\n\nP(s, a) for a nonterminal state s is simply the board with the symbol and square specified by a marked into s. Otherwise, if s is a terminal state, i.e. it already has three symbols in a row, the state no longer changes.\n\nr(s) at a terminal state is +1 if there are three Xs in a row, -1 if there are three Os in a row, and 0 otherwise.\n\nOur notation may remind you of \n\nMarkov decision processes.\nGiven that these games also involve a sequence of states and actions,\ncan we formulate them as finite-horizon MDPs?\nThe two settings are not exactly analogous,\nsince in MDPs we only consider a single policy,\nwhile these games involve two distinct players with opposite objectives.\nSince we want to analyze the behavior of both players at the same time,\ndescribing such a game as an MDP is more trouble than it’s worth.","type":"content","url":"/planning#notation","position":7},{"hierarchy":{"lvl1":"8 Tree Search Methods","lvl2":"Min-max search *"},"type":"lvl2","url":"/planning#min-max-search","position":8},{"hierarchy":{"lvl1":"8 Tree Search Methods","lvl2":"Min-max search *"},"content":"Important\n\nThe course (Fall 2024) does not cover min-max search.\nThis content is here to provide background on optimally solving these deterministic, zero-sum, two-player games.\n\nIn the introduction,\nwe claimed that we could win any potentially winnable game by looking ahead and predicting the opponent’s actions.\nThis would mean that each nonterminal state already has some predetermined game score,\nthat is, in each state,\nit is already “obvious” which player is going to win.\n\nLet V_\\hi^\\star(s) denote the game score under optimal play from both players starting in state s at time \\hi.\n\nMin-max search algorithmV_\\hi^{\\star}(s) = \\begin{cases}\nr(s) & \\hi = \\hor \\\\\n\\max_{a \\in \\mathcal{A}_\\hi(s)} V_{\\hi+1}^{\\star}(P(s, a)) & \\hi \\text{ is even and } \\hi < H \\\\\n\\min_{a \\in \\mathcal{A}_\\hi(s)} V_{\\hi+1}^{\\star}(P(s, a)) & \\hi \\text{ is odd and } \\hi < H \\\\\n\\end{cases}\n\nWe can compute this by starting at the terminal states,\nwhen the game’s outcome is known,\nand working backwards,\nassuming that Max chooses the action that leads to the highest score\nand Min chooses the action that leads to the lowest score.\n\nThis translates directly into a recursive depth-first search algorithm for searching the complete game tree.def minimax_search(s, player) -> Tuple[\"Action\", \"Value\"]:\n \"\"\"Return the value of the state (for Max) and the best action for Max to take.\"\"\"\n if env.is_terminal(s):\n return None, env.winner(s)\n\n if player is max:\n a_max, v_max = None, None\n for a in actions:\n _, v = minimax_search(env.step(s, a), min)\n if v > v_max:\n a_max, v_max = a, v\n return a_max, v_max\n else:\n a_min, v_min = None, None\n for a in actions:\n _, v = minimax_search(env.step(s, a), max)\n if v < v_min:\n a_min, v_min = a, v\n return a_min, v_min\n\nMin-max search for a simple game\n\nConsider a simple game with just two steps: Max chooses one of three possible actions (A, B, C),\nand then Min chooses one of three possible actions (D, E, F).\nThe combination leads to a certain integer outcome,\nshown in the table below:\n\n\n\nD\n\nE\n\nF\n\nA\n\n4\n\n-2\n\n5\n\nB\n\n-3\n\n3\n\n1\n\nC\n\n0\n\n3\n\n-1\n\nWe can visualize this as the following complete game tree,\nwhere each box contains the value V_\\hi^\\star(s) of that node.\nThe min-max values of the terminal states are already known:\n\nWe begin min-max search at the root,\nexploring each of Max’s actions.\nSuppose Max chooses action A.\nThen Min will choose action E to minimize the game score,\nmaking the value of this game node \\min(4, -2, 5) = -2.\n\nSimilarly, if Max chooses action B,\nthen Min will choose action D,\nand if Max chooses action C,\nthen Min will choose action F.\nWe can fill in the values of these nodes accordingly:\n\nThus, Max’s best move is to take action C,\nresulting in a game score of \\max(-2, -3, -1) = -1.","type":"content","url":"/planning#min-max-search","position":9},{"hierarchy":{"lvl1":"8 Tree Search Methods","lvl3":"Complexity of min-max search","lvl2":"Min-max search *"},"type":"lvl3","url":"/planning#complexity-of-min-max-search","position":10},{"hierarchy":{"lvl1":"8 Tree Search Methods","lvl3":"Complexity of min-max search","lvl2":"Min-max search *"},"content":"At each of the \\hor timesteps,\nthis algorithm iterates through the entire action space at that state,\nand therefore has a time complexity of \\hor^{n_A}\n(where n_A is the largest number of actions possibly available at once).\nThis makes the min-max algorithm impractical for even moderately sized games.\n\nBut do we need to compute the exact value of every possible state?\nInstead, is there some way we could “ignore” certain actions and their subtrees\nif we already know of better options?\nThe alpha-beta search makes use of this intuition.","type":"content","url":"/planning#complexity-of-min-max-search","position":11},{"hierarchy":{"lvl1":"8 Tree Search Methods","lvl2":"Alpha-beta search"},"type":"lvl2","url":"/planning#alpha-beta-search","position":12},{"hierarchy":{"lvl1":"8 Tree Search Methods","lvl2":"Alpha-beta search"},"content":"The intuition behind alpha-beta search is as follows:\nSuppose Max is in state s,\nand considering whether to take action a or a'.\nIf at any point they find out that action a' is definitely worse than (or equal to) action a,\nthey don’t need to evaluate action a' any further.\n\nConcretely, we run min-max search as above,\nexcept now we keep track of two additional parameters \\alpha(s) and \\beta(s) while evaluating each state:\n\nStarting in state s, Max can achieve a game score of at least \\alpha(s) assuming Min plays optimally. That is, V^\\star_\\hi(s) \\ge \\alpha(s) at all points.\n\nAnalogously, starting in state s, Min can ensure a game score of at most \\beta(s) assuming Max plays optimally. That is, V^\\star_\\hi(s) \\le \\beta(s) at all points.\n\nSuppose we are evaluating V^\\star_\\hi(s),\nwhere it is Max’s turn (\\hi is even).\nWe update \\alpha(s) to be the highest minimax value achievable from s so far.\nThat is, the value of s is at least \\alpha(s).\nSuppose Max chooses action a, which leads to state s', in which it is Min’s turn.\nIf any of Min’s actions in s' achieve a value V^\\star_{\\hi+1}(s') \\le \\alpha(s),\nwe know that Max would not choose action a,\nsince they know that it is worse than whichever action gave the value \\alpha(s).\nSimilarly, to evaluate a state on Min’s turn,\nwe update \\beta(s) to be the lowest value achievable from s so far.\nThat is, the value of s is at most \\beta(s).\nSuppose Min chooses action a,\nwhich leads to state s' for Max.\nIf Max has any actions that do better than \\beta(s),\nthey would take it,\nmaking action a a suboptimal choice for Min.\n\nAlpha-beta search for a simple game\n\nLet us use the same simple game from \n\nExample 8.2.\nWe list the values of \\alpha(s), \\beta(s) in each node throughout the algorithm.\nThese values are initialized to -\\infty, +\\infty respectively.\nWe shade any squares that have not been visited by the algorithm,\nand we assume that actions are evaluated from left to right.\n\nSuppose Max takes action A. Let s' be the resulting game state.\nThe values of \\alpha(s') and \\beta(s')\nare initialized at the same values as the root state,\nsince we want to prune a subtree if there exists a better action at any step higher in the tree.\n\nThen we iterate through Min’s possible actions,\nupdating the value of \\beta(s') as we go.\n\n\n\n\nOnce the value of state s' is fully evaluated,\nwe know that Max can achieve a value of at least -2 starting from the root,\nand so we update \\alpha(s), where s is the root state:\n\nThen Max imagines taking action B. Again, let s' denote the resulting game state.\nWe initialize \\alpha(s') and \\beta(s') from the root:\n\nNow suppose Min takes action D, resulting in a value of -3.\nWe see that V^\\star_\\hi(s') = \\min(-3, x, y),\nwhere x and y are the values of the remaining two actions.\nBut since \\min(-3, x, y) \\le -3,\nwe know that the value of s' is at most -3.\nBut Max can achieve a better value of \\alpha(s') = -2 by taking action A,\nand so Max will never take action B,\nand we can prune the search here.\nWe will use dotted lines to indicate states that have been ruled out from the search:\n\nFinally, suppose Max takes action C.\nFor Min’s actions D and E,\nthere is still a chance that action C might outperform action A,\nso we continue expanding:\n\n\n\n\nFinally, we see that Min taking action F achieves the minimum value at this state.\nThis shows that optimal play is for Max to take action C,\nand Min to take action F.def alpha_beta_search(s, player, alpha, beta) -> Tuple[\"Action\", \"Value\"]:\n \"\"\"Return the value of the state (for Max) and the best action for Max to take.\"\"\"\n if env.is_terminal(s):\n return None, env.winner(s)\n\n if player is max:\n a_max, v_max = None, None\n for a in actions:\n _, v = minimax_search(env.step(s, a), min, alpha, beta)\n if v > v_max:\n a_max, v_max = a, v\n alpha = max(alpha, v)\n if v_max >= beta:\n # we know Min will not choose the action that leads to this state\n return a_max, v_max\n return a_max, v_max\n\n else:\n a_min, v_min = None, None\n for a in actions:\n _, v = minimax_search(env.step(s, a), max)\n if v < v_min:\n a_min, v_min = a, v\n beta = min(beta, v)\n if v_min <= alpha:\n # we know Max will not choose the action that leads to this state\n return a_min, v_min\n return a_min, v_min\n\nHow do we choose what order to explore the branches?\nAs you can tell, this significantly affects the efficiency of the pruning algorithm.\nIf Max explores the possible actions in order from worst to best,\nthey will not be able to prune any branches at all!\nAdditionally, to verify that an action is suboptimal,\nwe must run the search recursively from that action,\nwhich ultimately requires traversing the tree all the way to a leaf node.\nThe longer the game might possibly last,\nthe more computation we have to run.\n\nIn practice, we can often use background information about the game to develop a heuristic for evaluating possible actions.\nIf a technique is based on background information or intuition,\nespecially if it isn’t rigorously justified,\nwe call it a heuristic.\n\nCan we develop heuristic methods for tree exploration that works for all sorts of games? Here's where we can incorporate the _reinforcement learning_ ","type":"content","url":"/planning#alpha-beta-search","position":13},{"hierarchy":{"lvl1":"8 Tree Search Methods","lvl2":"Monte Carlo Tree Search"},"type":"lvl2","url":"/planning#monte-carlo-tree-search","position":14},{"hierarchy":{"lvl1":"8 Tree Search Methods","lvl2":"Monte Carlo Tree Search"},"content":"The task of evaluating actions in a complex environment might seem familiar.\nWe’ve encountered this problem before in both the \n\nmulti-armed bandits setting and the \n\nMarkov decision process setting.\nNow we’ll see how to combine concepts from these to form a more general and efficient tree search heuristic called Monte Carlo Tree Search (MCTS).\n\nWhen a problem is intractable to solve exactly,\nwe often turn to approximate algorithms that sacrifice some accuracy in exchange for computational efficiency.\nMCTS also improves on alpha-beta search in this sense.\nAs the name suggests,\nMCTS uses Monte Carlo simulation, that is, collecting random samples and computing the sample statistics,\nin order to approximate the value of each action.\n\nAs before, we imagine a complete game tree in which each path represents an entire game.\nThe goal of MCTS is to assign values to only the game states that are relevant to the current game;\nWe gradually expand the tree at each move.\nFor comparison, in alpha-beta search,\nthe entire tree only needs to be solved once,\nand from then on,\nchoosing an action is as simple as taking a maximum over the previously computed values.\n\nThe crux of MCTS is approximating the win probability of a state by a sample probability.\nIn practice, MCTS is used for games with binary outcomes where r(s) \\in \\{ +1, -1 \\},\nand so this is equivalent to approximating the final game score.\nTo approximate the win probability from state s,\nMCTS samples random games starting in s and computes the sample proportion of those that the player wins.\n\nNote that, for a given state s,\nchoosing the best action a can be framed as a \n\nmulti-armed bandits problem,\nwhere each action corresponds to an arm,\nand the reward distribution of arm k is the distribution of the game score over random games after choosing that arm.\nThe most commonly used bandit algorithm in practice for MCTS is the \n\nUpper Confidence Bound (UCB) algorithm.\n\nSummary of UCB\n\nLet us quickly review the UCB bandit algorithm.\nFor each arm k, we track the sample mean\\hat \\mu^k_t = \\frac{1}{N_t^k} \\sum_{\\tau=0}^{t-1} \\ind{a_\\tau = k} r_\\tau\n\nof all rewards from that arm up to time t.\nThen we construct a confidence intervalC_t^k = [\\hat \\mu^k_t - B_t^k, \\hat \\mu^k_t + B_t^k],\n\nwhere B_t^k = \\sqrt{\\frac{\\ln(2 t / \\delta)}{2 N_t^k}} is given by Hoeffding’s inequality,\nso that with probability δ (some fixed parameter we choose),\nthe true mean \\mu^k lies within C_t^k.\nNote that B_t^k scales like \\sqrt{1/N^k_t},\ni.e. the more we have visited that arm,\nthe more confident we get about it,\nand the narrower the confidence interval.\n\nTo select an arm, we pick the arm with the highest upper confidence bound.\n\nThis means that, for each edge (corresponding to a state-action pair (s, a)) in the game tree,\nwe keep track of the statistics required to compute its UCB:\n\nHow many times it has been “visited” (N_t^{s, a})\n\nHow many of those visits resulted in victory (\\sum_{\\tau=0}^{t-1} \\ind{(s_\\tau, a_\\tau) = (s, a)} r_\\tau).\nLet us call this latter value W^{s, a}_t (for number of “wins”).\n\nWhat does t refer to in the above expressions?\nRecall t refers to the number of time steps elapsed in the bandit environment.\nAs mentioned above,\neach state s corresponds to its own bandit environment,\nand so t refers to N^s, that is,\nhow many actions have been taken from state s.\nThis term, N^s, gets incremented as the algorithm runs;\nfor simplicity, we won’t introduce another index to track how it changes.\n\nMonte Carlo tree search algorithm\n\nInputs:\n\nT, the number of iterations per move\n\n\\pi_{\\text{rollout}}, the rollout policy for randomly sampling games\n\nc, a positive value that encourages exploration\n\nTo choose a single move starting at state s_{\\text{start}},\nMCTS first tries to estimate the UCB values for each of the possible actions \\mathcal{A}(s_\\text{start}),\nand then chooses the best one.\nTo estimate the UCB values,\nit repeats the following four steps T times:\n\nSelection: We start at s = s_{\\text{start}}. Let τ be an empty list that we will use to track states and actions.\n\nUntil s has at least one action that hasn’t been taken:\n\nChoose a \\gets \\argmax_k \\text{UCB}^{s, k}, where\n\\text{UCB}^{s, a} = \\frac{W^{s, a}}{N^s} + c \\sqrt{\\frac{\\ln N^s}{N^{s, a}}}\n\nAppend (s, a) to τ\n\nSet s \\gets P(s, a)\n\nExpansion: Let s_\\text{new} denote the final state in τ (that has at least one action that hasn’t been taken). Choose one of these unexplored actions from s_\\text{new}. Call it a_{\\text{new}}. Add it to τ.\n\nSimulation: Simulate a complete game episode by starting with the action a_{\\text{new}}\nand then playing according to \\pi_\\text{rollout}.\nThis results in the outcome r \\in \\{ +1, -1 \\}.\n\nBackup: For each (s, a) \\in \\tau:\n\nSet N^{s, a} \\gets N^{s, a} + 1\n\nW^{s, a} \\gets W^{s, a} + r\n\nSet N^s \\gets N^s + 1\n\nAfter T repeats of the above,\nwe return the action with the highest UCB value \n\n(8.4).\nThen play continues.\n\nBetween turns, we can keep the subtree whose statistics we have visited so far.\nHowever, the rest of the tree for the actions we did not end up taking gets discarded.\n\nThe application which brought the MCTS algorithm to fame was DeepMind’s AlphaGo \n\nSilver et al. (2016).\nSince then, it has been used in numerous applications ranging from games to automated theorem proving.\n\nHow accurate is this Monte Carlo estimation?\nIt depends heavily on the rollout policy \\pi_\\text{rollout}.\nIf the distribution \\pi_\\text{rollout} induces over games is very different from the distribution seen during real gameplay,\nwe might end up with a poor value approximation.","type":"content","url":"/planning#monte-carlo-tree-search","position":15},{"hierarchy":{"lvl1":"8 Tree Search Methods","lvl3":"Incorporating value functions and policies","lvl2":"Monte Carlo Tree Search"},"type":"lvl3","url":"/planning#incorporating-value-functions-and-policies","position":16},{"hierarchy":{"lvl1":"8 Tree Search Methods","lvl3":"Incorporating value functions and policies","lvl2":"Monte Carlo Tree Search"},"content":"To remedy this,\nwe might make use of a value function v : \\mathcal{S} \\to \\mathbb{R} that more efficiently approximates the value of a state.\nThen, we can replace the simulation step of \n\nMCTS with evaluating r = v(s_\\text{next}), where s_\\text{next} = P(s_\\text{new}, a_\\text{new}).\n\nWe might also make use of a “guiding” policy \\pi_\\text{guide} : \\mathcal{S} \\to \\triangle(\\mathcal{A}) that provides “intuition” as to which actions are more valuable in a given state.\nWe can scale the exploration term of \n\n(8.4) according to the policy’s outputs.\n\nPutting these together,\nwe can describe an updated version of MCTS that makes use of these value functions and policy:\n\nMonte Carlo tree search with policy and value functions\n\nInputs:\n\nT, the number of iterations per move\n\nv, a value function that evaluates how good a state is\n\n\\pi_\\text{guide}, a guiding policy that encourages certain actions\n\nc, a positive value that encourages exploration\n\nTo select a move in state s_\\text{start}, we repeat the following four steps T times:\n\nSelection: We start at s = s_{\\text{start}}. Let τ be an empty list that we will use to track states and actions.\n\nUntil s has at least one action that hasn’t been taken:\n\nChoose a \\gets \\argmax_k \\text{UCB}^{s, k}, where\n\\text{UCB}^{s, a} = \\frac{W^{s, a}}{N^s} + c \\cdot \\pi_\\text{guide}(a \\mid s) \\sqrt{\\frac{\\ln N^s}{N^{s, a}}}\n\nAppend (s, a) to τ\n\nSet s \\gets P(s, a)\n\nExpansion: Let s_\\text{new} denote the final state in τ (that has at least one action that hasn’t been taken). Choose one of these unexplored actions from s_\\text{new}. Call it a_{\\text{new}}. Add it to τ.\n\nSimulation: Let s_\\text{next} = P(s_\\text{new}, a_\\text{new}). Evaluate r = v(s_\\text{next}). This approximates the value of the game after taking the action a_\\text{new}.\n\nBackup: For each (s, a) \\in \\tau:\n\nN^{s, a} \\gets N^{s, a} + 1\n\nW^{s, a} \\gets W^{s, a} + r\n\nN^s \\gets N^s + 1\n\nWe finally return the action with the highest UCB value \n\n(8.5).\nThen play continues. As before, we can reuse the tree across timesteps.\n\nHow do we actually compute a useful \\pi_\\text{guide} and v?\nIf we have some existing dataset of trajectories,\nwe could use \n\nsupervised learning (that is, imitation learning)\nto generate a policy \\pi_\\text{guide} via behavioral cloning\nand learn v by regressing the game outcomes onto states.\nThen, plugging these into \n\nthe above algorithm\nresults in a stronger policy by using tree search to “think ahead”.\n\nBut we don’t have to stop at just one improvement step;\nwe could iterate this process via self-play.","type":"content","url":"/planning#incorporating-value-functions-and-policies","position":17},{"hierarchy":{"lvl1":"8 Tree Search Methods","lvl3":"Self-play","lvl2":"Monte Carlo Tree Search"},"type":"lvl3","url":"/planning#self-play","position":18},{"hierarchy":{"lvl1":"8 Tree Search Methods","lvl3":"Self-play","lvl2":"Monte Carlo Tree Search"},"content":"Recall the \n\npolicy iteration algorithm from the \n\nMDPs chapter.\nPolicy iteration alternates between policy evaluation (taking π and computing V^\\pi)\nand policy improvement (setting π to be greedy with respect to V^\\pi).\nAbove, we saw how MCTS can be thought of as a “policy improvement” operation:\nfor a given policy \\pi^0,\nwe can use it to guide MCTS,\nresulting in an algorithm that is itself a policy \\pi^0_\\text{MCTS} that maps from states to actions.\nNow, we can use \n\nbehavioral cloning\nto obtain a new policy \\pi^1 that imitates \\pi^0_\\text{MCTS}.\nWe can now use \\pi^1 to guide MCTS,\nand repeat.\n\nMCTS with self-play\n\nInput:\n\nA parameterized policy class \\pi_\\theta : \\mathcal{S} \\to \\triangle(\\mathcal{A})\n\nA parameterized value function class v_\\lambda : \\mathcal{S} \\to \\mathbb{R}\n\nA number of trajectories M to generate\n\nThe initial parameters \\theta^0, \\lambda^0\n\nFor t = 0, \\dots, T-1:\n\nPolicy improvement: Let \\pi^t_\\text{MCTS} denote the policy obtained by \n\nAlgorithm 8.2 with \\pi_{\\theta^t} and v_{\\lambda^t}. We use \\pi^t_\\text{MCTS} to play against itself M times. This generates M trajectories \\tau_0, \\dots, \\tau_{M-1}.\n\nPolicy evaluation: Use behavioral cloning to find a set of policy parameters \\theta^{t+1} that mimic the behavior of \\pi^t_\\text{MCTS} and a set of value function parameters \\lambda^{t+1} that approximate its value function. That is,\\begin{align*}\n \\theta^{t+1} &\\gets \\argmin_\\theta \\sum_{m=0}^{M-1} \\sum_{\\hi=0}^{H-1} - \\log \\pi_\\theta(a^m_\\hi \\mid s^m_\\hi) \\\\\n \\lambda^{t+1} &\\gets \\argmin_\\lambda \\sum_{m=0}^{M-1} \\sum_{\\hi=0}^{H-1} (v_\\lambda(s^m_\\hi) - R(\\tau_m))^2\n \\end{align*}\n\nNote that in implementation,\nthe policy and value are typically both returned by a single deep neural network,\nthat is, with a single set of parameters,\nand the two loss functions are added together.\n\nThis algorithm was brought to fame by AlphaGo Zero \n\nSilver et al. (2017).","type":"content","url":"/planning#self-play","position":19},{"hierarchy":{"lvl1":"8 Tree Search Methods","lvl2":"Summary"},"type":"lvl2","url":"/planning#summary","position":20},{"hierarchy":{"lvl1":"8 Tree Search Methods","lvl2":"Summary"},"content":"In this chapter,\nwe explored tree search-based algorithms for deterministic, zero sum, fully observable two-player games.\nWe began with \n\nmin-max search,\nan algorithm for exactly solving the game value of every possible state.\nHowever, this is impossible to execute in practice,\nand so we must resort to various ways to reduce the number of states and actions that we must explore.\n\n\nAlpha-beta search does this by pruning away states that we already know to be suboptimal,\nand \n\nMonte Carlo Tree Search approximates the value of states instead of evaluating them exactly.","type":"content","url":"/planning#summary","position":21},{"hierarchy":{"lvl1":"8 Tree Search Methods","lvl2":"References"},"type":"lvl2","url":"/planning#references","position":22},{"hierarchy":{"lvl1":"8 Tree Search Methods","lvl2":"References"},"content":"Chapter 5 of \n\nRussell & Norvig (2021) provides an excellent overview of search methods in games.\nThe original AlphaGo paper \n\nSilver et al. (2016) was a groundbreaking application of these technologies.\n\n\nSilver et al. (2017) removed the imitation learning phase,\nlearning from scratch.\nAlphaZero \n\nSilver et al. (2018) then extended to other games beyond Go,\nnamely shogi and chess,\nalso learning from scratch.\nIn MuZero \n\nSchrittwieser et al. (2020),\nthis was further extended by learning a model of the game dynamics.","type":"content","url":"/planning#references","position":23},{"hierarchy":{"lvl1":"4 Supervised learning"},"type":"lvl1","url":"/supervised-learning","position":0},{"hierarchy":{"lvl1":"4 Supervised learning"},"content":"","type":"content","url":"/supervised-learning","position":1},{"hierarchy":{"lvl1":"4 Supervised learning","lvl2":"Introduction"},"type":"lvl2","url":"/supervised-learning#introduction","position":2},{"hierarchy":{"lvl1":"4 Supervised learning","lvl2":"Introduction"},"content":"This section will cover the details of implementing the fit function above:\nThat is, how to use a dataset of labelled samples (x_1, y_1), \\dots, (x_N, y_N) to find a function f that minimizes the empirical risk.\nThis requires two ingredients:\n\nA function class \\mathcal{F} to search over\n\nA fitting method for minimizing the empirical risk over this class\n\nThe two main function classes we will cover are linear models and neural networks.\nBoth of these function classes are parameterized by some parameters θ,\nand the fitting method will search over these parameters to minimize the empirical risk:\n\nParameterized empirical risk minimization\n\nGiven a dataset of samples (x_1, y_1), \\dots, (x_N, y_N) and a class of functions \\mathcal{F} parameterized by θ,\nwe to find a parameter (vector) \\hat \\theta that minimizes the empirical risk:\\hat \\theta = \\arg\\min_{\\theta} \\frac{1}{N} \\sum_{i=1}^N (y_i - f_\\theta(x_i))^2\n\nThe most common fitting method for parameterized models is gradient descent.\n\nGradient descent\n\nLetting L(\\theta) \\in \\mathbb{R} denote the empirical risk in terms of the parameters,\nthe gradient descent algorithm updates the parameters according to the rule\\theta^{t+1} = \\theta^t - \\eta \\nabla_\\theta L(\\theta^t)\n\nwhere \\eta > 0 is the learning rate.\n\nfrom jaxtyping import Float, Array\nfrom collections.abc import Callable\n\nParams = Float[Array, \" D\"]\n\n\ndef gradient_descent(\n loss: Callable[[Params], float],\n θ_init: Params,\n η: float,\n epochs: int,\n):\n \"\"\"\n Run gradient descent to minimize the given loss function\n (expressed in terms of the parameters).\n \"\"\"\n θ = θ_init\n for _ in range(epochs):\n θ = θ - η * grad(loss)(θ)\n return θ\n\n","type":"content","url":"/supervised-learning#introduction","position":3},{"hierarchy":{"lvl1":"4 Supervised learning","lvl2":"Linear regression"},"type":"lvl2","url":"/supervised-learning#linear-regression","position":4},{"hierarchy":{"lvl1":"4 Supervised learning","lvl2":"Linear regression"},"content":"In linear regression, we assume that the function f is linear in the parameters:\\mathcal{F} = \\{ x \\mapsto \\theta^\\top x \\mid \\theta \\in \\mathbb{R}^D \\}\n\nThis function class is extremely simple and only contains linear functions.\nTo expand its expressivity, we can transform the input x using some feature function ϕ,\ni.e. \\widetilde x = \\phi(x), and then fit a linear model in the transformed space instead.\n\ndef fit_linear(X: Float[Array, \"N D\"], y: Float[Array, \" N\"], φ=lambda x: x):\n \"\"\"Fit a linear model to the given dataset using ordinary least squares.\"\"\"\n X = vmap(φ)(X)\n θ = np.linalg.lstsq(X, y, rcond=None)[0]\n return lambda x: np.dot(φ(x), θ)\n\n","type":"content","url":"/supervised-learning#linear-regression","position":5},{"hierarchy":{"lvl1":"4 Supervised learning","lvl2":"Neural networks"},"type":"lvl2","url":"/supervised-learning#neural-networks","position":6},{"hierarchy":{"lvl1":"4 Supervised learning","lvl2":"Neural networks"},"content":"In neural networks, we assume that the function f is a composition of linear functions (represented by matrices W_i) and non-linear activation functions (denoted by σ):\\mathcal{F} = \\{ x \\mapsto \\sigma(W_L \\sigma(W_{L-1} \\dots \\sigma(W_1 x + b_1) \\dots + b_{L-1}) + b_L) \\}\n\nwhere W_i \\in \\mathbb{R}^{D_{i+1} \\times D_i} and b_i \\in \\mathbb{R}^{D_{i+1}} are the parameters of the i-th layer, and σ is the activation function.\n\nThis function class is much more expressive and contains many more parameters.\nThis makes it more susceptible to overfitting on smaller datasets,\nbut also allows it to represent more complex functions.\nIn practice, however, neural networks exhibit interesting phenomena during training,\nand are often able to generalize well even with many parameters.\n\nAnother reason for their popularity is the efficient backpropagation algorithm for computing the gradient of the empirical risk with respect to the parameters.\nEssentially, the hierarchical structure of the neural network,\ni.e. computing the output of the network as a composition of functions,\nallows us to use the chain rule to compute the gradient of the output with respect to the parameters of each layer.\n\nNielsen (2015) provides a comprehensive introduction to neural networks and backpropagation.","type":"content","url":"/supervised-learning#neural-networks","position":7}]} \ No newline at end of file diff --git a/myst.xref.json b/myst.xref.json index 1612e5b..a31e453 100644 --- a/myst.xref.json +++ b/myst.xref.json @@ -1 +1 @@ -{"version":"1","myst":"1.3.7","references":[{"kind":"page","data":"/index.json","url":"/"},{"identifier":"prerequisites","kind":"heading","data":"/index.json","url":"/","implicit":true},{"identifier":"reinforcement-learning-in-a-nutshell","kind":"heading","data":"/index.json","url":"/","implicit":true},{"identifier":"core-tasks-of-reinforcement-learning","kind":"heading","data":"/index.json","url":"/","implicit":true},{"identifier":"course-overview","kind":"heading","data":"/index.json","url":"/","implicit":true},{"identifier":"notation","kind":"heading","data":"/index.json","url":"/","implicit":true},{"identifier":"programming","kind":"heading","data":"/index.json","url":"/"},{"kind":"page","data":"/mdps.json","url":"/mdps"},{"identifier":"introduction","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"markov","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"finite-horizon-mdps","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"definition","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"finite_horizon_mdp","html_id":"finite-horizon-mdp","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"tidy_mdp","html_id":"tidy-mdp","kind":"proof:example","data":"/mdps.json","url":"/mdps"},{"identifier":"policies","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"policy","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"tidy_policy","html_id":"tidy-policy","kind":"proof:example","data":"/mdps.json","url":"/mdps"},{"identifier":"trajectories","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"trajectory","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"tidy_traj","html_id":"tidy-traj","kind":"proof:example","data":"/mdps.json","url":"/mdps"},{"identifier":"autoregressive_trajectories","html_id":"autoregressive-trajectories","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"value-functions","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"value","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"action_value","html_id":"action-value","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"relating-the-value-function-and-action-value-function","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"greedy-policies","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"the-one-step-bellman-consistency-equation","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"bellman_consistency","html_id":"bellman-consistency","kind":"proof:theorem","data":"/mdps.json","url":"/mdps"},{"identifier":"bellman_consistency_action","html_id":"bellman-consistency-action","kind":"proof:theorem","data":"/mdps.json","url":"/mdps"},{"identifier":"bellman_det","html_id":"bellman-det","kind":"proof:remark","data":"/mdps.json","url":"/mdps"},{"identifier":"the-one-step-bellman-operator","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"bellman_operator","html_id":"bellman-operator","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"finite_horizon_mdps","html_id":"finite-horizon-mdps-1","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"eval_dp","html_id":"eval-dp","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"tidy_eval_finite","html_id":"tidy-eval-finite","kind":"proof:example","data":"/mdps.json","url":"/mdps"},{"identifier":"opt_dynamic_programming","html_id":"opt-dynamic-programming","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"optimal_policy_finite","html_id":"optimal-policy-finite","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"optimal_greedy","html_id":"optimal-greedy","kind":"proof:theorem","data":"/mdps.json","url":"/mdps"},{"identifier":"bellman_consistency_optimal","html_id":"bellman-consistency-optimal","kind":"proof:corollary","data":"/mdps.json","url":"/mdps"},{"identifier":"pi_star_dp","html_id":"pi-star-dp","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"infinite_horizon_mdps","html_id":"infinite-horizon-mdps","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"discounted-rewards","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"stationary-policies","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"value-functions-and-bellman-consistency","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"bellman_consistency_infinite","html_id":"bellman-consistency-infinite","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"solving-infinite-horizon-mdps","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"the-bellman-operator-is-a-contraction-mapping","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"contraction","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"contraction_convergence","html_id":"contraction-convergence","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"bellman_convergence","html_id":"bellman-convergence","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"bellman_contraction","html_id":"bellman-contraction","kind":"proof:theorem","data":"/mdps.json","url":"/mdps"},{"identifier":"policy-evaluation-in-infinite-horizon-mdps","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"matrix-inversion-for-deterministic-policies","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"tidy_tabular","html_id":"tidy-tabular","kind":"proof:example","data":"/mdps.json","url":"/mdps"},{"identifier":"matrix_inversion_pe","html_id":"matrix-inversion-pe","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"tidy_eval_infinite","html_id":"tidy-eval-infinite","kind":"proof:example","data":"/mdps.json","url":"/mdps"},{"identifier":"iterative_pe","html_id":"iterative-pe","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"iterations_vi","html_id":"iterations-vi","kind":"proof:remark","data":"/mdps.json","url":"/mdps"},{"identifier":"optimal-policies-in-infinite-horizon-mdps","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"optimal_policy_infinite","html_id":"optimal-policy-infinite","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"bellman_optimality","html_id":"bellman-optimality","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"bellman_optimality_operator","html_id":"bellman-optimality-operator","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"value_iteration","html_id":"value-iteration","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"greedy_worsen","html_id":"greedy-worsen","kind":"proof:theorem","data":"/mdps.json","url":"/mdps"},{"identifier":"policy_iteration","html_id":"policy-iteration","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"pi_iter_analysis","html_id":"pi-iter-analysis","kind":"proof:theorem","data":"/mdps.json","url":"/mdps"},{"identifier":"pi_iter_proof","html_id":"pi-iter-proof","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"summary","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"kind":"page","data":"/control.json","url":"/control"},{"identifier":"introduction","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"control_examples","html_id":"control-examples","kind":"figure","data":"/control.json","url":"/control"},{"identifier":"robot_hand","html_id":"robot-hand","kind":"figure","data":"/control.json","url":"/control"},{"identifier":"cart_pole","html_id":"cart-pole","kind":"proof:example","data":"/control.json","url":"/control"},{"identifier":"optimal-control","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"optimal_control","html_id":"optimal-control","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"a-first-attempt-discretization","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"lqr","kind":"heading","data":"/control.json","url":"/control"},{"identifier":"lqr_definition","html_id":"lqr-definition","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"value_lqr","html_id":"value-lqr","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"optimal_lqr","html_id":"optimal-lqr","kind":"heading","data":"/control.json","url":"/control"},{"identifier":"optimal_value_lqr","html_id":"optimal-value-lqr","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"optimal_value_lqr_quadratic","html_id":"optimal-value-lqr-quadratic","kind":"proof:theorem","data":"/control.json","url":"/control"},{"identifier":"optimal_policy_lqr_linear","html_id":"optimal-policy-lqr-linear","kind":"proof:theorem","data":"/control.json","url":"/control"},{"identifier":"lemma_pi_linear","html_id":"lemma-pi-linear","kind":"proof:lemma","data":"/control.json","url":"/control"},{"identifier":"k_pi","html_id":"k-pi","kind":"equation","data":"/control.json","url":"/control"},{"identifier":"riccati","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"lemma_schur","html_id":"lemma-schur","kind":"proof:lemma","data":"/control.json","url":"/control"},{"identifier":"expected-state-at-time-hi","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"expected_state","html_id":"expected-state","kind":"equation","data":"/control.json","url":"/control"},{"identifier":"extensions","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"time_dep_lqr","html_id":"time-dep-lqr","kind":"heading","data":"/control.json","url":"/control"},{"identifier":"time_dependent_lqr","html_id":"time-dependent-lqr","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"riccati_time_dependent","html_id":"riccati-time-dependent","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"more-general-quadratic-cost-functions","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"general_quadratic_cost","html_id":"general-quadratic-cost","kind":"equation","data":"/control.json","url":"/control"},{"identifier":"tracking-a-predefined-trajectory","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"approx_nonlinear","html_id":"approx-nonlinear","kind":"heading","data":"/control.json","url":"/control"},{"identifier":"nonlinear_control","html_id":"nonlinear-control","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"local-linearization","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"finite-differencing","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"local-convexification","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"local_linearization","html_id":"local-linearization","kind":"figure","data":"/control.json","url":"/control"},{"identifier":"iterative_lqr","html_id":"iterative-lqr","kind":"heading","data":"/control.json","url":"/control"},{"identifier":"ilqr","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"summary","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"kind":"page","data":"/bandits.json","url":"/bandits"},{"identifier":"introduction","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"advertising","kind":"proof:example","data":"/bandits.json","url":"/bandits"},{"identifier":"clinical_trials","html_id":"clinical-trials","kind":"proof:example","data":"/bandits.json","url":"/bandits"},{"identifier":"multi-armed","kind":"proof:remark","data":"/bandits.json","url":"/bandits"},{"identifier":"regret","kind":"proof:definition","data":"/bandits.json","url":"/bandits"},{"identifier":"pure-exploration-random-guessing","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"pure_exploration","html_id":"pure-exploration","kind":"block:notebook-code","data":"/bandits.json","url":"/bandits"},{"identifier":"pure_exploration-code","html_id":"pure-exploration-code","kind":"code","data":"/bandits.json","url":"/bandits"},{"identifier":"pure_exploration-output","html_id":"pure-exploration-output","kind":"output","data":"/bandits.json","url":"/bandits"},{"identifier":"pure-greedy","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"pure_greedy","html_id":"pure-greedy","kind":"block:notebook-code","data":"/bandits.json","url":"/bandits"},{"identifier":"pure_greedy-code","html_id":"pure-greedy-code","kind":"code","data":"/bandits.json","url":"/bandits"},{"identifier":"pure_greedy-output","html_id":"pure-greedy-output","kind":"output","data":"/bandits.json","url":"/bandits"},{"identifier":"etc","kind":"heading","data":"/bandits.json","url":"/bandits"},{"identifier":"etc-regret-analysis","kind":"heading","data":"/bandits.json","url":"/bandits"},{"identifier":"exploration-phase","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"exploitation-phase","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"hoeffding","kind":"proof:theorem","data":"/bandits.json","url":"/bandits"},{"identifier":"hoeffding-etc","kind":"equation","data":"/bandits.json","url":"/bandits"},{"identifier":"union_bound","html_id":"union-bound","kind":"proof:theorem","data":"/bandits.json","url":"/bandits"},{"identifier":"epsilon-greedy","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"ucb","kind":"heading","data":"/bandits.json","url":"/bandits"},{"identifier":"ucb-regret-analysis","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"lower-bound-on-regret-intuition","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"thompson_sampling","html_id":"thompson-sampling","kind":"heading","data":"/bandits.json","url":"/bandits"},{"identifier":"bayesian_bernoulli","html_id":"bayesian-bernoulli","kind":"proof:example","data":"/bandits.json","url":"/bandits"},{"identifier":"contextual-bandits","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"contextual_bandit","html_id":"contextual-bandit","kind":"proof:definition","data":"/bandits.json","url":"/bandits"},{"identifier":"lin_ucb","html_id":"lin-ucb","kind":"heading","data":"/bandits.json","url":"/bandits"},{"identifier":"ols_bandit","html_id":"ols-bandit","kind":"equation","data":"/bandits.json","url":"/bandits"},{"identifier":"chebyshev","kind":"proof:theorem","data":"/bandits.json","url":"/bandits"},{"identifier":"summary","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"kind":"page","data":"/supervised-learning.json","url":"/supervised-learning"},{"identifier":"introduction","kind":"heading","data":"/supervised-learning.json","url":"/supervised-learning","implicit":true},{"identifier":"parameterized_empirical_risk_minimization","html_id":"parameterized-empirical-risk-minimization","kind":"proof:definition","data":"/supervised-learning.json","url":"/supervised-learning"},{"identifier":"gd_def","html_id":"gd-def","kind":"proof:definition","data":"/supervised-learning.json","url":"/supervised-learning"},{"identifier":"linear-regression","kind":"heading","data":"/supervised-learning.json","url":"/supervised-learning","implicit":true},{"identifier":"neural-networks","kind":"heading","data":"/supervised-learning.json","url":"/supervised-learning","implicit":true},{"kind":"page","data":"/fitted-dp.json","url":"/fitted-dp"},{"identifier":"introduction","kind":"heading","data":"/fitted-dp.json","url":"/fitted-dp","implicit":true},{"identifier":"erm","kind":"heading","data":"/fitted-dp.json","url":"/fitted-dp"},{"identifier":"conditional_expectation_minimizes_mse","html_id":"conditional-expectation-minimizes-mse","kind":"proof:theorem","data":"/fitted-dp.json","url":"/fitted-dp"},{"identifier":"empirical_risk_minimization","html_id":"empirical-risk-minimization","kind":"proof:definition","data":"/fitted-dp.json","url":"/fitted-dp"},{"identifier":"fitted-value-iteration","kind":"heading","data":"/fitted-dp.json","url":"/fitted-dp","implicit":true},{"identifier":"fitted_q_iteration","html_id":"fitted-q-iteration","kind":"proof:definition","data":"/fitted-dp.json","url":"/fitted-dp"},{"identifier":"fitted_evaluation","html_id":"fitted-evaluation","kind":"proof:definition","data":"/fitted-dp.json","url":"/fitted-dp"},{"identifier":"summary","kind":"heading","data":"/fitted-dp.json","url":"/fitted-dp","implicit":true},{"kind":"page","data":"/pg.json","url":"/pg"},{"identifier":"introduction","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"gradient-ascent","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"stochastic-gradient-ascent","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"policy-stochastic-gradient-ascent","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"objective_fn","html_id":"objective-fn","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"parameterizations","kind":"heading","data":"/pg.json","url":"/pg"},{"identifier":"tabular-representation","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"linear-in-features","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"neural-policies","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"continuous-action-spaces","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"importance_sampling","html_id":"importance-sampling","kind":"heading","data":"/pg.json","url":"/pg"},{"identifier":"the-reinforce-policy-gradient","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"trajectory_likelihood","html_id":"trajectory-likelihood","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"reinforce_pg","html_id":"reinforce-pg","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"pg_with_q","html_id":"pg-with-q","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"baselines-and-advantages","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"eq:pg_baseline","html_id":"eq-pg-baseline","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"pg_advantage","html_id":"pg-advantage","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"pg_baseline","html_id":"pg-baseline","kind":"proof:definition","data":"/pg.json","url":"/pg"},{"identifier":"comparing-policy-gradient-algorithms-to-policy-iteration","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"pdl","kind":"proof:theorem","data":"/pg.json","url":"/pg"},{"identifier":"pdl_eq","html_id":"pdl-eq","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"trust-region-policy-optimization","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"kld","kind":"proof:definition","data":"/pg.json","url":"/pg"},{"identifier":"trpo","kind":"proof:definition","data":"/pg.json","url":"/pg"},{"identifier":"natural-policy-gradient","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"npg_optimization","html_id":"npg-optimization","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"fisher_matrix","html_id":"fisher-matrix","kind":"proof:definition","data":"/pg.json","url":"/pg"},{"identifier":"fisher_trajectory","html_id":"fisher-trajectory","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"npg","kind":"proof:definition","data":"/pg.json","url":"/pg"},{"identifier":"natural_simple","html_id":"natural-simple","kind":"proof:example","data":"/pg.json","url":"/pg"},{"identifier":"proximal-policy-optimization","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"summary","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"kind":"page","data":"/imitation-learning.json","url":"/imitation-learning"},{"identifier":"introduction","kind":"heading","data":"/imitation-learning.json","url":"/imitation-learning","implicit":true},{"identifier":"behavioral-cloning","kind":"heading","data":"/imitation-learning.json","url":"/imitation-learning","implicit":true},{"identifier":"behavioral_cloning","html_id":"behavioral-cloning","kind":"proof:definition","data":"/imitation-learning.json","url":"/imitation-learning"},{"identifier":"distribution-shift","kind":"heading","data":"/imitation-learning.json","url":"/imitation-learning","implicit":true},{"identifier":"dataset-aggregation-dagger","kind":"heading","data":"/imitation-learning.json","url":"/imitation-learning","implicit":true},{"kind":"page","data":"/planning.json","url":"/planning"},{"identifier":"introduction","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"identifier":"deterministic-zero-sum-fully-observable-two-player-games","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"identifier":"notation","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"identifier":"min-max-search","kind":"heading","data":"/planning.json","url":"/planning"},{"identifier":"min-max-value","kind":"proof:algorithm","data":"/planning.json","url":"/planning"},{"identifier":"min-max-example","kind":"proof:example","data":"/planning.json","url":"/planning"},{"identifier":"complexity-of-min-max-search","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"identifier":"alpha-beta-search","kind":"heading","data":"/planning.json","url":"/planning"},{"identifier":"alpha-beta-example","kind":"proof:example","data":"/planning.json","url":"/planning"},{"identifier":"monte-carlo-tree-search","kind":"heading","data":"/planning.json","url":"/planning"},{"identifier":"mcts-algorithm","kind":"proof:algorithm","data":"/planning.json","url":"/planning"},{"identifier":"ucb-tree","kind":"equation","data":"/planning.json","url":"/planning"},{"identifier":"incorporating-value-functions-and-policies","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"identifier":"mcts-policy-value","kind":"proof:algorithm","data":"/planning.json","url":"/planning"},{"identifier":"ucb-tree-policy","kind":"equation","data":"/planning.json","url":"/planning"},{"identifier":"self-play","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"identifier":"mcts-self-play","kind":"proof:algorithm","data":"/planning.json","url":"/planning"},{"identifier":"summary","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"identifier":"references","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"kind":"page","data":"/exploration.json","url":"/exploration"},{"identifier":"introduction","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"per_episode_regret","html_id":"per-episode-regret","kind":"proof:definition","data":"/exploration.json","url":"/exploration"},{"identifier":"sparse-reward","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"sparse_reward_mdp","html_id":"sparse-reward-mdp","kind":"proof:example","data":"/exploration.json","url":"/exploration"},{"identifier":"exploration-in-deterministic-mdps","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"explore_then_exploit","html_id":"explore-then-exploit","kind":"proof:definition","data":"/exploration.json","url":"/exploration"},{"identifier":"explore_then_exploit_performance","html_id":"explore-then-exploit-performance","kind":"proof:theorem","data":"/exploration.json","url":"/exploration"},{"identifier":"mdp_mab","html_id":"mdp-mab","kind":"heading","data":"/exploration.json","url":"/exploration"},{"identifier":"mdp_as_mab","html_id":"mdp-as-mab","kind":"equation","data":"/exploration.json","url":"/exploration"},{"identifier":"ineffective_mdp","html_id":"ineffective-mdp","kind":"proof:example","data":"/exploration.json","url":"/exploration"},{"identifier":"ucb-vi","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"modelling-the-transitions","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"reward-bonus","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"eq:ucb_vi_bonus","html_id":"eq-ucb-vi-bonus","kind":"equation","data":"/exploration.json","url":"/exploration"},{"identifier":"ucb_vi_bonus","html_id":"ucb-vi-bonus","kind":"proof:remark","data":"/exploration.json","url":"/exploration"},{"identifier":"err","kind":"equation","data":"/exploration.json","url":"/exploration"},{"identifier":"definition","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"ucb-vi-alg","kind":"equation","data":"/exploration.json","url":"/exploration"},{"identifier":"performance-of-ucb-vi","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"ucb_vi_regret","html_id":"ucb-vi-regret","kind":"proof:theorem","data":"/exploration.json","url":"/exploration"},{"identifier":"linear-mdps","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"linear_mdp","html_id":"linear-mdp","kind":"proof:definition","data":"/exploration.json","url":"/exploration"},{"identifier":"planning-in-a-linear-mdp","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"lin_ucb_vi","html_id":"lin-ucb-vi","kind":"heading","data":"/exploration.json","url":"/exploration"},{"identifier":"performance","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"lin_ucb_vi_regret","html_id":"lin-ucb-vi-regret","kind":"proof:theorem","data":"/exploration.json","url":"/exploration"},{"identifier":"summary","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"kind":"page","data":"/background.json","url":"/background"},{"identifier":"o-notation","kind":"heading","data":"/background.json","url":"/background","implicit":true},{"identifier":"python","kind":"heading","data":"/background.json","url":"/background","implicit":true}]} \ No newline at end of file +{"version":"1","myst":"1.3.17","references":[{"kind":"page","data":"/index.json","url":"/"},{"identifier":"prerequisites","kind":"heading","data":"/index.json","url":"/","implicit":true},{"identifier":"reinforcement-learning-in-a-nutshell","kind":"heading","data":"/index.json","url":"/","implicit":true},{"identifier":"core-tasks-of-reinforcement-learning","kind":"heading","data":"/index.json","url":"/","implicit":true},{"identifier":"course-overview","kind":"heading","data":"/index.json","url":"/","implicit":true},{"identifier":"notation","kind":"heading","data":"/index.json","url":"/","implicit":true},{"identifier":"programming","kind":"heading","data":"/index.json","url":"/"},{"kind":"page","data":"/mdps.json","url":"/mdps"},{"identifier":"introduction","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"markov","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"finite-horizon-mdps","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"definition","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"finite_horizon_mdp","html_id":"finite-horizon-mdp","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"tidy_mdp","html_id":"tidy-mdp","kind":"proof:example","data":"/mdps.json","url":"/mdps"},{"identifier":"policies","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"policy","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"tidy_policy","html_id":"tidy-policy","kind":"proof:example","data":"/mdps.json","url":"/mdps"},{"identifier":"trajectories","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"trajectory","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"tidy_traj","html_id":"tidy-traj","kind":"proof:example","data":"/mdps.json","url":"/mdps"},{"identifier":"autoregressive_trajectories","html_id":"autoregressive-trajectories","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"value-functions","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"value","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"action_value","html_id":"action-value","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"relating-the-value-function-and-action-value-function","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"greedy-policies","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"the-one-step-bellman-consistency-equation","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"bellman_consistency","html_id":"bellman-consistency","kind":"proof:theorem","data":"/mdps.json","url":"/mdps"},{"identifier":"bellman_consistency_action","html_id":"bellman-consistency-action","kind":"proof:theorem","data":"/mdps.json","url":"/mdps"},{"identifier":"bellman_det","html_id":"bellman-det","kind":"proof:remark","data":"/mdps.json","url":"/mdps"},{"identifier":"the-one-step-bellman-operator","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"bellman_operator","html_id":"bellman-operator","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"finite_horizon_mdps","html_id":"finite-horizon-mdps-1","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"eval_dp","html_id":"eval-dp","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"tidy_eval_finite","html_id":"tidy-eval-finite","kind":"proof:example","data":"/mdps.json","url":"/mdps"},{"identifier":"opt_dynamic_programming","html_id":"opt-dynamic-programming","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"optimal_policy_finite","html_id":"optimal-policy-finite","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"optimal_greedy","html_id":"optimal-greedy","kind":"proof:theorem","data":"/mdps.json","url":"/mdps"},{"identifier":"bellman_consistency_optimal","html_id":"bellman-consistency-optimal","kind":"proof:corollary","data":"/mdps.json","url":"/mdps"},{"identifier":"pi_star_dp","html_id":"pi-star-dp","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"infinite_horizon_mdps","html_id":"infinite-horizon-mdps","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"discounted-rewards","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"stationary-policies","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"value-functions-and-bellman-consistency","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"bellman_consistency_infinite","html_id":"bellman-consistency-infinite","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"solving-infinite-horizon-mdps","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"the-bellman-operator-is-a-contraction-mapping","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"contraction","kind":"proof:definition","data":"/mdps.json","url":"/mdps"},{"identifier":"contraction_convergence","html_id":"contraction-convergence","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"bellman_convergence","html_id":"bellman-convergence","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"bellman_contraction","html_id":"bellman-contraction","kind":"proof:theorem","data":"/mdps.json","url":"/mdps"},{"identifier":"policy-evaluation-in-infinite-horizon-mdps","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"matrix-inversion-for-deterministic-policies","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"tidy_tabular","html_id":"tidy-tabular","kind":"proof:example","data":"/mdps.json","url":"/mdps"},{"identifier":"matrix_inversion_pe","html_id":"matrix-inversion-pe","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"tidy_eval_infinite","html_id":"tidy-eval-infinite","kind":"proof:example","data":"/mdps.json","url":"/mdps"},{"identifier":"iterative_pe","html_id":"iterative-pe","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"iterations_vi","html_id":"iterations-vi","kind":"proof:remark","data":"/mdps.json","url":"/mdps"},{"identifier":"optimal-policies-in-infinite-horizon-mdps","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"identifier":"optimal_policy_infinite","html_id":"optimal-policy-infinite","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"bellman_optimality","html_id":"bellman-optimality","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"bellman_optimality_operator","html_id":"bellman-optimality-operator","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"value_iteration","html_id":"value-iteration","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"greedy_worsen","html_id":"greedy-worsen","kind":"proof:theorem","data":"/mdps.json","url":"/mdps"},{"identifier":"policy_iteration","html_id":"policy-iteration","kind":"heading","data":"/mdps.json","url":"/mdps"},{"identifier":"pi_iter_analysis","html_id":"pi-iter-analysis","kind":"proof:theorem","data":"/mdps.json","url":"/mdps"},{"identifier":"pi_iter_proof","html_id":"pi-iter-proof","kind":"equation","data":"/mdps.json","url":"/mdps"},{"identifier":"summary","kind":"heading","data":"/mdps.json","url":"/mdps","implicit":true},{"kind":"page","data":"/control.json","url":"/control"},{"identifier":"introduction","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"control_examples","html_id":"control-examples","kind":"figure","data":"/control.json","url":"/control"},{"identifier":"robot_hand","html_id":"robot-hand","kind":"figure","data":"/control.json","url":"/control"},{"identifier":"cart_pole","html_id":"cart-pole","kind":"proof:example","data":"/control.json","url":"/control"},{"identifier":"optimal-control","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"optimal_control","html_id":"optimal-control","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"a-first-attempt-discretization","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"lqr","kind":"heading","data":"/control.json","url":"/control"},{"identifier":"lqr_definition","html_id":"lqr-definition","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"value_lqr","html_id":"value-lqr","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"optimal_lqr","html_id":"optimal-lqr","kind":"heading","data":"/control.json","url":"/control"},{"identifier":"optimal_value_lqr","html_id":"optimal-value-lqr","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"optimal_value_lqr_quadratic","html_id":"optimal-value-lqr-quadratic","kind":"proof:theorem","data":"/control.json","url":"/control"},{"identifier":"optimal_policy_lqr_linear","html_id":"optimal-policy-lqr-linear","kind":"proof:theorem","data":"/control.json","url":"/control"},{"identifier":"lemma_pi_linear","html_id":"lemma-pi-linear","kind":"proof:lemma","data":"/control.json","url":"/control"},{"identifier":"k_pi","html_id":"k-pi","kind":"equation","data":"/control.json","url":"/control"},{"identifier":"riccati","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"lemma_schur","html_id":"lemma-schur","kind":"proof:lemma","data":"/control.json","url":"/control"},{"identifier":"expected-state-at-time-hi","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"expected_state","html_id":"expected-state","kind":"equation","data":"/control.json","url":"/control"},{"identifier":"extensions","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"time_dep_lqr","html_id":"time-dep-lqr","kind":"heading","data":"/control.json","url":"/control"},{"identifier":"time_dependent_lqr","html_id":"time-dependent-lqr","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"riccati_time_dependent","html_id":"riccati-time-dependent","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"more-general-quadratic-cost-functions","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"general_quadratic_cost","html_id":"general-quadratic-cost","kind":"equation","data":"/control.json","url":"/control"},{"identifier":"tracking-a-predefined-trajectory","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"approx_nonlinear","html_id":"approx-nonlinear","kind":"heading","data":"/control.json","url":"/control"},{"identifier":"nonlinear_control","html_id":"nonlinear-control","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"local-linearization","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"finite-differencing","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"local-convexification","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"identifier":"local_linearization","html_id":"local-linearization","kind":"figure","data":"/control.json","url":"/control"},{"identifier":"iterative_lqr","html_id":"iterative-lqr","kind":"heading","data":"/control.json","url":"/control"},{"identifier":"ilqr","kind":"proof:definition","data":"/control.json","url":"/control"},{"identifier":"summary","kind":"heading","data":"/control.json","url":"/control","implicit":true},{"kind":"page","data":"/bandits.json","url":"/bandits"},{"identifier":"introduction","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"advertising","kind":"proof:example","data":"/bandits.json","url":"/bandits"},{"identifier":"clinical_trials","html_id":"clinical-trials","kind":"proof:example","data":"/bandits.json","url":"/bandits"},{"identifier":"multi-armed","kind":"proof:remark","data":"/bandits.json","url":"/bandits"},{"identifier":"regret","kind":"proof:definition","data":"/bandits.json","url":"/bandits"},{"identifier":"pure-exploration-random-guessing","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"pure_exploration","html_id":"pure-exploration","kind":"block:notebook-code","data":"/bandits.json","url":"/bandits"},{"identifier":"pure_exploration-code","html_id":"pure-exploration-code","kind":"code","data":"/bandits.json","url":"/bandits"},{"identifier":"pure_exploration-output","html_id":"pure-exploration-output","kind":"output","data":"/bandits.json","url":"/bandits"},{"identifier":"pure-greedy","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"pure_greedy","html_id":"pure-greedy","kind":"block:notebook-code","data":"/bandits.json","url":"/bandits"},{"identifier":"pure_greedy-code","html_id":"pure-greedy-code","kind":"code","data":"/bandits.json","url":"/bandits"},{"identifier":"pure_greedy-output","html_id":"pure-greedy-output","kind":"output","data":"/bandits.json","url":"/bandits"},{"identifier":"etc","kind":"heading","data":"/bandits.json","url":"/bandits"},{"identifier":"etc-regret-analysis","kind":"heading","data":"/bandits.json","url":"/bandits"},{"identifier":"exploration-phase","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"exploitation-phase","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"hoeffding","kind":"proof:theorem","data":"/bandits.json","url":"/bandits"},{"identifier":"hoeffding-etc","kind":"equation","data":"/bandits.json","url":"/bandits"},{"identifier":"union_bound","html_id":"union-bound","kind":"proof:theorem","data":"/bandits.json","url":"/bandits"},{"identifier":"epsilon-greedy","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"ucb","kind":"heading","data":"/bandits.json","url":"/bandits"},{"identifier":"ucb-regret-analysis","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"lower-bound-on-regret-intuition","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"thompson_sampling","html_id":"thompson-sampling","kind":"heading","data":"/bandits.json","url":"/bandits"},{"identifier":"bayesian_bernoulli","html_id":"bayesian-bernoulli","kind":"proof:example","data":"/bandits.json","url":"/bandits"},{"identifier":"contextual-bandits","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"identifier":"contextual_bandit","html_id":"contextual-bandit","kind":"proof:definition","data":"/bandits.json","url":"/bandits"},{"identifier":"lin_ucb","html_id":"lin-ucb","kind":"heading","data":"/bandits.json","url":"/bandits"},{"identifier":"ols_bandit","html_id":"ols-bandit","kind":"equation","data":"/bandits.json","url":"/bandits"},{"identifier":"chebyshev","kind":"proof:theorem","data":"/bandits.json","url":"/bandits"},{"identifier":"summary","kind":"heading","data":"/bandits.json","url":"/bandits","implicit":true},{"kind":"page","data":"/supervised-learning.json","url":"/supervised-learning"},{"identifier":"introduction","kind":"heading","data":"/supervised-learning.json","url":"/supervised-learning","implicit":true},{"identifier":"parameterized_empirical_risk_minimization","html_id":"parameterized-empirical-risk-minimization","kind":"proof:definition","data":"/supervised-learning.json","url":"/supervised-learning"},{"identifier":"gd_def","html_id":"gd-def","kind":"proof:definition","data":"/supervised-learning.json","url":"/supervised-learning"},{"identifier":"linear-regression","kind":"heading","data":"/supervised-learning.json","url":"/supervised-learning","implicit":true},{"identifier":"neural-networks","kind":"heading","data":"/supervised-learning.json","url":"/supervised-learning","implicit":true},{"kind":"page","data":"/fitted-dp.json","url":"/fitted-dp"},{"identifier":"introduction","kind":"heading","data":"/fitted-dp.json","url":"/fitted-dp","implicit":true},{"identifier":"erm","kind":"heading","data":"/fitted-dp.json","url":"/fitted-dp"},{"identifier":"conditional_expectation_minimizes_mse","html_id":"conditional-expectation-minimizes-mse","kind":"proof:theorem","data":"/fitted-dp.json","url":"/fitted-dp"},{"identifier":"empirical_risk_minimization","html_id":"empirical-risk-minimization","kind":"proof:definition","data":"/fitted-dp.json","url":"/fitted-dp"},{"identifier":"fitted-value-iteration","kind":"heading","data":"/fitted-dp.json","url":"/fitted-dp","implicit":true},{"identifier":"fitted_q_iteration","html_id":"fitted-q-iteration","kind":"proof:definition","data":"/fitted-dp.json","url":"/fitted-dp"},{"identifier":"fitted-pi-eval","kind":"heading","data":"/fitted-dp.json","url":"/fitted-dp"},{"identifier":"fitted_evaluation","html_id":"fitted-evaluation","kind":"proof:definition","data":"/fitted-dp.json","url":"/fitted-dp"},{"identifier":"fitted-policy-iteration","kind":"heading","data":"/fitted-dp.json","url":"/fitted-dp","implicit":true},{"identifier":"summary","kind":"heading","data":"/fitted-dp.json","url":"/fitted-dp","implicit":true},{"kind":"page","data":"/pg.json","url":"/pg"},{"identifier":"introduction","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"gradient-ascent","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"stochastic-gradient-ascent","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"policy-stochastic-gradient-ascent","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"objective_fn","html_id":"objective-fn","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"parameterizations","kind":"heading","data":"/pg.json","url":"/pg"},{"identifier":"importance_sampling","html_id":"importance-sampling","kind":"heading","data":"/pg.json","url":"/pg"},{"identifier":"the-reinforce-policy-gradient","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"reinforce_pg","html_id":"reinforce-pg","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"intuitive-remark","kind":"admonition:note","data":"/pg.json","url":"/pg"},{"identifier":"baselines-and-advantages","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"pg_with_q","html_id":"pg-with-q","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"eq:pg_baseline","html_id":"eq-pg-baseline","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"advantage","kind":"proof:definition","data":"/pg.json","url":"/pg"},{"identifier":"pg_advantage","html_id":"pg-advantage","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"comparing-policy-gradient-algorithms-to-policy-iteration","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"pdl","kind":"proof:theorem","data":"/pg.json","url":"/pg"},{"identifier":"pdl_eq","html_id":"pdl-eq","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"trust-region-policy-optimization","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"kld","kind":"proof:definition","data":"/pg.json","url":"/pg"},{"identifier":"trpo","kind":"proof:definition","data":"/pg.json","url":"/pg"},{"identifier":"natural-policy-gradient","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"npg_optimization","html_id":"npg-optimization","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"fisher_matrix","html_id":"fisher-matrix","kind":"proof:definition","data":"/pg.json","url":"/pg"},{"identifier":"fisher_trajectory","html_id":"fisher-trajectory","kind":"equation","data":"/pg.json","url":"/pg"},{"identifier":"npg","kind":"proof:definition","data":"/pg.json","url":"/pg"},{"identifier":"natural_simple","html_id":"natural-simple","kind":"proof:example","data":"/pg.json","url":"/pg"},{"identifier":"proximal-policy-optimization","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"identifier":"summary","kind":"heading","data":"/pg.json","url":"/pg","implicit":true},{"kind":"page","data":"/imitation-learning.json","url":"/imitation-learning"},{"identifier":"introduction","kind":"heading","data":"/imitation-learning.json","url":"/imitation-learning","implicit":true},{"identifier":"behavioral-cloning","kind":"heading","data":"/imitation-learning.json","url":"/imitation-learning","implicit":true},{"identifier":"behavioral_cloning","html_id":"behavioral-cloning","kind":"proof:definition","data":"/imitation-learning.json","url":"/imitation-learning"},{"identifier":"performance-of-behavioral-cloning","kind":"heading","data":"/imitation-learning.json","url":"/imitation-learning","implicit":true},{"identifier":"eq:pdl-rhs","html_id":"eq-pdl-rhs","kind":"equation","data":"/imitation-learning.json","url":"/imitation-learning"},{"identifier":"distribution-shift","kind":"heading","data":"/imitation-learning.json","url":"/imitation-learning","implicit":true},{"identifier":"dataset-aggregation-dagger","kind":"heading","data":"/imitation-learning.json","url":"/imitation-learning","implicit":true},{"identifier":"summary","kind":"heading","data":"/imitation-learning.json","url":"/imitation-learning","implicit":true},{"kind":"page","data":"/planning.json","url":"/planning"},{"identifier":"introduction","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"identifier":"deterministic-zero-sum-fully-observable-two-player-games","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"identifier":"notation","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"identifier":"tic-tac-toe","kind":"proof:example","data":"/planning.json","url":"/planning"},{"identifier":"min-max-search","kind":"heading","data":"/planning.json","url":"/planning"},{"identifier":"min-max-value","kind":"proof:definition","data":"/planning.json","url":"/planning"},{"identifier":"min-max-example","kind":"proof:example","data":"/planning.json","url":"/planning"},{"identifier":"complexity-of-min-max-search","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"identifier":"alpha-beta-search","kind":"heading","data":"/planning.json","url":"/planning"},{"identifier":"alpha-beta-example","kind":"proof:example","data":"/planning.json","url":"/planning"},{"identifier":"monte-carlo-tree-search","kind":"heading","data":"/planning.json","url":"/planning"},{"identifier":"mcts-algorithm","kind":"proof:algorithm","data":"/planning.json","url":"/planning"},{"identifier":"ucb-tree","kind":"equation","data":"/planning.json","url":"/planning"},{"identifier":"incorporating-value-functions-and-policies","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"identifier":"mcts-policy-value","kind":"proof:algorithm","data":"/planning.json","url":"/planning"},{"identifier":"ucb-tree-policy","kind":"equation","data":"/planning.json","url":"/planning"},{"identifier":"self-play","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"identifier":"mcts-self-play","kind":"proof:algorithm","data":"/planning.json","url":"/planning"},{"identifier":"summary","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"identifier":"references","kind":"heading","data":"/planning.json","url":"/planning","implicit":true},{"kind":"page","data":"/exploration.json","url":"/exploration"},{"identifier":"introduction","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"per_episode_regret","html_id":"per-episode-regret","kind":"proof:definition","data":"/exploration.json","url":"/exploration"},{"identifier":"sparse-reward","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"sparse_reward_mdp","html_id":"sparse-reward-mdp","kind":"proof:example","data":"/exploration.json","url":"/exploration"},{"identifier":"exploration-in-deterministic-mdps","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"explore_then_exploit","html_id":"explore-then-exploit","kind":"proof:definition","data":"/exploration.json","url":"/exploration"},{"identifier":"explore_then_exploit_performance","html_id":"explore-then-exploit-performance","kind":"proof:theorem","data":"/exploration.json","url":"/exploration"},{"identifier":"mdp_mab","html_id":"mdp-mab","kind":"heading","data":"/exploration.json","url":"/exploration"},{"identifier":"mdp_as_mab","html_id":"mdp-as-mab","kind":"equation","data":"/exploration.json","url":"/exploration"},{"identifier":"ineffective_mdp","html_id":"ineffective-mdp","kind":"proof:example","data":"/exploration.json","url":"/exploration"},{"identifier":"ucb-vi","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"modelling-the-transitions","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"reward-bonus","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"eq:ucb_vi_bonus","html_id":"eq-ucb-vi-bonus","kind":"equation","data":"/exploration.json","url":"/exploration"},{"identifier":"ucb_vi_bonus","html_id":"ucb-vi-bonus","kind":"proof:remark","data":"/exploration.json","url":"/exploration"},{"identifier":"err","kind":"equation","data":"/exploration.json","url":"/exploration"},{"identifier":"definition","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"ucb-vi-alg","kind":"equation","data":"/exploration.json","url":"/exploration"},{"identifier":"performance-of-ucb-vi","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"ucb_vi_regret","html_id":"ucb-vi-regret","kind":"proof:theorem","data":"/exploration.json","url":"/exploration"},{"identifier":"linear-mdps","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"linear_mdp","html_id":"linear-mdp","kind":"proof:definition","data":"/exploration.json","url":"/exploration"},{"identifier":"planning-in-a-linear-mdp","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"lin_ucb_vi","html_id":"lin-ucb-vi","kind":"heading","data":"/exploration.json","url":"/exploration"},{"identifier":"performance","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"identifier":"lin_ucb_vi_regret","html_id":"lin-ucb-vi-regret","kind":"proof:theorem","data":"/exploration.json","url":"/exploration"},{"identifier":"summary","kind":"heading","data":"/exploration.json","url":"/exploration","implicit":true},{"kind":"page","data":"/background.json","url":"/background"},{"identifier":"o-notation","kind":"heading","data":"/background.json","url":"/background","implicit":true},{"identifier":"python","kind":"heading","data":"/background.json","url":"/background","implicit":true}]} \ No newline at end of file diff --git a/objects.inv b/objects.inv index b813ea1..ebf8e4d 100644 Binary files a/objects.inv and b/objects.inv differ diff --git a/pg.html b/pg.html index 928e4d7..1c6d7ea 100644 --- a/pg.html +++ b/pg.html @@ -1,4 +1,4 @@ -6 Policy Gradient Methods - CS/STAT 184: Introduction to Reinforcement Learning

6 Policy Gradient Methods

6.1Introduction

The core task of RL is finding the optimal policy in a given environment. + c-1.4,4.9-3.6,8.6-6.8,11.1c-3.1,2.5-7.3,3.7-12.4,3.8H63c-6.7,0-10,2.9-10,8.8">Made with MyST

6 Policy Gradient Methods

1Introduction

The core task of RL is finding the optimal policy in a given environment. This is essentially an optimization problem: out of some space of policies, -we want to find the one that achieves the maximum total reward (in expectation).

It’s typically intractable to compute the optimal policy exactly. +we want to find the one that achieves the maximum total reward (in expectation).

It’s typically intractable to compute the optimal policy exactly in some finite number of steps. Instead, policy optimization algorithms start from some randomly initialized policy, and then improve it step by step. We’ve already seen some examples of these, -namely Section 1.5.3.2 for finite MDPs and Section 2.6.4 in continuous control. -In particular, we often use policies that can be described by some finite set of parameters. +namely Section 1.5.3.2 for finite MDPs and Section 2.6.4 in continuous control.

In particular, we often use policies that can be described by some finite set of parameters. +We will see some examples in Section 3.1. For such parameterized policies, we can approximate the policy gradient: the gradient of the expected total reward with respect to the parameters. -This tells us the direction the parameters should be updated to achieve a higher total reward (in expectation). +This tells us the direction the parameters should be updated to achieve a higher expected total reward. Policy gradient methods are responsible for groundbreaking applications including AlphaGo, OpenAI Five, and large language models, many of which use policies parameterized as deep neural networks.

  1. We begin the chapter with a short review of gradient ascent, a general optimization method.
  2. We’ll then see how to estimate the policy gradient, enabling us to apply (stochastic) gradient ascent in the RL setting.
  3. Then we’ll explore some proximal optimization techniques that ensure the steps taken are “not too large”. -This is helpful to stabilize training and widely used in practice.
from utils import plt, Array, Callable, jax, jnp

6.2Gradient Ascent

Gradient ascent is a general optimization algorithm for any differentiable function. +This is helpful to stabilize training and widely used in practice.

from utils import plt, Array, Callable, jax, jnp, latexify

2Gradient Ascent

Gradient ascent is a general optimization algorithm for any differentiable function. A suitable analogy for this algorithm is hiking up a mountain, where you keep taking steps in the steepest direction upwards. Here, your vertical position yy is the function being optimized, and your horizontal position (x,z)(x, z) is the input to the function. The slope of the mountain at your current position is given by the gradient, -written y(x,z)R2\nabla y(x, z) \in \mathbb{R}^2.

def f(x, y):
+written y(x,z)R2\nabla y(x, z) \in \mathbb{R}^2.

<Figure size 600x600 with 2 Axes>

For differentiable functions, this can be thought of as the vector of partial derivatives,

y(x,z)=(yxyz).\nabla y(x, z) = \begin{pmatrix} +plt.show()
<Figure size 600x600 with 2 Axes>

For differentiable functions, this can be thought of as the vector of partial derivatives,

y(x,z)=(yxyz).\nabla y(x, z) = \begin{pmatrix} \frac{\partial y}{\partial x} \\ \frac{\partial y}{\partial z} -\end{pmatrix}.

To calculate the slope (aka “directional derivative”) of the mountain in a given direction (Δx,Δz)(\Delta x, \Delta z), +\end{pmatrix}.

To calculate the slope (aka “directional derivative”) of the mountain in a given direction (Δx,Δz)(\Delta x, \Delta z), you take the dot product of the difference vector with the gradient. This means that the direction with the highest slope is exactly the gradient itself, -so we can describe the gradient ascent algorithm as follows:

where kk denotes the iteration of the algorithm and η>0\eta > 0 is a “step size” hyperparameter that controls the size of the steps we take. +\eta \nabla y(x^{k}, z^{k})

where kk denotes the iteration of the algorithm and η>0\eta > 0 is a “step size” hyperparameter that controls the size of the steps we take. (Note that we could also vary the step size across iterations, that is, η0,,ηK\eta^0, \dots, \eta^K.)

The case of a two-dimensional input is easy to visualize. But this idea can be straightforwardly extended to higher-dimensional inputs.

From now on, we’ll use JJ to denote the function we’re trying to maximize, and θ to denote the parameters being optimized over. (In the above example, θ=(xz)\theta = \begin{pmatrix} x & z \end{pmatrix}^\top).

Notice that our parameters will stop changing once J(θ)=0.\nabla J(\theta) = 0. @@ -98,8 +101,8 @@ the computer applies a list of rules to transform the symbols involved. Python’s sympy package supports symbolic differentiation. However, functions implemented in code may not always have a straightforward symbolic representation.

Another way is numerical differentiation, -which is based on the limit definition of a (directional) derivative:

uJ(x)=limε0J(x+εu)J(x)ε\nabla_{\boldsymbol{u}} J(\boldsymbol{x}) = \lim_{\varepsilon \to 0} -\frac{J(\boldsymbol{x} + \varepsilon \boldsymbol{u}) - J(\boldsymbol{x})}{\varepsilon}

Then, we can substitute a small value of ε\varepsilon on the r.h.s. to approximate the directional derivative. +which is based on the limit definition of a (directional) derivative:

uJ(x)=limε0J(x+εu)J(x)ε\nabla_{\boldsymbol{u}} J(\boldsymbol{x}) = \lim_{\varepsilon \to 0} +\frac{J(\boldsymbol{x} + \varepsilon \boldsymbol{u}) - J(\boldsymbol{x})}{\varepsilon}

Then, we can substitute a small value of ε\varepsilon on the r.h.s. to approximate the directional derivative. How small, though? If we need an accurate estimate, we may need such a small value of ε\varepsilon that typical computers will run into rounding errors. Also, to compute the full gradient, @@ -111,14 +114,15 @@ we execute them on the values when the function gets called, like in numerical differentiation. This allows us to differentiate through programming constructs such as branches or loops, -and doesn’t involve any arbitrarily small values.

6.2.1Stochastic gradient ascent

In real applications, +and doesn’t involve any arbitrarily small values. +Baydin et al. (2018) provides an accessible survey of automatic differentiation.

2.1Stochastic gradient ascent

In real applications, computing the gradient of the target function is not so simple. As an example from supervised learning, J(θ)J(\theta) might be the sum of squared prediction errors across an entire training dataset. However, if our dataset is very large, it might not fit into our computer’s memory! In these cases, we often compute some estimate of the gradient at each step, ~J(θ)\tilde \nabla J(\theta), and walk in that direction instead. This is called stochastic gradient ascent. -In the SL example above, we might randomly choose a minibatch of samples and use them to estimate the true prediction error. (This approach is known as minibatch SGD.)

def sgd(
-    θ_init: Array,
+In the SL example above, we might randomly choose a minibatch of samples and use them to estimate the true prediction error. (This approach is known as minibatch SGD.)

def sgd(
+    theta_init: Array,
     estimate_gradient: Callable[[Array], Array],
     η: float,
     n_steps: int,
@@ -127,11 +131,11 @@
 
     `estimate_gradient` eats the current parameters and returns an estimate of the objective function's gradient at those parameters.
     """
-    θ = θ_init
+    θ = theta_init
     for step in range(n_steps):
         θ += η * estimate_gradient(θ)
-    return θ

What makes one gradient estimator better than another? -Ideally, we want this estimator to be unbiased; that is, on average, it matches a single true gradient step:

E[~J(θ)]=J(θ).\E [\tilde \nabla J(\theta)] = \nabla J(\theta).

We also want the variance of the estimator to be low so that its performance doesn’t change drastically at each step.

We can actually show that, for many “nice” functions, in a finite number of steps, SGD will find a θ that is “close” to a stationary point. + return θ

What makes one gradient estimator better than another? +Ideally, we want this estimator to be unbiased; that is, on average, it matches a single true gradient step:

E[~J(θ)]=J(θ).\E [\tilde \nabla J(\theta)] = \nabla J(\theta).

We also want the variance of the estimator to be low so that its performance doesn’t change drastically at each step.

We can actually show that, for many “nice” functions, in a finite number of steps, SGD will find a θ that is “close” to a stationary point. In another perspective, for such functions, the local “landscape” of JJ around θ becomes flatter and flatter the longer we run SGD.

We’ll now see a concrete application of gradient ascent in the context of policy optimization.

6.3Policy (stochastic) gradient ascent

Remember that in RL, the primary goal is to find the optimal policy that achieves the maximimum total reward, which we can express using the value function we defined in Definition 1.6:

J(π):=Es0μ0Vπ(s0)=Eh=0H1rhwheres0μ0st+1P(sh,ah),ah=π(sh)rh=r(sh,ah).\begin{aligned} - J(\pi) := \E_{s_0 \sim \mu_0} V^{\pi} (s_0) = & \E \sum_{\hi=0}^{\hor-1} r_\hi \\ - \text{where} \quad & s_0 \sim \mu_0 \\ - & s_{t+1} \sim P(s_\hi, a_\hi), \\ - & a_\hi = \pi(s_\hi) \\ - & r_\hi = r(s_\hi, a_\hi). -\end{aligned}

(Note that we’ll continue to work in the undiscounted, finite-horizon case. Analogous results hold for the discounted, infinite-horizon case.)

As shown by the notation, this is exactly the function JJ that we want to maximize using gradient ascent. -What does θ correspond to, though? -In general, π is a function, and optimizing over the space of arbitrary input-output mappings would be intractable. -Instead, we need to describe π in terms of some finite set of parameters θ.

6.3.1Example policy parameterizations

What are some ways we could parameterize our policy?

6.3.1.1Tabular representation

If both the state and action spaces are finite, perhaps we could simply learn a preference value θs,a\theta_{s,a} for each state-action pair. +and the norm of the gradient estimator has a bounded second moment σ2,\sigma^2,

J(θK)2O(Mβσ2/K).\|\nabla J(\theta^K)\|^2 \le O \left( M \beta \sigma^2 / K\right).

We call a function β-smooth if its gradient is Lipschitz continuous with constant β:

J(θ)J(θ)βθθ.\|\nabla J(\theta) - \nabla J(\theta')\| \le \beta \|\theta - \theta'\|.

We’ll now see a concrete application of gradient ascent in the context of policy optimization.

3Policy (stochastic) gradient ascent

Remember that in RL, the primary goal is to find the optimal policy that achieves the maximimum total reward, which we can express using the value function we defined in Definition 1.6:

J(π):=Es0μ0Vπ(s0)=Eτρπh=0H1r(sh,ah)\begin{aligned} + J(\pi) := \E_{s_0 \sim \mu_0} V^{\pi} (s_0) = & \E_{\tau \sim \rho^\pi} \sum_{\hi=0}^{\hor-1} r(s_\hi, a_\hi) +\end{aligned}

where ρπ\rho^\pi is the distribution over trajectories induced by π (see Definition 1.5).

(Note that we’ll continue to work in the undiscounted, finite-horizon case. Analogous results hold for the discounted, infinite-horizon setup.)

As shown by the notation, this is exactly the function JJ that we want to maximize using gradient ascent. +What variables are we optimizing over in this problem? +Well, the objective function JJ is a function of the policy π, +but in general, π is a function, +and optimizing over the entire space of arbitrary input-output mappings would be intractable. +Instead, we need to describe π in terms of some finite set of parameters θ.

3.1Example policy parameterizations

What are some ways we could parameterize our policy?

Now that we have seen some examples of parameterized policies, +we will write the total reward in terms of the parameters, +overloading notation and letting ρθ:=ρπθ\rho_\theta := \rho^{\pi_\theta}:

J(θ)=EτρθR(τ)J(\theta) = \E_{\tau \sim \rho_\theta} R(\tau)

where R(τ)=h=0H1r(sh,ah)R(\tau) = \sum_{\hi=0}^{\hor-1} r(s_\hi, a_\hi) denotes the total reward in the trajectory.

Now how do we maximize this function (the expected total reward) over the parameters? +One simple idea would be to directly apply gradient ascent:

θk+1=θk+ηJ(θk).\theta^{k+1} = \theta^k + \eta \nabla J(\theta^k).

In order to apply this technique, we need to be able to evaluate the gradient J(θ).\nabla J(\theta). But J(θ)J(\theta) is very difficult, or even intractable, to compute exactly, since it involves taking an expectation over all possible trajectories τ.\tau. -Can we rewrite it in a form that’s more convenient to implement?

6.3.3Importance Sampling

There is a general trick called importance sampling for evaluating such expectations. -Suppose we want to estimate Exp[f(x)]\E_{x \sim p}[f(x)] where pp is hard or expensive to sample from. We can, however, evaluate the likelihood p(x)p(x). -Suppose that we can sample from a different distribution qq. +Can we rewrite it in a form that’s more convenient to implement?

3.2Importance Sampling

There is a general trick called importance sampling for evaluating difficult expectations. +Suppose we want to estimate Exp[f(x)]\E_{x \sim p}[f(x)] where pp is hard or expensive to sample from, +but easy to evaluate the likelihood p(x)p(x) of. +Suppose that we can easily sample from a different distribution qq. Since an expectation is just a weighted average, we can sample xx from qq, compute f(x)f(x), and then reweight the results: if xx is very likely under pp but unlikely under qq, we should boost its weighting, and if it is common under qq but uncommon under pp, we should lower its weighting. -The reweighting factor is exactly the likelihood ratio between the target distribution pp and the sampling distribution qq:

Exp[f(x)]=xXf(x)p(x)=xXf(x)p(x)q(x)q(x)=Exq[p(x)q(x)f(x)].\E_{x \sim p}[f(x)] = \sum_{x \in \mathcal{X}} f(x) p(x) = \sum_{x \in \mathcal{X}} f(x) \frac{p(x)}{q(x)} q(x) = \E_{x \sim q} \left[ \frac{p(x)}{q(x)} f(x) \right].

Doesn’t this seem too good to be true? If there were no drawbacks, we could use this to estimate any expectation of any function on any arbitrary distribution! The drawback is that the variance may be very large due to the likelihood ratio term. +The reweighting factor is exactly the likelihood ratio between the target distribution pp and the sampling distribution qq:

Exp[f(x)]=xXf(x)p(x)=xXf(x)p(x)q(x)q(x)=Exq[p(x)q(x)f(x)].\E_{x \sim p}[f(x)] = \sum_{x \in \mathcal{X}} f(x) p(x) = \sum_{x \in \mathcal{X}} f(x) \frac{p(x)}{q(x)} q(x) = \E_{x \sim q} \left[ \frac{p(x)}{q(x)} f(x) \right].

Doesn’t this seem too good to be true? If there were no drawbacks, we could use this to estimate any expectation of any function on any arbitrary distribution! The drawback is that the variance may be very large due to the likelihood ratio term. If there are values of xx that are very rare in the sampling distribution qq, but common under pp, -then the likelihood ratio p(x)/q(x)p(x)/q(x) will cause the variance to blow up.

6.4The REINFORCE policy gradient

Returning to RL, suppose there is some trajectory distribution ρ(τ)\rho(\tau) that is easy to sample from, such as a database of existing trajectories. +then the likelihood ratio p(x)/q(x)p(x)/q(x) will cause the variance to blow up.

4The REINFORCE policy gradient

Returning to RL, suppose there is some trajectory distribution ρ(τ)\rho(\tau) that is easy to sample from, such as a database of existing trajectories. We can then rewrite J(θ)\nabla J(\theta), a.k.a. the policy gradient, as follows. -All gradients are being taken with respect to θ.

J(θ)=Eτρθ[R(τ)]=Eτρ[ρθ(τ)ρ(τ)R(τ)]likelihood ratio trick=Eτρ[ρθ(τ)ρ(τ)R(τ)]switching gradient and expectation\begin{aligned} +All gradients are being taken with respect to θ.

J(θ)=Eτρθ[R(τ)]=Eτρ[ρθ(τ)ρ(τ)R(τ)]likelihood ratio trick=Eτρ[ρθ(τ)ρ(τ)R(τ)]switching gradient and expectation\begin{aligned} \nabla J(\theta) & = \nabla \E_{\tau \sim \rho_\theta} [ R(\tau) ] \\ & = \nabla \E_{\tau \sim \rho} \left[ \frac{\rho_\theta(\tau)}{\rho(\tau)} R(\tau) \right] & & \text{likelihood ratio trick} \\ & = \E_{\tau \sim \rho} \left[ \frac{\nabla \rho_\theta(\tau)}{\rho(\tau)} R(\tau) \right] & & \text{switching gradient and expectation} -\end{aligned}

Note that for ρ=ρθ\rho = \rho_\theta, the inside term becomes

J(θ)=Eτρθ[logρθ(τ)R(τ)].\nabla J(\theta) = \E_{\tau \sim \rho_\theta} [ \nabla \log \rho_\theta(\tau) \cdot R(\tau)].

(The order of operations is (logρθ)(τ)\nabla (\log \rho_\theta)(\tau).)

Note that when the state transitions are Markov (i.e. sts_{t} only depends on st1,at1s_{t-1}, a_{t-1}) and the policy is time-homogeneous (i.e. ahπθ(sh)a_\hi \sim \pi_\theta (s_\hi)), we can write out the likelihood of a trajectory under the policy πθ\pi_\theta:

ρθ(τ)=μ(s0)πθ(a0s0)×P(s1s0,a0)πθ(a1s1)××P(sH1sH2,aH2)πθ(aH1sH1).\begin{aligned} - \rho_\theta(\tau) &= \mu(s_0) \pi_\theta(a_0 | s_0) \\ - &\qquad \times P(s_1 | s_0, a_0) \pi_\theta(a_1 | s_1) \\ - &\qquad \times \cdots \\ - &\qquad \times P(s_{H-1} | s_{H-2}, a_{H-2}) \pi_\theta(a_{H-1} | s_{H-1}). -\end{aligned}

Note that the log-trajectory-likelihood turns into a sum of terms, -of which only the πθ(ahsh)\pi_\theta(a_\hi | s_\hi) terms depend on θ,\theta, -so we can simplify even further to obtain the following expression for the policy gradient, known as the “REINFORCE” policy gradient:

J(θ)=Eτρθ[t=0T1θlogπθ(ahsh)R(τ)]\begin{aligned} - \nabla J(\theta) = \E_{\tau \sim \rho_\theta} \left[ \sum_{t=0}^{T-1} \nabla_\theta \log \pi_{\theta}(a_\hi | s_\hi) R(\tau) \right] -\end{aligned}

This expression allows us to estimate the gradient by sampling a few sample trajectories from πθ,\pi_\theta, +\end{aligned}

Note that for ρ=ρθ\rho = \rho_\theta, the inside term becomes

J(θ)=Eτρθ[logρθ(τ)R(τ)].\nabla J(\theta) = \E_{\tau \sim \rho_\theta} [ \nabla \log \rho_\theta(\tau) \cdot R(\tau)].

(The order of operations is (logρθ)(τ)\nabla (\log \rho_\theta)(\tau).)

Recall that when the state transitions are Markov (i.e. sts_{t} only depends on st1,at1s_{t-1}, a_{t-1}) and the policy is time-homogeneous (i.e. ahπθ(sh)a_\hi \sim \pi_\theta (s_\hi)), we can write out the likelihood of a trajectory under the policy πθ\pi_\theta autoregressively, as in Definition 1.5. Taking the log of the trajectory likelihood turns it into a sum of terms:

logρθ(τ)=logμ(s0)+h=0H1logπθ(ahsh)+logP(sh+1sh,ah)\log \rho_\theta(\tau) = \log \mu(s_0) + \sum_{\hi=0}^{\hor-1} \log \pi_\theta(a_\hi \mid s_\hi) + \log P(s_{\hi+1} \mid s_\hi, a_\hi)

When we take the gradient with respect to the parameters θ, +only the πθ(ahsh)\pi_\theta(a_\hi | s_\hi) terms depend on θ. +This gives the following expression for the policy gradient, known as the “REINFORCE” policy gradient Williams (1992):

J(θ)=Eτρθ[h=0H1θlogπθ(ahsh)R(τ)]\begin{aligned} + \nabla J(\theta) = \E_{\tau \sim \rho_\theta} \left[ \sum_{\hi=0}^{\hor-1} \nabla_\theta \log \pi_{\theta}(a_\hi | s_\hi) R(\tau) \right] +\end{aligned}

This expression allows us to estimate the gradient by sampling a few sample trajectories from πθ,\pi_\theta, calculating the likelihoods of the chosen actions, -and substituting these into the expression above. -We can then use this gradient estimate to apply stochastic gradient ascent.

def estimate_gradient_reinforce_pseudocode(env, π, θ):
+and substituting these into the expression inside the brackets of (18).
+Then we can update the parameters θ in this direction to perform stochastic gradient ascent.

The rest of this chapter investigates ways to reduce the variance of this estimator by subtracting off certain correlated quantities.

def estimate_gradient_reinforce_pseudocode(env, π, θ):
     τ = sample_trajectory(env, π(θ))
     gradient_hat = 0
     for s, a, r in τ:
         def policy_log_likelihood(θ):
             return log(π(θ)(s, a))
         gradient_hat += jax.grad(policy_log_likelihood)(θ) * τ.total_reward
-    return gradient_hat

In fact, we can perform one more simplification. -Intuitively, the action taken at step tt does not affect the reward from previous timesteps, since they’re already in the past! -You can also show rigorously that this is the case, -and that we only need to consider the present and future rewards to calculate the policy gradient:

J(θ)=Eτρθ[t=0T1θlogπθ(ahsh)t=tT1r(st,at)]=Eτρθ[t=0T1θlogπθ(ahsh)Qπθ(st,at)]\begin{aligned} - \nabla J(\theta) &= \E_{\tau \sim \rho_\theta} \left[ \sum_{t=0}^{T-1} \nabla_\theta \log \pi_{\theta}(a_\hi | s_\hi) \sum_{t' = t}^{T-1} r(s_{t'}, a_{t'}) \right] \\ - &= \E_{\tau \sim \rho_\theta} \left[ \sum_{t=0}^{T-1} \nabla_\theta \log \pi_{\theta}(a_\hi | s_\hi) Q^{\pi_\theta}(s_{t}, a_{t}) \right] -\end{aligned}

Exercise: Prove that this is equivalent to the previous definitions. What modification to the expression must be made for the discounted, infinite-horizon setting?

For some intuition into how this method works, recall that we update our parameters according to

θt+1=θh+ηJ(θh)=θh+ηEτρθh[logρθh(τ)R(τ)].\begin{aligned} - \theta_{t+1} &= \theta_\hi + \eta \nabla J(\theta_\hi) \\ - &= \theta_\hi + \eta \E_{\tau \sim \rho_{\theta_\hi}} [\nabla \log \rho_{\theta_\hi}(\tau) \cdot R(\tau)]. -\end{aligned}

Consider the “good” trajectories where R(τ)R(\tau) is large. Then θ gets updated so that these trajectories become more likely. To see why, recall that ρθ(τ)\rho_{\theta}(\tau) is the likelihood of the trajectory τ under the policy πθ,\pi_\theta, so evaluating the gradient points in the direction that makes τ more likely.

6.5Baselines and advantages

A central idea from supervised learning is the bias-variance decomposition, + return gradient_hat

For some intuition into how this method works, recall that we update our parameters according to

θt+1=θt+ηJ(θt)=θt+ηEτρθt[logρθt(τ)R(τ)].\begin{aligned} + \theta_{t+1} &= \theta_t + \eta \nabla J(\theta_t) \\ + &= \theta_t + \eta \E_{\tau \sim \rho_{\theta_t}} [\nabla \log \rho_{\theta_t}(\tau) \cdot R(\tau)]. +\end{aligned}

Consider the “good” trajectories where R(τ)R(\tau) is large. Then θ gets updated so that these trajectories become more likely. To see why, recall that ρθ(τ)\rho_{\theta}(\tau) is the likelihood of the trajectory τ under the policy πθ,\pi_\theta, so the gradient points in the direction that makes τ more likely.

5Baselines and advantages

A central idea from supervised learning is the bias-variance decomposition, which shows that the mean squared error of an estimator is the sum of its squared bias and its variance. -The REINFORCE gradient estimator (6.21) is already unbiased, meaning that its expectation over trajectories is the true policy gradient. -Can we find ways to reduce its variance as well?

One common way is to subtract a baseline function bh:SRb_\hi : \mathcal{S} \to \mathbb{R} at each timestep h.\hi. This modifies the policy gradient as follows:

J(θ)=Eτρθ[h=0H1logπθ(ahsh)((h=hH1rh)bh(sh))].\nabla J(\theta) = \E_{\tau \sim \rho_\theta} \left[ +The REINFORCE gradient estimator (18) is already unbiased, meaning that its expectation over trajectories is the true policy gradient. +Can we find ways to reduce its variance as well?

As a first step, +consider that the action taken at step tt does not affect the reward from previous timesteps, since they’re already in the past. +You can also show rigorously that this is the case, +and that we only need to consider the present and future rewards to calculate the policy gradient:

J(θ)=Eτρθ[h=0H1θlogπθ(ahsh)h=hH1r(sh,ah)]\nabla J(\theta) = \E_{\tau \sim \rho_\theta} \left[ \sum_{\hi=0}^{\hor-1} \nabla_\theta \log \pi_{\theta}(a_\hi | s_\hi) \sum_{\hi' = \hi}^{\hor-1} r(s_{\hi'}, a_{\hi'}) \right]

Furthermore, by a conditioning argument, we can replace the inner sum over remaining rewards with the policy’s Q-function, +evaluated at the current state:

J(θ)=Eτρθ[h=0H1θlogπθ(ahsh)Qπθ(sh,ah)]\nabla J(\theta) = \E_{\tau \sim \rho_\theta} \left[ \sum_{\hi=0}^{\hor-1} \nabla_\theta \log \pi_{\theta}(a_\hi | s_\hi) Q^{\pi_\theta}(s_{\hi}, a_{\hi}) \right]

Exercise: Prove that this is equivalent to the previous definitions. What modification to the expression must be made for the discounted, infinite-horizon setting?

We can further reduce variance by subtracting a baseline function bh:SRb_\hi : \mathcal{S} \to \mathbb{R} at each timestep h\hi. +This modifies the policy gradient as follows:

J(θ)=Eτρθ[h=0H1logπθ(ahsh)(Qπθ(sh,ah)bh(sh))].\nabla J(\theta) = \E_{\tau \sim \rho_\theta} \left[ \sum_{\hi=0}^{H-1} \nabla \log \pi_\theta (a_\hi | s_\hi) \left( - \left( - \sum_{\hi' = \hi}^{H-1} r_{\hi'} - \right) + Q^{\pi_\theta}(s_\hi, a_\hi) - b_\hi(s_\hi) \right) - \right].

For example, we might want bhb_\hi to estimate the average reward-to-go at a given timestep:

bhθ=EτρθRh(τ).b_\hi^\theta = \E_{\tau \sim \rho_\theta} R_\hi(\tau).

This way, the random variable Rh(τ)bhθR_\hi(\tau) - b_\hi^\theta is centered around zero, making certain algorithms more stable.

As a better baseline, we could instead choose the value function. + \right].

(Again, you should try to prove that this equality still holds.) +For example, we might want bhb_\hi to estimate the average reward-to-go at a given timestep:

bhθ=EτρθRh(τ).b_\hi^\theta = \E_{\tau \sim \rho_\theta} R_\hi(\tau).

As a better baseline, we could instead choose the value function. Note that the random variable Qhπ(s,a)Vhπ(s),Q^\pi_\hi(s, a) - V^\pi_\hi(s), -where the randomness is taken over the actions, is also centered around zero. +where the randomness is taken over the actions, is centered around zero. (Recall Vhπ(s)=EaπQhπ(s,a).V^\pi_\hi(s) = \E_{a \sim \pi} Q^\pi_\hi(s, a).) -In fact, this quantity has a particular name: the advantage function. -This measures how much better this action does than the average for that policy. -(Note that for an optimal policy π,\pi^\star, the advantage of a given state-action pair is always zero or negative.)

We can now express the policy gradient as follows. Note that the advantage function effectively replaces the QQ-function from (6.22):

J(θ)=Eτρθ[t=0T1logπθ(ahsh)Ahπθ(sh,ah)].\nabla J(\theta) = \E_{\tau \sim \rho_\theta} \left[ - \sum_{t=0}^{T-1} \nabla \log \pi_\theta(a_\hi | s_\hi) A^{\pi_\theta}_\hi (s_\hi, a_\hi) -\right].

Note that to avoid correlations between the gradient estimator and the value estimator (i.e. baseline), we must estimate them with independently sampled trajectories:

6.6Comparing policy gradient algorithms to policy iteration

What advantages does the policy gradient algorithm have over Section 1.5.3.2?

To analyze the difference between them, we’ll make use of the performance difference lemma, which provides an expression for comparing the difference between two value functions.

as desired. (Note that the “inner” expectation from expanding the advantage function has the same distribution as the outer one, so omitting it here is valid.)

The PDL gives insight into why fitted approaches such as PI don’t work as well in the “full” RL setting. To see why, let’s consider a single iteration of policy iteration, where policy π gets updated to π~\tilde \pi. We’ll assume these policies are deterministic. Suppose the new policy π~\tilde \pi chooses some action with a negative advantage with respect to π. That is, when acting according to π, taking the action from π~\tilde \pi would perform worse than expected. Define Δ\Delta_\infty to be the most negative advantage, that is, Δ=minsSAhπ(s,π~(s))\Delta_\infty = \min_{s \in \mathcal{S}} A^{\pi}_\hi(s, \tilde \pi(s)). -Plugging this into the Theorem 6.1 gives

V0π~(s)V0π(s)=Eτρπ~,s[h=0H1Ahπ(sh,ah)]HΔV0π~(s)V0π(s)HΔ.\begin{aligned} +Plugging this into the Theorem 1 gives

V0π~(s)V0π(s)=Eτρπ~,s[h=0H1Ahπ(sh,ah)]HΔV0π~(s)V0π(s)HΔ.\begin{aligned} V_0^{\tilde \pi}(s) - V_0^{\pi}(s) &= \E_{\tau \sim \rho_{\tilde \pi, s}} \left[ \sum_{\hi=0}^{\hor-1} A_\hi^{\pi}(s_\hi, a_\hi) \right] \\ &\ge H \Delta_\infty \\ V_0^{\tilde \pi}(s) &\ge V_0^{\pi}(s) - H|\Delta_\infty|. -\end{aligned}

That is, for some state ss, the lower bound on the performance of π~\tilde \pi is lower than the performance of π. +\end{aligned}

That is, for some state ss, the lower bound on the performance of π~\tilde \pi is lower than the performance of π. This doesn’t state that π~\tilde \pi will necessarily perform worse than π, only suggests that it might be possible. If these worst case states do exist, though, @@ -286,27 +287,27 @@ Then, by adjusting the parameters only a small distance, the new policy will also have a similar trajectory distribution. But this is not very rigorous, and in practice the parameter-to-distribution mapping may not be so smooth. -Can we constrain the distance between the resulting distributions more explicitly?

This brings us to the next three methods:

  • trust region policy optimization (TRPO), which explicitly constrains the difference between the distributions before and after each step;
  • the natural policy gradient (NPG), a first-order approximation of TRPO;
  • proximal policy optimization (PPO), a “soft relaxation” of TRPO.

6.7Trust region policy optimization

We saw above that policy gradient methods are effective because they implicitly constrain how much the policy changes at each iteration. +Can we constrain the distance between the resulting distributions more explicitly?

This brings us to the next three methods:

  • trust region policy optimization (TRPO), which explicitly constrains the difference between the distributions before and after each step;
  • the natural policy gradient (NPG), a first-order approximation of TRPO;
  • proximal policy optimization (PPO), a “soft relaxation” of TRPO.

7Trust region policy optimization

We saw above that policy gradient methods are effective because they implicitly constrain how much the policy changes at each iteration. Can we design an algorithm that explicitly constrains the “step size”? That is, we want to improve the policy as much as possible, -measured in terms of the r.h.s. of the Theorem 6.1, -while ensuring that its trajectory distribution does not change too much:

θk+1argmaxθoptEs0,,sH1πk[h=0H1Eahπθopt(sh)Aπk(sh,ah)]where distance(ρθopt,ρθk)<δ\begin{aligned} +measured in terms of the r.h.s. of the Theorem 1, +while ensuring that its trajectory distribution does not change too much:

θk+1argmaxθoptEs0,,sH1πk[h=0H1Eahπθopt(sh)Aπk(sh,ah)]where distance(ρθopt,ρθk)<δ\begin{aligned} \theta^{k+1} &\gets \arg\max_{\theta^{\text{opt}}} \E_{s_0, \dots, s_{H-1} \sim \pi^{k}} \left[ \sum_{\hi=0}^{\hor-1} \E_{a_\hi \sim \pi^{\theta^\text{opt}}(s_\hi)} A^{\pi^{k}}(s_\hi, a_\hi) \right] \\ & \text{where } \text{distance}(\rho_{\theta^{\text{opt}}}, \rho_{\theta^k}) < \delta -\end{aligned}

Note that we have made a small change to the r.h.s. expression: +\end{aligned}

Note that we have made a small change to the r.h.s. expression: we use the states sampled from the old policy, and only use the actions from the new policy. It would be computationally infeasible to sample entire trajectories from πθ\pi_\theta as we are optimizing over θ. On the other hand, if πθ\pi_\theta returns a vector representing a probability distribution over actions, then evaluating the expected advantage with respect to this distribution only requires taking a dot product. This approximation also matches the r.h.s. of the PDL to first order in θ. (We will elaborate more on this later.)

How do we describe the distance between ρθopt\rho_{\theta^{\text{opt}}} and ρθk\rho_{\theta^k}? -We’ll use the Kullback-Leibler divergence (KLD):

Though the NPG now gives a closed-form optimization step, it requires computing the inverse Fisher information matrix, which typically scales as O((dimΘ)3)O((\dim \Theta)^3). This can be expensive if the parameter space is large. -Can we find an algorithm that works in linear time with respect to the dimension of the parameter space?

6.9Proximal policy optimization

We can relax the TRPO optimization problem in a different way: +Can we find an algorithm that works in linear time with respect to the dimension of the parameter space?

9Proximal policy optimization

We can relax the TRPO optimization problem in a different way: Rather than imposing a hard constraint on the KL distance, -we can instead impose a soft constraint by incorporating it into the objective and penalizing parameter values that drastically change the trajectory distribution.

θk+1argmaxθEs0,,sH1ρπk[h=0H1Eahπθ(sh)Aπk(sh,ah)]λKL(ρθρθk)\begin{aligned} +we can instead impose a soft constraint by incorporating it into the objective and penalizing parameter values that drastically change the trajectory distribution.

θk+1argmaxθEs0,,sH1ρπk[h=0H1Eahπθ(sh)Aπk(sh,ah)]λKL(ρθρθk)\begin{aligned} \theta^{k+1} &\gets \arg\max_{\theta} \E_{s_0, \dots, s_{H-1} \sim \rho_{\pi^{k}}} \left[ \sum_{\hi=0}^{\hor-1} \E_{a_\hi \sim \pi_{\theta}(s_\hi)} A^{\pi^{k}}(s_\hi, a_\hi) \right] - \lambda \kl{\rho_{\theta}}{\rho_{\theta^k}} -\end{aligned}

Here λ is a regularization hyperparameter that controls the tradeoff between the two terms.

Like the original TRPO algorithm Definition 6.4, PPO is not gradient-based; rather, at each step, we try to maximize local advantage relative to the current policy.

How do we solve this optimization? -Let us begin by simplifying the KL(ρπkρπθ)\kl{\rho_{\pi^k}}{\rho_{\pi_{\theta}}} term. Expanding gives

KL(ρπkρπθ)=Eτρπk[logρπk(τ)ρπθ(τ)]=Eτρπk[h=0H1logπk(ahsh)πθ(ahsh)]state transitions cancel=Eτρπk[h=0H1log1πθ(ahsh)]+c\begin{aligned} +\end{aligned}

Here λ is a regularization hyperparameter that controls the tradeoff between the two terms. +This is the objective of the proximal policy optimization algorithm Schulman et al. (2017).

Like the original TRPO algorithm Definition 4, +PPO is not gradient-based; rather, at each step, we try to maximize local advantage relative to the current policy.

How do we solve this optimization? +Let us begin by simplifying the KL(ρπkρπθ)\kl{\rho_{\pi^k}}{\rho_{\pi_{\theta}}} term. Expanding gives

KL(ρπkρπθ)=Eτρπk[logρπk(τ)ρπθ(τ)]=Eτρπk[h=0H1logπk(ahsh)πθ(ahsh)]state transitions cancel=Eτρπk[h=0H1log1πθ(ahsh)]+c\begin{aligned} \kl{\rho_{\pi^k}}{\rho_{\pi_{\theta}}} & = \E_{\tau \sim \rho_{\pi^k}} \left[\log \frac{\rho_{\pi^k}(\tau)}{\rho_{\pi_{\theta}}(\tau)}\right] \\ & = \E_{\tau \sim \rho_{\pi^k}} \left[ \sum_{h=0}^{H-1} \log \frac{\pi^k(a_\hi \mid s_\hi)}{\pi_{\theta}(a_\hi \mid s_\hi)}\right] & \text{state transitions cancel} \\ & = \E_{\tau \sim \rho_{\pi^k}} \left[ \sum_{h=0}^{H-1} \log \frac{1}{\pi_{\theta}(a_\hi \mid s_\hi)}\right] + c -\end{aligned}

where cc is some constant with respect to θ, and can be ignored. -This gives the objective

k(θ)=Es0,,sH1ρπk[h=0H1Eahπθ(sh)Aπk(sh,ah)]λEτρπk[h=0H1log1πθ(ahsh)]\ell^k(\theta) +\end{aligned}

where cc is some constant with respect to θ, and can be ignored. +This gives the objective

k(θ)=Es0,,sH1ρπk[h=0H1Eahπθ(sh)Aπk(sh,ah)]λEτρπk[h=0H1log1πθ(ahsh)]\ell^k(\theta) = -\E_{s_0, \dots, s_{H-1} \sim \rho_{\pi^{k}}} \left[ \sum_{\hi=0}^{\hor-1} \E_{a_\hi \sim \pi_{\theta}(s_\hi)} A^{\pi^{k}}(s_\hi, a_\hi) \right] - \lambda \E_{\tau \sim \rho_{\pi^k}} \left[ \sum_{h=0}^{H-1} \log \frac{1}{\pi_{\theta}(a_\hi \mid s_\hi)}\right]

Once again, this takes an expectation over trajectories. +\E_{s_0, \dots, s_{H-1} \sim \rho_{\pi^{k}}} \left[ \sum_{\hi=0}^{\hor-1} \E_{a_\hi \sim \pi_{\theta}(s_\hi)} A^{\pi^{k}}(s_\hi, a_\hi) \right] - \lambda \E_{\tau \sim \rho_{\pi^k}} \left[ \sum_{h=0}^{H-1} \log \frac{1}{\pi_{\theta}(a_\hi \mid s_\hi)}\right]

Once again, this takes an expectation over trajectories. But here we cannot directly sample trajectories from πk\pi^k, since in the first term, the actions actually come from πθ\pi_\theta. To make this term line up with the other expectation, we would need the actions to also come from πk\pi^k.

This should sound familiar: we want to estimate an expectation over one distribution by sampling from another. -We can once again use Section 6.3.3 to rewrite the inner expectation:

Eahπθ(sh)Aπk(sh,ah)=Eahπk(sh)πθ(ahsh)πk(ahsh)Aπk(sh,ah)\E_{a_\hi \sim \pi_{\theta}(s_\hi)} A^{\pi^{k}}(s_\hi, a_\hi) +We can once again use Section 3.2 to rewrite the inner expectation:

Eahπθ(sh)Aπk(sh,ah)=Eahπk(sh)πθ(ahsh)πk(ahsh)Aπk(sh,ah)\E_{a_\hi \sim \pi_{\theta}(s_\hi)} A^{\pi^{k}}(s_\hi, a_\hi) = -\E_{a_\hi \sim \pi^k(s_\hi)} \frac{\pi_\theta(a_\hi \mid s_\hi)}{\pi^k(a_\hi \mid s_\hi)} A^{\pi^{k}}(s_\hi, a_\hi)

Now we can combine the expectations together to get the objective

k(θ)=Eτρπk[h=0H1(πθ(ahsh)πk(ahsh)Aπk(sh,ah)λlog1πθ(ahsh))]\ell^k(\theta) = \E_{\tau \sim \rho_{\pi^k}} \left[ \sum_{h=0}^{H-1} \left( \frac{\pi_\theta(a_\hi \mid s_\hi)}{\pi^k(a_\hi \mid s_\hi)} A^{\pi^k}(s_\hi, a_\hi) - \lambda \log \frac{1}{\pi_\theta(a_\hi \mid s_\hi)} \right) \right]

Now we can estimate this function by a sample average over trajectories from πk\pi^k. +\E_{a_\hi \sim \pi^k(s_\hi)} \frac{\pi_\theta(a_\hi \mid s_\hi)}{\pi^k(a_\hi \mid s_\hi)} A^{\pi^{k}}(s_\hi, a_\hi)

Now we can combine the expectations together to get the objective

k(θ)=Eτρπk[h=0H1(πθ(ahsh)πk(ahsh)Aπk(sh,ah)λlog1πθ(ahsh))]\ell^k(\theta) = \E_{\tau \sim \rho_{\pi^k}} \left[ \sum_{h=0}^{H-1} \left( \frac{\pi_\theta(a_\hi \mid s_\hi)}{\pi^k(a_\hi \mid s_\hi)} A^{\pi^k}(s_\hi, a_\hi) - \lambda \log \frac{1}{\pi_\theta(a_\hi \mid s_\hi)} \right) \right]

Now we can estimate this function by a sample average over trajectories from πk\pi^k. Remember that to complete a single iteration of PPO, -we execute

θk+1argmaxθk(θ).\theta^{k+1} \gets \arg\max_{\theta} \ell^k(\theta).

If k\ell^k is differentiable, we can optimize it by gradient ascent, completing a single iteration of PPO.

def ppo_pseudocode(
+we execute

θk+1argmaxθk(θ).\theta^{k+1} \gets \arg\max_{\theta} \ell^k(\theta).

If k\ell^k is differentiable, we can optimize it by gradient ascent, completing a single iteration of PPO.

def ppo_pseudocode(
     env,
     π: Callable[[Params], Callable[[State, Action], Float]],
     λ: float,
@@ -437,9 +440,21 @@
         
         θ = optimize(objective, θ)
 
-    return θ

6.10Summary

Policy gradient methods are a powerful family of algorithms that directly optimize the total reward by iteratively updating the policy parameters.

TODO

  • Vanilla policy gradient
  • Baselines and advantages
  • Trust region policy optimization
  • Natural policy gradient
  • Proximal policy optimization
References
  1. Boyd, S., & Vandenberghe, L. (2004). Convex Optimization. Cambridge University Press.
\ No newline at end of file diff --git a/pg.json b/pg.json index 7f97b99..35b0208 100644 --- a/pg.json +++ b/pg.json @@ -1 +1 @@ -{"kind":"Notebook","sha256":"47497ac56bd39ac3a823e8bfd6c4097e933a72960f31d7f469a8610a4e9554df","slug":"pg","location":"/pg.md","dependencies":[],"frontmatter":{"title":"6 Policy Gradient Methods","numbering":{"all":{"enabled":true},"enumerator":{"template":"6.%s"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"thumbnail":"/build/npg_line-18dfc6d5286c25a94643b5e115d15484.png","thumbnailOptimized":"/build/npg_line-18dfc6d5286c25a94643b5e115d15484.webp","exports":[{"format":"md","filename":"pg.md","url":"/build/pg-955e7c04f204da0cc1efa76c01287d9f.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"o3jGXzbTjd"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"6.1","key":"jEO54wAA4v"},{"type":"paragraph","position":{"start":{"line":20,"column":1},"end":{"line":23,"column":1}},"children":[{"type":"text","value":"The core task of RL is finding the ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"hjbDRPt8Yt"},{"type":"strong","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"optimal policy","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"P4NuXXuuXP"}],"key":"jCrNSOi9ec"},{"type":"text","value":" in a given environment.\nThis is essentially an ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"sQ7jEbB2Uv"},{"type":"emphasis","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"optimization problem:","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"nQlkWzLpFq"}],"key":"JDDcSZn9iv"},{"type":"text","value":"\nout of some space of policies,\nwe want to find the one that achieves the maximum total reward (in expectation).","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"ixxyceibhA"}],"key":"dwm2lSiv2V"},{"type":"paragraph","position":{"start":{"line":25,"column":1},"end":{"line":36,"column":1}},"children":[{"type":"text","value":"It’s typically intractable to compute the optimal policy exactly.\nInstead, ","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"FnYizRGqBG"},{"type":"strong","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"policy optimization algorithms","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"tImHVhaXEH"}],"key":"Oea3642WXn"},{"type":"text","value":" start from some randomly initialized policy,\nand then ","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"uWaAWj1WQc"},{"type":"emphasis","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"improve","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"CM4YAZqzdZ"}],"key":"kzXLKiOsxJ"},{"type":"text","value":" it step by step.\nWe’ve already seen some examples of these,\nnamely ","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"MPkmU0LcCr"},{"type":"crossReference","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"Section ","key":"kHtndc9A0J"},{"type":"text","value":"1.5.3.2","key":"lqg8Cgw9Vs"}],"identifier":"policy_iteration","label":"policy_iteration","kind":"heading","template":"Section %s","enumerator":"1.5.3.2","resolved":true,"html_id":"policy-iteration","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"kJQUqN9zPA"},{"type":"text","value":" for finite MDPs and ","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"qpUcovrM8b"},{"type":"crossReference","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"Section ","key":"wTqA7pTDrZ"},{"type":"text","value":"2.6.4","key":"DnXW3LCwjX"}],"identifier":"iterative_lqr","label":"iterative_lqr","kind":"heading","template":"Section %s","enumerator":"2.6.4","resolved":true,"html_id":"iterative-lqr","remote":true,"url":"/control","dataUrl":"/control.json","key":"ay4mDN3N1Z"},{"type":"text","value":" in continuous control.\nIn particular, we often use policies that can be described by some finite set of ","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"cbpzei1fob"},{"type":"emphasis","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"parameters.","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"vdKfTibWlL"}],"key":"s0xG7dwLLA"},{"type":"text","value":"\nFor such parameterized policies,\nwe can approximate the ","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"cvCiErkiFl"},{"type":"strong","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"policy gradient:","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"BzJ26o4Crv"}],"key":"NrRKogPiY8"},{"type":"text","value":"\nthe gradient of the expected total reward with respect to the parameters.\nThis tells us the direction the parameters should be updated to achieve a higher total reward (in expectation).\nPolicy gradient methods are responsible for groundbreaking applications including AlphaGo, OpenAI Five, and large language models,\nmany of which use policies parameterized as deep neural networks.","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"FZmta5SYSI"}],"key":"faek6yzHQc"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":38,"column":1},"end":{"line":44,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":38,"column":1},"end":{"line":39,"column":1}},"children":[{"type":"text","value":"We begin the chapter with a short review of gradient ascent,\na general ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"vTB51fkXne"},{"type":"strong","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"children":[{"type":"text","value":"optimization method.","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"XovImgHmlW"}],"key":"ubHP5pphVK"}],"key":"pEiJeWbi84"},{"type":"listItem","spread":true,"position":{"start":{"line":40,"column":1},"end":{"line":41,"column":1}},"children":[{"type":"text","value":"We’ll then see how to estimate the ","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"CongDz4lmV"},{"type":"strong","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"children":[{"type":"text","value":"policy gradient,","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"C1o1o117EI"}],"key":"tyPSKAgIKx"},{"type":"text","value":"\nenabling us to apply (stochastic) gradient ascent in the RL setting.","position":{"start":{"line":40,"column":1},"end":{"line":40,"column":1}},"key":"wvcSyX4jVP"}],"key":"DPut3kB5s8"},{"type":"listItem","spread":true,"position":{"start":{"line":42,"column":1},"end":{"line":44,"column":1}},"children":[{"type":"text","value":"Then we’ll explore some ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"WiyeerH2Nj"},{"type":"emphasis","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"proximal optimization","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"SAVrJbd2Gi"}],"key":"wJ90O1GlVC"},{"type":"text","value":" techniques that ensure the steps taken are “not too large”.\nThis is helpful to stabilize training and widely used in practice.","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"I1Jk4ZGrpP"}],"key":"SpwaTU8Tmt"}],"key":"yAub2obGCJ"}],"key":"S43pYMYn8u"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"from utils import plt, Array, Callable, jax, jnp","key":"Z5PGOOb94g"},{"type":"output","id":"K2WNM5MoSfz1enig_LM4U","data":[],"key":"V4Z6YpyFA6"}],"data":{},"key":"BOOANRySE5"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":49,"column":1},"end":{"line":49,"column":1}},"children":[{"type":"text","value":"Gradient Ascent","position":{"start":{"line":49,"column":1},"end":{"line":49,"column":1}},"key":"Qf0l1qHxVf"}],"identifier":"gradient-ascent","label":"Gradient Ascent","html_id":"gradient-ascent","implicit":true,"enumerator":"6.2","key":"AJVx8Q4uVr"},{"type":"paragraph","position":{"start":{"line":51,"column":1},"end":{"line":57,"column":1}},"children":[{"type":"strong","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"children":[{"type":"text","value":"Gradient ascent","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"KYKAsQj4e8"}],"key":"KF8Y9Atr6p"},{"type":"text","value":" is a general optimization algorithm for any differentiable function.\nA suitable analogy for this algorithm is hiking up a mountain,\nwhere you keep taking steps in the steepest direction upwards.\nHere, your vertical position ","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"WNDKZwQoyX"},{"type":"inlineMath","value":"y","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"html":"yyy","key":"rbHEL8gqZz"},{"type":"text","value":" is the function being optimized,\nand your horizontal position ","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"bxseYRYRPG"},{"type":"inlineMath","value":"(x, z)","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"html":"(x,z)(x, z)(x,z)","key":"nr9OZm4McR"},{"type":"text","value":" is the input to the function.\nThe ","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"ZUT8SzZY5T"},{"type":"emphasis","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"children":[{"type":"text","value":"slope","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"uZfk0QXmKN"}],"key":"NGLx4Xljsg"},{"type":"text","value":" of the mountain at your current position is given by the ","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"DLfcfDnzNZ"},{"type":"emphasis","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"children":[{"type":"text","value":"gradient","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"c8v8NUICDO"}],"key":"XhEIU7ZxLX"},{"type":"text","value":",\nwritten ","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"QEWZmYwZ8H"},{"type":"inlineMath","value":"\\nabla y(x, z) \\in \\mathbb{R}^2","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"html":"y(x,z)R2\\nabla y(x, z) \\in \\mathbb{R}^2y(x,z)R2","key":"aSqMTbdRGp"},{"type":"text","value":".","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"e5GbbcHi1c"}],"key":"icOz3eDTgF"}],"key":"OOoPqh4AtD"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def f(x, y):\n \"\"\"Himmelblau's function\"\"\"\n return (x**2 + y - 11)**2 + (x + y**2 - 7)**2\n\n# Create a grid of points\nx = jnp.linspace(-5, 5, 400)\ny = jnp.linspace(-5, 5, 400)\nX, Y = jnp.meshgrid(x, y)\nZ = f(X, Y)\n\n# Create the plot\nfig, ax = plt.subplots(figsize=(6, 6))\n\n# Plot the function using imshow\nimg = ax.imshow(Z, extent=[-5, 5, -5, 5], origin='lower')\n\n# Add color bar\nfig.colorbar(img, ax=ax)\n\n# Gradient computation using JAX\ntx, ty = 1.0, 1.0\ngx, gy = jax.grad(f, argnums=(0, 1))(tx, ty)\n\n# Scatter point\nax.scatter(tx, ty, color='red', s=100)\n\n# Add arrow representing the gradient\nax.arrow(tx, ty, gx * 0.01, gy * 0.01, head_width=0.3, head_length=0.3, fc='blue', ec='blue')\n\n# Add plot title\nax.set_title(\"Himmelblau's Function\")\n\nplt.show()","key":"W8aytrMqmS"},{"type":"output","id":"MBfeXe1zQiRxj0TY4xWjs","data":[{"output_type":"display_data","metadata":{},"data":{"text/plain":{"content":"
","content_type":"text/plain"},"image/png":{"content_type":"image/png","hash":"b8e65b5253271f49ddf227a711c3aa2c","path":"/build/b8e65b5253271f49ddf227a711c3aa2c.png"}}}],"key":"rq9Cduipxf"}],"data":{},"key":"EKeOxM21o8"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":95,"column":1},"end":{"line":95,"column":1}},"children":[{"type":"text","value":"For differentiable functions, this can be thought of as the vector of partial derivatives,","position":{"start":{"line":95,"column":1},"end":{"line":95,"column":1}},"key":"jyegdFaGxX"}],"key":"EzrC6wdsG2"},{"type":"math","value":"\\nabla y(x, z) = \\begin{pmatrix}\n\\frac{\\partial y}{\\partial x} \\\\\n\\frac{\\partial y}{\\partial z}\n\\end{pmatrix}.","position":{"start":{"line":97,"column":1},"end":{"line":102,"column":1}},"html":"y(x,z)=(yxyz).\\nabla y(x, z) = \\begin{pmatrix}\n\\frac{\\partial y}{\\partial x} \\\\\n\\frac{\\partial y}{\\partial z}\n\\end{pmatrix}.y(x,z)=(xyzy).","enumerator":"6.1","key":"aUZ923VDL9"},{"type":"paragraph","position":{"start":{"line":104,"column":1},"end":{"line":107,"column":1}},"children":[{"type":"text","value":"To calculate the ","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"key":"lLr7CtmEkk"},{"type":"emphasis","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"children":[{"type":"text","value":"slope","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"key":"gM5QZmGgR7"}],"key":"nRhOEqBdEF"},{"type":"text","value":" (aka “directional derivative”) of the mountain in a given direction ","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"key":"OsKKbIeepa"},{"type":"inlineMath","value":"(\\Delta x, \\Delta z)","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"html":"(Δx,Δz)(\\Delta x, \\Delta z)(Δx,Δz)","key":"nwnlvlJWuv"},{"type":"text","value":",\nyou take the dot product of the difference vector with the gradient.\nThis means that the direction with the highest slope is exactly the gradient itself,\nso we can describe the gradient ascent algorithm as follows:","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"key":"kt9x1cyoV2"}],"key":"F3fe2zjSxE"},{"type":"proof","kind":"definition","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Gradient ascent","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"nTznuZz6Ji"}],"key":"aJBRQAeCKB"},{"type":"math","value":"\\begin{pmatrix}\nx^{k+1} \\\\ z^{k+1}\n\\end{pmatrix}\n= \n\\begin{pmatrix}\nx^{k} \\\\ z^{k}\n\\end{pmatrix}\n+\n\\eta \\nabla y(x^{k}, z^{k})","position":{"start":{"line":110,"column":1},"end":{"line":120,"column":1}},"html":"(xk+1zk+1)=(xkzk)+ηy(xk,zk)\\begin{pmatrix}\nx^{k+1} \\\\ z^{k+1}\n\\end{pmatrix}\n= \n\\begin{pmatrix}\nx^{k} \\\\ z^{k}\n\\end{pmatrix}\n+\n\\eta \\nabla y(x^{k}, z^{k})(xk+1zk+1)=(xkzk)+ηy(xk,zk)","enumerator":"6.2","key":"jIPndLqxAu"}],"enumerator":"6.1","key":"wSjcpZgZov"},{"type":"paragraph","position":{"start":{"line":123,"column":1},"end":{"line":124,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"key":"QoDcqR3FXO"},{"type":"inlineMath","value":"k","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"html":"kkk","key":"czlDAzxGvh"},{"type":"text","value":" denotes the iteration of the algorithm and ","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"key":"y6rjUL0LgE"},{"type":"inlineMath","value":"\\eta > 0","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"html":"η>0\\eta > 0η>0","key":"rTg8mNkLqN"},{"type":"text","value":" is a “step size” hyperparameter that controls the size of the steps we take.\n(Note that we could also vary the step size across iterations, that is, ","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"key":"i8s1Bx0PzN"},{"type":"inlineMath","value":"\\eta^0, \\dots, \\eta^K","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"html":"η0,,ηK\\eta^0, \\dots, \\eta^Kη0,,ηK","key":"HYOQTiIlNP"},{"type":"text","value":".)","position":{"start":{"line":123,"column":1},"end":{"line":123,"column":1}},"key":"KVRkrgkxIR"}],"key":"hzUTiuTjmW"},{"type":"paragraph","position":{"start":{"line":126,"column":1},"end":{"line":127,"column":1}},"children":[{"type":"text","value":"The case of a two-dimensional input is easy to visualize.\nBut this idea can be straightforwardly extended to higher-dimensional inputs.","position":{"start":{"line":126,"column":1},"end":{"line":126,"column":1}},"key":"oYVBJveW9L"}],"key":"BYcRNfRZap"},{"type":"paragraph","position":{"start":{"line":129,"column":1},"end":{"line":130,"column":1}},"children":[{"type":"text","value":"From now on, we’ll use ","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"key":"Q2ADU6u0JE"},{"type":"inlineMath","value":"J","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"html":"JJJ","key":"NvWfjMgpet"},{"type":"text","value":" to denote the function we’re trying to maximize,\nand ","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"key":"q2sEBpyeea"},{"type":"text","value":"θ","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"key":"nVbc9JXmwz"},{"type":"text","value":" to denote the parameters being optimized over. (In the above example, ","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"key":"DLWVKql14N"},{"type":"inlineMath","value":"\\theta = \\begin{pmatrix} x & z \\end{pmatrix}^\\top","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"html":"θ=(xz)\\theta = \\begin{pmatrix} x & z \\end{pmatrix}^\\topθ=(xz)","key":"nALBHRMvIZ"},{"type":"text","value":").","position":{"start":{"line":129,"column":1},"end":{"line":129,"column":1}},"key":"sbVo3mOpMU"}],"key":"ZhUMDHQXel"},{"type":"paragraph","position":{"start":{"line":132,"column":1},"end":{"line":136,"column":1}},"children":[{"type":"text","value":"Notice that our parameters will stop changing once ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"sBsdMsNX2h"},{"type":"inlineMath","value":"\\nabla J(\\theta) = 0.","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"html":"J(θ)=0.\\nabla J(\\theta) = 0.J(θ)=0.","key":"WKdQs4a5lx"},{"type":"text","value":"\nOnce we reach this ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"z24ulSBCdz"},{"type":"strong","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"children":[{"type":"text","value":"stationary point,","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"yo8CqfE0tJ"}],"key":"PwDoEPwEza"},{"type":"text","value":" our current parameters are ‘locally optimal’ in some sense;\nit’s impossible to increase the function by moving in any direction.\nIf ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"LpLoP3pg6H"},{"type":"inlineMath","value":"J","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"html":"JJJ","key":"xWAleRXTws"},{"type":"text","value":" is ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"wB8rnvJz5s"},{"type":"emphasis","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"children":[{"type":"text","value":"convex","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"BLVfnzDLhh"}],"key":"biOMeo7YOt"},{"type":"text","value":", then the only point where this happens is at the ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"toETehZtve"},{"type":"emphasis","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"children":[{"type":"text","value":"global optimum.","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"dhnAzg3beg"}],"key":"JFnfDdvHZt"},{"type":"text","value":"\nOtherwise, if ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"a69xNgX3bH"},{"type":"inlineMath","value":"J","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"html":"JJJ","key":"NELKrLfsSG"},{"type":"text","value":" is nonconvex, the best we can hope for is a ","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"IltAOB2pEv"},{"type":"emphasis","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"children":[{"type":"text","value":"local optimum.","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"l8LsxyqWFE"}],"key":"e9a2KrbpMT"}],"key":"w1KMg0g0UC"},{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Note","key":"WAQ5lVSx6K"}],"key":"wmgMLfGEIR"},{"type":"paragraph","position":{"start":{"line":139,"column":1},"end":{"line":139,"column":1}},"children":[{"type":"text","value":"How does a computer compute the gradient of a function?","position":{"start":{"line":139,"column":1},"end":{"line":139,"column":1}},"key":"sJAMUgPRTV"}],"key":"Jr9WtoFj2c"},{"type":"paragraph","position":{"start":{"line":141,"column":1},"end":{"line":145,"column":1}},"children":[{"type":"text","value":"One way is ","position":{"start":{"line":141,"column":1},"end":{"line":141,"column":1}},"key":"vauNfvBuYV"},{"type":"emphasis","position":{"start":{"line":141,"column":1},"end":{"line":141,"column":1}},"children":[{"type":"text","value":"symbolic differentiation,","position":{"start":{"line":141,"column":1},"end":{"line":141,"column":1}},"key":"b1j619m1EW"}],"key":"a6DMIKs89c"},{"type":"text","value":"\nwhich is similar to the way you might compute it by hand:\nthe computer applies a list of rules to transform the ","position":{"start":{"line":141,"column":1},"end":{"line":141,"column":1}},"key":"RxlsOO3qU1"},{"type":"emphasis","position":{"start":{"line":141,"column":1},"end":{"line":141,"column":1}},"children":[{"type":"text","value":"symbols","position":{"start":{"line":141,"column":1},"end":{"line":141,"column":1}},"key":"ntDIgS7g0Q"}],"key":"JaJ4kigtnI"},{"type":"text","value":" involved.\nPython’s ","position":{"start":{"line":141,"column":1},"end":{"line":141,"column":1}},"key":"HDXlt3zO2o"},{"type":"inlineCode","value":"sympy","position":{"start":{"line":141,"column":1},"end":{"line":141,"column":1}},"key":"PSARloLQVh"},{"type":"text","value":" package supports symbolic differentiation.\nHowever, functions implemented in code may not always have a straightforward symbolic representation.","position":{"start":{"line":141,"column":1},"end":{"line":141,"column":1}},"key":"Dizf0pTYGH"}],"key":"eksExAXGHN"},{"type":"paragraph","position":{"start":{"line":147,"column":1},"end":{"line":148,"column":1}},"children":[{"type":"text","value":"Another way is ","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"key":"bBU2emTmVC"},{"type":"emphasis","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"children":[{"type":"text","value":"numerical differentiation,","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"key":"mLix3YvwaI"}],"key":"woyEr7fX6b"},{"type":"text","value":"\nwhich is based on the limit definition of a (directional) derivative:","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"key":"RhFLouQqhJ"}],"key":"Byj69QeMlM"},{"type":"math","value":"\\nabla_{\\boldsymbol{u}} J(\\boldsymbol{x}) = \\lim_{\\varepsilon \\to 0}\n\\frac{J(\\boldsymbol{x} + \\varepsilon \\boldsymbol{u}) - J(\\boldsymbol{x})}{\\varepsilon}","position":{"start":{"line":150,"column":1},"end":{"line":153,"column":1}},"html":"uJ(x)=limε0J(x+εu)J(x)ε\\nabla_{\\boldsymbol{u}} J(\\boldsymbol{x}) = \\lim_{\\varepsilon \\to 0}\n\\frac{J(\\boldsymbol{x} + \\varepsilon \\boldsymbol{u}) - J(\\boldsymbol{x})}{\\varepsilon}uJ(x)=ε0limεJ(x+εu)J(x)","enumerator":"6.3","key":"v7Y3DNbeOl"},{"type":"paragraph","position":{"start":{"line":155,"column":1},"end":{"line":160,"column":1}},"children":[{"type":"text","value":"Then, we can substitute a small value of ","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"XojZYYgZA6"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"html":"ε\\varepsilonε","key":"LS3luvf6SB"},{"type":"text","value":" on the r.h.s. to approximate the directional derivative.\nHow small, though? If we need an accurate estimate,\nwe may need such a small value of ","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"F05vJwOCHD"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"html":"ε\\varepsilonε","key":"kHN4nK5sDh"},{"type":"text","value":" that typical computers will run into rounding errors.\nAlso, to compute the full gradient,\nwe would need to compute the r.h.s. once for each input dimension.\nThis is an issue if computing ","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"rpvJ2TeNca"},{"type":"inlineMath","value":"J","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"html":"JJJ","key":"H2dxuMRsDC"},{"type":"text","value":" is expensive.","position":{"start":{"line":155,"column":1},"end":{"line":155,"column":1}},"key":"GAXV2CRKt5"}],"key":"Nwc4buVDUd"},{"type":"paragraph","position":{"start":{"line":162,"column":1},"end":{"line":169,"column":1}},"children":[{"type":"strong","position":{"start":{"line":162,"column":1},"end":{"line":162,"column":1}},"children":[{"type":"text","value":"Automatic differentiation","position":{"start":{"line":162,"column":1},"end":{"line":162,"column":1}},"key":"OkkgdhVffS"}],"key":"W9Em3SyPXX"},{"type":"text","value":" achieves the best of both worlds.\nLike symbolic differentiation,\nwe manually implement the derivative rules for a few basic operations.\nHowever, instead of executing these on the ","position":{"start":{"line":162,"column":1},"end":{"line":162,"column":1}},"key":"PE5xfOH3aZ"},{"type":"emphasis","position":{"start":{"line":162,"column":1},"end":{"line":162,"column":1}},"children":[{"type":"text","value":"symbols","position":{"start":{"line":162,"column":1},"end":{"line":162,"column":1}},"key":"jWvmiUVKrH"}],"key":"NTY6qSvYKT"},{"type":"text","value":",\nwe execute them on the ","position":{"start":{"line":162,"column":1},"end":{"line":162,"column":1}},"key":"ytfxJE0z1o"},{"type":"emphasis","position":{"start":{"line":162,"column":1},"end":{"line":162,"column":1}},"children":[{"type":"text","value":"values","position":{"start":{"line":162,"column":1},"end":{"line":162,"column":1}},"key":"RxN40tgCHf"}],"key":"bSjzLnrwV2"},{"type":"text","value":" when the function gets called,\nlike in numerical differentiation.\nThis allows us to differentiate through programming constructs such as branches or loops,\nand doesn’t involve any arbitrarily small values.","position":{"start":{"line":162,"column":1},"end":{"line":162,"column":1}},"key":"tNvG9TClbN"}],"key":"FeWXAgKdwa"}],"key":"c6auZoFMYA"}],"key":"ICh8m1PKbb"},{"type":"block","position":{"start":{"line":172,"column":1},"end":{"line":172,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"children":[{"type":"text","value":"Stochastic gradient ascent","position":{"start":{"line":174,"column":1},"end":{"line":174,"column":1}},"key":"C2DocgyfJr"}],"identifier":"stochastic-gradient-ascent","label":"Stochastic gradient ascent","html_id":"stochastic-gradient-ascent","implicit":true,"enumerator":"6.2.1","key":"H1ZQWTnogb"},{"type":"paragraph","position":{"start":{"line":176,"column":1},"end":{"line":182,"column":1}},"children":[{"type":"text","value":"In real applications,\ncomputing the gradient of the target function is not so simple.\nAs an example from supervised learning, ","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"QEKbGTZ90F"},{"type":"inlineMath","value":"J(\\theta)","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"html":"J(θ)J(\\theta)J(θ)","key":"f59xmXWh0f"},{"type":"text","value":" might be the sum of squared prediction errors across an entire training dataset.\nHowever, if our dataset is very large, it might not fit into our computer’s memory!\nIn these cases, we often compute some ","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"M8cgemzsXU"},{"type":"emphasis","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"children":[{"type":"text","value":"estimate","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"ZAesgBEMOs"}],"key":"LFjwVILabc"},{"type":"text","value":" of the gradient at each step, ","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"hSIy5moO9P"},{"type":"inlineMath","value":"\\tilde \\nabla J(\\theta)","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"html":"~J(θ)\\tilde \\nabla J(\\theta)~J(θ)","key":"FziwOaiUK6"},{"type":"text","value":", and walk in that direction instead.\nThis is called ","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"HoOYe4QzqS"},{"type":"strong","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"children":[{"type":"text","value":"stochastic","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"DW9lCsGpQk"}],"key":"afKFeHNPSX"},{"type":"text","value":" gradient ascent.\nIn the SL example above, we might randomly choose a ","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"elvv57vb1I"},{"type":"emphasis","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"children":[{"type":"text","value":"minibatch","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"N1soQqLhIi"}],"key":"b7WrtclcnA"},{"type":"text","value":" of samples and use them to estimate the true prediction error. (This approach is known as ","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"PijtCLnskw"},{"type":"strong","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"children":[{"type":"emphasis","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"children":[{"type":"text","value":"minibatch","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"ROgJxktYFo"}],"key":"KyV0FzX2hi"},{"type":"text","value":" SGD","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"v2WtCOj6ey"}],"key":"vCgCCUeRb3"},{"type":"text","value":".)","position":{"start":{"line":176,"column":1},"end":{"line":176,"column":1}},"key":"hGfS67AShk"}],"key":"NSawrUOCpx"}],"key":"ffWzfQiDg2"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def sgd(\n θ_init: Array,\n estimate_gradient: Callable[[Array], Array],\n η: float,\n n_steps: int,\n):\n \"\"\"Perform `n_steps` steps of SGD.\n\n `estimate_gradient` eats the current parameters and returns an estimate of the objective function's gradient at those parameters.\n \"\"\"\n θ = θ_init\n for step in range(n_steps):\n θ += η * estimate_gradient(θ)\n return θ","key":"iHZWvbD6uc"},{"type":"output","id":"QLP7QRmVGLJr60aFiETa4","data":[],"key":"Peq0hnExae"}],"data":{},"key":"xNK67nXSsi"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":201,"column":1},"end":{"line":202,"column":1}},"children":[{"type":"text","value":"What makes one gradient estimator better than another?\nIdeally, we want this estimator to be ","position":{"start":{"line":201,"column":1},"end":{"line":201,"column":1}},"key":"UJVvGHeT8D"},{"type":"strong","position":{"start":{"line":201,"column":1},"end":{"line":201,"column":1}},"children":[{"type":"text","value":"unbiased;","position":{"start":{"line":201,"column":1},"end":{"line":201,"column":1}},"key":"QN6I1Ci3cK"}],"key":"te6F6GbhiX"},{"type":"text","value":" that is, on average, it matches a single true gradient step:","position":{"start":{"line":201,"column":1},"end":{"line":201,"column":1}},"key":"Dlakv5ZVUB"}],"key":"BWPC6Su8yJ"},{"type":"math","value":"\\E [\\tilde \\nabla J(\\theta)] = \\nabla J(\\theta).","position":{"start":{"line":204,"column":1},"end":{"line":206,"column":1}},"html":"E[~J(θ)]=J(θ).\\E [\\tilde \\nabla J(\\theta)] = \\nabla J(\\theta).E[~J(θ)]=J(θ).","enumerator":"6.4","key":"X1GxVjYEod"},{"type":"paragraph","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"children":[{"type":"text","value":"We also want the ","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"nXFXZ7FpZ5"},{"type":"emphasis","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"children":[{"type":"text","value":"variance","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"oYHApY27KT"}],"key":"ZJxDSRzYa2"},{"type":"text","value":" of the estimator to be low so that its performance doesn’t change drastically at each step.","position":{"start":{"line":208,"column":1},"end":{"line":208,"column":1}},"key":"a6bcHK1kXr"}],"key":"qsWZB81Rhg"},{"type":"paragraph","position":{"start":{"line":210,"column":1},"end":{"line":211,"column":1}},"children":[{"type":"text","value":"We can actually show that, for many “nice” functions, in a finite number of steps, SGD will find a ","position":{"start":{"line":210,"column":1},"end":{"line":210,"column":1}},"key":"uDk9va3iRK"},{"type":"text","value":"θ","position":{"start":{"line":210,"column":1},"end":{"line":210,"column":1}},"key":"tiTXkAwJhd"},{"type":"text","value":" that is “close” to a stationary point.\nIn another perspective, for such functions, the local “landscape” of ","position":{"start":{"line":210,"column":1},"end":{"line":210,"column":1}},"key":"qwIX5pYaMZ"},{"type":"inlineMath","value":"J","position":{"start":{"line":210,"column":1},"end":{"line":210,"column":1}},"html":"JJJ","key":"cR6UlVYDRB"},{"type":"text","value":" around ","position":{"start":{"line":210,"column":1},"end":{"line":210,"column":1}},"key":"gvLptkIpwd"},{"type":"text","value":"θ","position":{"start":{"line":210,"column":1},"end":{"line":210,"column":1}},"key":"gKYWVP0FGp"},{"type":"text","value":" becomes flatter and flatter the longer we run SGD.","position":{"start":{"line":210,"column":1},"end":{"line":210,"column":1}},"key":"xfRi0mm6J8"}],"key":"PCLThRrNYN"},{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"SGD convergence","position":{"start":{"line":213,"column":1},"end":{"line":213,"column":1}},"key":"ThJzawORqE"}],"key":"S9igq42j13"},{"type":"paragraph","position":{"start":{"line":214,"column":1},"end":{"line":217,"column":1}},"children":[{"type":"text","value":"More formally, suppose we run SGD for ","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"key":"U2OYwS2R7O"},{"type":"inlineMath","value":"K","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"html":"KKK","key":"nyP42lCSfq"},{"type":"text","value":" steps, using an unbiased gradient estimator.\nLet the step size ","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"key":"hFsEI0zBhy"},{"type":"inlineMath","value":"\\eta^k","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"html":"ηk\\eta^kηk","key":"XM2nNCGs2T"},{"type":"text","value":" scale as ","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"key":"BZbb7i1ilp"},{"type":"inlineMath","value":"O(1/\\sqrt{k}).","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"html":"O(1/k).O(1/\\sqrt{k}).O(1/k).","key":"EgSgBwvnRg"},{"type":"text","value":"\nThen if ","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"key":"VUIvrFT27r"},{"type":"inlineMath","value":"J","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"html":"JJJ","key":"VJo89leF7Z"},{"type":"text","value":" is bounded and ","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"key":"bfQYkpIPRY"},{"type":"text","value":"β","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"key":"jHLke2SGjw"},{"type":"text","value":"-smooth (see below),\nand the ","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"key":"bE81MyliTl"},{"type":"emphasis","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"children":[{"type":"text","value":"norm","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"key":"WkRnaNuBHr"}],"key":"ZeHH6ttHzW"},{"type":"text","value":" of the gradient estimator has a bounded second moment ","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"key":"m8Saxklx2y"},{"type":"inlineMath","value":"\\sigma^2,","position":{"start":{"line":214,"column":1},"end":{"line":214,"column":1}},"html":"σ2,\\sigma^2,σ2,","key":"tKCQrmZh5p"}],"key":"l3FDPIt2yT"},{"type":"math","value":"\\|\\nabla J(\\theta^K)\\|^2 \\le O \\left( M \\beta \\sigma^2 / K\\right).","position":{"start":{"line":219,"column":1},"end":{"line":219,"column":1}},"html":"J(θK)2O(Mβσ2/K).\\|\\nabla J(\\theta^K)\\|^2 \\le O \\left( M \\beta \\sigma^2 / K\\right).∥∇J(θK)2O(σ2/K).","enumerator":"6.5","key":"lWsCbWfT5e"},{"type":"paragraph","position":{"start":{"line":221,"column":1},"end":{"line":221,"column":1}},"children":[{"type":"text","value":"We call a function ","position":{"start":{"line":221,"column":1},"end":{"line":221,"column":1}},"key":"Nl8ZDLPrDL"},{"type":"text","value":"β","position":{"start":{"line":221,"column":1},"end":{"line":221,"column":1}},"key":"taW4b93zgg"},{"type":"text","value":"-smooth if its gradient is Lipschitz continuous with constant ","position":{"start":{"line":221,"column":1},"end":{"line":221,"column":1}},"key":"vxon3K8TX8"},{"type":"text","value":"β","position":{"start":{"line":221,"column":1},"end":{"line":221,"column":1}},"key":"ykDP4BBxxQ"},{"type":"text","value":":","position":{"start":{"line":221,"column":1},"end":{"line":221,"column":1}},"key":"DEQjZIiZu0"}],"key":"rbhVHntpty"},{"type":"math","value":"\\|\\nabla J(\\theta) - \\nabla J(\\theta')\\| \\le \\beta \\|\\theta - \\theta'\\|.","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"html":"J(θ)J(θ)βθθ.\\|\\nabla J(\\theta) - \\nabla J(\\theta')\\| \\le \\beta \\|\\theta - \\theta'\\|.∥∇J(θ)J(θ)βθθ∥.","enumerator":"6.6","key":"ul7NK6Xvv6"}],"key":"U4Ofu2oZhz"},{"type":"paragraph","position":{"start":{"line":226,"column":1},"end":{"line":226,"column":1}},"children":[{"type":"text","value":"We’ll now see a concrete application of gradient ascent in the context of policy optimization.","position":{"start":{"line":226,"column":1},"end":{"line":226,"column":1}},"key":"YbswJ169EP"}],"key":"ujyVqDSTNN"}],"key":"bk9aAxId2Q"},{"type":"block","position":{"start":{"line":228,"column":1},"end":{"line":228,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"children":[{"type":"text","value":"Policy (stochastic) gradient ascent","position":{"start":{"line":230,"column":1},"end":{"line":230,"column":1}},"key":"KO7cnqyAsE"}],"identifier":"policy-stochastic-gradient-ascent","label":"Policy (stochastic) gradient ascent","html_id":"policy-stochastic-gradient-ascent","implicit":true,"enumerator":"6.3","key":"Px4e1Ateps"},{"type":"paragraph","position":{"start":{"line":232,"column":1},"end":{"line":232,"column":1}},"children":[{"type":"text","value":"Remember that in RL, the primary goal is to find the ","position":{"start":{"line":232,"column":1},"end":{"line":232,"column":1}},"key":"jF5dtyghve"},{"type":"emphasis","position":{"start":{"line":232,"column":1},"end":{"line":232,"column":1}},"children":[{"type":"text","value":"optimal policy","position":{"start":{"line":232,"column":1},"end":{"line":232,"column":1}},"key":"gSSI0YGBi5"}],"key":"jJ0QphFhRT"},{"type":"text","value":" that achieves the maximimum total reward, which we can express using the value function we defined in ","position":{"start":{"line":232,"column":1},"end":{"line":232,"column":1}},"key":"Uvz8RyXBbt"},{"type":"crossReference","kind":"proof:definition","identifier":"value","label":"value","children":[{"type":"text","value":"Definition ","key":"wU7JUmGgSH"},{"type":"text","value":"1.6","key":"NjfXtJJDRQ"}],"template":"Definition %s","enumerator":"1.6","resolved":true,"html_id":"value","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"tZnwRHEo25"},{"type":"text","value":":","position":{"start":{"line":232,"column":1},"end":{"line":232,"column":1}},"key":"WI6SvY0pGq"}],"key":"bAwXeCJKNq"},{"type":"math","value":"\\begin{aligned}\n J(\\pi) := \\E_{s_0 \\sim \\mu_0} V^{\\pi} (s_0) = & \\E \\sum_{\\hi=0}^{\\hor-1} r_\\hi \\\\\n \\text{where} \\quad & s_0 \\sim \\mu_0 \\\\\n & s_{t+1} \\sim P(s_\\hi, a_\\hi), \\\\\n & a_\\hi = \\pi(s_\\hi) \\\\\n & r_\\hi = r(s_\\hi, a_\\hi).\n\\end{aligned}","label":"objective_fn","identifier":"objective_fn","html":"J(π):=Es0μ0Vπ(s0)=Eh=0H1rhwheres0μ0st+1P(sh,ah),ah=π(sh)rh=r(sh,ah).\\begin{aligned}\n J(\\pi) := \\E_{s_0 \\sim \\mu_0} V^{\\pi} (s_0) = & \\E \\sum_{\\hi=0}^{\\hor-1} r_\\hi \\\\\n \\text{where} \\quad & s_0 \\sim \\mu_0 \\\\\n & s_{t+1} \\sim P(s_\\hi, a_\\hi), \\\\\n & a_\\hi = \\pi(s_\\hi) \\\\\n & r_\\hi = r(s_\\hi, a_\\hi).\n\\end{aligned}J(π):=Es0μ0Vπ(s0)=whereEh=0H1rhs0μ0st+1P(sh,ah),ah=π(sh)rh=r(sh,ah).","enumerator":"6.7","html_id":"objective-fn","key":"lX401GWI87"},{"type":"paragraph","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"children":[{"type":"text","value":"(Note that we’ll continue to work in the ","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"key":"JrwlKKO35x"},{"type":"emphasis","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"children":[{"type":"text","value":"undiscounted, finite-horizon case.","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"key":"F4em7d1bbA"}],"key":"WgbUWVdJzD"},{"type":"text","value":" Analogous results hold for the ","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"key":"v4o5GNzo8l"},{"type":"emphasis","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"children":[{"type":"text","value":"discounted, infinite-horizon case.","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"key":"Wm8QuBnf3b"}],"key":"aIDun9vKBf"},{"type":"text","value":")","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"key":"eDf3ZpOVDX"}],"key":"eJgbJpxtmQ"},{"type":"paragraph","position":{"start":{"line":248,"column":1},"end":{"line":251,"column":1}},"children":[{"type":"text","value":"As shown by the notation, this is exactly the function ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"cLfR3xouZx"},{"type":"inlineMath","value":"J","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"html":"JJJ","key":"vUwn2bFfOR"},{"type":"text","value":" that we want to maximize using gradient ascent.\nWhat does ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"wWYV8cVpyd"},{"type":"text","value":"θ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"vWtIcy61US"},{"type":"text","value":" correspond to, though?\nIn general, ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"VCeYEMk4jQ"},{"type":"text","value":"π","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"BT6i4uMHCJ"},{"type":"text","value":" is a function, and optimizing over the space of arbitrary input-output mappings would be intractable.\nInstead, we need to describe ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"kkZtUqPAwB"},{"type":"text","value":"π","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"uo989In5Ny"},{"type":"text","value":" in terms of some finite set of ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"shdUHvO6Z7"},{"type":"emphasis","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"children":[{"type":"text","value":"parameters","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"kxFeRgET1y"}],"key":"Hs0Yu1F3gF"},{"type":"text","value":" ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"nL3dIEbqOm"},{"type":"text","value":"θ","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"ZQ8OVFJqaZ"},{"type":"text","value":".","position":{"start":{"line":248,"column":1},"end":{"line":248,"column":1}},"key":"SGB064jxhY"}],"key":"f7pqNoon5K"}],"key":"mFdHie90xa"},{"type":"block","position":{"start":{"line":253,"column":1},"end":{"line":253,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"children":[{"type":"text","value":"Example policy parameterizations","position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"key":"WYXmimdfi1"}],"label":"parameterizations","identifier":"parameterizations","html_id":"parameterizations","enumerator":"6.3.1","key":"OXyteNSZ1X"},{"type":"paragraph","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"children":[{"type":"text","value":"What are some ways we could parameterize our policy?","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"pLEhc398VS"}],"key":"W2R5HrgTFA"}],"key":"k4J4blViwc"},{"type":"block","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"children":[{"type":"heading","depth":4,"position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"children":[{"type":"text","value":"Tabular representation","position":{"start":{"line":262,"column":1},"end":{"line":262,"column":1}},"key":"EzbE382tsJ"}],"identifier":"tabular-representation","label":"Tabular representation","html_id":"tabular-representation","implicit":true,"enumerator":"6.3.1.1","key":"el3C6N3QA0"},{"type":"paragraph","position":{"start":{"line":264,"column":1},"end":{"line":267,"column":1}},"children":[{"type":"text","value":"If both the state and action spaces are finite, perhaps we could simply learn a preference value ","position":{"start":{"line":264,"column":1},"end":{"line":264,"column":1}},"key":"DXrVY0EbUU"},{"type":"inlineMath","value":"\\theta_{s,a}","position":{"start":{"line":264,"column":1},"end":{"line":264,"column":1}},"html":"θs,a\\theta_{s,a}θs,a","key":"dulnHG3JQg"},{"type":"text","value":" for each state-action pair.\nThen to turn this into a valid distribution, we perform a ","position":{"start":{"line":264,"column":1},"end":{"line":264,"column":1}},"key":"hPlRPNbp0L"},{"type":"strong","position":{"start":{"line":264,"column":1},"end":{"line":264,"column":1}},"children":[{"type":"text","value":"softmax","position":{"start":{"line":264,"column":1},"end":{"line":264,"column":1}},"key":"lOxFCn1qtt"}],"key":"WS0tSsb2Gx"},{"type":"text","value":" operation:\nwe exponentiate each of them,\nand then normalize to form a valid distribution:","position":{"start":{"line":264,"column":1},"end":{"line":264,"column":1}},"key":"eacoA79BSg"}],"key":"myA9l8icgs"},{"type":"math","value":"\\pi^\\text{softmax}_\\theta(a | s) = \\frac{\\exp(\\theta_{s,a})}{\\sum_{s,a'} \\exp (\\theta_{s,a'})}.","position":{"start":{"line":269,"column":1},"end":{"line":269,"column":1}},"html":"πθsoftmax(as)=exp(θs,a)s,aexp(θs,a).\\pi^\\text{softmax}_\\theta(a | s) = \\frac{\\exp(\\theta_{s,a})}{\\sum_{s,a'} \\exp (\\theta_{s,a'})}.πθsoftmax(as)=s,aexp(θs,a)exp(θs,a).","enumerator":"6.8","key":"zDBUCQvVrp"},{"type":"paragraph","position":{"start":{"line":271,"column":1},"end":{"line":272,"column":1}},"children":[{"type":"text","value":"However, this doesn’t make use of any structure in the states or actions,\nso while this is flexible, it is also prone to overfitting.","position":{"start":{"line":271,"column":1},"end":{"line":271,"column":1}},"key":"iHZND4z6E6"}],"key":"E760LE2ESB"},{"type":"heading","depth":4,"position":{"start":{"line":274,"column":1},"end":{"line":274,"column":1}},"children":[{"type":"text","value":"Linear in features","position":{"start":{"line":274,"column":1},"end":{"line":274,"column":1}},"key":"Sv8JtjqzjY"}],"identifier":"linear-in-features","label":"Linear in features","html_id":"linear-in-features","implicit":true,"enumerator":"6.3.1.2","key":"wz5TjksTwn"},{"type":"paragraph","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"children":[{"type":"text","value":"Another approach is to map each state-action pair into some ","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"key":"CMndVQ7Gx9"},{"type":"strong","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"children":[{"type":"text","value":"feature space","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"key":"pwmwzaVXML"}],"key":"hmUB8I6n0T"},{"type":"text","value":" ","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"key":"VJpwhtCHns"},{"type":"inlineMath","value":"\\phi(s, a) \\in \\mathbb{R}^p","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"html":"ϕ(s,a)Rp\\phi(s, a) \\in \\mathbb{R}^pϕ(s,a)Rp","key":"rtG51oe5YM"},{"type":"text","value":". Then, to map a feature vector to a probability, we take a linear combination of the features and take a softmax:","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"key":"Ev5g5mNAv5"}],"key":"smPTNb5psg"},{"type":"math","value":"\\pi^\\text{linear in features}_{\\theta}(a|s) = \\frac{\\exp(\\theta^\\top \\phi(s, a))}{\\sum_{a'} \\exp(\\theta^\\top \\phi(s, a'))}.","position":{"start":{"line":278,"column":1},"end":{"line":278,"column":1}},"html":"πθlinear in features(as)=exp(θϕ(s,a))aexp(θϕ(s,a)).\\pi^\\text{linear in features}_{\\theta}(a|s) = \\frac{\\exp(\\theta^\\top \\phi(s, a))}{\\sum_{a'} \\exp(\\theta^\\top \\phi(s, a'))}.πθlinear in features(as)=aexp(θϕ(s,a))exp(θϕ(s,a)).","enumerator":"6.9","key":"X1xOFaFALD"},{"type":"paragraph","position":{"start":{"line":280,"column":1},"end":{"line":280,"column":1}},"children":[{"type":"text","value":"Another interpretation is that ","position":{"start":{"line":280,"column":1},"end":{"line":280,"column":1}},"key":"tgZnfiFhNR"},{"type":"text","value":"θ","position":{"start":{"line":280,"column":1},"end":{"line":280,"column":1}},"key":"b2PCtrys4P"},{"type":"text","value":" represents the feature vector of the “desired” state-action pair, as state-action pairs whose features align closely with ","position":{"start":{"line":280,"column":1},"end":{"line":280,"column":1}},"key":"RcC5zYPMJ9"},{"type":"text","value":"θ","position":{"start":{"line":280,"column":1},"end":{"line":280,"column":1}},"key":"E2MtrJ2qZO"},{"type":"text","value":" are given higher probability.","position":{"start":{"line":280,"column":1},"end":{"line":280,"column":1}},"key":"JWNlvtdyWf"}],"key":"drJs6rthcH"},{"type":"paragraph","position":{"start":{"line":282,"column":1},"end":{"line":282,"column":1}},"children":[{"type":"text","value":"The score function for this parameterization is also quite elegant:","position":{"start":{"line":282,"column":1},"end":{"line":282,"column":1}},"key":"r2YYRyHZHi"}],"key":"IliQ7mGW81"},{"type":"math","value":"\\begin{aligned}\n \\nabla \\log \\pi_\\theta(a|s) &= \\nabla \\left( \\theta^\\top \\phi(s, a) - \\log \\left( \\sum_{a'} \\exp(\\theta^\\top \\phi(s, a')) \\right) \\right) \\\\\n &= \\phi(s, a) - \\E_{a' \\sim \\pi_\\theta(s)} \\phi(s, a')\n\\end{aligned}","position":{"start":{"line":284,"column":1},"end":{"line":289,"column":1}},"html":"logπθ(as)=(θϕ(s,a)log(aexp(θϕ(s,a))))=ϕ(s,a)Eaπθ(s)ϕ(s,a)\\begin{aligned}\n \\nabla \\log \\pi_\\theta(a|s) &= \\nabla \\left( \\theta^\\top \\phi(s, a) - \\log \\left( \\sum_{a'} \\exp(\\theta^\\top \\phi(s, a')) \\right) \\right) \\\\\n &= \\phi(s, a) - \\E_{a' \\sim \\pi_\\theta(s)} \\phi(s, a')\n\\end{aligned}logπθ(as)=(θϕ(s,a)log(aexp(θϕ(s,a))))=ϕ(s,a)Eaπθ(s)ϕ(s,a)","enumerator":"6.10","key":"Clfem0a8o3"},{"type":"paragraph","position":{"start":{"line":291,"column":1},"end":{"line":291,"column":1}},"children":[{"type":"text","value":"Plugging this into our policy gradient expression, we get","position":{"start":{"line":291,"column":1},"end":{"line":291,"column":1}},"key":"NROZGnladF"}],"key":"dTrsCqQ2Vg"},{"type":"math","value":"\\begin{aligned}\n \\nabla J(\\theta) & = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{t=0}^{T-1} \\nabla \\log \\pi_\\theta(a_\\hi | s_\\hi) A_\\hi^{\\pi_\\theta}\n \\right] \\\\\n & = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{t=0}^{T-1} \\left( \\phi(s_\\hi, a_\\hi) - \\E_{a' \\sim \\pi(s_\\hi)} \\phi(s_\\hi, a') \\right) A_\\hi^{\\pi_\\theta}(s_\\hi, a_\\hi)\n \\right] \\\\\n & = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{t=0}^{T-1} \\phi(s_\\hi, a_\\hi) A_\\hi^{\\pi_\\theta} (s_\\hi, a_\\hi) \\right]\n\\end{aligned}","position":{"start":{"line":293,"column":1},"end":{"line":302,"column":1}},"html":"J(θ)=Eτρθ[t=0T1logπθ(ahsh)Ahπθ]=Eτρθ[t=0T1(ϕ(sh,ah)Eaπ(sh)ϕ(sh,a))Ahπθ(sh,ah)]=Eτρθ[t=0T1ϕ(sh,ah)Ahπθ(sh,ah)]\\begin{aligned}\n \\nabla J(\\theta) & = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{t=0}^{T-1} \\nabla \\log \\pi_\\theta(a_\\hi | s_\\hi) A_\\hi^{\\pi_\\theta}\n \\right] \\\\\n & = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{t=0}^{T-1} \\left( \\phi(s_\\hi, a_\\hi) - \\E_{a' \\sim \\pi(s_\\hi)} \\phi(s_\\hi, a') \\right) A_\\hi^{\\pi_\\theta}(s_\\hi, a_\\hi)\n \\right] \\\\\n & = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{t=0}^{T-1} \\phi(s_\\hi, a_\\hi) A_\\hi^{\\pi_\\theta} (s_\\hi, a_\\hi) \\right]\n\\end{aligned}J(θ)=Eτρθ[t=0T1logπθ(ahsh)Ahπθ]=Eτρθ[t=0T1(ϕ(sh,ah)Eaπ(sh)ϕ(sh,a))Ahπθ(sh,ah)]=Eτρθ[t=0T1ϕ(sh,ah)Ahπθ(sh,ah)]","enumerator":"6.11","key":"lQGPEndM9W"},{"type":"paragraph","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"children":[{"type":"text","value":"Why can we drop the ","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"key":"hGsACzwnJF"},{"type":"inlineMath","value":"\\E \\phi(s_\\hi, a')","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"html":"Eϕ(sh,a)\\E \\phi(s_\\hi, a')Eϕ(sh,a)","key":"vt5w50UBqZ"},{"type":"text","value":" term? By linearity of expectation, consider the dropped term at a single timestep: ","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"key":"SbcjEHjW73"},{"type":"inlineMath","value":"\\E_{\\tau \\sim \\rho_\\theta} \\left[ \\left( \\E_{a' \\sim \\pi(s_\\hi)} \\phi(s, a') \\right) A_\\hi^{\\pi_\\theta}(s_\\hi, a_\\hi) \\right].","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"html":"Eτρθ[(Eaπ(sh)ϕ(s,a))Ahπθ(sh,ah)].\\E_{\\tau \\sim \\rho_\\theta} \\left[ \\left( \\E_{a' \\sim \\pi(s_\\hi)} \\phi(s, a') \\right) A_\\hi^{\\pi_\\theta}(s_\\hi, a_\\hi) \\right].Eτρθ[(Eaπ(sh)ϕ(s,a))Ahπθ(sh,ah)].","key":"X4hcAVMuiq"},{"type":"text","value":" By Adam’s Law, we can wrap the advantage term in a conditional expectation on the state ","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"key":"MCZQmv8zvw"},{"type":"inlineMath","value":"s_\\hi.","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"html":"sh.s_\\hi.sh.","key":"tU1saaG9zI"},{"type":"text","value":" Then we already know that ","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"key":"DaVgS4ll18"},{"type":"inlineMath","value":"\\E_{a \\sim \\pi(s)} A_\\hi^{\\pi}(s, a) = 0,","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"html":"Eaπ(s)Ahπ(s,a)=0,\\E_{a \\sim \\pi(s)} A_\\hi^{\\pi}(s, a) = 0,Eaπ(s)Ahπ(s,a)=0,","key":"JIITrfrTQl"},{"type":"text","value":" and so this entire term vanishes.","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"key":"od9TQQ8oHv"}],"key":"M6vbOH3vDw"},{"type":"heading","depth":4,"position":{"start":{"line":306,"column":1},"end":{"line":306,"column":1}},"children":[{"type":"text","value":"Neural policies","position":{"start":{"line":306,"column":1},"end":{"line":306,"column":1}},"key":"IQvjHw5eEG"}],"identifier":"neural-policies","label":"Neural policies","html_id":"neural-policies","implicit":true,"enumerator":"6.3.1.3","key":"TKchd0qQ25"},{"type":"paragraph","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"children":[{"type":"text","value":"More generally, we could map states and actions to unnormalized scores via some parameterized function ","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"S5yC7Cpgf7"},{"type":"inlineMath","value":"f_\\theta : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R},","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"html":"fθ:S×AR,f_\\theta : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R},fθ:S×AR,","key":"opxyKoXCvH"},{"type":"text","value":" such as a neural network, and choose actions according to a softmax: ","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"key":"x8z7DHtHf9"}],"key":"gyFuvtJy0S"},{"type":"math","value":"\\pi^\\text{general}_\\theta(a|s) = \\frac{\\exp(f_{\\theta}(s,a))}{\\sum_{a'} \\exp(f_{\\theta}(s,a'))}.","position":{"start":{"line":308,"column":1},"end":{"line":308,"column":1}},"tight":"before","html":"πθgeneral(as)=exp(fθ(s,a))aexp(fθ(s,a)).\\pi^\\text{general}_\\theta(a|s) = \\frac{\\exp(f_{\\theta}(s,a))}{\\sum_{a'} \\exp(f_{\\theta}(s,a'))}.πθgeneral(as)=aexp(fθ(s,a))exp(fθ(s,a)).","enumerator":"6.12","key":"iPceJOQAdR"},{"type":"paragraph","position":{"start":{"line":310,"column":1},"end":{"line":310,"column":1}},"children":[{"type":"text","value":"The score can then be written as ","position":{"start":{"line":310,"column":1},"end":{"line":310,"column":1}},"key":"Ti1IaaQzty"}],"key":"SuRHqzGuYK"},{"type":"math","value":"\\nabla \\log \\pi_\\theta(a|s) = \\nabla f_\\theta(s, a) - \\E_{a \\sim \\pi_\\theta(s)} \\nabla f_\\theta (s, a')","position":{"start":{"line":310,"column":1},"end":{"line":310,"column":1}},"tight":"before","html":"logπθ(as)=fθ(s,a)Eaπθ(s)fθ(s,a)\\nabla \\log \\pi_\\theta(a|s) = \\nabla f_\\theta(s, a) - \\E_{a \\sim \\pi_\\theta(s)} \\nabla f_\\theta (s, a')logπθ(as)=fθ(s,a)Eaπθ(s)fθ(s,a)","enumerator":"6.13","key":"wvAV2wgBhd"}],"key":"Y7UFsj8IPw"},{"type":"block","position":{"start":{"line":312,"column":1},"end":{"line":312,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":314,"column":1},"end":{"line":314,"column":1}},"children":[{"type":"text","value":"Continuous action spaces","position":{"start":{"line":314,"column":1},"end":{"line":314,"column":1}},"key":"tIm9p2EuFA"}],"identifier":"continuous-action-spaces","label":"Continuous action spaces","html_id":"continuous-action-spaces","implicit":true,"enumerator":"6.3.2","key":"jjKNdcnrwT"},{"type":"paragraph","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"children":[{"type":"text","value":"Consider a continuous ","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"key":"sjg0nIZswR"},{"type":"inlineMath","value":"n","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"html":"nnn","key":"kQO183Mpdr"},{"type":"text","value":"-dimensional action space ","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"key":"dfXJpAzAR4"},{"type":"inlineMath","value":"\\mathcal{A} = \\mathbb{R}^n","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"html":"A=Rn\\mathcal{A} = \\mathbb{R}^nA=Rn","key":"GXnnsThyr1"},{"type":"text","value":". Then for a stochastic policy, we could use a function to predict the ","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"key":"nwL4WO1GcT"},{"type":"emphasis","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"children":[{"type":"text","value":"mean","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"key":"M7p9QCNKeI"}],"key":"V9Cw37tWxw"},{"type":"text","value":" action and then add some random noise about it. For example, we could use a neural network to predict the mean action ","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"key":"nXbUVrmwXH"},{"type":"inlineMath","value":"\\mu_\\theta(s)","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"html":"μθ(s)\\mu_\\theta(s)μθ(s)","key":"zJvYE6Ua4d"},{"type":"text","value":" and then add some noise ","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"key":"kBmQvjgCoD"},{"type":"inlineMath","value":"\\epsilon \\sim \\mathcal{N}(0, \\sigma^2 I)","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"html":"ϵN(0,σ2I)\\epsilon \\sim \\mathcal{N}(0, \\sigma^2 I)ϵN(0,σ2I)","key":"tGXxK1PVKd"},{"type":"text","value":" to it:","position":{"start":{"line":316,"column":1},"end":{"line":316,"column":1}},"key":"zYVGOLCHUz"}],"key":"OYfLJdHYBo"},{"type":"math","value":"\\pi_\\theta(a|s) = \\mathcal{N}(\\mu_\\theta(s), \\sigma^2 I).","position":{"start":{"line":318,"column":1},"end":{"line":318,"column":1}},"html":"πθ(as)=N(μθ(s),σ2I).\\pi_\\theta(a|s) = \\mathcal{N}(\\mu_\\theta(s), \\sigma^2 I).πθ(as)=N(μθ(s),σ2I).","enumerator":"6.14","key":"dA2iXqVBEW"},{"type":"comment","value":" **Exercise:** Can you extend the \"linear in features\" policy to continuous action spaces in a similar way? ","key":"NdNMvQ0MKa"}],"key":"bysKcVCze7"},{"type":"block","position":{"start":{"line":322,"column":1},"end":{"line":322,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"children":[{"type":"text","value":"Now that we have seen parameterized policies, we can now write the total reward in terms of the parameters:","position":{"start":{"line":324,"column":1},"end":{"line":324,"column":1}},"key":"NaowCUzr0N"}],"key":"es7DyC88Y6"},{"type":"math","value":"J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} R(\\tau).","position":{"start":{"line":326,"column":1},"end":{"line":326,"column":1}},"html":"J(θ)=EτρθR(τ).J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} R(\\tau).J(θ)=EτρθR(τ).","enumerator":"6.15","key":"tMO7yoHMCH"},{"type":"paragraph","position":{"start":{"line":328,"column":1},"end":{"line":329,"column":1}},"children":[{"type":"text","value":"Now how do we maximize this function (the expected total reward) over the parameters?\nOne simple idea would be to directly apply gradient ascent:","position":{"start":{"line":328,"column":1},"end":{"line":328,"column":1}},"key":"eSMj32b2fA"}],"key":"facxYr11oM"},{"type":"math","value":"\\theta^{k+1} = \\theta^k + \\eta \\nabla J(\\theta^k).","position":{"start":{"line":331,"column":1},"end":{"line":333,"column":1}},"html":"θk+1=θk+ηJ(θk).\\theta^{k+1} = \\theta^k + \\eta \\nabla J(\\theta^k).θk+1=θk+ηJ(θk).","enumerator":"6.16","key":"qnScrzHvzt"},{"type":"paragraph","position":{"start":{"line":335,"column":1},"end":{"line":337,"column":1}},"children":[{"type":"text","value":"In order to apply this technique, we need to be able to evaluate the gradient ","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"SbgOhqSIwh"},{"type":"inlineMath","value":"\\nabla J(\\theta).","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"html":"J(θ).\\nabla J(\\theta).J(θ).","key":"AAIfJDVxSo"},{"type":"text","value":"\nBut ","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"IDrtgUUnZ5"},{"type":"inlineMath","value":"J(\\theta)","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"html":"J(θ)J(\\theta)J(θ)","key":"QlwirdSvuQ"},{"type":"text","value":" is very difficult, or even intractable, to compute exactly, since it involves taking an expectation over all possible trajectories ","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"pMxkSqUJ6R"},{"type":"inlineMath","value":"\\tau.","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"html":"τ.\\tau.τ.","key":"smB9n8Xj6z"},{"type":"text","value":"\nCan we rewrite it in a form that’s more convenient to implement?","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"LRxPUgIjry"}],"key":"kWB8qcPzBO"}],"key":"o6mg3cCtRL"},{"type":"block","position":{"start":{"line":339,"column":1},"end":{"line":339,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":342,"column":1},"end":{"line":342,"column":1}},"children":[{"type":"text","value":"Importance Sampling","position":{"start":{"line":342,"column":1},"end":{"line":342,"column":1}},"key":"vFUGNsMpNY"}],"label":"importance_sampling","identifier":"importance_sampling","html_id":"importance-sampling","enumerator":"6.3.3","key":"DPS8BjFgeK"},{"type":"paragraph","position":{"start":{"line":344,"column":1},"end":{"line":352,"column":1}},"children":[{"type":"text","value":"There is a general trick called ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"LYFdxPuFDq"},{"type":"strong","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"children":[{"type":"text","value":"importance sampling","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"Lx2zerr4tp"}],"key":"TBbyEiILMW"},{"type":"text","value":" for evaluating such expectations.\nSuppose we want to estimate ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"zV5i0qBMR2"},{"type":"inlineMath","value":"\\E_{x \\sim p}[f(x)]","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"Exp[f(x)]\\E_{x \\sim p}[f(x)]Exp[f(x)]","key":"hiiAScFSXO"},{"type":"text","value":" where ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"NcFaF9hHCE"},{"type":"inlineMath","value":"p","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"ppp","key":"ZWQdo8lRvr"},{"type":"text","value":" is hard or expensive to sample from. We can, however, evaluate the likelihood ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"HoPtb0C7d1"},{"type":"inlineMath","value":"p(x)","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"p(x)p(x)p(x)","key":"Z8UxnMdP9w"},{"type":"text","value":".\nSuppose that we ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"nHGaYn8POs"},{"type":"emphasis","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"children":[{"type":"text","value":"can","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"yXeuyZOaus"}],"key":"Fkqfp3LUKc"},{"type":"text","value":" sample from a different distribution ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"TWqmGWm4rp"},{"type":"inlineMath","value":"q","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"qqq","key":"cNtK3obJjG"},{"type":"text","value":".\nSince an expectation is just a weighted average, we can sample ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"ONehSW3Fpf"},{"type":"inlineMath","value":"x","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"xxx","key":"xX3xYlE8R0"},{"type":"text","value":" from ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"S3Idr9O04q"},{"type":"inlineMath","value":"q","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"qqq","key":"UJgSjdSG7o"},{"type":"text","value":", compute ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"XlpsPSibQt"},{"type":"inlineMath","value":"f(x)","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"f(x)f(x)f(x)","key":"ilWyHuiGIL"},{"type":"text","value":", and then reweight the results:\nif ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"Bi3mdJQ9t4"},{"type":"inlineMath","value":"x","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"xxx","key":"qaWjU9yIAw"},{"type":"text","value":" is very likely under ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"UM2SY5t85o"},{"type":"inlineMath","value":"p","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"ppp","key":"HWi456KQ3U"},{"type":"text","value":" but unlikely under ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"mxQF9ebdj6"},{"type":"inlineMath","value":"q","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"qqq","key":"IVNcgr1Sdr"},{"type":"text","value":",\nwe should boost its weighting,\nand if it is common under ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"Q95qXrd0qe"},{"type":"inlineMath","value":"q","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"qqq","key":"JgfQVBCa3C"},{"type":"text","value":" but uncommon under ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"UqzGPApyV4"},{"type":"inlineMath","value":"p","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"ppp","key":"n43A772Pfx"},{"type":"text","value":",\nwe should lower its weighting.\nThe reweighting factor is exactly the ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"DodmfQgD5u"},{"type":"strong","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"children":[{"type":"text","value":"likelihood ratio","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"NnntDA1tXN"}],"key":"aagyGj4e5b"},{"type":"text","value":" between the target distribution ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"KkTjuk5TBV"},{"type":"inlineMath","value":"p","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"ppp","key":"HMOsH8rWLi"},{"type":"text","value":" and the sampling distribution ","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"FhAHydqzhd"},{"type":"inlineMath","value":"q","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"html":"qqq","key":"mnoWLa0VWa"},{"type":"text","value":":","position":{"start":{"line":344,"column":1},"end":{"line":344,"column":1}},"key":"LIlqTzDdIC"}],"key":"xlgsrNIDh7"},{"type":"math","value":"\\E_{x \\sim p}[f(x)] = \\sum_{x \\in \\mathcal{X}} f(x) p(x) = \\sum_{x \\in \\mathcal{X}} f(x) \\frac{p(x)}{q(x)} q(x) = \\E_{x \\sim q} \\left[ \\frac{p(x)}{q(x)} f(x) \\right].","position":{"start":{"line":354,"column":1},"end":{"line":356,"column":1}},"html":"Exp[f(x)]=xXf(x)p(x)=xXf(x)p(x)q(x)q(x)=Exq[p(x)q(x)f(x)].\\E_{x \\sim p}[f(x)] = \\sum_{x \\in \\mathcal{X}} f(x) p(x) = \\sum_{x \\in \\mathcal{X}} f(x) \\frac{p(x)}{q(x)} q(x) = \\E_{x \\sim q} \\left[ \\frac{p(x)}{q(x)} f(x) \\right].Exp[f(x)]=xXf(x)p(x)=xXf(x)q(x)p(x)q(x)=Exq[q(x)p(x)f(x)].","enumerator":"6.17","key":"JLkS2ONsNl"},{"type":"paragraph","position":{"start":{"line":358,"column":1},"end":{"line":361,"column":1}},"children":[{"type":"text","value":"Doesn’t this seem too good to be true? If there were no drawbacks, we could use this to estimate ","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"key":"eZv4JSLKCF"},{"type":"emphasis","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"children":[{"type":"text","value":"any","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"key":"zcC9TORNWt"}],"key":"U1TTu6z1Hk"},{"type":"text","value":" expectation of any function on any arbitrary distribution! The drawback is that the variance may be very large due to the likelihood ratio term.\nIf there are values of ","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"key":"EXTQydXzXq"},{"type":"inlineMath","value":"x","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"html":"xxx","key":"YoprSMTEx3"},{"type":"text","value":" that are very rare in the sampling distribution ","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"key":"cAQInUqxB7"},{"type":"inlineMath","value":"q","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"html":"qqq","key":"M31wj3rXWE"},{"type":"text","value":",\nbut common under ","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"key":"DPEGV8rGEX"},{"type":"inlineMath","value":"p","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"html":"ppp","key":"IAYRUH1B79"},{"type":"text","value":",\nthen the likelihood ratio ","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"key":"XyuSBaEXzw"},{"type":"inlineMath","value":"p(x)/q(x)","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"html":"p(x)/q(x)p(x)/q(x)p(x)/q(x)","key":"x4SkGyeS8G"},{"type":"text","value":" will cause the variance to blow up.","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"key":"vC8rlmvtCn"}],"key":"USIyV4XnGc"},{"type":"heading","depth":2,"position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"children":[{"type":"text","value":"The REINFORCE policy gradient","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"key":"ZAFKIywZkV"}],"identifier":"the-reinforce-policy-gradient","label":"The REINFORCE policy gradient","html_id":"the-reinforce-policy-gradient","implicit":true,"enumerator":"6.4","key":"krzXMX61LJ"},{"type":"paragraph","position":{"start":{"line":365,"column":1},"end":{"line":367,"column":1}},"children":[{"type":"text","value":"Returning to RL, suppose there is some trajectory distribution ","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"sqQQgsvGEf"},{"type":"inlineMath","value":"\\rho(\\tau)","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"html":"ρ(τ)\\rho(\\tau)ρ(τ)","key":"aA2c9yq5mv"},{"type":"text","value":" that is ","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"DqgCEB2zQL"},{"type":"strong","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"children":[{"type":"text","value":"easy to sample from,","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"oMAXFtFKxQ"}],"key":"jGIKkLLbV1"},{"type":"text","value":" such as a database of existing trajectories.\nWe can then rewrite ","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"fEnbAL69yZ"},{"type":"inlineMath","value":"\\nabla J(\\theta)","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"html":"J(θ)\\nabla J(\\theta)J(θ)","key":"fUb0KS4uFU"},{"type":"text","value":", a.k.a. the ","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"de3chFFjRf"},{"type":"emphasis","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"children":[{"type":"text","value":"policy gradient","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"UHG7CGdkoK"}],"key":"LV2Ozt6RxQ"},{"type":"text","value":", as follows.\nAll gradients are being taken with respect to ","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"RGxPisZqWi"},{"type":"text","value":"θ","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"gkreXkZxar"},{"type":"text","value":".","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"ZOFQYiuhuK"}],"key":"asSenUe8TF"},{"type":"math","value":"\\begin{aligned}\n \\nabla J(\\theta) & = \\nabla \\E_{\\tau \\sim \\rho_\\theta} [ R(\\tau) ] \\\\\n & = \\nabla \\E_{\\tau \\sim \\rho} \\left[ \\frac{\\rho_\\theta(\\tau)}{\\rho(\\tau)} R(\\tau) \\right] & & \\text{likelihood ratio trick} \\\\\n & = \\E_{\\tau \\sim \\rho} \\left[ \\frac{\\nabla \\rho_\\theta(\\tau)}{\\rho(\\tau)} R(\\tau) \\right] & & \\text{switching gradient and expectation}\n\\end{aligned}","position":{"start":{"line":369,"column":1},"end":{"line":375,"column":1}},"html":"J(θ)=Eτρθ[R(τ)]=Eτρ[ρθ(τ)ρ(τ)R(τ)]likelihood ratio trick=Eτρ[ρθ(τ)ρ(τ)R(τ)]switching gradient and expectation\\begin{aligned}\n \\nabla J(\\theta) & = \\nabla \\E_{\\tau \\sim \\rho_\\theta} [ R(\\tau) ] \\\\\n & = \\nabla \\E_{\\tau \\sim \\rho} \\left[ \\frac{\\rho_\\theta(\\tau)}{\\rho(\\tau)} R(\\tau) \\right] & & \\text{likelihood ratio trick} \\\\\n & = \\E_{\\tau \\sim \\rho} \\left[ \\frac{\\nabla \\rho_\\theta(\\tau)}{\\rho(\\tau)} R(\\tau) \\right] & & \\text{switching gradient and expectation}\n\\end{aligned}J(θ)=Eτρθ[R(τ)]=Eτρ[ρ(τ)ρθ(τ)R(τ)]=Eτρ[ρ(τ)ρθ(τ)R(τ)]likelihood ratio trickswitching gradient and expectation","enumerator":"6.18","key":"wYW6pspzBt"},{"type":"paragraph","position":{"start":{"line":377,"column":1},"end":{"line":377,"column":1}},"children":[{"type":"text","value":"Note that for ","position":{"start":{"line":377,"column":1},"end":{"line":377,"column":1}},"key":"zJMPEgHjeA"},{"type":"inlineMath","value":"\\rho = \\rho_\\theta","position":{"start":{"line":377,"column":1},"end":{"line":377,"column":1}},"html":"ρ=ρθ\\rho = \\rho_\\thetaρ=ρθ","key":"UhKCuI0EDM"},{"type":"text","value":", the inside term becomes","position":{"start":{"line":377,"column":1},"end":{"line":377,"column":1}},"key":"Ng8YYvmqBg"}],"key":"Y2Y1y7czls"},{"type":"math","value":"\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} [ \\nabla \\log \\rho_\\theta(\\tau) \\cdot R(\\tau)].","position":{"start":{"line":379,"column":1},"end":{"line":381,"column":1}},"html":"J(θ)=Eτρθ[logρθ(τ)R(τ)].\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} [ \\nabla \\log \\rho_\\theta(\\tau) \\cdot R(\\tau)].J(θ)=Eτρθ[logρθ(τ)R(τ)].","enumerator":"6.19","key":"tYmSZKEUfy"},{"type":"paragraph","position":{"start":{"line":383,"column":1},"end":{"line":383,"column":1}},"children":[{"type":"text","value":"(The order of operations is ","position":{"start":{"line":383,"column":1},"end":{"line":383,"column":1}},"key":"koAZbbaKNT"},{"type":"inlineMath","value":"\\nabla (\\log \\rho_\\theta)(\\tau)","position":{"start":{"line":383,"column":1},"end":{"line":383,"column":1}},"html":"(logρθ)(τ)\\nabla (\\log \\rho_\\theta)(\\tau)(logρθ)(τ)","key":"wv5oaSTE1h"},{"type":"text","value":".)","position":{"start":{"line":383,"column":1},"end":{"line":383,"column":1}},"key":"lN5XBzBw0p"}],"key":"x1SrkTvZxI"},{"type":"paragraph","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"children":[{"type":"text","value":"Note that when the state transitions are Markov (i.e. ","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"oKC6xaJEgR"},{"type":"inlineMath","value":"s_{t}","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"html":"sts_{t}st","key":"cxrhy1Nh4j"},{"type":"text","value":" only depends on ","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"yJZqdWdl5R"},{"type":"inlineMath","value":"s_{t-1}, a_{t-1}","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"html":"st1,at1s_{t-1}, a_{t-1}st1,at1","key":"AB6hzheZd5"},{"type":"text","value":") and the policy is time-homogeneous (i.e. ","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"zrxOtvYvO6"},{"type":"inlineMath","value":"a_\\hi \\sim \\pi_\\theta (s_\\hi)","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"html":"ahπθ(sh)a_\\hi \\sim \\pi_\\theta (s_\\hi)ahπθ(sh)","key":"k170kNB2qd"},{"type":"text","value":"), we can write out the ","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"TfGrj3hGc1"},{"type":"emphasis","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"children":[{"type":"text","value":"likelihood of a trajectory","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"gFAyENtoww"}],"key":"n9sQCfYXwh"},{"type":"text","value":" under the policy ","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"LPflwrneqi"},{"type":"inlineMath","value":"\\pi_\\theta","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"html":"πθ\\pi_\\thetaπθ","key":"TNoV1Mb0dx"},{"type":"text","value":":","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"muIOm97zga"}],"key":"SPRp2JSDu5"},{"type":"math","value":"\\begin{aligned}\n \\rho_\\theta(\\tau) &= \\mu(s_0) \\pi_\\theta(a_0 | s_0) \\\\\n &\\qquad \\times P(s_1 | s_0, a_0) \\pi_\\theta(a_1 | s_1) \\\\\n &\\qquad \\times \\cdots \\\\\n &\\qquad \\times P(s_{H-1} | s_{H-2}, a_{H-2}) \\pi_\\theta(a_{H-1} | s_{H-1}).\n\\end{aligned}","label":"trajectory_likelihood","identifier":"trajectory_likelihood","html":"ρθ(τ)=μ(s0)πθ(a0s0)×P(s1s0,a0)πθ(a1s1)××P(sH1sH2,aH2)πθ(aH1sH1).\\begin{aligned}\n \\rho_\\theta(\\tau) &= \\mu(s_0) \\pi_\\theta(a_0 | s_0) \\\\\n &\\qquad \\times P(s_1 | s_0, a_0) \\pi_\\theta(a_1 | s_1) \\\\\n &\\qquad \\times \\cdots \\\\\n &\\qquad \\times P(s_{H-1} | s_{H-2}, a_{H-2}) \\pi_\\theta(a_{H-1} | s_{H-1}).\n\\end{aligned}ρθ(τ)=μ(s0)πθ(a0s0)×P(s1s0,a0)πθ(a1s1)××P(sH1sH2,aH2)πθ(aH1sH1).","enumerator":"6.20","html_id":"trajectory-likelihood","key":"CRJXuqTkNM"},{"type":"paragraph","position":{"start":{"line":398,"column":1},"end":{"line":400,"column":1}},"children":[{"type":"text","value":"Note that the log-trajectory-likelihood turns into a sum of terms,\nof which only the ","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"key":"rIaT4NMKqw"},{"type":"inlineMath","value":"\\pi_\\theta(a_\\hi | s_\\hi)","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"html":"πθ(ahsh)\\pi_\\theta(a_\\hi | s_\\hi)πθ(ahsh)","key":"lAJ4L9kkpa"},{"type":"text","value":" terms depend on ","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"key":"lVuW1wCPpe"},{"type":"inlineMath","value":"\\theta,","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"html":"θ,\\theta,θ,","key":"mj7rbY9I4C"},{"type":"text","value":"\nso we can simplify even further to obtain the following expression for the policy gradient, known as the “REINFORCE” policy gradient:","position":{"start":{"line":398,"column":1},"end":{"line":398,"column":1}},"key":"ROrk5iOhgg"}],"key":"huLmhMCRxT"},{"type":"math","value":"\\begin{aligned}\n \\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{t=0}^{T-1} \\nabla_\\theta \\log \\pi_{\\theta}(a_\\hi | s_\\hi) R(\\tau) \\right]\n\\end{aligned}","label":"reinforce_pg","identifier":"reinforce_pg","html":"J(θ)=Eτρθ[t=0T1θlogπθ(ahsh)R(τ)]\\begin{aligned}\n \\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{t=0}^{T-1} \\nabla_\\theta \\log \\pi_{\\theta}(a_\\hi | s_\\hi) R(\\tau) \\right]\n\\end{aligned}J(θ)=Eτρθ[t=0T1θlogπθ(ahsh)R(τ)]","enumerator":"6.21","html_id":"reinforce-pg","key":"hdME1CdlzB"},{"type":"paragraph","position":{"start":{"line":410,"column":1},"end":{"line":413,"column":1}},"children":[{"type":"text","value":"This expression allows us to estimate the gradient by sampling a few sample trajectories from ","position":{"start":{"line":410,"column":1},"end":{"line":410,"column":1}},"key":"Kj3PHp2nDA"},{"type":"inlineMath","value":"\\pi_\\theta,","position":{"start":{"line":410,"column":1},"end":{"line":410,"column":1}},"html":"πθ,\\pi_\\theta,πθ,","key":"ApFrZw5AZd"},{"type":"text","value":"\ncalculating the likelihoods of the chosen actions,\nand substituting these into the expression above.\nWe can then use this gradient estimate to apply stochastic gradient ascent.","position":{"start":{"line":410,"column":1},"end":{"line":410,"column":1}},"key":"q7yXhdCY0e"}],"key":"oTaeyuXuqb"},{"type":"code","lang":"python","value":"def estimate_gradient_reinforce_pseudocode(env, π, θ):\n τ = sample_trajectory(env, π(θ))\n gradient_hat = 0\n for s, a, r in τ:\n def policy_log_likelihood(θ):\n return log(π(θ)(s, a))\n gradient_hat += jax.grad(policy_log_likelihood)(θ) * τ.total_reward\n return gradient_hat","position":{"start":{"line":415,"column":1},"end":{"line":424,"column":1}},"key":"cBhIWJ8gRb"},{"type":"paragraph","position":{"start":{"line":426,"column":1},"end":{"line":429,"column":1}},"children":[{"type":"text","value":"In fact, we can perform one more simplification.\nIntuitively, the action taken at step ","position":{"start":{"line":426,"column":1},"end":{"line":426,"column":1}},"key":"sZHGosFMRf"},{"type":"inlineMath","value":"t","position":{"start":{"line":426,"column":1},"end":{"line":426,"column":1}},"html":"ttt","key":"wbUPy338b1"},{"type":"text","value":" does not affect the reward from previous timesteps, since they’re already in the past!\nYou can also show rigorously that this is the case,\nand that we only need to consider the present and future rewards to calculate the policy gradient:","position":{"start":{"line":426,"column":1},"end":{"line":426,"column":1}},"key":"ymk5iStj34"}],"key":"Gpd3i2XfOB"},{"type":"math","value":"\\begin{aligned}\n \\nabla J(\\theta) &= \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{t=0}^{T-1} \\nabla_\\theta \\log \\pi_{\\theta}(a_\\hi | s_\\hi) \\sum_{t' = t}^{T-1} r(s_{t'}, a_{t'}) \\right] \\\\\n &= \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{t=0}^{T-1} \\nabla_\\theta \\log \\pi_{\\theta}(a_\\hi | s_\\hi) Q^{\\pi_\\theta}(s_{t}, a_{t}) \\right]\n\\end{aligned}","label":"pg_with_q","identifier":"pg_with_q","html":"J(θ)=Eτρθ[t=0T1θlogπθ(ahsh)t=tT1r(st,at)]=Eτρθ[t=0T1θlogπθ(ahsh)Qπθ(st,at)]\\begin{aligned}\n \\nabla J(\\theta) &= \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{t=0}^{T-1} \\nabla_\\theta \\log \\pi_{\\theta}(a_\\hi | s_\\hi) \\sum_{t' = t}^{T-1} r(s_{t'}, a_{t'}) \\right] \\\\\n &= \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{t=0}^{T-1} \\nabla_\\theta \\log \\pi_{\\theta}(a_\\hi | s_\\hi) Q^{\\pi_\\theta}(s_{t}, a_{t}) \\right]\n\\end{aligned}J(θ)=Eτρθ[t=0T1θlogπθ(ahsh)t=tT1r(st,at)]=Eτρθ[t=0T1θlogπθ(ahsh)Qπθ(st,at)]","enumerator":"6.22","html_id":"pg-with-q","key":"aIKCIUDBs8"},{"type":"paragraph","position":{"start":{"line":440,"column":1},"end":{"line":440,"column":1}},"children":[{"type":"strong","position":{"start":{"line":440,"column":1},"end":{"line":440,"column":1}},"children":[{"type":"text","value":"Exercise:","position":{"start":{"line":440,"column":1},"end":{"line":440,"column":1}},"key":"VAz6rXGQ2j"}],"key":"MCwIleDDt6"},{"type":"text","value":" Prove that this is equivalent to the previous definitions. What modification to the expression must be made for the discounted, infinite-horizon setting?","position":{"start":{"line":440,"column":1},"end":{"line":440,"column":1}},"key":"cjX8CoHaaM"}],"key":"It7XoRKaeW"},{"type":"paragraph","position":{"start":{"line":442,"column":1},"end":{"line":442,"column":1}},"children":[{"type":"text","value":"For some intuition into how this method works, recall that we update our parameters according to","position":{"start":{"line":442,"column":1},"end":{"line":442,"column":1}},"key":"L6gJJAARwy"}],"key":"VD2inwlkQo"},{"type":"math","value":"\\begin{aligned}\n \\theta_{t+1} &= \\theta_\\hi + \\eta \\nabla J(\\theta_\\hi) \\\\\n &= \\theta_\\hi + \\eta \\E_{\\tau \\sim \\rho_{\\theta_\\hi}} [\\nabla \\log \\rho_{\\theta_\\hi}(\\tau) \\cdot R(\\tau)].\n\\end{aligned}","position":{"start":{"line":444,"column":1},"end":{"line":449,"column":1}},"html":"θt+1=θh+ηJ(θh)=θh+ηEτρθh[logρθh(τ)R(τ)].\\begin{aligned}\n \\theta_{t+1} &= \\theta_\\hi + \\eta \\nabla J(\\theta_\\hi) \\\\\n &= \\theta_\\hi + \\eta \\E_{\\tau \\sim \\rho_{\\theta_\\hi}} [\\nabla \\log \\rho_{\\theta_\\hi}(\\tau) \\cdot R(\\tau)].\n\\end{aligned}θt+1=θh+ηJ(θh)=θh+ηEτρθh[logρθh(τ)R(τ)].","enumerator":"6.23","key":"AVM9kBkqFd"},{"type":"paragraph","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"children":[{"type":"text","value":"Consider the “good” trajectories where ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"StXm1ffwZ1"},{"type":"inlineMath","value":"R(\\tau)","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"html":"R(τ)R(\\tau)R(τ)","key":"UTUMnk0dzt"},{"type":"text","value":" is large. Then ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"WcOXQlRr1e"},{"type":"text","value":"θ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"KJAsJ3sCYV"},{"type":"text","value":" gets updated so that these trajectories become more likely. To see why, recall that ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"tN75h2khyF"},{"type":"inlineMath","value":"\\rho_{\\theta}(\\tau)","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"html":"ρθ(τ)\\rho_{\\theta}(\\tau)ρθ(τ)","key":"jktsduoKyV"},{"type":"text","value":" is the likelihood of the trajectory ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"xNKDcuTMQ6"},{"type":"text","value":"τ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"hpbuK7OVQA"},{"type":"text","value":" under the policy ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"eiMbp7WanI"},{"type":"inlineMath","value":"\\pi_\\theta,","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"html":"πθ,\\pi_\\theta,πθ,","key":"ZxG18dZKOS"},{"type":"text","value":" so evaluating the gradient points in the direction that makes ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"xOnzEoNxKt"},{"type":"text","value":"τ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"yBJIxCk90V"},{"type":"text","value":" more likely.","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"louLWmLins"}],"key":"rLS2Cs6rcJ"}],"key":"Qz1BEIADT8"},{"type":"block","position":{"start":{"line":453,"column":1},"end":{"line":453,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":455,"column":1},"end":{"line":455,"column":1}},"children":[{"type":"text","value":"Baselines and advantages","position":{"start":{"line":455,"column":1},"end":{"line":455,"column":1}},"key":"lRZAPNPflZ"}],"identifier":"baselines-and-advantages","label":"Baselines and advantages","html_id":"baselines-and-advantages","implicit":true,"enumerator":"6.5","key":"H7l0UzsPM4"},{"type":"paragraph","position":{"start":{"line":457,"column":1},"end":{"line":460,"column":1}},"children":[{"type":"text","value":"A central idea from supervised learning is the ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"GXcBr2dsXA"},{"type":"strong","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"children":[{"type":"text","value":"bias-variance decomposition","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"kkMul6s1xq"}],"key":"qDZf2vqdnk"},{"type":"text","value":",\nwhich shows that the mean squared error of an estimator is the sum of its squared bias and its variance.\nThe REINFORCE gradient estimator ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"hlrn9JnZhG"},{"type":"crossReference","kind":"equation","identifier":"reinforce_pg","label":"reinforce_pg","children":[{"type":"text","value":"(","key":"zTuTcZPECg"},{"type":"text","value":"6.21","key":"iYUbxrKeVR"},{"type":"text","value":")","key":"B1PFbSPAGe"}],"template":"(%s)","enumerator":"6.21","resolved":true,"html_id":"reinforce-pg","key":"v5LQn95HFL"},{"type":"text","value":" is already ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"mC8MmNaU70"},{"type":"emphasis","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"children":[{"type":"text","value":"unbiased,","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"FvtCdExLHD"}],"key":"MkvkzlhZrY"},{"type":"text","value":" meaning that its expectation over trajectories is the true policy gradient.\nCan we find ways to reduce its ","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"pTTp3QsOsN"},{"type":"emphasis","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"children":[{"type":"text","value":"variance","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"B5bC29hNwK"}],"key":"h4mejISy2z"},{"type":"text","value":" as well?","position":{"start":{"line":457,"column":1},"end":{"line":457,"column":1}},"key":"nTUQ091QGN"}],"key":"hgGNiLsXHq"},{"type":"paragraph","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"children":[{"type":"text","value":"One common way is to subtract a ","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"key":"LFdTjiSD1k"},{"type":"strong","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"children":[{"type":"text","value":"baseline function","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"key":"Ly7DeO9w1S"}],"key":"kdlhUt3rDZ"},{"type":"text","value":" ","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"key":"RhAnPQvvi2"},{"type":"inlineMath","value":"b_\\hi : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"html":"bh:SRb_\\hi : \\mathcal{S} \\to \\mathbb{R}bh:SR","key":"eCqIP9EC13"},{"type":"text","value":" at each timestep ","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"key":"GpZTruEGTM"},{"type":"inlineMath","value":"\\hi.","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"html":"h.\\hi.h.","key":"DeRceaoCRB"},{"type":"text","value":" This modifies the policy gradient as follows:","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"key":"oA3Vb3LAIt"}],"key":"I30QvaByK8"},{"type":"math","value":"\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{\\hi=0}^{H-1} \\nabla \\log \\pi_\\theta (a_\\hi | s_\\hi) \\left(\n \\left(\n \\sum_{\\hi' = \\hi}^{H-1} r_{\\hi'}\n \\right)\n - b_\\hi(s_\\hi)\n \\right)\n \\right].","position":{"start":{"line":464,"column":1},"end":{"line":474,"column":1}},"identifier":"eq:pg_baseline","label":"eq:pg_baseline","html_id":"eq-pg-baseline","html":"J(θ)=Eτρθ[h=0H1logπθ(ahsh)((h=hH1rh)bh(sh))].\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{\\hi=0}^{H-1} \\nabla \\log \\pi_\\theta (a_\\hi | s_\\hi) \\left(\n \\left(\n \\sum_{\\hi' = \\hi}^{H-1} r_{\\hi'}\n \\right)\n - b_\\hi(s_\\hi)\n \\right)\n \\right].J(θ)=Eτρθ[h=0H1logπθ(ahsh)((h=hH1rh)bh(sh))].","enumerator":"6.24","key":"ATiS2yBQ2E"},{"type":"paragraph","position":{"start":{"line":476,"column":1},"end":{"line":476,"column":1}},"children":[{"type":"text","value":"For example, we might want ","position":{"start":{"line":476,"column":1},"end":{"line":476,"column":1}},"key":"BfJ7Z2C9Sl"},{"type":"inlineMath","value":"b_\\hi","position":{"start":{"line":476,"column":1},"end":{"line":476,"column":1}},"html":"bhb_\\hibh","key":"HRbbMMWWfp"},{"type":"text","value":" to estimate the average reward-to-go at a given timestep:","position":{"start":{"line":476,"column":1},"end":{"line":476,"column":1}},"key":"Dsnsqq7RdG"}],"key":"vPfeE5kle0"},{"type":"math","value":"b_\\hi^\\theta = \\E_{\\tau \\sim \\rho_\\theta} R_\\hi(\\tau).","position":{"start":{"line":478,"column":1},"end":{"line":478,"column":1}},"html":"bhθ=EτρθRh(τ).b_\\hi^\\theta = \\E_{\\tau \\sim \\rho_\\theta} R_\\hi(\\tau).bhθ=EτρθRh(τ).","enumerator":"6.25","key":"dInjfwxlMl"},{"type":"paragraph","position":{"start":{"line":480,"column":1},"end":{"line":480,"column":1}},"children":[{"type":"text","value":"This way, the random variable ","position":{"start":{"line":480,"column":1},"end":{"line":480,"column":1}},"key":"fOTb1IVYBY"},{"type":"inlineMath","value":"R_\\hi(\\tau) - b_\\hi^\\theta","position":{"start":{"line":480,"column":1},"end":{"line":480,"column":1}},"html":"Rh(τ)bhθR_\\hi(\\tau) - b_\\hi^\\thetaRh(τ)bhθ","key":"qQ2IxnugM2"},{"type":"text","value":" is centered around zero, making certain algorithms more stable.","position":{"start":{"line":480,"column":1},"end":{"line":480,"column":1}},"key":"HUe0wT1HtG"}],"key":"eyrx4bxRjV"},{"type":"paragraph","position":{"start":{"line":482,"column":1},"end":{"line":488,"column":1}},"children":[{"type":"text","value":"As a better baseline, we could instead choose the ","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"JnnC7cgZG1"},{"type":"emphasis","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"children":[{"type":"text","value":"value function.","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"p1KBm2dbrt"}],"key":"pVIJ9wSSxH"},{"type":"text","value":"\nNote that the random variable ","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"ZMUUi6JlCG"},{"type":"inlineMath","value":"Q^\\pi_\\hi(s, a) - V^\\pi_\\hi(s),","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"html":"Qhπ(s,a)Vhπ(s),Q^\\pi_\\hi(s, a) - V^\\pi_\\hi(s),Qhπ(s,a)Vhπ(s),","key":"bDLquodMam"},{"type":"text","value":"\nwhere the randomness is taken over the actions, is also centered around zero.\n(Recall ","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"pWzl9vlj9U"},{"type":"inlineMath","value":"V^\\pi_\\hi(s) = \\E_{a \\sim \\pi} Q^\\pi_\\hi(s, a).","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"html":"Vhπ(s)=EaπQhπ(s,a).V^\\pi_\\hi(s) = \\E_{a \\sim \\pi} Q^\\pi_\\hi(s, a).Vhπ(s)=EaπQhπ(s,a).","key":"huuW0Hj1fO"},{"type":"text","value":")\nIn fact, this quantity has a particular name: the ","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"ELhalEmgIH"},{"type":"strong","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"children":[{"type":"text","value":"advantage function.","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"r80KKRWUi0"}],"key":"XQNzlghPLX"},{"type":"text","value":"\nThis measures how much better this action does than the average for that policy.\n(Note that for an optimal policy ","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"hox6NHwUYG"},{"type":"inlineMath","value":"\\pi^\\star,","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"html":"π,\\pi^\\star,π,","key":"yt6XZ4n0T2"},{"type":"text","value":" the advantage of a given state-action pair is always zero or negative.)","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"MSor9qXHTl"}],"key":"J0zjJfN3We"},{"type":"paragraph","position":{"start":{"line":490,"column":1},"end":{"line":490,"column":1}},"children":[{"type":"text","value":"We can now express the policy gradient as follows. Note that the advantage function effectively replaces the ","position":{"start":{"line":490,"column":1},"end":{"line":490,"column":1}},"key":"IYS4dNlniS"},{"type":"inlineMath","value":"Q","position":{"start":{"line":490,"column":1},"end":{"line":490,"column":1}},"html":"QQQ","key":"tUxBog5fmj"},{"type":"text","value":"-function from ","position":{"start":{"line":490,"column":1},"end":{"line":490,"column":1}},"key":"geRtbo8Ebf"},{"type":"crossReference","kind":"equation","identifier":"pg_with_q","label":"pg_with_q","children":[{"type":"text","value":"(","key":"MQrMCrqZtH"},{"type":"text","value":"6.22","key":"TuvMAq1DtZ"},{"type":"text","value":")","key":"IsbmIQQTVy"}],"template":"(%s)","enumerator":"6.22","resolved":true,"html_id":"pg-with-q","key":"AR7SuHe1zO"},{"type":"text","value":":","position":{"start":{"line":490,"column":1},"end":{"line":490,"column":1}},"key":"ZD2yuXB2Wy"}],"key":"cXno6otOBq"},{"type":"math","value":"\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{t=0}^{T-1} \\nabla \\log \\pi_\\theta(a_\\hi | s_\\hi) A^{\\pi_\\theta}_\\hi (s_\\hi, a_\\hi)\n\\right].","label":"pg_advantage","identifier":"pg_advantage","html":"J(θ)=Eτρθ[t=0T1logπθ(ahsh)Ahπθ(sh,ah)].\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{t=0}^{T-1} \\nabla \\log \\pi_\\theta(a_\\hi | s_\\hi) A^{\\pi_\\theta}_\\hi (s_\\hi, a_\\hi)\n\\right].J(θ)=Eτρθ[t=0T1logπθ(ahsh)Ahπθ(sh,ah)].","enumerator":"6.26","html_id":"pg-advantage","key":"TirDKOSV3m"},{"type":"paragraph","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"children":[{"type":"text","value":"Note that to avoid correlations between the gradient estimator and the value estimator (i.e. baseline), we must estimate them with independently sampled trajectories:","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"key":"vuqvtO3ZNb"}],"key":"V7xnxHwobE"},{"type":"comment","value":" TODO could use more explanation _why_ we want to avoid correlations ","key":"pNe4PKuHZl"},{"type":"proof","kind":"definition","label":"pg_baseline","identifier":"pg_baseline","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Policy gradient with a learned baseline","position":{"start":{"line":504,"column":1},"end":{"line":504,"column":1}},"key":"Su37YvvHwC"}],"key":"NlcjAczGnB"},{"type":"code","lang":"python","value":"def pg_with_learned_baseline_pseudocode(env, π, η, θ_init, K, N):\n θ = θ_init\n for k in range(K):\n trajectories = sample_trajectories(env, π(θ), N)\n V_hat = fit(trajectories) # estimates the value function of π(θ)\n τ = sample_trajectories(env, π(θ), 1)\n g = jnp.zeros_like(θ) # gradient estimator\n\n for h, (s, a) in enumerate(τ):\n def log_likelihood(θ_):\n return jnp.log(π(θ_)(s, a))\n g = g + jax.grad(log_likelihood)(θ) * (return_to_go(τ, h) - V_hat(s))\n \n θ = θ + η * g\n return θ","position":{"start":{"line":507,"column":1},"end":{"line":523,"column":1}},"key":"nbXtenzRrL"},{"type":"paragraph","position":{"start":{"line":525,"column":1},"end":{"line":526,"column":1}},"children":[{"type":"text","value":"Note that you could also generalize this by allowing the learning rate ","position":{"start":{"line":525,"column":1},"end":{"line":525,"column":1}},"key":"vdiqfDo6di"},{"type":"text","value":"η","position":{"start":{"line":525,"column":1},"end":{"line":525,"column":1}},"key":"iCpGd54pFZ"},{"type":"text","value":" to vary across steps,\nor take multiple trajectories ","position":{"start":{"line":525,"column":1},"end":{"line":525,"column":1}},"key":"Hc3DfqxuV0"},{"type":"text","value":"τ","position":{"start":{"line":525,"column":1},"end":{"line":525,"column":1}},"key":"cmhZBecm2B"},{"type":"text","value":" and compute the sample average of the gradient estimates.","position":{"start":{"line":525,"column":1},"end":{"line":525,"column":1}},"key":"g9ZPwmk0Sd"}],"key":"Vhb4mRmjgp"},{"type":"paragraph","position":{"start":{"line":528,"column":1},"end":{"line":529,"column":1}},"children":[{"type":"text","value":"The baseline estimation step ","position":{"start":{"line":528,"column":1},"end":{"line":528,"column":1}},"key":"eRhwxsMPLB"},{"type":"inlineCode","value":"fit","position":{"start":{"line":528,"column":1},"end":{"line":528,"column":1}},"key":"VQrxALIrMo"},{"type":"text","value":" can be done using any appropriate supervised learning algorithm.\nNote that the gradient estimator will be unbiased regardless of the baseline.","position":{"start":{"line":528,"column":1},"end":{"line":528,"column":1}},"key":"a9SKtnZiZ1"}],"key":"ijNNEY5G2o"}],"enumerator":"6.2","html_id":"pg-baseline","key":"tQPIVc9SLK"}],"key":"ImwNCOXuuP"},{"type":"block","position":{"start":{"line":532,"column":1},"end":{"line":532,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":534,"column":1},"end":{"line":534,"column":1}},"children":[{"type":"text","value":"Comparing policy gradient algorithms to policy iteration","position":{"start":{"line":534,"column":1},"end":{"line":534,"column":1}},"key":"oc7iKUsmXM"}],"identifier":"comparing-policy-gradient-algorithms-to-policy-iteration","label":"Comparing policy gradient algorithms to policy iteration","html_id":"comparing-policy-gradient-algorithms-to-policy-iteration","implicit":true,"enumerator":"6.6","key":"RldhViIA32"},{"type":"comment","value":" TODO maybe restructure this part ","key":"xgx4yZRTS2"},{"type":"paragraph","position":{"start":{"line":538,"column":1},"end":{"line":538,"column":1}},"children":[{"type":"text","value":"What advantages does the policy gradient algorithm have over ","position":{"start":{"line":538,"column":1},"end":{"line":538,"column":1}},"key":"DrW7zwNghh"},{"type":"crossReference","position":{"start":{"line":538,"column":1},"end":{"line":538,"column":1}},"children":[{"type":"text","value":"Section ","key":"KwEhtgLson"},{"type":"text","value":"1.5.3.2","key":"DN66xf9Imh"}],"identifier":"policy_iteration","label":"policy_iteration","kind":"heading","template":"Section %s","enumerator":"1.5.3.2","resolved":true,"html_id":"policy-iteration","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"VtdoGvulSz"},{"type":"text","value":"?","position":{"start":{"line":538,"column":1},"end":{"line":538,"column":1}},"key":"D7EVRvZGKc"}],"key":"MYnD8DNYjL"},{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Policy iteration recap","position":{"start":{"line":540,"column":1},"end":{"line":540,"column":1}},"key":"Uokp1uSCrZ"}],"key":"qMfY1MAnwK"},{"type":"paragraph","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"children":[{"type":"text","value":"Recall that policy iteration is an algorithm for MDPs with unknown state transitions where we alternate between these two steps:","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"key":"Bx5iKtWkxF"}],"key":"oxgGyQqdoH"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":543,"column":1},"end":{"line":544,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":543,"column":1},"end":{"line":543,"column":1}},"children":[{"type":"text","value":"Estimating the ","position":{"start":{"line":543,"column":1},"end":{"line":543,"column":1}},"key":"FKBBao3scK"},{"type":"inlineMath","value":"Q","position":{"start":{"line":543,"column":1},"end":{"line":543,"column":1}},"html":"QQQ","key":"YXt4sdRXRL"},{"type":"text","value":"-function (or advantage function) of the current policy;","position":{"start":{"line":543,"column":1},"end":{"line":543,"column":1}},"key":"PxfRBjY1ec"}],"key":"f82Aptel1H"},{"type":"listItem","spread":true,"position":{"start":{"line":544,"column":1},"end":{"line":544,"column":1}},"children":[{"type":"text","value":"Updating the policy to be greedy w.r.t. this approximate ","position":{"start":{"line":544,"column":1},"end":{"line":544,"column":1}},"key":"Y3yyIx8muP"},{"type":"inlineMath","value":"Q","position":{"start":{"line":544,"column":1},"end":{"line":544,"column":1}},"html":"QQQ","key":"KQSONc69RT"},{"type":"text","value":"-function (or advantage function).","position":{"start":{"line":544,"column":1},"end":{"line":544,"column":1}},"key":"Ci8WA6YdiS"}],"key":"ApRMKRwDHP"}],"key":"RzuxUo1Wpq"}],"key":"QFAPR5tXs9"},{"type":"paragraph","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"children":[{"type":"text","value":"To analyze the difference between them, we’ll make use of the ","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"key":"W6S7n8rCQs"},{"type":"strong","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"children":[{"type":"text","value":"performance difference lemma","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"key":"pYHSHM4Ae6"}],"key":"eiNL4hu8cc"},{"type":"text","value":", which provides an expression for comparing the difference between two value functions.","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"key":"hpiTmGJTCC"}],"key":"uN4cOfUF5l"},{"type":"proof","kind":"theorem","label":"pdl","identifier":"pdl","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Performance difference lemma","position":{"start":{"line":549,"column":1},"end":{"line":549,"column":1}},"key":"zzGiHyHcBl"}],"key":"uQAnw3tv3Q"},{"type":"paragraph","position":{"start":{"line":552,"column":1},"end":{"line":555,"column":1}},"children":[{"type":"text","value":"Suppose Alice is playing a game (an MDP).\nBob is spectating, and can evaluate how good an action is compared to his own strategy.\n(That is, Bob can compute his ","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"key":"ol7ueldzvN"},{"type":"emphasis","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"children":[{"type":"text","value":"advantage function","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"key":"B9yZFZ44xb"}],"key":"vAjbWOPtk1"},{"type":"text","value":" ","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"key":"HyIwpEVV7N"},{"type":"inlineMath","value":"A_\\hi^{\\text{Bob}}(s_\\hi, a_\\hi)","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"html":"AhBob(sh,ah)A_\\hi^{\\text{Bob}}(s_\\hi, a_\\hi)AhBob(sh,ah)","key":"WNxihSOPp1"},{"type":"text","value":").\nThe performance difference lemma says that Bob can now calculate exactly how much better or worse he is than Alice as follows:","position":{"start":{"line":552,"column":1},"end":{"line":552,"column":1}},"key":"jz1PcMdYGS"}],"key":"eMCYhxPYz7"},{"type":"math","value":"V_0^{\\text{Alice}}(s) - V_0^{\\text{Bob}}(s) = \\E_{\\tau \\sim \\rho_{\\text{Alice}, s}} \\left[ \\sum_{h=0}^{H-1} A_\\hi^{\\text{Bob}} (s_\\hi, a_\\hi) \\right]","label":"pdl_eq","identifier":"pdl_eq","html":"V0Alice(s)V0Bob(s)=EτρAlice,s[h=0H1AhBob(sh,ah)]V_0^{\\text{Alice}}(s) - V_0^{\\text{Bob}}(s) = \\E_{\\tau \\sim \\rho_{\\text{Alice}, s}} \\left[ \\sum_{h=0}^{H-1} A_\\hi^{\\text{Bob}} (s_\\hi, a_\\hi) \\right]V0Alice(s)V0Bob(s)=EτρAlice,s[h=0H1AhBob(sh,ah)]","enumerator":"6.27","html_id":"pdl-eq","key":"ZHH61ugDeX"},{"type":"paragraph","position":{"start":{"line":562,"column":1},"end":{"line":562,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":562,"column":1},"end":{"line":562,"column":1}},"key":"uAAQ5qgGXU"},{"type":"inlineMath","value":"\\rho_{\\text{Alice}, s}","position":{"start":{"line":562,"column":1},"end":{"line":562,"column":1}},"html":"ρAlice,s\\rho_{\\text{Alice}, s}ρAlice,s","key":"heYmK9TRvG"},{"type":"text","value":" denotes the distribution over trajectories starting in state ","position":{"start":{"line":562,"column":1},"end":{"line":562,"column":1}},"key":"xPwzpBf1Ww"},{"type":"inlineMath","value":"s","position":{"start":{"line":562,"column":1},"end":{"line":562,"column":1}},"html":"sss","key":"isFmIZSFAK"},{"type":"text","value":" when Alice is playing.","position":{"start":{"line":562,"column":1},"end":{"line":562,"column":1}},"key":"RCrv0altLe"}],"key":"l8LbP6PfBP"},{"type":"paragraph","position":{"start":{"line":564,"column":1},"end":{"line":566,"column":1}},"children":[{"type":"text","value":"To see why, consider just a single step ","position":{"start":{"line":564,"column":1},"end":{"line":564,"column":1}},"key":"KYAdNz7jBZ"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":564,"column":1},"end":{"line":564,"column":1}},"html":"h\\hih","key":"UKJ8rCDlQl"},{"type":"text","value":" of the trajectory.\nAt this step we compute how much better actions from Bob are than the actions from Alice, on average.\nBut this is exactly the average Bob-advantage across actions from Alice, as described in the PDL!","position":{"start":{"line":564,"column":1},"end":{"line":564,"column":1}},"key":"xy8JZyuVSM"}],"key":"eYypnRDNgG"},{"type":"paragraph","position":{"start":{"line":568,"column":1},"end":{"line":568,"column":1}},"children":[{"type":"text","value":"Formally, this corresponds to a nice telescoping simplification when we expand out the definition of the advantage function. Note that","position":{"start":{"line":568,"column":1},"end":{"line":568,"column":1}},"key":"ErsdfAQC46"}],"key":"amzfhmD5UG"},{"type":"math","value":"\\begin{aligned}\nA^\\pi_\\hi(s_\\hi, a_\\hi) &= Q^\\pi_\\hi(s_\\hi, a_\\hi) - V^\\pi_\\hi(s_\\hi) \\\\\n&= r_\\hi(s_\\hi, a_\\hi) + \\E_{s_{\\hi+1} \\sim P(s_\\hi, a_\\hi)} [V^\\pi_{\\hi+1}(s_{\\hi+1})] - V^\\pi_\\hi(s_\\hi)\n\\end{aligned}","position":{"start":{"line":570,"column":1},"end":{"line":575,"column":1}},"html":"Ahπ(sh,ah)=Qhπ(sh,ah)Vhπ(sh)=rh(sh,ah)+Esh+1P(sh,ah)[Vh+1π(sh+1)]Vhπ(sh)\\begin{aligned}\nA^\\pi_\\hi(s_\\hi, a_\\hi) &= Q^\\pi_\\hi(s_\\hi, a_\\hi) - V^\\pi_\\hi(s_\\hi) \\\\\n&= r_\\hi(s_\\hi, a_\\hi) + \\E_{s_{\\hi+1} \\sim P(s_\\hi, a_\\hi)} [V^\\pi_{\\hi+1}(s_{\\hi+1})] - V^\\pi_\\hi(s_\\hi)\n\\end{aligned}Ahπ(sh,ah)=Qhπ(sh,ah)Vhπ(sh)=rh(sh,ah)+Esh+1P(sh,ah)[Vh+1π(sh+1)]Vhπ(sh)","enumerator":"6.28","key":"jcYk8byagl"},{"type":"paragraph","position":{"start":{"line":577,"column":1},"end":{"line":577,"column":1}},"children":[{"type":"text","value":"so expanding out the r.h.s. expression of ","position":{"start":{"line":577,"column":1},"end":{"line":577,"column":1}},"key":"cyhLuvrUpw"},{"type":"crossReference","kind":"equation","identifier":"pdl_eq","label":"pdl_eq","children":[{"type":"text","value":"(","key":"hHZsWYD8vN"},{"type":"text","value":"6.27","key":"jitlvLSzeS"},{"type":"text","value":")","key":"ebJsiDu2TA"}],"template":"(%s)","enumerator":"6.27","resolved":true,"html_id":"pdl-eq","key":"lelGTRB2Zh"},{"type":"text","value":" and grouping terms together gives","position":{"start":{"line":577,"column":1},"end":{"line":577,"column":1}},"key":"FSeqgBEYuN"}],"key":"OMtDiyAuKE"},{"type":"math","value":"\\begin{aligned}\n\\E_{\\tau \\sim \\rho_{\\text{Alice}, s}} \\left[ \\sum_{\\hi=0}^{\\hor-1} A_\\hi^{\\text{Bob}} (s_\\hi, a_\\hi) \\right] &= \\E_{\\tau \\sim \\rho_{\\text{Alice}, s}} \\left[ \\left( \\sum_{\\hi=0}^{\\hor-1} r_\\hi(s_\\hi, a_\\hi) \\right) + \\left( V^{\\text{Bob}}_1(s_1) + \\cdots + V^{\\text{Bob}}_\\hor(s_\\hor) \\right) - \\left( V^{\\text{Bob}_0}(s_0) + \\cdots + V^{\\text{Bob}}_{\\hor-1}(s_{\\hor-1}) \\right) \\right] \\\\\n&= V^{\\text{Alice}}_0(s) - V^{\\text{Bob}}_0(s)\n\\end{aligned}","position":{"start":{"line":579,"column":1},"end":{"line":584,"column":1}},"html":"EτρAlice,s[h=0H1AhBob(sh,ah)]=EτρAlice,s[(h=0H1rh(sh,ah))+(V1Bob(s1)++VHBob(sH))(VBob0(s0)++VH1Bob(sH1))]=V0Alice(s)V0Bob(s)\\begin{aligned}\n\\E_{\\tau \\sim \\rho_{\\text{Alice}, s}} \\left[ \\sum_{\\hi=0}^{\\hor-1} A_\\hi^{\\text{Bob}} (s_\\hi, a_\\hi) \\right] &= \\E_{\\tau \\sim \\rho_{\\text{Alice}, s}} \\left[ \\left( \\sum_{\\hi=0}^{\\hor-1} r_\\hi(s_\\hi, a_\\hi) \\right) + \\left( V^{\\text{Bob}}_1(s_1) + \\cdots + V^{\\text{Bob}}_\\hor(s_\\hor) \\right) - \\left( V^{\\text{Bob}_0}(s_0) + \\cdots + V^{\\text{Bob}}_{\\hor-1}(s_{\\hor-1}) \\right) \\right] \\\\\n&= V^{\\text{Alice}}_0(s) - V^{\\text{Bob}}_0(s)\n\\end{aligned}EτρAlice,s[h=0H1AhBob(sh,ah)]=EτρAlice,s[(h=0H1rh(sh,ah))+(V1Bob(s1)++VHBob(sH))(VBob0(s0)++VH1Bob(sH1))]=V0Alice(s)V0Bob(s)","enumerator":"6.29","key":"qWoWvnH3vF"},{"type":"paragraph","position":{"start":{"line":586,"column":1},"end":{"line":586,"column":1}},"children":[{"type":"text","value":"as desired. (Note that the “inner” expectation from expanding the advantage function has the same distribution as the outer one, so omitting it here is valid.)","position":{"start":{"line":586,"column":1},"end":{"line":586,"column":1}},"key":"U7PlbJgWID"}],"key":"bR81iWo9L4"}],"enumerator":"6.1","html_id":"pdl","key":"RzCpZLp1ZN"},{"type":"paragraph","position":{"start":{"line":589,"column":1},"end":{"line":594,"column":1}},"children":[{"type":"text","value":"The PDL gives insight into why fitted approaches such as PI don’t work as well in the “full” RL setting.\nTo see why, let’s consider a single iteration of policy iteration, where policy ","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"uc1ee3DdgR"},{"type":"text","value":"π","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"dJJOB9gzwG"},{"type":"text","value":" gets updated to ","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"EQtjutZjKV"},{"type":"inlineMath","value":"\\tilde \\pi","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"html":"π~\\tilde \\piπ~","key":"E1N30YpBk4"},{"type":"text","value":". We’ll assume these policies are deterministic.\nSuppose the new policy ","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"C6lk9B0srg"},{"type":"inlineMath","value":"\\tilde \\pi","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"html":"π~\\tilde \\piπ~","key":"l8ogIsf2aG"},{"type":"text","value":" chooses some action with a negative advantage with respect to ","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"pg31rc1F1R"},{"type":"text","value":"π","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"Unr5BrwyAJ"},{"type":"text","value":".\nThat is, when acting according to ","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"jPqKObSNHE"},{"type":"text","value":"π","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"uMZjBadMzH"},{"type":"text","value":", taking the action from ","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"eTepuPFhMT"},{"type":"inlineMath","value":"\\tilde \\pi","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"html":"π~\\tilde \\piπ~","key":"lLwuLewkl0"},{"type":"text","value":" would perform worse than expected.\nDefine ","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"KVcOXCA3Df"},{"type":"inlineMath","value":"\\Delta_\\infty","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"html":"Δ\\Delta_\\inftyΔ","key":"ooZzZweoqT"},{"type":"text","value":" to be the most negative advantage, that is, ","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"Pj8wG5x42D"},{"type":"inlineMath","value":"\\Delta_\\infty = \\min_{s \\in \\mathcal{S}} A^{\\pi}_\\hi(s, \\tilde \\pi(s))","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"html":"Δ=minsSAhπ(s,π~(s))\\Delta_\\infty = \\min_{s \\in \\mathcal{S}} A^{\\pi}_\\hi(s, \\tilde \\pi(s))Δ=minsSAhπ(s,π~(s))","key":"g3QKEmusT4"},{"type":"text","value":".\nPlugging this into the ","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"Q6vEwS4bUm"},{"type":"crossReference","kind":"proof:theorem","identifier":"pdl","label":"pdl","children":[{"type":"text","value":"Theorem ","key":"URP3tiGd0A"},{"type":"text","value":"6.1","key":"p8if8hE5nC"}],"template":"Theorem %s","enumerator":"6.1","resolved":true,"html_id":"pdl","key":"O7KqGF5qPe"},{"type":"text","value":" gives","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"wLnlK3N4Ui"}],"key":"H712131XjZ"},{"type":"math","value":"\\begin{aligned}\nV_0^{\\tilde \\pi}(s) - V_0^{\\pi}(s) &= \\E_{\\tau \\sim \\rho_{\\tilde \\pi, s}} \\left[\n\\sum_{\\hi=0}^{\\hor-1} A_\\hi^{\\pi}(s_\\hi, a_\\hi)\n\\right] \\\\\n&\\ge H \\Delta_\\infty \\\\\nV_0^{\\tilde \\pi}(s) &\\ge V_0^{\\pi}(s) - H|\\Delta_\\infty|.\n\\end{aligned}","position":{"start":{"line":596,"column":1},"end":{"line":604,"column":1}},"html":"V0π~(s)V0π(s)=Eτρπ~,s[h=0H1Ahπ(sh,ah)]HΔV0π~(s)V0π(s)HΔ.\\begin{aligned}\nV_0^{\\tilde \\pi}(s) - V_0^{\\pi}(s) &= \\E_{\\tau \\sim \\rho_{\\tilde \\pi, s}} \\left[\n\\sum_{\\hi=0}^{\\hor-1} A_\\hi^{\\pi}(s_\\hi, a_\\hi)\n\\right] \\\\\n&\\ge H \\Delta_\\infty \\\\\nV_0^{\\tilde \\pi}(s) &\\ge V_0^{\\pi}(s) - H|\\Delta_\\infty|.\n\\end{aligned}V0π~(s)V0π(s)V0π~(s)=Eτρπ~,s[h=0H1Ahπ(sh,ah)]HΔV0π(s)HΔ∣.","enumerator":"6.30","key":"ax9bED4yWt"},{"type":"paragraph","position":{"start":{"line":606,"column":1},"end":{"line":612,"column":1}},"children":[{"type":"text","value":"That is, for some state ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"KmbCnQ8id4"},{"type":"inlineMath","value":"s","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"html":"sss","key":"aJI3IgmS46"},{"type":"text","value":", the lower bound on the performance of ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"wkRImZDac7"},{"type":"inlineMath","value":"\\tilde \\pi","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"html":"π~\\tilde \\piπ~","key":"PYVvNTjizF"},{"type":"text","value":" is ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"Ylip1Xomjd"},{"type":"emphasis","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"children":[{"type":"text","value":"lower","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"pm9lJM5V4R"}],"key":"V8mwIGP3Fv"},{"type":"text","value":" than the performance of ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"VYzcRK9WW3"},{"type":"text","value":"π","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"tq9kR4Bd8X"},{"type":"text","value":".\nThis doesn’t state that ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"wxaQNmzOrY"},{"type":"inlineMath","value":"\\tilde \\pi","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"html":"π~\\tilde \\piπ~","key":"hIBRha22Wr"},{"type":"text","value":" ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"HhogVgf4nV"},{"type":"emphasis","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"children":[{"type":"text","value":"will","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"bUKTqR9W7I"}],"key":"j4edBY0K5e"},{"type":"text","value":" necessarily perform worse than ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"ZgLsWI8ECy"},{"type":"text","value":"π","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"fiU5Qzf4vt"},{"type":"text","value":",\nonly suggests that it might be possible.\nIf these worst case states do exist, though,\nPI does not avoid situations where the new policy often visits them;\nIt does not enforce that the trajectory distributions ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"zp0fkB4qgS"},{"type":"inlineMath","value":"\\rho_\\pi","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"html":"ρπ\\rho_\\piρπ","key":"PejLZ8hp5x"},{"type":"text","value":" and ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"wpGIGIrzH4"},{"type":"inlineMath","value":"\\rho_{\\tilde \\pi}","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"html":"ρπ~\\rho_{\\tilde \\pi}ρπ~","key":"UqIn1tHM7Z"},{"type":"text","value":" be close to each other.\nIn other words, the “training distribution” that our prediction rule is fitted on, ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"evVfJ1oIBf"},{"type":"inlineMath","value":"\\rho_\\pi","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"html":"ρπ\\rho_\\piρπ","key":"jCp0a8uYNm"},{"type":"text","value":", may differ significantly from the “evaluation distribution” ","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"Ik7QKXPnS7"},{"type":"inlineMath","value":"\\rho_{\\tilde \\pi}","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"html":"ρπ~\\rho_{\\tilde \\pi}ρπ~","key":"cE7AUWjaVk"},{"type":"text","value":".","position":{"start":{"line":606,"column":1},"end":{"line":606,"column":1}},"key":"HLRAVTmCJt"}],"key":"tav22ztYBK"},{"type":"comment","value":" \nThis is an instance of *distributional shift*.\nTo begin, let's ask, where *do* fitted approaches work well?\nThey are commonly seen in SL,\nwhere a prediction rule is fit using some labelled training set,\nand then assessed on a test set from the same distribution.\nBut policy iteration isn't performed in the same scenario:\nthere is now _distributional shift_ between the different iterations of the policy. ","key":"CixzaWxKf3"},{"type":"paragraph","position":{"start":{"line":623,"column":1},"end":{"line":629,"column":1}},"children":[{"type":"text","value":"On the other hand, policy gradient methods ","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"key":"fnTqxPiLnM"},{"type":"emphasis","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"children":[{"type":"text","value":"do","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"key":"zrR3CshO6V"}],"key":"bvX2F8KsVn"},{"type":"text","value":", albeit implicitly,\nencourage ","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"key":"qJBUlZ96aX"},{"type":"inlineMath","value":"\\rho_\\pi","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"html":"ρπ\\rho_\\piρπ","key":"U6YTBnPkPQ"},{"type":"text","value":" and ","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"key":"dOZTqQ9vpK"},{"type":"inlineMath","value":"\\rho_{\\tilde \\pi}","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"html":"ρπ~\\rho_{\\tilde \\pi}ρπ~","key":"SNjUrOAsfq"},{"type":"text","value":" to be similar.\nSuppose that the mapping from policy parameters to trajectory distributions is relatively smooth.\nThen, by adjusting the parameters only a small distance,\nthe new policy will also have a similar trajectory distribution.\nBut this is not very rigorous, and in practice the parameter-to-distribution mapping may not be so smooth.\nCan we constrain the distance between the resulting distributions more ","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"key":"zY9tyL4ODL"},{"type":"emphasis","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"children":[{"type":"text","value":"explicitly","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"key":"AErO3rwL2l"}],"key":"PGbOYB6Ip2"},{"type":"text","value":"?","position":{"start":{"line":623,"column":1},"end":{"line":623,"column":1}},"key":"C1osq0gQMn"}],"key":"mBIRpjGO0m"},{"type":"paragraph","position":{"start":{"line":631,"column":1},"end":{"line":631,"column":1}},"children":[{"type":"text","value":"This brings us to the next three methods:","position":{"start":{"line":631,"column":1},"end":{"line":631,"column":1}},"key":"HZZhAFJeeS"}],"key":"r3ffsYBwrj"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":632,"column":1},"end":{"line":635,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":632,"column":1},"end":{"line":632,"column":1}},"children":[{"type":"strong","position":{"start":{"line":632,"column":1},"end":{"line":632,"column":1}},"children":[{"type":"text","value":"trust region policy optimization","position":{"start":{"line":632,"column":1},"end":{"line":632,"column":1}},"key":"wugO0ZBV4O"}],"key":"YQhEUB7Oka"},{"type":"text","value":" (TRPO), which explicitly constrains the difference between the distributions before and after each step;","position":{"start":{"line":632,"column":1},"end":{"line":632,"column":1}},"key":"DdPY1YM6sP"}],"key":"u5vlSAcVQt"},{"type":"listItem","spread":true,"position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"children":[{"type":"text","value":"the ","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"key":"zWkQFHhkhL"},{"type":"strong","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"children":[{"type":"text","value":"natural policy gradient","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"key":"Y6vGnEM4Yj"}],"key":"M38v39IgVn"},{"type":"text","value":" (NPG), a first-order approximation of TRPO;","position":{"start":{"line":633,"column":1},"end":{"line":633,"column":1}},"key":"tGlT63pfTR"}],"key":"d1KGGYIojB"},{"type":"listItem","spread":true,"position":{"start":{"line":634,"column":1},"end":{"line":635,"column":1}},"children":[{"type":"strong","position":{"start":{"line":634,"column":1},"end":{"line":634,"column":1}},"children":[{"type":"text","value":"proximal policy optimization","position":{"start":{"line":634,"column":1},"end":{"line":634,"column":1}},"key":"Gpb48Iw6zC"}],"key":"rzkfuV0NIT"},{"type":"text","value":" (PPO), a “soft relaxation” of TRPO.","position":{"start":{"line":634,"column":1},"end":{"line":634,"column":1}},"key":"yyzd7SO9US"}],"key":"qkgzbLZtUK"}],"key":"sY9XfIfK9Y"}],"key":"PRixxeKBxg"},{"type":"block","position":{"start":{"line":636,"column":1},"end":{"line":636,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":638,"column":1},"end":{"line":638,"column":1}},"children":[{"type":"text","value":"Trust region policy optimization","position":{"start":{"line":638,"column":1},"end":{"line":638,"column":1}},"key":"IF5SJxfXbg"}],"identifier":"trust-region-policy-optimization","label":"Trust region policy optimization","html_id":"trust-region-policy-optimization","implicit":true,"enumerator":"6.7","key":"lQpkGyUFTc"},{"type":"paragraph","position":{"start":{"line":640,"column":1},"end":{"line":644,"column":1}},"children":[{"type":"text","value":"We saw above that policy gradient methods are effective because they implicitly constrain how much the policy changes at each iteration.\nCan we design an algorithm that ","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"P9rGaRFdY4"},{"type":"emphasis","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"children":[{"type":"text","value":"explicitly","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"kkI1Fy0bmX"}],"key":"iNyStpn2ee"},{"type":"text","value":" constrains the “step size”?\nThat is, we want to ","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"AaYsjhIkGt"},{"type":"emphasis","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"children":[{"type":"text","value":"improve","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"w3YTJvEK64"}],"key":"cnKJv4cMzQ"},{"type":"text","value":" the policy as much as possible,\nmeasured in terms of the r.h.s. of the ","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"grw9hdP2Ru"},{"type":"crossReference","kind":"proof:theorem","identifier":"pdl","label":"pdl","children":[{"type":"text","value":"Theorem ","key":"T7ULg7crsJ"},{"type":"text","value":"6.1","key":"rhQpzVxMBj"}],"template":"Theorem %s","enumerator":"6.1","resolved":true,"html_id":"pdl","key":"udVzhoV8a8"},{"type":"text","value":",\nwhile ensuring that its trajectory distribution does not change too much:","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"yQBEG8eD5a"}],"key":"XI3yhmdfYo"},{"type":"math","value":"\\begin{aligned}\n\\theta^{k+1} &\\gets \\arg\\max_{\\theta^{\\text{opt}}} \\E_{s_0, \\dots, s_{H-1} \\sim \\pi^{k}} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\E_{a_\\hi \\sim \\pi^{\\theta^\\text{opt}}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi) \\right] \\\\\n& \\text{where } \\text{distance}(\\rho_{\\theta^{\\text{opt}}}, \\rho_{\\theta^k}) < \\delta\n\\end{aligned}","position":{"start":{"line":646,"column":1},"end":{"line":651,"column":1}},"html":"θk+1argmaxθoptEs0,,sH1πk[h=0H1Eahπθopt(sh)Aπk(sh,ah)]where distance(ρθopt,ρθk)<δ\\begin{aligned}\n\\theta^{k+1} &\\gets \\arg\\max_{\\theta^{\\text{opt}}} \\E_{s_0, \\dots, s_{H-1} \\sim \\pi^{k}} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\E_{a_\\hi \\sim \\pi^{\\theta^\\text{opt}}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi) \\right] \\\\\n& \\text{where } \\text{distance}(\\rho_{\\theta^{\\text{opt}}}, \\rho_{\\theta^k}) < \\delta\n\\end{aligned}θk+1argθoptmaxEs0,,sH1πk[h=0H1Eahπθopt(sh)Aπk(sh,ah)]where distance(ρθopt,ρθk)<δ","enumerator":"6.31","key":"DGziRgUinX"},{"type":"paragraph","position":{"start":{"line":653,"column":1},"end":{"line":659,"column":1}},"children":[{"type":"text","value":"Note that we have made a small change to the r.h.s. expression:\nwe use the ","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"gFfbY30211"},{"type":"emphasis","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"children":[{"type":"text","value":"states","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"V675nMIPty"}],"key":"m4HWGjVfyI"},{"type":"text","value":" sampled from the old policy, and only use the ","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"HHjFVFImbl"},{"type":"emphasis","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"children":[{"type":"text","value":"actions","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"R1ewsPv8yH"}],"key":"PtUuCEAP3L"},{"type":"text","value":" from the new policy.\nIt would be computationally infeasible to sample entire trajectories from ","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"Zozj2zdHTC"},{"type":"inlineMath","value":"\\pi_\\theta","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"html":"πθ\\pi_\\thetaπθ","key":"npFxJd3PId"},{"type":"text","value":" as we are optimizing over ","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"W3fyre3GyC"},{"type":"text","value":"θ","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"G51LkR0PuS"},{"type":"text","value":".\nOn the other hand, if ","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"EzwsCrNdEF"},{"type":"inlineMath","value":"\\pi_\\theta","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"html":"πθ\\pi_\\thetaπθ","key":"Vtsyod7g7u"},{"type":"text","value":" returns a vector representing a probability distribution over actions,\nthen evaluating the expected advantage with respect to this distribution only requires taking a dot product.\nThis approximation also matches the r.h.s. of the PDL to first order in ","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"BMmytwKwvc"},{"type":"text","value":"θ","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"dTHi2pJm1L"},{"type":"text","value":".\n(We will elaborate more on this later.)","position":{"start":{"line":653,"column":1},"end":{"line":653,"column":1}},"key":"shiAu0O2P6"}],"key":"zwMQUoHayI"},{"type":"paragraph","position":{"start":{"line":661,"column":1},"end":{"line":662,"column":1}},"children":[{"type":"text","value":"How do we describe the distance between ","position":{"start":{"line":661,"column":1},"end":{"line":661,"column":1}},"key":"QSeAGToGKj"},{"type":"inlineMath","value":"\\rho_{\\theta^{\\text{opt}}}","position":{"start":{"line":661,"column":1},"end":{"line":661,"column":1}},"html":"ρθopt\\rho_{\\theta^{\\text{opt}}}ρθopt","key":"E43DZyyjPx"},{"type":"text","value":" and ","position":{"start":{"line":661,"column":1},"end":{"line":661,"column":1}},"key":"Aai7oyIcDk"},{"type":"inlineMath","value":"\\rho_{\\theta^k}","position":{"start":{"line":661,"column":1},"end":{"line":661,"column":1}},"html":"ρθk\\rho_{\\theta^k}ρθk","key":"OznNtNzi2d"},{"type":"text","value":"?\nWe’ll use the ","position":{"start":{"line":661,"column":1},"end":{"line":661,"column":1}},"key":"sHXmP6snck"},{"type":"strong","position":{"start":{"line":661,"column":1},"end":{"line":661,"column":1}},"children":[{"type":"text","value":"Kullback-Leibler divergence (KLD)","position":{"start":{"line":661,"column":1},"end":{"line":661,"column":1}},"key":"CnpAqNyaSb"}],"key":"MBIplEGuKG"},{"type":"text","value":":","position":{"start":{"line":661,"column":1},"end":{"line":661,"column":1}},"key":"ugE0CQEl9j"}],"key":"JqQVXPl2x6"},{"type":"proof","kind":"definition","label":"kld","identifier":"kld","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Kullback-Leibler divergence","position":{"start":{"line":664,"column":1},"end":{"line":664,"column":1}},"key":"GKy9HoDnw1"}],"key":"pJRZjA5O09"},{"type":"paragraph","position":{"start":{"line":667,"column":1},"end":{"line":667,"column":1}},"children":[{"type":"text","value":"For two PDFs ","position":{"start":{"line":667,"column":1},"end":{"line":667,"column":1}},"key":"SQjAj8fXyf"},{"type":"inlineMath","value":"p, q","position":{"start":{"line":667,"column":1},"end":{"line":667,"column":1}},"html":"p,qp, qp,q","key":"RJnThomTz6"},{"type":"text","value":",","position":{"start":{"line":667,"column":1},"end":{"line":667,"column":1}},"key":"mR1LP1uPle"}],"key":"auXZOaqhcn"},{"type":"math","value":"\\kl{p}{q} := \\E_{x \\sim p} \\left[ \\log \\frac{p(x)}{q(x)} \\right]","position":{"start":{"line":669,"column":1},"end":{"line":669,"column":1}},"html":"KL(pq):=Exp[logp(x)q(x)]\\kl{p}{q} := \\E_{x \\sim p} \\left[ \\log \\frac{p(x)}{q(x)} \\right]KL(pq):=Exp[logq(x)p(x)]","enumerator":"6.32","key":"F6URrukDPp"},{"type":"paragraph","position":{"start":{"line":671,"column":1},"end":{"line":674,"column":1}},"children":[{"type":"text","value":"This can be interpreted in many different ways, many stemming from information theory.\nOne such interpretation is that ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"mmQaxTPwQY"},{"type":"inlineMath","value":"\\kl{p}{q}","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"html":"KL(pq)\\kl{p}{q}KL(pq)","key":"CTscU7UMMf"},{"type":"text","value":" describes my average “surprise” if I ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"rQqScUhPzy"},{"type":"emphasis","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"children":[{"type":"text","value":"think","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"MKn0P4auw1"}],"key":"CQp2kTnmA6"},{"type":"text","value":" data is being generated by ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"oOzU3CnqiQ"},{"type":"inlineMath","value":"q","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"html":"qqq","key":"iB6yv7pmzG"},{"type":"text","value":" but it’s actually generated by ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"aIVvzrJQ0J"},{"type":"inlineMath","value":"p","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"html":"ppp","key":"UJSDFsEYZ2"},{"type":"text","value":".\n(The ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"fHhLyDGr1a"},{"type":"strong","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"children":[{"type":"text","value":"surprise","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"yzpMHvMuwz"}],"key":"wt9vAzMUyJ"},{"type":"text","value":" of an event with probability ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"Fyvb88nyxJ"},{"type":"inlineMath","value":"p","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"html":"ppp","key":"z3RJwmxQw4"},{"type":"text","value":" is ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"VCif6va18A"},{"type":"inlineMath","value":"- \\log_2 p","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"html":"log2p- \\log_2 plog2p","key":"Plv9wiI7zM"},{"type":"text","value":".)\nNote that ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"srRED7wo0j"},{"type":"inlineMath","value":"\\kl{p}{q} = 0","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"html":"KL(pq)=0\\kl{p}{q} = 0KL(pq)=0","key":"UZRQr9HJxq"},{"type":"text","value":" if and only if ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"XX0jHZaS2y"},{"type":"inlineMath","value":"p = q","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"html":"p=qp = qp=q","key":"z1K6BVLmEQ"},{"type":"text","value":". Also note that it is generally ","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"Cml9CISKYr"},{"type":"emphasis","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"children":[{"type":"text","value":"not","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"NEkrTce9rT"}],"key":"m3VOLGbVZp"},{"type":"text","value":" symmetric.","position":{"start":{"line":671,"column":1},"end":{"line":671,"column":1}},"key":"Wx1fQd4uVD"}],"key":"AKE6DKLCb5"}],"enumerator":"6.3","html_id":"kld","key":"B3I8CaICBQ"},{"type":"paragraph","position":{"start":{"line":677,"column":1},"end":{"line":680,"column":1}},"children":[{"type":"text","value":"Both the objective function and the KLD constraint involve a weighted average over the space of all trajectories.\nThis is intractable in general, so we need to estimate the expectation.\nAs before, we can do this by taking an empirical average over samples from the trajectory distribution.\nThis gives us the following pseudocode:","position":{"start":{"line":677,"column":1},"end":{"line":677,"column":1}},"key":"jAFUzXgq5J"}],"key":"E5cuqN4qAS"},{"type":"proof","kind":"definition","label":"trpo","identifier":"trpo","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Trust region policy optimization (exact)","position":{"start":{"line":682,"column":1},"end":{"line":682,"column":1}},"key":"DtpAioV18Y"}],"key":"aoNCHhhI4f"},{"type":"code","lang":"python","value":"def trpo_pseudocode(env, δ, θ_init, M):\n θ = θ_init\n for k in range(K):\n trajectories = sample_trajectories(env, π(θ), M)\n A_hat = fit(trajectories)\n \n def approximate_gain(θ_):\n total_advantage = 0\n for τ in trajectories:\n for s, _a, _r in τ:\n for a in env.action_space:\n total_advantage += π(θ)(s, a) * A_hat(s, a)\n return total_advantage\n \n def constraint(θ_):\n kl_div = 0\n for τ in trajectories:\n for s, a, _r in τ:\n kl_div += jnp.log(π(θ)(s, a)) - jnp.log(π(θ_)(s, a))\n return kl_div <= δ\n \n θ = optimize(approximate_gain, constraint)\n\n return θ","position":{"start":{"line":686,"column":1},"end":{"line":711,"column":1}},"key":"RX5aW5o1A4"}],"enumerator":"6.4","html_id":"trpo","key":"qTEyevmr6D"},{"type":"comment","value":"\nApplying importance sampling allows us to estimate the TRPO objective as follows:\n\n::::{prf:definition} Trust region policy optimization (implementation)\n:label: trpo_implement\n\n:::{prf:definitionic} TODO\nInitialize $\\theta^0$\n\nSample $N$ trajectories from $\\rho^k$ to learn a value estimator $\\tilde b_\\hi(s) \\approx V^{\\pi^k}_\\hi(s)$\n\nSample $M$ trajectories $\\tau_0, \\dots, \\tau_{M-1} \\sim \\rho^k$\n\n$$\\begin{gathered}\n \\theta^{k+1} \\gets \\arg\\max_{\\theta} \\frac{1}{M} \\sum_{m=0}^{M-1} \\sum_{h=0}^{H-1} \\frac{\\pi_\\theta(a_\\hi \\mid s_\\hi)}{\\pi^k(a_\\hi \\mid s_\\hi)} [ R_\\hi(\\tau_m) - \\tilde b_\\hi(s_\\hi) ] \\\\\n \\text{where } \\sum_{m=0}^{M-1} \\sum_{h=0}^{H-1} \\log \\frac{\\pi_k(a_\\hi^m \\mid s_\\hi^m)}{\\pi_\\theta(a_\\hi^m \\mid s_\\hi^m)} \\le \\delta\n \n\\end{gathered}$$\n:::\n:::: ","key":"H0KWZ9H37k"},{"type":"paragraph","position":{"start":{"line":735,"column":1},"end":{"line":742,"column":1}},"children":[{"type":"text","value":"The above isn’t entirely complete:\nwe still need to solve the actual optimization problem at each step.\nUnless we know additional properties of the problem,\nthis might be an intractable optimization.\nDo we need to solve it exactly, though?\nInstead, if we assume that both the objective function and the constraint are somewhat smooth in terms of the policy parameters,\nwe can use their ","position":{"start":{"line":735,"column":1},"end":{"line":735,"column":1}},"key":"kzkF8qujl6"},{"type":"emphasis","position":{"start":{"line":735,"column":1},"end":{"line":735,"column":1}},"children":[{"type":"text","value":"Taylor expansions","position":{"start":{"line":735,"column":1},"end":{"line":735,"column":1}},"key":"xJRbxDG7KZ"}],"key":"XPxtpEnnmO"},{"type":"text","value":" to give us a simpler optimization problem with a closed-form solution.\nThis brings us to the ","position":{"start":{"line":735,"column":1},"end":{"line":735,"column":1}},"key":"IZyIpS9Rcy"},{"type":"strong","position":{"start":{"line":735,"column":1},"end":{"line":735,"column":1}},"children":[{"type":"text","value":"natural policy gradient","position":{"start":{"line":735,"column":1},"end":{"line":735,"column":1}},"key":"SQqRRdePCP"}],"key":"FOCOoWxWqt"},{"type":"text","value":" algorithm.","position":{"start":{"line":735,"column":1},"end":{"line":735,"column":1}},"key":"JxXaGHInMN"}],"key":"hQf4aOokzQ"}],"key":"ysv0OF71GF"},{"type":"block","position":{"start":{"line":744,"column":1},"end":{"line":744,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":746,"column":1},"end":{"line":746,"column":1}},"children":[{"type":"text","value":"Natural policy gradient","position":{"start":{"line":746,"column":1},"end":{"line":746,"column":1}},"key":"vHkUQVnOKA"}],"identifier":"natural-policy-gradient","label":"Natural policy gradient","html_id":"natural-policy-gradient","implicit":true,"enumerator":"6.8","key":"IlPnB9GvH2"},{"type":"paragraph","position":{"start":{"line":748,"column":1},"end":{"line":749,"column":1}},"children":[{"type":"text","value":"We take a ","position":{"start":{"line":748,"column":1},"end":{"line":748,"column":1}},"key":"Anr8f0ciHr"},{"type":"emphasis","position":{"start":{"line":748,"column":1},"end":{"line":748,"column":1}},"children":[{"type":"text","value":"linear","position":{"start":{"line":748,"column":1},"end":{"line":748,"column":1}},"key":"JAypcoZkiM"}],"key":"XNUwwGvEEH"},{"type":"text","value":" (first-order) approximation to the objective function and a ","position":{"start":{"line":748,"column":1},"end":{"line":748,"column":1}},"key":"Kuf3gN5xPo"},{"type":"emphasis","position":{"start":{"line":748,"column":1},"end":{"line":748,"column":1}},"children":[{"type":"text","value":"quadratic","position":{"start":{"line":748,"column":1},"end":{"line":748,"column":1}},"key":"pC7JVSs9u6"}],"key":"VzJ4lLy1Pd"},{"type":"text","value":" (second-order) approximation to the KL divergence constraint about the current estimate ","position":{"start":{"line":748,"column":1},"end":{"line":748,"column":1}},"key":"Y2mq4acCu9"},{"type":"inlineMath","value":"\\theta^k","position":{"start":{"line":748,"column":1},"end":{"line":748,"column":1}},"html":"θk\\theta^kθk","key":"CqAQ4W4Lwd"},{"type":"text","value":".\nThis results in the optimization problem","position":{"start":{"line":748,"column":1},"end":{"line":748,"column":1}},"key":"wzvAw5cYgn"}],"key":"badIBDpRYf"},{"type":"math","value":"\\begin{gathered}\n \\max_\\theta \\nabla_\\theta J(\\pi_{\\theta^k})^\\top (\\theta - \\theta^k) \\\\\n \\text{where } \\frac{1}{2} (\\theta - \\theta^k)^\\top F_{\\theta^k} (\\theta - \\theta^k) \\le \\delta\n\\end{gathered}","label":"npg_optimization","identifier":"npg_optimization","html":"maxθθJ(πθk)(θθk)where 12(θθk)Fθk(θθk)δ\\begin{gathered}\n \\max_\\theta \\nabla_\\theta J(\\pi_{\\theta^k})^\\top (\\theta - \\theta^k) \\\\\n \\text{where } \\frac{1}{2} (\\theta - \\theta^k)^\\top F_{\\theta^k} (\\theta - \\theta^k) \\le \\delta\n\\end{gathered}θmaxθJ(πθk)(θθk)where 21(θθk)Fθk(θθk)δ","enumerator":"6.33","html_id":"npg-optimization","key":"KUbl5IbFqa"},{"type":"paragraph","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"key":"CAGm8fNrGl"},{"type":"inlineMath","value":"F_{\\theta^k}","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"html":"FθkF_{\\theta^k}Fθk","key":"h9i544EHwZ"},{"type":"text","value":" is the ","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"key":"GTkkGPyyao"},{"type":"strong","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"children":[{"type":"text","value":"Fisher information matrix","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"key":"JGQkrTHvP9"}],"key":"Oz957TR2ju"},{"type":"text","value":" defined below.","position":{"start":{"line":760,"column":1},"end":{"line":760,"column":1}},"key":"zGcTPqFYLx"}],"key":"Rxwps8gPnG"},{"type":"proof","kind":"definition","label":"fisher_matrix","identifier":"fisher_matrix","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Fisher information matrix","position":{"start":{"line":762,"column":1},"end":{"line":762,"column":1}},"key":"hFF2vX97TI"}],"key":"dumlaAHhBL"},{"type":"paragraph","position":{"start":{"line":765,"column":1},"end":{"line":766,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":765,"column":1},"end":{"line":765,"column":1}},"key":"LzZROT2I6z"},{"type":"inlineMath","value":"p_\\theta","position":{"start":{"line":765,"column":1},"end":{"line":765,"column":1}},"html":"pθp_\\thetapθ","key":"W0TQlBkcFG"},{"type":"text","value":" denote a parameterized distribution.\nIts Fisher information matrix ","position":{"start":{"line":765,"column":1},"end":{"line":765,"column":1}},"key":"S9qdgonwK3"},{"type":"inlineMath","value":"F_\\theta","position":{"start":{"line":765,"column":1},"end":{"line":765,"column":1}},"html":"FθF_\\thetaFθ","key":"b3fpGXD68c"},{"type":"text","value":" can be defined equivalently as:","position":{"start":{"line":765,"column":1},"end":{"line":765,"column":1}},"key":"I3nigdImkM"}],"key":"aXDRrGTCBr"},{"type":"math","value":"\\begin{aligned}\n F_{\\theta} & = \\E_{x \\sim p_\\theta} \\left[ (\\nabla_\\theta \\log p_\\theta(x)) (\\nabla_\\theta \\log p_\\theta(x))^\\top \\right] & \\text{covariance matrix of the Fisher score} \\\\\n & = \\E_{x \\sim p_{\\theta}} [- \\nabla_\\theta^2 \\log p_\\theta(x)] & \\text{average Hessian of the negative log-likelihood}\n\\end{aligned}","position":{"start":{"line":768,"column":1},"end":{"line":773,"column":1}},"html":"Fθ=Expθ[(θlogpθ(x))(θlogpθ(x))]covariance matrix of the Fisher score=Expθ[θ2logpθ(x)]average Hessian of the negative log-likelihood\\begin{aligned}\n F_{\\theta} & = \\E_{x \\sim p_\\theta} \\left[ (\\nabla_\\theta \\log p_\\theta(x)) (\\nabla_\\theta \\log p_\\theta(x))^\\top \\right] & \\text{covariance matrix of the Fisher score} \\\\\n & = \\E_{x \\sim p_{\\theta}} [- \\nabla_\\theta^2 \\log p_\\theta(x)] & \\text{average Hessian of the negative log-likelihood}\n\\end{aligned}Fθ=Expθ[(θlogpθ(x))(θlogpθ(x))]=Expθ[θ2logpθ(x)]covariance matrix of the Fisher scoreaverage Hessian of the negative log-likelihood","enumerator":"6.34","key":"aHAg7n2Yob"},{"type":"paragraph","position":{"start":{"line":775,"column":1},"end":{"line":778,"column":1}},"children":[{"type":"text","value":"Recall that the Hessian of a function describes its curvature:\nfor a vector ","position":{"start":{"line":775,"column":1},"end":{"line":775,"column":1}},"key":"kI1qktJNcZ"},{"type":"inlineMath","value":"\\delta \\in \\Theta","position":{"start":{"line":775,"column":1},"end":{"line":775,"column":1}},"html":"δΘ\\delta \\in \\ThetaδΘ","key":"Kd3FMojVPx"},{"type":"text","value":",\nthe quantity ","position":{"start":{"line":775,"column":1},"end":{"line":775,"column":1}},"key":"VQcNXOg43o"},{"type":"inlineMath","value":"\\delta^\\top F_\\theta \\delta","position":{"start":{"line":775,"column":1},"end":{"line":775,"column":1}},"html":"δFθδ\\delta^\\top F_\\theta \\deltaδFθδ","key":"GGRVlkn3Mu"},{"type":"text","value":" describes how rapidly the negative log-likelihood changes if we move by ","position":{"start":{"line":775,"column":1},"end":{"line":775,"column":1}},"key":"akePYIRTEg"},{"type":"text","value":"δ","position":{"start":{"line":775,"column":1},"end":{"line":775,"column":1}},"key":"IbtrFC5ACc"},{"type":"text","value":".\nThe Fisher information matrix is precisely the Hessian of the KL divergence (with respect to either one of the parameters).","position":{"start":{"line":775,"column":1},"end":{"line":775,"column":1}},"key":"fqClqzUBuG"}],"key":"nLzIQWZd6U"},{"type":"paragraph","position":{"start":{"line":780,"column":1},"end":{"line":780,"column":1}},"children":[{"type":"text","value":"In particular, when ","position":{"start":{"line":780,"column":1},"end":{"line":780,"column":1}},"key":"ujoCaCNuUF"},{"type":"inlineMath","value":"p_\\theta = \\rho_{\\theta}","position":{"start":{"line":780,"column":1},"end":{"line":780,"column":1}},"html":"pθ=ρθp_\\theta = \\rho_{\\theta}pθ=ρθ","key":"wYZ8B9aLQ2"},{"type":"text","value":" denotes a trajectory distribution, we can further simplify the expression:","position":{"start":{"line":780,"column":1},"end":{"line":780,"column":1}},"key":"fDkqSVpXE4"}],"key":"UtMsx7OnsE"},{"type":"math","value":"F_{\\theta} = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{h=0}^{H-1} (\\nabla \\log \\pi_\\theta (a_\\hi \\mid s_\\hi)) (\\nabla \\log \\pi_\\theta(a_\\hi \\mid s_\\hi))^\\top \\right]","label":"fisher_trajectory","identifier":"fisher_trajectory","html":"Fθ=Eτρθ[h=0H1(logπθ(ahsh))(logπθ(ahsh))]F_{\\theta} = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{h=0}^{H-1} (\\nabla \\log \\pi_\\theta (a_\\hi \\mid s_\\hi)) (\\nabla \\log \\pi_\\theta(a_\\hi \\mid s_\\hi))^\\top \\right]Fθ=Eτρθ[h=0H1(logπθ(ahsh))(logπθ(ahsh))]","enumerator":"6.35","html_id":"fisher-trajectory","key":"jefnGM6X1A"},{"type":"paragraph","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"children":[{"type":"text","value":"Note that we’ve used the Markov property to cancel out the cross terms corresponding to two different time steps.","position":{"start":{"line":788,"column":1},"end":{"line":788,"column":1}},"key":"HW80hJDF1a"}],"key":"lLMMONtIhN"}],"enumerator":"6.5","html_id":"fisher-matrix","key":"fKvEnrODVn"},{"type":"paragraph","position":{"start":{"line":791,"column":1},"end":{"line":796,"column":1}},"children":[{"type":"text","value":"This is a convex optimization problem with a closed-form solution.\nTo see why, it helps to visualize the case where ","position":{"start":{"line":791,"column":1},"end":{"line":791,"column":1}},"key":"uCyQi2h4DF"},{"type":"text","value":"θ","position":{"start":{"line":791,"column":1},"end":{"line":791,"column":1}},"key":"EoT0degf8u"},{"type":"text","value":" is two-dimensional:\nthe constraint describes the inside of an ellipse,\nand the objective function is linear,\nso we can find the extreme point on the boundary of the ellipse.\nWe recommend ","position":{"start":{"line":791,"column":1},"end":{"line":791,"column":1}},"key":"zCJKPMdfVp"},{"type":"cite","kind":"narrative","label":"boyd_convex_2004","identifier":"boyd_convex_2004","children":[{"type":"text","value":"Boyd & Vandenberghe (2004)","key":"nq2U9mvcVl"}],"enumerator":"1","key":"dOwyrcdb8H"},{"type":"text","value":" for a comprehensive treatment of convex optimization.","position":{"start":{"line":791,"column":1},"end":{"line":791,"column":1}},"key":"eV0XVq6Fpw"}],"key":"wRM2LOqLEL"},{"type":"paragraph","position":{"start":{"line":798,"column":1},"end":{"line":799,"column":1}},"children":[{"type":"text","value":"More generally, for a higher-dimensional ","position":{"start":{"line":798,"column":1},"end":{"line":798,"column":1}},"key":"nEdmV54M6m"},{"type":"text","value":"θ","position":{"start":{"line":798,"column":1},"end":{"line":798,"column":1}},"key":"IcYfW0FyFF"},{"type":"text","value":",\nwe can compute the global optima by setting the gradient of the Lagrangian to zero:","position":{"start":{"line":798,"column":1},"end":{"line":798,"column":1}},"key":"Gc1VxE9TAS"}],"key":"nQW5QSyjC6"},{"type":"math","value":"\\begin{aligned}\n \\mathcal{L}(\\theta, \\alpha) & = \\nabla J(\\pi_{\\theta^k})^\\top (\\theta - \\theta^k) - \\alpha \\left[ \\frac{1}{2} (\\theta - \\theta^k)^\\top F_{\\theta^k} (\\theta - \\theta^k) - \\delta \\right] \\\\\n \\nabla \\mathcal{L}(\\theta^{k+1}, \\alpha) & := 0 \\\\\n \\implies \\nabla J(\\pi_{\\theta^k}) & = \\alpha F_{\\theta^k} (\\theta^{k+1} - \\theta^k) \\\\\n \\theta^{k+1} & = \\theta^k + \\eta F_{\\theta^k}^{-1} \\nabla J(\\pi_{\\theta^k}) \\\\\n \\text{where } \\eta & = \\sqrt{\\frac{2 \\delta}{\\nabla J(\\pi_{\\theta^k})^\\top F_{\\theta^k}^{-1} \\nabla J(\\pi_{\\theta^k})}}\n\\end{aligned}","position":{"start":{"line":801,"column":1},"end":{"line":809,"column":1}},"html":"L(θ,α)=J(πθk)(θθk)α[12(θθk)Fθk(θθk)δ]L(θk+1,α):=0    J(πθk)=αFθk(θk+1θk)θk+1=θk+ηFθk1J(πθk)where η=2δJ(πθk)Fθk1J(πθk)\\begin{aligned}\n \\mathcal{L}(\\theta, \\alpha) & = \\nabla J(\\pi_{\\theta^k})^\\top (\\theta - \\theta^k) - \\alpha \\left[ \\frac{1}{2} (\\theta - \\theta^k)^\\top F_{\\theta^k} (\\theta - \\theta^k) - \\delta \\right] \\\\\n \\nabla \\mathcal{L}(\\theta^{k+1}, \\alpha) & := 0 \\\\\n \\implies \\nabla J(\\pi_{\\theta^k}) & = \\alpha F_{\\theta^k} (\\theta^{k+1} - \\theta^k) \\\\\n \\theta^{k+1} & = \\theta^k + \\eta F_{\\theta^k}^{-1} \\nabla J(\\pi_{\\theta^k}) \\\\\n \\text{where } \\eta & = \\sqrt{\\frac{2 \\delta}{\\nabla J(\\pi_{\\theta^k})^\\top F_{\\theta^k}^{-1} \\nabla J(\\pi_{\\theta^k})}}\n\\end{aligned}L(θ,α)L(θk+1,α)J(πθk)θk+1where η=J(πθk)(θθk)α[21(θθk)Fθk(θθk)δ]:=0=αFθk(θk+1θk)=θk+ηFθk1J(πθk)=J(πθk)Fθk1J(πθk)2δ","enumerator":"6.36","key":"PwWk7txzNI"},{"type":"paragraph","position":{"start":{"line":811,"column":1},"end":{"line":813,"column":1}},"children":[{"type":"text","value":"This gives us the closed-form update.\nNow the only challenge is to estimate the Fisher information matrix,\nsince, as with the KL divergence constraint, it is an expectation over trajectories, and computing it exactly is therefore typically intractable.","position":{"start":{"line":811,"column":1},"end":{"line":811,"column":1}},"key":"exy9Peb4d4"}],"key":"TrVdGSMtft"},{"type":"proof","kind":"definition","label":"npg","identifier":"npg","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Natural policy gradient","position":{"start":{"line":815,"column":1},"end":{"line":815,"column":1}},"key":"rybOC8yLRN"}],"key":"vd5VagFd1K"},{"type":"paragraph","position":{"start":{"line":818,"column":1},"end":{"line":820,"column":1}},"children":[{"type":"text","value":"How many trajectory samples do we need to accurately estimate the Fisher information matrix?\nAs a rule of thumb, the sample complexity should scale with the dimension of the parameter space.\nThis makes this approach intractable in the deep learning setting where we might have a very large number of parameters.","position":{"start":{"line":818,"column":1},"end":{"line":818,"column":1}},"key":"UlLOr5BQdX"}],"key":"DxNKGpEswN"}],"enumerator":"6.6","html_id":"npg","key":"c8gJTWAW4w"},{"type":"paragraph","position":{"start":{"line":823,"column":1},"end":{"line":828,"column":1}},"children":[{"type":"text","value":"As you can see, the NPG is the “basic” policy gradient algorithm we saw above,\nbut with the gradient transformed by the inverse Fisher information matrix.\nThis matrix can be understood as accounting for the ","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"f8msndHYKo"},{"type":"strong","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"children":[{"type":"text","value":"geometry of the parameter space.","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"UXZB3rYckN"}],"key":"REcN5l7gQq"},{"type":"text","value":"\nThe typical gradient descent algorithm implicitly measures distances between parameters using the typical ","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"VkSHhhu9Ku"},{"type":"emphasis","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"children":[{"type":"text","value":"Euclidean distance","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"VNlfX2txue"}],"key":"AFi8ZYoR6Z"},{"type":"text","value":".\nHere, where the parameters map to a ","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"yWJu6zuLQl"},{"type":"emphasis","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"children":[{"type":"text","value":"distribution","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"MXu7bgvkJl"}],"key":"mLWRghyQTs"},{"type":"text","value":", using the natural gradient update is equivalent to optimizing over ","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"lo1I6ZLifX"},{"type":"strong","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"children":[{"type":"text","value":"distribution space","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"Xa4tTbfwDR"}],"key":"GaCSdYoU84"},{"type":"text","value":" rather than parameter space,\nwhere distance between distributions is measured by the ","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"U0wBhPAkdZ"},{"type":"crossReference","kind":"proof:definition","identifier":"kld","label":"kld","children":[{"type":"text","value":"Definition ","key":"f8I7JL9QNJ"},{"type":"text","value":"6.3","key":"tcDf0BO8Q9"}],"template":"Definition %s","enumerator":"6.3","resolved":true,"html_id":"kld","key":"uxuwHxbUR5"},{"type":"text","value":".","position":{"start":{"line":823,"column":1},"end":{"line":823,"column":1}},"key":"kWFbsF1e2n"}],"key":"Q4qm4sg4in"},{"type":"proof","kind":"example","label":"natural_simple","identifier":"natural_simple","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Natural gradient on a simple problem","position":{"start":{"line":830,"column":1},"end":{"line":830,"column":1}},"key":"PQaeBwENF5"}],"key":"oxvqxncDTS"},{"type":"paragraph","position":{"start":{"line":833,"column":1},"end":{"line":833,"column":1}},"children":[{"type":"text","value":"Let’s step away from RL and consider the following optimization problem over Bernoulli distributions ","position":{"start":{"line":833,"column":1},"end":{"line":833,"column":1}},"key":"fJuKpsUnWt"},{"type":"inlineMath","value":"\\pi \\in \\Delta(\\{ 0, 1 \\})","position":{"start":{"line":833,"column":1},"end":{"line":833,"column":1}},"html":"πΔ({0,1})\\pi \\in \\Delta(\\{ 0, 1 \\})πΔ({0,1})","key":"lLzQPoICUV"},{"type":"text","value":":","position":{"start":{"line":833,"column":1},"end":{"line":833,"column":1}},"key":"YazO8ENgRo"}],"key":"OlzPp8YnHQ"},{"type":"math","value":"\\begin{aligned}\n J(\\pi) & = 100 \\cdot \\pi(1) + 1 \\cdot \\pi(0)\n\\end{aligned}","position":{"start":{"line":835,"column":1},"end":{"line":839,"column":1}},"html":"J(π)=100π(1)+1π(0)\\begin{aligned}\n J(\\pi) & = 100 \\cdot \\pi(1) + 1 \\cdot \\pi(0)\n\\end{aligned}J(π)=100π(1)+1π(0)","enumerator":"6.37","key":"KBTNV2QO7B"},{"type":"paragraph","position":{"start":{"line":841,"column":1},"end":{"line":841,"column":1}},"children":[{"type":"text","value":"We can think of the space of such distributions as the line between ","position":{"start":{"line":841,"column":1},"end":{"line":841,"column":1}},"key":"CyjSfhFNw3"},{"type":"inlineMath","value":"(0, 1)","position":{"start":{"line":841,"column":1},"end":{"line":841,"column":1}},"html":"(0,1)(0, 1)(0,1)","key":"KpuK2HYZs3"},{"type":"text","value":" to ","position":{"start":{"line":841,"column":1},"end":{"line":841,"column":1}},"key":"UVH67clDLW"},{"type":"inlineMath","value":"(1, 0)","position":{"start":{"line":841,"column":1},"end":{"line":841,"column":1}},"html":"(1,0)(1, 0)(1,0)","key":"vafMQPCaI6"},{"type":"text","value":" on the Cartesian plane:","position":{"start":{"line":841,"column":1},"end":{"line":841,"column":1}},"key":"m6Saobak5w"}],"key":"rdMQWwyVZp"},{"type":"image","url":"/build/npg_line-18dfc6d5286c25a94643b5e115d15484.png","alt":"a line from (0, 1) to (1, 0)","width":"240px","align":"center","key":"iVXgtCkfCV","urlSource":"shared/npg_line.png","urlOptimized":"/build/npg_line-18dfc6d5286c25a94643b5e115d15484.webp"},{"type":"paragraph","position":{"start":{"line":849,"column":1},"end":{"line":851,"column":1}},"children":[{"type":"text","value":"Clearly the optimal distribution is the constant one ","position":{"start":{"line":849,"column":1},"end":{"line":849,"column":1}},"key":"khjND6AtkL"},{"type":"inlineMath","value":"\\pi(1) = 1","position":{"start":{"line":849,"column":1},"end":{"line":849,"column":1}},"html":"π(1)=1\\pi(1) = 1π(1)=1","key":"irpuTBWhAD"},{"type":"text","value":". Suppose we optimize over the parameterized family ","position":{"start":{"line":849,"column":1},"end":{"line":849,"column":1}},"key":"hTW1Zksphg"},{"type":"inlineMath","value":"\\pi_\\theta(1) = \\frac{\\exp(\\theta)}{1+\\exp(\\theta)}","position":{"start":{"line":849,"column":1},"end":{"line":849,"column":1}},"html":"πθ(1)=exp(θ)1+exp(θ)\\pi_\\theta(1) = \\frac{\\exp(\\theta)}{1+\\exp(\\theta)}πθ(1)=1+exp(θ)exp(θ)","key":"HsNdrbFovf"},{"type":"text","value":".\nThen our optimization algorithm should set ","position":{"start":{"line":849,"column":1},"end":{"line":849,"column":1}},"key":"VnP8CHQt9P"},{"type":"text","value":"θ","position":{"start":{"line":849,"column":1},"end":{"line":849,"column":1}},"key":"fDnAzzjb3m"},{"type":"text","value":" to be unboundedly large.\nThen the “vanilla” gradient is","position":{"start":{"line":849,"column":1},"end":{"line":849,"column":1}},"key":"po1TCPCk8S"}],"key":"aMzAXw2paG"},{"type":"math","value":"\\nabla_\\theta J(\\pi_\\theta) = \\frac{99 \\exp(\\theta)}{(1 + \\exp(\\theta))^2}.","position":{"start":{"line":853,"column":1},"end":{"line":853,"column":1}},"html":"θJ(πθ)=99exp(θ)(1+exp(θ))2.\\nabla_\\theta J(\\pi_\\theta) = \\frac{99 \\exp(\\theta)}{(1 + \\exp(\\theta))^2}.θJ(πθ)=(1+exp(θ))299exp(θ).","enumerator":"6.38","key":"dX7hszxC1b"},{"type":"paragraph","position":{"start":{"line":855,"column":1},"end":{"line":856,"column":1}},"children":[{"type":"text","value":"Note that as ","position":{"start":{"line":855,"column":1},"end":{"line":855,"column":1}},"key":"KzIkNnM7gb"},{"type":"inlineMath","value":"\\theta \\to \\infty","position":{"start":{"line":855,"column":1},"end":{"line":855,"column":1}},"html":"θ\\theta \\to \\inftyθ","key":"PWkPOeLQNA"},{"type":"text","value":" that the increments get closer and closer to ","position":{"start":{"line":855,"column":1},"end":{"line":855,"column":1}},"key":"B8nqQJe0Ui"},{"type":"text","value":"0","position":{"start":{"line":855,"column":1},"end":{"line":855,"column":1}},"key":"i2bgqfwwUe"},{"type":"text","value":";\nthe rate of increase becomes exponentially slow.","position":{"start":{"line":855,"column":1},"end":{"line":855,"column":1}},"key":"zKvc7aoUc1"}],"key":"qFbpjJMox7"},{"type":"paragraph","position":{"start":{"line":859,"column":1},"end":{"line":859,"column":1}},"children":[{"type":"text","value":"However, if we compute the Fisher information “matrix” (which is just a scalar in this case), we can account for the geometry induced by the parameterization.","position":{"start":{"line":859,"column":1},"end":{"line":859,"column":1}},"key":"DLtovoSHjb"}],"key":"O8lBnsSACl"},{"type":"math","value":"\\begin{aligned}\n F_\\theta & = \\E_{x \\sim \\pi_\\theta} [ (\\nabla_\\theta \\log \\pi_\\theta(x))^2 ] \\\\\n & = \\frac{\\exp(\\theta)}{(1 + \\exp(\\theta))^2}.\n\\end{aligned}","position":{"start":{"line":861,"column":1},"end":{"line":866,"column":1}},"html":"Fθ=Exπθ[(θlogπθ(x))2]=exp(θ)(1+exp(θ))2.\\begin{aligned}\n F_\\theta & = \\E_{x \\sim \\pi_\\theta} [ (\\nabla_\\theta \\log \\pi_\\theta(x))^2 ] \\\\\n & = \\frac{\\exp(\\theta)}{(1 + \\exp(\\theta))^2}.\n\\end{aligned}Fθ=Exπθ[(θlogπθ(x))2]=(1+exp(θ))2exp(θ).","enumerator":"6.39","key":"sOX8MlvcrA"},{"type":"paragraph","position":{"start":{"line":868,"column":1},"end":{"line":868,"column":1}},"children":[{"type":"text","value":"This gives the natural gradient update","position":{"start":{"line":868,"column":1},"end":{"line":868,"column":1}},"key":"WnLWjxOJBY"}],"key":"x1iLArurJW"},{"type":"math","value":"\\begin{aligned}\n \\theta^{k+1} & = \\theta^k + \\eta F_{\\theta^k}^{-1} \\nabla_ \\theta J(\\theta^k) \\\\\n & = \\theta^k + 99 \\eta\n\\end{aligned}","position":{"start":{"line":870,"column":1},"end":{"line":875,"column":1}},"html":"θk+1=θk+ηFθk1θJ(θk)=θk+99η\\begin{aligned}\n \\theta^{k+1} & = \\theta^k + \\eta F_{\\theta^k}^{-1} \\nabla_ \\theta J(\\theta^k) \\\\\n & = \\theta^k + 99 \\eta\n\\end{aligned}θk+1=θk+ηFθk1θJ(θk)=θk+99η","enumerator":"6.40","key":"V6WmcwQnoh"},{"type":"paragraph","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"children":[{"type":"text","value":"which increases at a constant rate, i.e. improves the objective more quickly than “vanilla” gradient ascent.","position":{"start":{"line":877,"column":1},"end":{"line":877,"column":1}},"key":"YGD8qmYgKT"}],"key":"IflQ2PmCDZ"}],"enumerator":"6.1","html_id":"natural-simple","key":"yYC1Lejhbp"},{"type":"paragraph","position":{"start":{"line":880,"column":1},"end":{"line":884,"column":1}},"children":[{"type":"text","value":"Though the NPG now gives a closed-form optimization step,\nit requires computing the inverse Fisher information matrix,\nwhich typically scales as ","position":{"start":{"line":880,"column":1},"end":{"line":880,"column":1}},"key":"sWHTSj0rLt"},{"type":"inlineMath","value":"O((\\dim \\Theta)^3)","position":{"start":{"line":880,"column":1},"end":{"line":880,"column":1}},"html":"O((dimΘ)3)O((\\dim \\Theta)^3)O((dimΘ)3)","key":"uNtp8X7f8N"},{"type":"text","value":".\nThis can be expensive if the parameter space is large.\nCan we find an algorithm that works in ","position":{"start":{"line":880,"column":1},"end":{"line":880,"column":1}},"key":"kMOroE89zf"},{"type":"emphasis","position":{"start":{"line":880,"column":1},"end":{"line":880,"column":1}},"children":[{"type":"text","value":"linear time","position":{"start":{"line":880,"column":1},"end":{"line":880,"column":1}},"key":"tIs4xR8Dns"}],"key":"V0KMt58ox5"},{"type":"text","value":" with respect to the dimension of the parameter space?","position":{"start":{"line":880,"column":1},"end":{"line":880,"column":1}},"key":"o1dvTZeQqy"}],"key":"XdiO7ERU1l"}],"key":"TBZknQ9ah5"},{"type":"block","position":{"start":{"line":886,"column":1},"end":{"line":886,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":888,"column":1},"end":{"line":888,"column":1}},"children":[{"type":"text","value":"Proximal policy optimization","position":{"start":{"line":888,"column":1},"end":{"line":888,"column":1}},"key":"YvYQKnbSef"}],"identifier":"proximal-policy-optimization","label":"Proximal policy optimization","html_id":"proximal-policy-optimization","implicit":true,"enumerator":"6.9","key":"riuJzGXSQ1"},{"type":"paragraph","position":{"start":{"line":890,"column":1},"end":{"line":892,"column":1}},"children":[{"type":"text","value":"We can relax the TRPO optimization problem in a different way:\nRather than imposing a hard constraint on the KL distance,\nwe can instead impose a ","position":{"start":{"line":890,"column":1},"end":{"line":890,"column":1}},"key":"sSaZAYfDpD"},{"type":"emphasis","position":{"start":{"line":890,"column":1},"end":{"line":890,"column":1}},"children":[{"type":"text","value":"soft","position":{"start":{"line":890,"column":1},"end":{"line":890,"column":1}},"key":"Rg4SnTE63Y"}],"key":"MbvYyjZaFX"},{"type":"text","value":" constraint by incorporating it into the objective and penalizing parameter values that drastically change the trajectory distribution.","position":{"start":{"line":890,"column":1},"end":{"line":890,"column":1}},"key":"rSBR4rUnUL"}],"key":"lByVX1NUSW"},{"type":"math","value":"\\begin{aligned}\n\\theta^{k+1} &\\gets \\arg\\max_{\\theta} \\E_{s_0, \\dots, s_{H-1} \\sim \\rho_{\\pi^{k}}} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\E_{a_\\hi \\sim \\pi_{\\theta}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi) \\right] - \\lambda \\kl{\\rho_{\\theta}}{\\rho_{\\theta^k}}\n\\end{aligned}","position":{"start":{"line":894,"column":1},"end":{"line":898,"column":1}},"html":"θk+1argmaxθEs0,,sH1ρπk[h=0H1Eahπθ(sh)Aπk(sh,ah)]λKL(ρθρθk)\\begin{aligned}\n\\theta^{k+1} &\\gets \\arg\\max_{\\theta} \\E_{s_0, \\dots, s_{H-1} \\sim \\rho_{\\pi^{k}}} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\E_{a_\\hi \\sim \\pi_{\\theta}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi) \\right] - \\lambda \\kl{\\rho_{\\theta}}{\\rho_{\\theta^k}}\n\\end{aligned}θk+1argθmaxEs0,,sH1ρπk[h=0H1Eahπθ(sh)Aπk(sh,ah)]λKL(ρθρθk)","enumerator":"6.41","key":"MVlYrm0Tlh"},{"type":"paragraph","position":{"start":{"line":900,"column":1},"end":{"line":900,"column":1}},"children":[{"type":"text","value":"Here ","position":{"start":{"line":900,"column":1},"end":{"line":900,"column":1}},"key":"IbAn5AtwOp"},{"type":"text","value":"λ","position":{"start":{"line":900,"column":1},"end":{"line":900,"column":1}},"key":"iBvWymHTg8"},{"type":"text","value":" is a ","position":{"start":{"line":900,"column":1},"end":{"line":900,"column":1}},"key":"fGMucq8cwT"},{"type":"strong","position":{"start":{"line":900,"column":1},"end":{"line":900,"column":1}},"children":[{"type":"text","value":"regularization hyperparameter","position":{"start":{"line":900,"column":1},"end":{"line":900,"column":1}},"key":"NBDhI7OlFz"}],"key":"pY5SVJvVHG"},{"type":"text","value":" that controls the tradeoff between the two terms.","position":{"start":{"line":900,"column":1},"end":{"line":900,"column":1}},"key":"KmkwRL3x0J"}],"key":"kMkCXnOk4I"},{"type":"paragraph","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"children":[{"type":"text","value":"Like the original TRPO algorithm ","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"uIhlTP81VJ"},{"type":"crossReference","kind":"proof:definition","identifier":"trpo","label":"trpo","children":[{"type":"text","value":"Definition ","key":"Qibyq3jk58"},{"type":"text","value":"6.4","key":"dIrEXZxFZ0"}],"template":"Definition %s","enumerator":"6.4","resolved":true,"html_id":"trpo","key":"PeTjzlUkGx"},{"type":"text","value":", PPO is not gradient-based; rather, at each step, we try to maximize local advantage relative to the current policy.","position":{"start":{"line":902,"column":1},"end":{"line":902,"column":1}},"key":"uFgLI3o2Wm"}],"key":"ZRiGja8cYb"},{"type":"paragraph","position":{"start":{"line":904,"column":1},"end":{"line":905,"column":1}},"children":[{"type":"text","value":"How do we solve this optimization?\nLet us begin by simplifying the ","position":{"start":{"line":904,"column":1},"end":{"line":904,"column":1}},"key":"iIZmHVRkyO"},{"type":"inlineMath","value":"\\kl{\\rho_{\\pi^k}}{\\rho_{\\pi_{\\theta}}}","position":{"start":{"line":904,"column":1},"end":{"line":904,"column":1}},"html":"KL(ρπkρπθ)\\kl{\\rho_{\\pi^k}}{\\rho_{\\pi_{\\theta}}}KL(ρπkρπθ)","key":"bzwKKaBwUm"},{"type":"text","value":" term. Expanding gives","position":{"start":{"line":904,"column":1},"end":{"line":904,"column":1}},"key":"w09RF3AP3D"}],"key":"JOdDOrPl8U"},{"type":"math","value":"\\begin{aligned}\n \\kl{\\rho_{\\pi^k}}{\\rho_{\\pi_{\\theta}}} & = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[\\log \\frac{\\rho_{\\pi^k}(\\tau)}{\\rho_{\\pi_{\\theta}}(\\tau)}\\right] \\\\\n & = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\log \\frac{\\pi^k(a_\\hi \\mid s_\\hi)}{\\pi_{\\theta}(a_\\hi \\mid s_\\hi)}\\right] & \\text{state transitions cancel} \\\\\n & = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\log \\frac{1}{\\pi_{\\theta}(a_\\hi \\mid s_\\hi)}\\right] + c\n\\end{aligned}","position":{"start":{"line":907,"column":1},"end":{"line":913,"column":1}},"html":"KL(ρπkρπθ)=Eτρπk[logρπk(τ)ρπθ(τ)]=Eτρπk[h=0H1logπk(ahsh)πθ(ahsh)]state transitions cancel=Eτρπk[h=0H1log1πθ(ahsh)]+c\\begin{aligned}\n \\kl{\\rho_{\\pi^k}}{\\rho_{\\pi_{\\theta}}} & = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[\\log \\frac{\\rho_{\\pi^k}(\\tau)}{\\rho_{\\pi_{\\theta}}(\\tau)}\\right] \\\\\n & = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\log \\frac{\\pi^k(a_\\hi \\mid s_\\hi)}{\\pi_{\\theta}(a_\\hi \\mid s_\\hi)}\\right] & \\text{state transitions cancel} \\\\\n & = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\log \\frac{1}{\\pi_{\\theta}(a_\\hi \\mid s_\\hi)}\\right] + c\n\\end{aligned}KL(ρπkρπθ)=Eτρπk[logρπθ(τ)ρπk(τ)]=Eτρπk[h=0H1logπθ(ahsh)πk(ahsh)]=Eτρπk[h=0H1logπθ(ahsh)1]+cstate transitions cancel","enumerator":"6.42","key":"lYxfMsdYrB"},{"type":"paragraph","position":{"start":{"line":915,"column":1},"end":{"line":916,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":915,"column":1},"end":{"line":915,"column":1}},"key":"Qjd7aGDl7e"},{"type":"inlineMath","value":"c","position":{"start":{"line":915,"column":1},"end":{"line":915,"column":1}},"html":"ccc","key":"A8zxKzatJV"},{"type":"text","value":" is some constant with respect to ","position":{"start":{"line":915,"column":1},"end":{"line":915,"column":1}},"key":"t9qMpfa3Ix"},{"type":"text","value":"θ","position":{"start":{"line":915,"column":1},"end":{"line":915,"column":1}},"key":"qorJH2PJ32"},{"type":"text","value":", and can be ignored.\nThis gives the objective","position":{"start":{"line":915,"column":1},"end":{"line":915,"column":1}},"key":"u8MeddeUx0"}],"key":"LowfNndtI7"},{"type":"math","value":"\\ell^k(\\theta)\n=\n\\E_{s_0, \\dots, s_{H-1} \\sim \\rho_{\\pi^{k}}} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\E_{a_\\hi \\sim \\pi_{\\theta}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi) \\right] - \\lambda \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\log \\frac{1}{\\pi_{\\theta}(a_\\hi \\mid s_\\hi)}\\right]","position":{"start":{"line":918,"column":1},"end":{"line":922,"column":1}},"html":"k(θ)=Es0,,sH1ρπk[h=0H1Eahπθ(sh)Aπk(sh,ah)]λEτρπk[h=0H1log1πθ(ahsh)]\\ell^k(\\theta)\n=\n\\E_{s_0, \\dots, s_{H-1} \\sim \\rho_{\\pi^{k}}} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\E_{a_\\hi \\sim \\pi_{\\theta}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi) \\right] - \\lambda \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\log \\frac{1}{\\pi_{\\theta}(a_\\hi \\mid s_\\hi)}\\right]k(θ)=Es0,,sH1ρπk[h=0H1Eahπθ(sh)Aπk(sh,ah)]λEτρπk[h=0H1logπθ(ahsh)1]","enumerator":"6.43","key":"IUhnf0Bs55"},{"type":"paragraph","position":{"start":{"line":924,"column":1},"end":{"line":928,"column":1}},"children":[{"type":"text","value":"Once again, this takes an expectation over trajectories.\nBut here we cannot directly sample trajectories from ","position":{"start":{"line":924,"column":1},"end":{"line":924,"column":1}},"key":"g7HOD3iNnz"},{"type":"inlineMath","value":"\\pi^k","position":{"start":{"line":924,"column":1},"end":{"line":924,"column":1}},"html":"πk\\pi^kπk","key":"IPl2EJonzW"},{"type":"text","value":",\nsince in the first term, the actions actually come from ","position":{"start":{"line":924,"column":1},"end":{"line":924,"column":1}},"key":"W04s6VaxOr"},{"type":"inlineMath","value":"\\pi_\\theta","position":{"start":{"line":924,"column":1},"end":{"line":924,"column":1}},"html":"πθ\\pi_\\thetaπθ","key":"M6f8vtJ7Rc"},{"type":"text","value":".\nTo make this term line up with the other expectation,\nwe would need the actions to also come from ","position":{"start":{"line":924,"column":1},"end":{"line":924,"column":1}},"key":"wD2WPtfQ6N"},{"type":"inlineMath","value":"\\pi^k","position":{"start":{"line":924,"column":1},"end":{"line":924,"column":1}},"html":"πk\\pi^kπk","key":"l407iHMnER"},{"type":"text","value":".","position":{"start":{"line":924,"column":1},"end":{"line":924,"column":1}},"key":"xDSDdyCbDQ"}],"key":"Kjvn8yfjia"},{"type":"paragraph","position":{"start":{"line":930,"column":1},"end":{"line":932,"column":1}},"children":[{"type":"text","value":"This should sound familiar:\nwe want to estimate an expectation over one distribution by sampling from another.\nWe can once again use ","position":{"start":{"line":930,"column":1},"end":{"line":930,"column":1}},"key":"urrKXX669g"},{"type":"crossReference","position":{"start":{"line":930,"column":1},"end":{"line":930,"column":1}},"children":[{"type":"text","value":"Section ","key":"Q1c3jRStp0"},{"type":"text","value":"6.3.3","key":"KL1cJooXo7"}],"identifier":"importance_sampling","label":"importance_sampling","kind":"heading","template":"Section %s","enumerator":"6.3.3","resolved":true,"html_id":"importance-sampling","key":"QQVbeuJ2xT"},{"type":"text","value":" to rewrite the inner expectation:","position":{"start":{"line":930,"column":1},"end":{"line":930,"column":1}},"key":"YlYz3o2zUx"}],"key":"F10LwDP0fy"},{"type":"math","value":"\\E_{a_\\hi \\sim \\pi_{\\theta}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi)\n=\n\\E_{a_\\hi \\sim \\pi^k(s_\\hi)} \\frac{\\pi_\\theta(a_\\hi \\mid s_\\hi)}{\\pi^k(a_\\hi \\mid s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi)","position":{"start":{"line":934,"column":1},"end":{"line":938,"column":1}},"html":"Eahπθ(sh)Aπk(sh,ah)=Eahπk(sh)πθ(ahsh)πk(ahsh)Aπk(sh,ah)\\E_{a_\\hi \\sim \\pi_{\\theta}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi)\n=\n\\E_{a_\\hi \\sim \\pi^k(s_\\hi)} \\frac{\\pi_\\theta(a_\\hi \\mid s_\\hi)}{\\pi^k(a_\\hi \\mid s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi)Eahπθ(sh)Aπk(sh,ah)=Eahπk(sh)πk(ahsh)πθ(ahsh)Aπk(sh,ah)","enumerator":"6.44","key":"ey05uRKJj2"},{"type":"paragraph","position":{"start":{"line":940,"column":1},"end":{"line":940,"column":1}},"children":[{"type":"text","value":"Now we can combine the expectations together to get the objective","position":{"start":{"line":940,"column":1},"end":{"line":940,"column":1}},"key":"pHqwHoft5Z"}],"key":"yTaBSWmHwT"},{"type":"math","value":"\\ell^k(\\theta) = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\left( \\frac{\\pi_\\theta(a_\\hi \\mid s_\\hi)}{\\pi^k(a_\\hi \\mid s_\\hi)} A^{\\pi^k}(s_\\hi, a_\\hi) - \\lambda \\log \\frac{1}{\\pi_\\theta(a_\\hi \\mid s_\\hi)} \\right) \\right]","position":{"start":{"line":942,"column":1},"end":{"line":944,"column":1}},"html":"k(θ)=Eτρπk[h=0H1(πθ(ahsh)πk(ahsh)Aπk(sh,ah)λlog1πθ(ahsh))]\\ell^k(\\theta) = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\left( \\frac{\\pi_\\theta(a_\\hi \\mid s_\\hi)}{\\pi^k(a_\\hi \\mid s_\\hi)} A^{\\pi^k}(s_\\hi, a_\\hi) - \\lambda \\log \\frac{1}{\\pi_\\theta(a_\\hi \\mid s_\\hi)} \\right) \\right]k(θ)=Eτρπk[h=0H1(πk(ahsh)πθ(ahsh)Aπk(sh,ah)λlogπθ(ahsh)1)]","enumerator":"6.45","key":"Qf9AmJjjFu"},{"type":"paragraph","position":{"start":{"line":946,"column":1},"end":{"line":948,"column":1}},"children":[{"type":"text","value":"Now we can estimate this function by a sample average over trajectories from ","position":{"start":{"line":946,"column":1},"end":{"line":946,"column":1}},"key":"OcVuI0eXDn"},{"type":"inlineMath","value":"\\pi^k","position":{"start":{"line":946,"column":1},"end":{"line":946,"column":1}},"html":"πk\\pi^kπk","key":"Sm7XpJQ8ve"},{"type":"text","value":".\nRemember that to complete a single iteration of PPO,\nwe execute","position":{"start":{"line":946,"column":1},"end":{"line":946,"column":1}},"key":"IfsKvsRIiv"}],"key":"wnluv6XeZ7"},{"type":"math","value":"\\theta^{k+1} \\gets \\arg\\max_{\\theta} \\ell^k(\\theta).","position":{"start":{"line":950,"column":1},"end":{"line":952,"column":1}},"html":"θk+1argmaxθk(θ).\\theta^{k+1} \\gets \\arg\\max_{\\theta} \\ell^k(\\theta).θk+1argθmaxk(θ).","enumerator":"6.46","key":"LytsiBb4Oa"},{"type":"paragraph","position":{"start":{"line":954,"column":1},"end":{"line":954,"column":1}},"children":[{"type":"text","value":"If ","position":{"start":{"line":954,"column":1},"end":{"line":954,"column":1}},"key":"C9GbJ1hSDR"},{"type":"inlineMath","value":"\\ell^k","position":{"start":{"line":954,"column":1},"end":{"line":954,"column":1}},"html":"k\\ell^kk","key":"B9cltOzw8G"},{"type":"text","value":" is differentiable, we can optimize it by gradient ascent, completing a single iteration of PPO.","position":{"start":{"line":954,"column":1},"end":{"line":954,"column":1}},"key":"lQJk5Cp5Sa"}],"key":"RrCnwJ3GgT"},{"type":"code","lang":"python","value":"def ppo_pseudocode(\n env,\n π: Callable[[Params], Callable[[State, Action], Float]],\n λ: float,\n θ_init: Params,\n n_iters: int,\n n_fit_trajectories: int,\n n_sample_trajectories: int,\n):\n θ = θ_init\n for k in range(n_iters):\n fit_trajectories = sample_trajectories(env, π(θ), n_fit_trajectories)\n A_hat = fit(fit_trajectories)\n\n sample_trajectories = sample_trajectories(env, π(θ), n_sample_trajectories)\n \n def objective(θ_opt):\n total_objective = 0\n for τ in sample_trajectories:\n for s, a, _r in τ:\n total_objective += π(θ_opt)(s, a) / π(θ)(s, a) * A_hat(s, a) + λ * jnp.log(π(θ_opt)(s, a))\n return total_objective / n_sample_trajectories\n \n θ = optimize(objective, θ)\n\n return θ","position":{"start":{"line":956,"column":1},"end":{"line":983,"column":1}},"key":"wq4ivubqW8"},{"type":"heading","depth":2,"position":{"start":{"line":985,"column":1},"end":{"line":985,"column":1}},"children":[{"type":"text","value":"Summary","position":{"start":{"line":985,"column":1},"end":{"line":985,"column":1}},"key":"Mkl9PDbN60"}],"identifier":"summary","label":"Summary","html_id":"summary","implicit":true,"enumerator":"6.10","key":"KwTIkfmTtI"},{"type":"paragraph","position":{"start":{"line":987,"column":1},"end":{"line":987,"column":1}},"children":[{"type":"text","value":"Policy gradient methods are a powerful family of algorithms that directly optimize the total reward by iteratively updating the policy parameters.","position":{"start":{"line":987,"column":1},"end":{"line":987,"column":1}},"key":"fKka461pw6"}],"key":"kRhr6eRgkv"},{"type":"paragraph","position":{"start":{"line":989,"column":1},"end":{"line":989,"column":1}},"children":[{"type":"text","value":"TODO","position":{"start":{"line":989,"column":1},"end":{"line":989,"column":1}},"key":"lsrXlDVT6Q"}],"key":"hUdruZzN2u"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":991,"column":1},"end":{"line":995,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":991,"column":1},"end":{"line":991,"column":1}},"children":[{"type":"text","value":"Vanilla policy gradient","position":{"start":{"line":991,"column":1},"end":{"line":991,"column":1}},"key":"DlNm985kim"}],"key":"aiooq0s81O"},{"type":"listItem","spread":true,"position":{"start":{"line":992,"column":1},"end":{"line":992,"column":1}},"children":[{"type":"text","value":"Baselines and advantages","position":{"start":{"line":992,"column":1},"end":{"line":992,"column":1}},"key":"o4bhnuaiRt"}],"key":"Db1IVgyDQ3"},{"type":"listItem","spread":true,"position":{"start":{"line":993,"column":1},"end":{"line":993,"column":1}},"children":[{"type":"text","value":"Trust region policy optimization","position":{"start":{"line":993,"column":1},"end":{"line":993,"column":1}},"key":"DVZ38XIkN0"}],"key":"PKPLMgkHLZ"},{"type":"listItem","spread":true,"position":{"start":{"line":994,"column":1},"end":{"line":994,"column":1}},"children":[{"type":"text","value":"Natural policy gradient","position":{"start":{"line":994,"column":1},"end":{"line":994,"column":1}},"key":"STF3aP7pu3"}],"key":"dxhfVjBhtI"},{"type":"listItem","spread":true,"position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"children":[{"type":"text","value":"Proximal policy optimization","position":{"start":{"line":995,"column":1},"end":{"line":995,"column":1}},"key":"gil1Z8UPrj"}],"key":"ETsdz4hPk4"}],"key":"iynlH8Nrz3"}],"key":"OyrbVt6vd6"}],"key":"iFiZExDpCz"},"references":{"cite":{"order":["boyd_convex_2004"],"data":{"boyd_convex_2004":{"label":"boyd_convex_2004","enumerator":"1","html":"Boyd, S., & Vandenberghe, L. (2004). Convex Optimization. Cambridge University Press."}}}},"footer":{"navigation":{"prev":{"title":"5 Fitted Dynamic Programming Algorithms","url":"/fitted-dp","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"7 Imitation Learning","url":"/imitation-learning","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file +{"kind":"Notebook","sha256":"3611fe03726337e3d61b27051083131451cb238e27c0323fea7d7891744e04e1","slug":"pg","location":"/pg.md","dependencies":[],"frontmatter":{"title":"6 Policy Gradient Methods","kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","numbering":{"all":{"enabled":true}},"math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"thumbnail":"/build/npg_line-18dfc6d5286c25a94643b5e115d15484.png","thumbnailOptimized":"/build/npg_line-18dfc6d5286c25a94643b5e115d15484.webp","exports":[{"format":"md","filename":"pg.md","url":"/build/pg-dacc33b261658c6d7f260df53a7857dc.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":16,"column":1},"end":{"line":16,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":16,"column":1},"end":{"line":16,"column":1}},"key":"hGtQj7qNrM"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"1","key":"ZcSdgOsRzs"},{"type":"paragraph","position":{"start":{"line":18,"column":1},"end":{"line":21,"column":1}},"children":[{"type":"text","value":"The core task of RL is finding the ","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"iFHVc62KoX"},{"type":"strong","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"optimal policy","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"n22LkbyHMv"}],"key":"h1j4yXxMja"},{"type":"text","value":" in a given environment.\nThis is essentially an ","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"P37nPf2OdB"},{"type":"emphasis","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"optimization problem:","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"agJpiig6ko"}],"key":"q0v4FQwRNE"},{"type":"text","value":"\nout of some space of policies,\nwe want to find the one that achieves the maximum total reward (in expectation).","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"sGjW4aCsGh"}],"key":"VKScOig5zp"},{"type":"paragraph","position":{"start":{"line":23,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"text","value":"It’s typically intractable to compute the optimal policy exactly in some finite number of steps.\nInstead, ","position":{"start":{"line":23,"column":1},"end":{"line":23,"column":1}},"key":"MVPtnoQ4Dg"},{"type":"strong","position":{"start":{"line":23,"column":1},"end":{"line":23,"column":1}},"children":[{"type":"text","value":"policy optimization algorithms","position":{"start":{"line":23,"column":1},"end":{"line":23,"column":1}},"key":"t7Vla5XJ5A"}],"key":"OSlOKlEBzL"},{"type":"text","value":" start from some randomly initialized policy,\nand then ","position":{"start":{"line":23,"column":1},"end":{"line":23,"column":1}},"key":"aeAuIJQGKs"},{"type":"emphasis","position":{"start":{"line":23,"column":1},"end":{"line":23,"column":1}},"children":[{"type":"text","value":"improve","position":{"start":{"line":23,"column":1},"end":{"line":23,"column":1}},"key":"BBx9YXXtWD"}],"key":"NKmDbOzCJn"},{"type":"text","value":" it step by step.\nWe’ve already seen some examples of these,\nnamely ","position":{"start":{"line":23,"column":1},"end":{"line":23,"column":1}},"key":"OB6n8bsg25"},{"type":"crossReference","position":{"start":{"line":23,"column":1},"end":{"line":23,"column":1}},"children":[{"type":"text","value":"Section ","key":"fa1GiHQuwD"},{"type":"text","value":"1.5.3.2","key":"O69UKTHwEk"}],"identifier":"policy_iteration","label":"policy_iteration","kind":"heading","template":"Section %s","enumerator":"1.5.3.2","resolved":true,"html_id":"policy-iteration","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"awMRlTkR1A"},{"type":"text","value":" for finite MDPs and ","position":{"start":{"line":23,"column":1},"end":{"line":23,"column":1}},"key":"Cv7JvChb3V"},{"type":"crossReference","position":{"start":{"line":23,"column":1},"end":{"line":23,"column":1}},"children":[{"type":"text","value":"Section ","key":"TWVMziuTlg"},{"type":"text","value":"2.6.4","key":"x77KWnAdoE"}],"identifier":"iterative_lqr","label":"iterative_lqr","kind":"heading","template":"Section %s","enumerator":"2.6.4","resolved":true,"html_id":"iterative-lqr","remote":true,"url":"/control","dataUrl":"/control.json","key":"aPMo99PkFV"},{"type":"text","value":" in continuous control.","position":{"start":{"line":23,"column":1},"end":{"line":23,"column":1}},"key":"mtppf7kboM"}],"key":"rnxqHkx7B7"},{"type":"paragraph","position":{"start":{"line":30,"column":1},"end":{"line":37,"column":1}},"children":[{"type":"text","value":"In particular, we often use policies that can be described by some finite set of ","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"bKlwkstDOT"},{"type":"strong","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"children":[{"type":"text","value":"parameters.","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"Pq5LN4Yc3X"}],"key":"hi0Ul7OMvO"},{"type":"text","value":"\nWe will see some examples in ","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"HW6jvvsaru"},{"type":"crossReference","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"children":[{"type":"text","value":"Section ","key":"W6xn3l4Tcm"},{"type":"text","value":"3.1","key":"QcTC56Os75"}],"identifier":"parameterizations","label":"parameterizations","kind":"heading","template":"Section %s","enumerator":"3.1","resolved":true,"html_id":"parameterizations","key":"g7yDYVCQvn"},{"type":"text","value":".\nFor such parameterized policies,\nwe can approximate the ","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"m1UnGlBBn2"},{"type":"strong","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"children":[{"type":"text","value":"policy gradient:","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"dMntmphWAw"}],"key":"lvxfxfz4B5"},{"type":"text","value":"\nthe gradient of the expected total reward with respect to the parameters.\nThis tells us the direction the parameters should be updated to achieve a higher expected total reward.\nPolicy gradient methods are responsible for groundbreaking applications including AlphaGo, OpenAI Five, and large language models,\nmany of which use policies parameterized as deep neural networks.","position":{"start":{"line":30,"column":1},"end":{"line":30,"column":1}},"key":"XcqLZ6vIza"}],"key":"D283bX2WEo"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":39,"column":1},"end":{"line":45,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":39,"column":1},"end":{"line":40,"column":1}},"children":[{"type":"text","value":"We begin the chapter with a short review of gradient ascent,\na general ","position":{"start":{"line":39,"column":1},"end":{"line":39,"column":1}},"key":"Ox9TuGU6P4"},{"type":"strong","position":{"start":{"line":39,"column":1},"end":{"line":39,"column":1}},"children":[{"type":"text","value":"optimization method.","position":{"start":{"line":39,"column":1},"end":{"line":39,"column":1}},"key":"zgQADTrn7G"}],"key":"kjVeq9XKx2"}],"key":"yPRUR8WQrA"},{"type":"listItem","spread":true,"position":{"start":{"line":41,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"We’ll then see how to estimate the ","position":{"start":{"line":41,"column":1},"end":{"line":41,"column":1}},"key":"X51cL7rPIE"},{"type":"strong","position":{"start":{"line":41,"column":1},"end":{"line":41,"column":1}},"children":[{"type":"text","value":"policy gradient,","position":{"start":{"line":41,"column":1},"end":{"line":41,"column":1}},"key":"bxOCylt3hj"}],"key":"CstLpp0jax"},{"type":"text","value":"\nenabling us to apply (stochastic) gradient ascent in the RL setting.","position":{"start":{"line":41,"column":1},"end":{"line":41,"column":1}},"key":"eKmQOq4IYv"}],"key":"HhVq2XXmZF"},{"type":"listItem","spread":true,"position":{"start":{"line":43,"column":1},"end":{"line":45,"column":1}},"children":[{"type":"text","value":"Then we’ll explore some ","position":{"start":{"line":43,"column":1},"end":{"line":43,"column":1}},"key":"ZS3r1R34G7"},{"type":"emphasis","position":{"start":{"line":43,"column":1},"end":{"line":43,"column":1}},"children":[{"type":"text","value":"proximal optimization","position":{"start":{"line":43,"column":1},"end":{"line":43,"column":1}},"key":"LUjZKPbEjR"}],"key":"aIMICtPeMY"},{"type":"text","value":" techniques that ensure the steps taken are “not too large”.\nThis is helpful to stabilize training and widely used in practice.","position":{"start":{"line":43,"column":1},"end":{"line":43,"column":1}},"key":"ZraAW5zfyi"}],"key":"WgADoVHCvt"}],"key":"xKXWtByJbc"}],"key":"S72d9mx5IK"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"from utils import plt, Array, Callable, jax, jnp, latexify","key":"XZ1wdSNtL5"},{"type":"output","id":"m-le-IaChjoQk_IIXzDjO","data":[],"key":"GgTY2HBSWF"}],"data":{},"key":"mA5SDsediS"},{"type":"block","position":{"start":{"line":50,"column":1},"end":{"line":50,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"children":[{"type":"text","value":"Gradient Ascent","position":{"start":{"line":52,"column":1},"end":{"line":52,"column":1}},"key":"jg5zph9hPt"}],"identifier":"gradient-ascent","label":"Gradient Ascent","html_id":"gradient-ascent","implicit":true,"enumerator":"2","key":"WqqwlmAygc"},{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Note","key":"DM9O6uOt6S"}],"key":"h5qxMGZwSb"},{"type":"paragraph","position":{"start":{"line":56,"column":1},"end":{"line":59,"column":1}},"children":[{"type":"text","value":"You may have previously heard of ","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"PHJRbfBxX8"},{"type":"emphasis","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"gradient descent","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"bLS3hYhnAJ"}],"key":"CuMhbCqEYm"},{"type":"text","value":" for minimizing functions.\nOptimization problems are usually posed as ","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"G3Iq7baJLv"},{"type":"emphasis","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"minimization","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"E8L9Zi3d8N"}],"key":"uk6Ui3ob3e"},{"type":"text","value":" problems by convention.\nHowever, in RL, we usually talk about ","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"i4G6UYzLit"},{"type":"emphasis","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"maximizing","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"WBnhUajyIi"}],"key":"IcwE2ffVAS"},{"type":"text","value":" the expected total reward,\nand so we perform gradient ","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"jClJ09Fo2I"},{"type":"emphasis","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"ascent","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"CtK80R65YR"}],"key":"UOKQOxoXcp"},{"type":"text","value":" instead.","position":{"start":{"line":56,"column":1},"end":{"line":56,"column":1}},"key":"Zp8E0o92BX"}],"key":"bhivjYQQRg"}],"key":"O8OvLdagP9"},{"type":"paragraph","position":{"start":{"line":62,"column":1},"end":{"line":68,"column":1}},"children":[{"type":"strong","position":{"start":{"line":62,"column":1},"end":{"line":62,"column":1}},"children":[{"type":"text","value":"Gradient ascent","position":{"start":{"line":62,"column":1},"end":{"line":62,"column":1}},"key":"dhHiNLbysA"}],"key":"eLZqmkHrvL"},{"type":"text","value":" is a general optimization algorithm for any differentiable function.\nA suitable analogy for this algorithm is hiking up a mountain,\nwhere you keep taking steps in the steepest direction upwards.\nHere, your vertical position ","position":{"start":{"line":62,"column":1},"end":{"line":62,"column":1}},"key":"VVwfygTSZS"},{"type":"inlineMath","value":"y","position":{"start":{"line":62,"column":1},"end":{"line":62,"column":1}},"html":"yyy","key":"mXOLtrkTuU"},{"type":"text","value":" is the function being optimized,\nand your horizontal position ","position":{"start":{"line":62,"column":1},"end":{"line":62,"column":1}},"key":"WYmuGjuSef"},{"type":"inlineMath","value":"(x, z)","position":{"start":{"line":62,"column":1},"end":{"line":62,"column":1}},"html":"(x,z)(x, z)(x,z)","key":"tC5tQuzLKG"},{"type":"text","value":" is the input to the function.\nThe ","position":{"start":{"line":62,"column":1},"end":{"line":62,"column":1}},"key":"iodunwnZ8e"},{"type":"emphasis","position":{"start":{"line":62,"column":1},"end":{"line":62,"column":1}},"children":[{"type":"text","value":"slope","position":{"start":{"line":62,"column":1},"end":{"line":62,"column":1}},"key":"jqvdalXlpP"}],"key":"yGRGQcKkMB"},{"type":"text","value":" of the mountain at your current position is given by the ","position":{"start":{"line":62,"column":1},"end":{"line":62,"column":1}},"key":"ZFzcxe7MdV"},{"type":"emphasis","position":{"start":{"line":62,"column":1},"end":{"line":62,"column":1}},"children":[{"type":"text","value":"gradient","position":{"start":{"line":62,"column":1},"end":{"line":62,"column":1}},"key":"w2z3BsJt3W"}],"key":"G3w8O59Qqo"},{"type":"text","value":",\nwritten ","position":{"start":{"line":62,"column":1},"end":{"line":62,"column":1}},"key":"Yj9wHPtWpR"},{"type":"inlineMath","value":"\\nabla y(x, z) \\in \\mathbb{R}^2","position":{"start":{"line":62,"column":1},"end":{"line":62,"column":1}},"html":"y(x,z)R2\\nabla y(x, z) \\in \\mathbb{R}^2y(x,z)R2","key":"P1NSxyT48V"},{"type":"text","value":".","position":{"start":{"line":62,"column":1},"end":{"line":62,"column":1}},"key":"xlUcAyaPSk"}],"key":"qG77h8DbTr"}],"key":"fxqag8KDKS"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def f(x, y):\n \"\"\"Himmelblau's function\"\"\"\n return (x**2 + y - 11)**2 + (x + y**2 - 7)**2\n\n# Create a grid of points\nx = jnp.linspace(-5, 5, 400)\ny = jnp.linspace(-5, 5, 400)\nX, Y = jnp.meshgrid(x, y)\nZ = f(X, Y)\n\n# Create the plot\nfig, ax = plt.subplots(figsize=(6, 6))\n\n# Plot the function using imshow\nimg = ax.imshow(Z, extent=[-5, 5, -5, 5], origin='lower')\n\n# Add color bar\nfig.colorbar(img, ax=ax)\n\n# Gradient computation using JAX\ntx, ty = 1.0, 1.0\ngx, gy = jax.grad(f, argnums=(0, 1))(tx, ty)\n\n# Scatter point\nax.scatter(tx, ty, color='red', s=100)\n\n# Add arrow representing the gradient\nax.arrow(tx, ty, gx * 0.01, gy * 0.01, head_width=0.3, head_length=0.3, fc='blue', ec='blue')\n\n# Add plot title\nax.set_title(\"Gradient ascent example\")\n\nplt.show()","visibility":"remove","key":"x2yXE9l10B"},{"type":"output","id":"cz8m2FT5KNPfywvswy4_2","data":[{"output_type":"display_data","metadata":{},"data":{"text/plain":{"content":"
","content_type":"text/plain"},"image/png":{"content_type":"image/png","hash":"1d74500d7a5d62ffa43debb29b4fba06","path":"/build/1d74500d7a5d62ffa43debb29b4fba06.png"}}}],"visibility":"show","key":"PyVc2DcNuu"}],"data":{"tags":[]},"visibility":"show","key":"bQfCYgBLaZ"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"children":[{"type":"text","value":"For differentiable functions, this can be thought of as the vector of partial derivatives,","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"key":"RwnmB5OG4w"}],"key":"Jgkc2cOqnU"},{"type":"math","value":"\\nabla y(x, z) = \\begin{pmatrix}\n\\frac{\\partial y}{\\partial x} \\\\\n\\frac{\\partial y}{\\partial z}\n\\end{pmatrix}.","position":{"start":{"line":110,"column":1},"end":{"line":115,"column":1}},"html":"y(x,z)=(yxyz).\\nabla y(x, z) = \\begin{pmatrix}\n\\frac{\\partial y}{\\partial x} \\\\\n\\frac{\\partial y}{\\partial z}\n\\end{pmatrix}.y(x,z)=(xyzy).","enumerator":"1","key":"KUIQ7c6ZHm"},{"type":"paragraph","position":{"start":{"line":117,"column":1},"end":{"line":120,"column":1}},"children":[{"type":"text","value":"To calculate the ","position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"key":"ckuRdIP0ln"},{"type":"emphasis","position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"children":[{"type":"text","value":"slope","position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"key":"GGSpgWI1sa"}],"key":"LYZnGMvbGF"},{"type":"text","value":" (aka “directional derivative”) of the mountain in a given direction ","position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"key":"zCaP9PAoPT"},{"type":"inlineMath","value":"(\\Delta x, \\Delta z)","position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"html":"(Δx,Δz)(\\Delta x, \\Delta z)(Δx,Δz)","key":"NL4aaRqkuL"},{"type":"text","value":",\nyou take the dot product of the difference vector with the gradient.\nThis means that the direction with the highest slope is exactly the gradient itself,\nso we can describe the gradient ascent algorithm as follows:","position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"key":"UUCFHpYaNd"}],"key":"GdyAJGbjgQ"},{"type":"proof","kind":"definition","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Gradient ascent","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"UmnCX1S1Ol"}],"key":"zXQONoRffV"},{"type":"math","value":"\\begin{pmatrix}\nx^{k+1} \\\\ z^{k+1}\n\\end{pmatrix}\n= \n\\begin{pmatrix}\nx^{k} \\\\ z^{k}\n\\end{pmatrix}\n+\n\\eta \\nabla y(x^{k}, z^{k})","position":{"start":{"line":123,"column":1},"end":{"line":133,"column":1}},"html":"(xk+1zk+1)=(xkzk)+ηy(xk,zk)\\begin{pmatrix}\nx^{k+1} \\\\ z^{k+1}\n\\end{pmatrix}\n= \n\\begin{pmatrix}\nx^{k} \\\\ z^{k}\n\\end{pmatrix}\n+\n\\eta \\nabla y(x^{k}, z^{k})(xk+1zk+1)=(xkzk)+ηy(xk,zk)","enumerator":"2","key":"I5WXNVC0uF"}],"enumerator":"1","key":"BQo2sH2xXW"},{"type":"paragraph","position":{"start":{"line":136,"column":1},"end":{"line":137,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":136,"column":1},"end":{"line":136,"column":1}},"key":"RYNzdrYgJ8"},{"type":"inlineMath","value":"k","position":{"start":{"line":136,"column":1},"end":{"line":136,"column":1}},"html":"kkk","key":"ae7QCuV2Vp"},{"type":"text","value":" denotes the iteration of the algorithm and ","position":{"start":{"line":136,"column":1},"end":{"line":136,"column":1}},"key":"uY3C6nhA44"},{"type":"inlineMath","value":"\\eta > 0","position":{"start":{"line":136,"column":1},"end":{"line":136,"column":1}},"html":"η>0\\eta > 0η>0","key":"ELmubQnBCW"},{"type":"text","value":" is a “step size” hyperparameter that controls the size of the steps we take.\n(Note that we could also vary the step size across iterations, that is, ","position":{"start":{"line":136,"column":1},"end":{"line":136,"column":1}},"key":"Z9t15clhJD"},{"type":"inlineMath","value":"\\eta^0, \\dots, \\eta^K","position":{"start":{"line":136,"column":1},"end":{"line":136,"column":1}},"html":"η0,,ηK\\eta^0, \\dots, \\eta^Kη0,,ηK","key":"pyhOG53JmT"},{"type":"text","value":".)","position":{"start":{"line":136,"column":1},"end":{"line":136,"column":1}},"key":"jJZewJrBgu"}],"key":"WCG3ecdTq2"},{"type":"paragraph","position":{"start":{"line":139,"column":1},"end":{"line":140,"column":1}},"children":[{"type":"text","value":"The case of a two-dimensional input is easy to visualize.\nBut this idea can be straightforwardly extended to higher-dimensional inputs.","position":{"start":{"line":139,"column":1},"end":{"line":139,"column":1}},"key":"PsXOoJYrRl"}],"key":"X0XDo6mRV3"},{"type":"paragraph","position":{"start":{"line":142,"column":1},"end":{"line":143,"column":1}},"children":[{"type":"text","value":"From now on, we’ll use ","position":{"start":{"line":142,"column":1},"end":{"line":142,"column":1}},"key":"Ghi3BaxfLF"},{"type":"inlineMath","value":"J","position":{"start":{"line":142,"column":1},"end":{"line":142,"column":1}},"html":"JJJ","key":"YhP5wHVpXC"},{"type":"text","value":" to denote the function we’re trying to maximize,\nand ","position":{"start":{"line":142,"column":1},"end":{"line":142,"column":1}},"key":"w9xaJVfagv"},{"type":"text","value":"θ","position":{"start":{"line":142,"column":1},"end":{"line":142,"column":1}},"key":"xwJXioLZor"},{"type":"text","value":" to denote the parameters being optimized over. (In the above example, ","position":{"start":{"line":142,"column":1},"end":{"line":142,"column":1}},"key":"msCk4tY5VN"},{"type":"inlineMath","value":"\\theta = \\begin{pmatrix} x & z \\end{pmatrix}^\\top","position":{"start":{"line":142,"column":1},"end":{"line":142,"column":1}},"html":"θ=(xz)\\theta = \\begin{pmatrix} x & z \\end{pmatrix}^\\topθ=(xz)","key":"Wemp2M9gfU"},{"type":"text","value":").","position":{"start":{"line":142,"column":1},"end":{"line":142,"column":1}},"key":"wrMSXFULic"}],"key":"rBYyuAhldh"},{"type":"paragraph","position":{"start":{"line":145,"column":1},"end":{"line":149,"column":1}},"children":[{"type":"text","value":"Notice that our parameters will stop changing once ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"RY3VlaOVfA"},{"type":"inlineMath","value":"\\nabla J(\\theta) = 0.","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"J(θ)=0.\\nabla J(\\theta) = 0.J(θ)=0.","key":"sfE3v760lJ"},{"type":"text","value":"\nOnce we reach this ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"D0cf7cagX9"},{"type":"strong","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"children":[{"type":"text","value":"stationary point,","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"zrsoBAq2FH"}],"key":"NCl49uHDOw"},{"type":"text","value":" our current parameters are ‘locally optimal’ in some sense;\nit’s impossible to increase the function by moving in any direction.\nIf ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"vivswhq3zB"},{"type":"inlineMath","value":"J","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"JJJ","key":"yExLqhtGtN"},{"type":"text","value":" is ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"FtcTv3k21q"},{"type":"emphasis","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"children":[{"type":"text","value":"convex","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"OeXlivmr2X"}],"key":"o8HVv7xIhz"},{"type":"text","value":", then the only point where this happens is at the ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"HKU0D1ufTW"},{"type":"emphasis","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"children":[{"type":"text","value":"global optimum.","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"XJBzmvUqlT"}],"key":"a6MyW72rRO"},{"type":"text","value":"\nOtherwise, if ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"tvKdfBbJpw"},{"type":"inlineMath","value":"J","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"JJJ","key":"J9SKIeHYRl"},{"type":"text","value":" is nonconvex, the best we can hope for is a ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"n6JPJIWHhz"},{"type":"emphasis","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"children":[{"type":"text","value":"local optimum.","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"ijiALtiLwr"}],"key":"c7QRXbfx7s"}],"key":"aGCBCy5dLE"},{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Note","key":"s6Pu9Jw7SQ"}],"key":"BBFGMLIfd1"},{"type":"paragraph","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"children":[{"type":"text","value":"How does a computer compute the gradient of a function?","position":{"start":{"line":152,"column":1},"end":{"line":152,"column":1}},"key":"h8AxxAK6J9"}],"key":"unrwFQLWvF"},{"type":"paragraph","position":{"start":{"line":154,"column":1},"end":{"line":158,"column":1}},"children":[{"type":"text","value":"One way is ","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"key":"mnOOmuHnyy"},{"type":"emphasis","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"children":[{"type":"text","value":"symbolic differentiation,","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"key":"DcHdkgZDhe"}],"key":"jq8PFLMghc"},{"type":"text","value":"\nwhich is similar to the way you might compute it by hand:\nthe computer applies a list of rules to transform the ","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"key":"TJdt2KYG3C"},{"type":"emphasis","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"children":[{"type":"text","value":"symbols","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"key":"I1FVKwG6Vn"}],"key":"sc7FU7P8gZ"},{"type":"text","value":" involved.\nPython’s ","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"key":"cXJq7T8XVi"},{"type":"inlineCode","value":"sympy","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"key":"VceUJXjVuz"},{"type":"text","value":" package supports symbolic differentiation.\nHowever, functions implemented in code may not always have a straightforward symbolic representation.","position":{"start":{"line":154,"column":1},"end":{"line":154,"column":1}},"key":"b2c1WwYi1f"}],"key":"DlupPcs75k"},{"type":"paragraph","position":{"start":{"line":160,"column":1},"end":{"line":161,"column":1}},"children":[{"type":"text","value":"Another way is ","position":{"start":{"line":160,"column":1},"end":{"line":160,"column":1}},"key":"sXkXGEWrpQ"},{"type":"emphasis","position":{"start":{"line":160,"column":1},"end":{"line":160,"column":1}},"children":[{"type":"text","value":"numerical differentiation,","position":{"start":{"line":160,"column":1},"end":{"line":160,"column":1}},"key":"wNzqtc7DRQ"}],"key":"wcQ0qBtqUu"},{"type":"text","value":"\nwhich is based on the limit definition of a (directional) derivative:","position":{"start":{"line":160,"column":1},"end":{"line":160,"column":1}},"key":"us9UU8o18w"}],"key":"KWDCKZESWw"},{"type":"math","value":"\\nabla_{\\boldsymbol{u}} J(\\boldsymbol{x}) = \\lim_{\\varepsilon \\to 0}\n\\frac{J(\\boldsymbol{x} + \\varepsilon \\boldsymbol{u}) - J(\\boldsymbol{x})}{\\varepsilon}","position":{"start":{"line":163,"column":1},"end":{"line":166,"column":1}},"html":"uJ(x)=limε0J(x+εu)J(x)ε\\nabla_{\\boldsymbol{u}} J(\\boldsymbol{x}) = \\lim_{\\varepsilon \\to 0}\n\\frac{J(\\boldsymbol{x} + \\varepsilon \\boldsymbol{u}) - J(\\boldsymbol{x})}{\\varepsilon}uJ(x)=ε0limεJ(x+εu)J(x)","enumerator":"3","key":"wVdaOfjaXA"},{"type":"paragraph","position":{"start":{"line":168,"column":1},"end":{"line":173,"column":1}},"children":[{"type":"text","value":"Then, we can substitute a small value of ","position":{"start":{"line":168,"column":1},"end":{"line":168,"column":1}},"key":"C5USu84Ubt"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":168,"column":1},"end":{"line":168,"column":1}},"html":"ε\\varepsilonε","key":"olfHSP4WCV"},{"type":"text","value":" on the r.h.s. to approximate the directional derivative.\nHow small, though? If we need an accurate estimate,\nwe may need such a small value of ","position":{"start":{"line":168,"column":1},"end":{"line":168,"column":1}},"key":"egNhSqOqNE"},{"type":"inlineMath","value":"\\varepsilon","position":{"start":{"line":168,"column":1},"end":{"line":168,"column":1}},"html":"ε\\varepsilonε","key":"bLhkrpsuYL"},{"type":"text","value":" that typical computers will run into rounding errors.\nAlso, to compute the full gradient,\nwe would need to compute the r.h.s. once for each input dimension.\nThis is an issue if computing ","position":{"start":{"line":168,"column":1},"end":{"line":168,"column":1}},"key":"KMYrINgZIm"},{"type":"inlineMath","value":"J","position":{"start":{"line":168,"column":1},"end":{"line":168,"column":1}},"html":"JJJ","key":"DRYrYyJcFl"},{"type":"text","value":" is expensive.","position":{"start":{"line":168,"column":1},"end":{"line":168,"column":1}},"key":"E7OiBHIVAF"}],"key":"LCge27WN6Z"},{"type":"paragraph","position":{"start":{"line":175,"column":1},"end":{"line":183,"column":1}},"children":[{"type":"strong","position":{"start":{"line":175,"column":1},"end":{"line":175,"column":1}},"children":[{"type":"text","value":"Automatic differentiation","position":{"start":{"line":175,"column":1},"end":{"line":175,"column":1}},"key":"xux8PnANXe"}],"key":"AwRhIDv5P0"},{"type":"text","value":" achieves the best of both worlds.\nLike symbolic differentiation,\nwe manually implement the derivative rules for a few basic operations.\nHowever, instead of executing these on the ","position":{"start":{"line":175,"column":1},"end":{"line":175,"column":1}},"key":"pWH4IUyJ9J"},{"type":"emphasis","position":{"start":{"line":175,"column":1},"end":{"line":175,"column":1}},"children":[{"type":"text","value":"symbols","position":{"start":{"line":175,"column":1},"end":{"line":175,"column":1}},"key":"IBeMUZgdsU"}],"key":"MjU9QJiYEf"},{"type":"text","value":",\nwe execute them on the ","position":{"start":{"line":175,"column":1},"end":{"line":175,"column":1}},"key":"d9fFglIOFy"},{"type":"emphasis","position":{"start":{"line":175,"column":1},"end":{"line":175,"column":1}},"children":[{"type":"text","value":"values","position":{"start":{"line":175,"column":1},"end":{"line":175,"column":1}},"key":"D3SNDGSrQ5"}],"key":"lqUDCbxIIi"},{"type":"text","value":" when the function gets called,\nlike in numerical differentiation.\nThis allows us to differentiate through programming constructs such as branches or loops,\nand doesn’t involve any arbitrarily small values.\n","position":{"start":{"line":175,"column":1},"end":{"line":175,"column":1}},"key":"msweIhfkmm"},{"type":"cite","kind":"narrative","label":"baydin_automatic_2018","identifier":"baydin_automatic_2018","children":[{"type":"text","value":"Baydin ","key":"J9stN7DW7l"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"IQHMFWId2N"}],"key":"ZepkS5GVg7"},{"type":"text","value":" (2018)","key":"mqiP02Qnyk"}],"enumerator":"1","key":"mqO4LDbhzB"},{"type":"text","value":" provides an accessible survey of automatic differentiation.","position":{"start":{"line":175,"column":1},"end":{"line":175,"column":1}},"key":"O9OetI2UHD"}],"key":"iqsTgaePC3"}],"key":"NYKkUlzLto"}],"key":"PFE119Oghh"},{"type":"block","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":188,"column":1},"end":{"line":188,"column":1}},"children":[{"type":"text","value":"Stochastic gradient ascent","position":{"start":{"line":188,"column":1},"end":{"line":188,"column":1}},"key":"daSZBvJtEl"}],"identifier":"stochastic-gradient-ascent","label":"Stochastic gradient ascent","html_id":"stochastic-gradient-ascent","implicit":true,"enumerator":"2.1","key":"mCfRZibGej"},{"type":"paragraph","position":{"start":{"line":190,"column":1},"end":{"line":196,"column":1}},"children":[{"type":"text","value":"In real applications,\ncomputing the gradient of the target function is not so simple.\nAs an example from supervised learning, ","position":{"start":{"line":190,"column":1},"end":{"line":190,"column":1}},"key":"XhETRkwUQy"},{"type":"inlineMath","value":"J(\\theta)","position":{"start":{"line":190,"column":1},"end":{"line":190,"column":1}},"html":"J(θ)J(\\theta)J(θ)","key":"h7GTVDClqS"},{"type":"text","value":" might be the sum of squared prediction errors across an entire training dataset.\nHowever, if our dataset is very large, it might not fit into our computer’s memory!\nIn these cases, we often compute some ","position":{"start":{"line":190,"column":1},"end":{"line":190,"column":1}},"key":"Pkbz0kdjbw"},{"type":"emphasis","position":{"start":{"line":190,"column":1},"end":{"line":190,"column":1}},"children":[{"type":"text","value":"estimate","position":{"start":{"line":190,"column":1},"end":{"line":190,"column":1}},"key":"IL9IjJxwBi"}],"key":"i5NIQFQ26x"},{"type":"text","value":" of the gradient at each step, ","position":{"start":{"line":190,"column":1},"end":{"line":190,"column":1}},"key":"SU4ygiwh0c"},{"type":"inlineMath","value":"\\tilde \\nabla J(\\theta)","position":{"start":{"line":190,"column":1},"end":{"line":190,"column":1}},"html":"~J(θ)\\tilde \\nabla J(\\theta)~J(θ)","key":"NVD78C4ZPw"},{"type":"text","value":", and walk in that direction instead.\nThis is called ","position":{"start":{"line":190,"column":1},"end":{"line":190,"column":1}},"key":"im195JErRm"},{"type":"strong","position":{"start":{"line":190,"column":1},"end":{"line":190,"column":1}},"children":[{"type":"text","value":"stochastic","position":{"start":{"line":190,"column":1},"end":{"line":190,"column":1}},"key":"hhN4d4ukOa"}],"key":"VKbVGQOAVs"},{"type":"text","value":" gradient ascent.\nIn the SL example above, we might randomly choose a ","position":{"start":{"line":190,"column":1},"end":{"line":190,"column":1}},"key":"zHla4p0BWC"},{"type":"emphasis","position":{"start":{"line":190,"column":1},"end":{"line":190,"column":1}},"children":[{"type":"text","value":"minibatch","position":{"start":{"line":190,"column":1},"end":{"line":190,"column":1}},"key":"W4CJkjIPoX"}],"key":"DXw6ehJF4h"},{"type":"text","value":" of samples and use them to estimate the true prediction error. (This approach is known as ","position":{"start":{"line":190,"column":1},"end":{"line":190,"column":1}},"key":"fp3nvD1s2s"},{"type":"strong","position":{"start":{"line":190,"column":1},"end":{"line":190,"column":1}},"children":[{"type":"emphasis","position":{"start":{"line":190,"column":1},"end":{"line":190,"column":1}},"children":[{"type":"text","value":"minibatch","position":{"start":{"line":190,"column":1},"end":{"line":190,"column":1}},"key":"o6oZWX9WOr"}],"key":"jX81uVDtQ9"},{"type":"text","value":" SGD","position":{"start":{"line":190,"column":1},"end":{"line":190,"column":1}},"key":"jjDeNwgcBz"}],"key":"T8RtLSiGqr"},{"type":"text","value":".)","position":{"start":{"line":190,"column":1},"end":{"line":190,"column":1}},"key":"iCCK2Tqw87"}],"key":"szp9T7nyJJ"}],"key":"EgpLreOaFJ"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def sgd(\n theta_init: Array,\n estimate_gradient: Callable[[Array], Array],\n η: float,\n n_steps: int,\n):\n \"\"\"Perform `n_steps` steps of SGD.\n\n `estimate_gradient` eats the current parameters and returns an estimate of the objective function's gradient at those parameters.\n \"\"\"\n θ = theta_init\n for step in range(n_steps):\n θ += η * estimate_gradient(θ)\n return θ","key":"A9e7MnpWns"},{"type":"output","id":"rgB2UOsFVQ-DHCSLrSkhp","data":[],"key":"pExrzuPdlx"}],"data":{},"key":"UhrEAtoDh7"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":215,"column":1},"end":{"line":216,"column":1}},"children":[{"type":"text","value":"What makes one gradient estimator better than another?\nIdeally, we want this estimator to be ","position":{"start":{"line":215,"column":1},"end":{"line":215,"column":1}},"key":"nbMOVixy97"},{"type":"strong","position":{"start":{"line":215,"column":1},"end":{"line":215,"column":1}},"children":[{"type":"text","value":"unbiased;","position":{"start":{"line":215,"column":1},"end":{"line":215,"column":1}},"key":"FTUeTLeaLb"}],"key":"R7Z4Y4FR1t"},{"type":"text","value":" that is, on average, it matches a single true gradient step:","position":{"start":{"line":215,"column":1},"end":{"line":215,"column":1}},"key":"sIu7cMxyPa"}],"key":"VpdJgWakak"},{"type":"math","value":"\\E [\\tilde \\nabla J(\\theta)] = \\nabla J(\\theta).","position":{"start":{"line":218,"column":1},"end":{"line":220,"column":1}},"html":"E[~J(θ)]=J(θ).\\E [\\tilde \\nabla J(\\theta)] = \\nabla J(\\theta).E[~J(θ)]=J(θ).","enumerator":"4","key":"hZGbtJRcLP"},{"type":"paragraph","position":{"start":{"line":222,"column":1},"end":{"line":222,"column":1}},"children":[{"type":"text","value":"We also want the ","position":{"start":{"line":222,"column":1},"end":{"line":222,"column":1}},"key":"RIflgKE6c7"},{"type":"emphasis","position":{"start":{"line":222,"column":1},"end":{"line":222,"column":1}},"children":[{"type":"text","value":"variance","position":{"start":{"line":222,"column":1},"end":{"line":222,"column":1}},"key":"owRDdtB14K"}],"key":"aV1PA2ZGLR"},{"type":"text","value":" of the estimator to be low so that its performance doesn’t change drastically at each step.","position":{"start":{"line":222,"column":1},"end":{"line":222,"column":1}},"key":"L14LQdyj8y"}],"key":"kJrwes9FUc"},{"type":"paragraph","position":{"start":{"line":224,"column":1},"end":{"line":225,"column":1}},"children":[{"type":"text","value":"We can actually show that, for many “nice” functions, in a finite number of steps, SGD will find a ","position":{"start":{"line":224,"column":1},"end":{"line":224,"column":1}},"key":"V0l9lccWct"},{"type":"text","value":"θ","position":{"start":{"line":224,"column":1},"end":{"line":224,"column":1}},"key":"xdynO4aIhN"},{"type":"text","value":" that is “close” to a stationary point.\nIn another perspective, for such functions, the local “landscape” of ","position":{"start":{"line":224,"column":1},"end":{"line":224,"column":1}},"key":"qbld7R5D9o"},{"type":"inlineMath","value":"J","position":{"start":{"line":224,"column":1},"end":{"line":224,"column":1}},"html":"JJJ","key":"Q2akJ7z0gO"},{"type":"text","value":" around ","position":{"start":{"line":224,"column":1},"end":{"line":224,"column":1}},"key":"xD1FYBoTLn"},{"type":"text","value":"θ","position":{"start":{"line":224,"column":1},"end":{"line":224,"column":1}},"key":"YmTZnBmHMV"},{"type":"text","value":" becomes flatter and flatter the longer we run SGD.","position":{"start":{"line":224,"column":1},"end":{"line":224,"column":1}},"key":"xtGN5HKiyX"}],"key":"c36hGiEPHq"},{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"SGD convergence","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"key":"CInk3DSmSE"}],"key":"zDxtGxjFd3"},{"type":"paragraph","position":{"start":{"line":228,"column":1},"end":{"line":231,"column":1}},"children":[{"type":"text","value":"More formally, suppose we run SGD for ","position":{"start":{"line":228,"column":1},"end":{"line":228,"column":1}},"key":"pH1ElZPEB9"},{"type":"inlineMath","value":"K","position":{"start":{"line":228,"column":1},"end":{"line":228,"column":1}},"html":"KKK","key":"RUpN10V2jq"},{"type":"text","value":" steps, using an unbiased gradient estimator.\nLet the step size ","position":{"start":{"line":228,"column":1},"end":{"line":228,"column":1}},"key":"tYFdTV4Y1V"},{"type":"inlineMath","value":"\\eta^k","position":{"start":{"line":228,"column":1},"end":{"line":228,"column":1}},"html":"ηk\\eta^kηk","key":"D8cJ6NX5E4"},{"type":"text","value":" scale as ","position":{"start":{"line":228,"column":1},"end":{"line":228,"column":1}},"key":"ThFs6NxvI3"},{"type":"inlineMath","value":"O(1/\\sqrt{k}).","position":{"start":{"line":228,"column":1},"end":{"line":228,"column":1}},"html":"O(1/k).O(1/\\sqrt{k}).O(1/k).","key":"h5cG20hsup"},{"type":"text","value":"\nThen if ","position":{"start":{"line":228,"column":1},"end":{"line":228,"column":1}},"key":"PHKIrSNVb9"},{"type":"inlineMath","value":"J","position":{"start":{"line":228,"column":1},"end":{"line":228,"column":1}},"html":"JJJ","key":"msgCNXxRxh"},{"type":"text","value":" is bounded and ","position":{"start":{"line":228,"column":1},"end":{"line":228,"column":1}},"key":"Mh2C4tT8Ws"},{"type":"text","value":"β","position":{"start":{"line":228,"column":1},"end":{"line":228,"column":1}},"key":"yTtJXCZbBe"},{"type":"text","value":"-smooth (see below),\nand the ","position":{"start":{"line":228,"column":1},"end":{"line":228,"column":1}},"key":"mBYc00M8kE"},{"type":"emphasis","position":{"start":{"line":228,"column":1},"end":{"line":228,"column":1}},"children":[{"type":"text","value":"norm","position":{"start":{"line":228,"column":1},"end":{"line":228,"column":1}},"key":"rkgKtDdzrY"}],"key":"Et7iDYJSkB"},{"type":"text","value":" of the gradient estimator has a bounded second moment ","position":{"start":{"line":228,"column":1},"end":{"line":228,"column":1}},"key":"JcVVL1o25B"},{"type":"inlineMath","value":"\\sigma^2,","position":{"start":{"line":228,"column":1},"end":{"line":228,"column":1}},"html":"σ2,\\sigma^2,σ2,","key":"ZcSE1ThwQp"}],"key":"YSdbQIP8DP"},{"type":"math","value":"\\|\\nabla J(\\theta^K)\\|^2 \\le O \\left( M \\beta \\sigma^2 / K\\right).","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"html":"J(θK)2O(Mβσ2/K).\\|\\nabla J(\\theta^K)\\|^2 \\le O \\left( M \\beta \\sigma^2 / K\\right).∥∇J(θK)2O(σ2/K).","enumerator":"5","key":"E2A2fu1Ieq"},{"type":"paragraph","position":{"start":{"line":235,"column":1},"end":{"line":235,"column":1}},"children":[{"type":"text","value":"We call a function ","position":{"start":{"line":235,"column":1},"end":{"line":235,"column":1}},"key":"s9uqdT342H"},{"type":"text","value":"β","position":{"start":{"line":235,"column":1},"end":{"line":235,"column":1}},"key":"NlbWdHSKD7"},{"type":"text","value":"-smooth if its gradient is Lipschitz continuous with constant ","position":{"start":{"line":235,"column":1},"end":{"line":235,"column":1}},"key":"f6etL0P76G"},{"type":"text","value":"β","position":{"start":{"line":235,"column":1},"end":{"line":235,"column":1}},"key":"l17VcojI2B"},{"type":"text","value":":","position":{"start":{"line":235,"column":1},"end":{"line":235,"column":1}},"key":"Bbnjnmg2lQ"}],"key":"PH7GDVN3eo"},{"type":"math","value":"\\|\\nabla J(\\theta) - \\nabla J(\\theta')\\| \\le \\beta \\|\\theta - \\theta'\\|.","position":{"start":{"line":237,"column":1},"end":{"line":237,"column":1}},"html":"J(θ)J(θ)βθθ.\\|\\nabla J(\\theta) - \\nabla J(\\theta')\\| \\le \\beta \\|\\theta - \\theta'\\|.∥∇J(θ)J(θ)βθθ∥.","enumerator":"6","key":"Joj8fQnmSS"}],"key":"HEDT1bTfuj"},{"type":"paragraph","position":{"start":{"line":240,"column":1},"end":{"line":240,"column":1}},"children":[{"type":"text","value":"We’ll now see a concrete application of gradient ascent in the context of policy optimization.","position":{"start":{"line":240,"column":1},"end":{"line":240,"column":1}},"key":"Fu1AE75teL"}],"key":"NB1m8OGlWX"}],"key":"K4cjrGB3ax"},{"type":"block","position":{"start":{"line":242,"column":1},"end":{"line":242,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":244,"column":1},"end":{"line":244,"column":1}},"children":[{"type":"text","value":"Policy (stochastic) gradient ascent","position":{"start":{"line":244,"column":1},"end":{"line":244,"column":1}},"key":"XCt0M9GLxS"}],"identifier":"policy-stochastic-gradient-ascent","label":"Policy (stochastic) gradient ascent","html_id":"policy-stochastic-gradient-ascent","implicit":true,"enumerator":"3","key":"rz3DRTqGrY"},{"type":"paragraph","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"children":[{"type":"text","value":"Remember that in RL, the primary goal is to find the ","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"key":"n0Y3qqqdp0"},{"type":"emphasis","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"children":[{"type":"text","value":"optimal policy","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"key":"S1Sxqs5gFh"}],"key":"tNSVJceMoQ"},{"type":"text","value":" that achieves the maximimum total reward, which we can express using the value function we defined in ","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"key":"wYnEZ1o7aF"},{"type":"crossReference","kind":"proof:definition","identifier":"value","label":"value","children":[{"type":"text","value":"Definition ","key":"T1IA2QV9MI"},{"type":"text","value":"1.6","key":"ApnkSUOv7M"}],"template":"Definition %s","enumerator":"1.6","resolved":true,"html_id":"value","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"AuGqxhPpqV"},{"type":"text","value":":","position":{"start":{"line":246,"column":1},"end":{"line":246,"column":1}},"key":"lN7ULBeXuN"}],"key":"FOvjgI9RQh"},{"type":"math","value":"\\begin{aligned}\n J(\\pi) := \\E_{s_0 \\sim \\mu_0} V^{\\pi} (s_0) = & \\E_{\\tau \\sim \\rho^\\pi} \\sum_{\\hi=0}^{\\hor-1} r(s_\\hi, a_\\hi)\n\\end{aligned}","label":"objective_fn","identifier":"objective_fn","html":"J(π):=Es0μ0Vπ(s0)=Eτρπh=0H1r(sh,ah)\\begin{aligned}\n J(\\pi) := \\E_{s_0 \\sim \\mu_0} V^{\\pi} (s_0) = & \\E_{\\tau \\sim \\rho^\\pi} \\sum_{\\hi=0}^{\\hor-1} r(s_\\hi, a_\\hi)\n\\end{aligned}J(π):=Es0μ0Vπ(s0)=Eτρπh=0H1r(sh,ah)","enumerator":"7","html_id":"objective-fn","key":"tRgCgMBNm7"},{"type":"paragraph","position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"key":"XpH0zzwKwF"},{"type":"inlineMath","value":"\\rho^\\pi","position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"html":"ρπ\\rho^\\piρπ","key":"AlBJ87Jtmu"},{"type":"text","value":" is the distribution over trajectories induced by ","position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"key":"PMHxbDrSWP"},{"type":"text","value":"π","position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"key":"VWJCkuKeMw"},{"type":"text","value":" (see ","position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"key":"SnVP9XLcT6"},{"type":"crossReference","position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"children":[{"type":"text","value":"Definition ","key":"vLPC2fXH7h"},{"type":"text","value":"1.5","key":"X1D89ATEqz"}],"identifier":"autoregressive_trajectories","label":"autoregressive_trajectories","kind":"proof:definition","template":"Definition %s","enumerator":"1.5","resolved":true,"html_id":"autoregressive-trajectories","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"CUkYDu05gb"},{"type":"text","value":").","position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"key":"N5QVRm50jV"}],"key":"XTxk4ghK7E"},{"type":"paragraph","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"children":[{"type":"text","value":"(Note that we’ll continue to work in the ","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"sERp6KsrVh"},{"type":"emphasis","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"children":[{"type":"text","value":"undiscounted, finite-horizon case.","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"Pli1bdDHEX"}],"key":"Nypo7bumqw"},{"type":"text","value":" Analogous results hold for the ","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"srP5D8TSKz"},{"type":"emphasis","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"children":[{"type":"text","value":"discounted, infinite-horizon setup.","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"JzDFl7l18j"}],"key":"MkZ5UmWSsM"},{"type":"text","value":")","position":{"start":{"line":258,"column":1},"end":{"line":258,"column":1}},"key":"s2FpbrPuAA"}],"key":"hJxu3VDqpi"},{"type":"paragraph","position":{"start":{"line":260,"column":1},"end":{"line":265,"column":1}},"children":[{"type":"text","value":"As shown by the notation, this is exactly the function ","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"key":"NLqCgQxSiW"},{"type":"inlineMath","value":"J","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"html":"JJJ","key":"xNFg2u58fz"},{"type":"text","value":" that we want to maximize using gradient ascent.\nWhat variables are we optimizing over in this problem?\nWell, the objective function ","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"key":"QIpLiH6dpU"},{"type":"inlineMath","value":"J","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"html":"JJJ","key":"cZ0scXYhog"},{"type":"text","value":" is a function of the policy ","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"key":"c5V1FfOhzY"},{"type":"text","value":"π","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"key":"HVlgpXXf2G"},{"type":"text","value":",\nbut in general, ","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"key":"GbIWXNJtsJ"},{"type":"text","value":"π","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"key":"ulth6YrokV"},{"type":"text","value":" is a function,\nand optimizing over the entire space of arbitrary input-output mappings would be intractable.\nInstead, we need to describe ","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"key":"pyJFrjWZTV"},{"type":"text","value":"π","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"key":"sB7q4tVT7m"},{"type":"text","value":" in terms of some finite set of ","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"key":"KVoRRxkZZe"},{"type":"emphasis","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"children":[{"type":"text","value":"parameters","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"key":"MGfDjTT9jU"}],"key":"e6qGaNkAmZ"},{"type":"text","value":" ","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"key":"F9T7fziReG"},{"type":"text","value":"θ","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"key":"EoCXllXOMD"},{"type":"text","value":".","position":{"start":{"line":260,"column":1},"end":{"line":260,"column":1}},"key":"FtHeQgf2TE"}],"key":"d0GnViqMT5"}],"key":"D1WpjJZikM"},{"type":"block","position":{"start":{"line":267,"column":1},"end":{"line":267,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"children":[{"type":"text","value":"Example policy parameterizations","position":{"start":{"line":270,"column":1},"end":{"line":270,"column":1}},"key":"rtyVuX4pZq"}],"label":"parameterizations","identifier":"parameterizations","html_id":"parameterizations","enumerator":"3.1","key":"SOUOi07Ffl"},{"type":"paragraph","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"children":[{"type":"text","value":"What are some ways we could parameterize our policy?","position":{"start":{"line":272,"column":1},"end":{"line":272,"column":1}},"key":"FZyGdxTkS8"}],"key":"ytdbt8XcDK"}],"key":"ugVZq6fzC3"},{"type":"block","position":{"start":{"line":274,"column":1},"end":{"line":274,"column":1}},"children":[{"type":"proof","kind":"example","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Tabular representation","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"key":"sS82R19uli"}],"key":"Rz46hvuI28"},{"type":"paragraph","position":{"start":{"line":278,"column":1},"end":{"line":281,"column":1}},"children":[{"type":"text","value":"If both the state and action spaces are finite, perhaps we could simply learn a preference value ","position":{"start":{"line":278,"column":1},"end":{"line":278,"column":1}},"key":"G8MFk0uHTs"},{"type":"inlineMath","value":"\\theta_{s,a}","position":{"start":{"line":278,"column":1},"end":{"line":278,"column":1}},"html":"θs,a\\theta_{s,a}θs,a","key":"Re4PmwLxnj"},{"type":"text","value":" for each state-action pair.\nThen to turn this into a valid distribution, we perform a ","position":{"start":{"line":278,"column":1},"end":{"line":278,"column":1}},"key":"aaqhx0A71n"},{"type":"strong","position":{"start":{"line":278,"column":1},"end":{"line":278,"column":1}},"children":[{"type":"text","value":"softmax","position":{"start":{"line":278,"column":1},"end":{"line":278,"column":1}},"key":"t38iyO9Wvp"}],"key":"CAkTUhWKbb"},{"type":"text","value":" operation:\nwe exponentiate each of them,\nand then normalize to form a valid distribution:","position":{"start":{"line":278,"column":1},"end":{"line":278,"column":1}},"key":"qrnEMd97eU"}],"key":"uAjtHWkWda"},{"type":"math","value":"\\pi^\\text{softmax}_\\theta(a | s) = \\frac{\\exp(\\theta_{s,a})}{\\sum_{s,a'} \\exp (\\theta_{s,a'})}.","position":{"start":{"line":283,"column":1},"end":{"line":283,"column":1}},"html":"πθsoftmax(as)=exp(θs,a)s,aexp(θs,a).\\pi^\\text{softmax}_\\theta(a | s) = \\frac{\\exp(\\theta_{s,a})}{\\sum_{s,a'} \\exp (\\theta_{s,a'})}.πθsoftmax(as)=s,aexp(θs,a)exp(θs,a).","enumerator":"8","key":"O95dSVGPRR"},{"type":"paragraph","position":{"start":{"line":285,"column":1},"end":{"line":286,"column":1}},"children":[{"type":"text","value":"However, this doesn’t make use of any structure in the states or actions,\nso while this is flexible, it is also prone to overfitting.","position":{"start":{"line":285,"column":1},"end":{"line":285,"column":1}},"key":"FWCfxebvUb"}],"key":"utQIDVmVni"}],"enumerator":"1","key":"DemjuYtVTF"},{"type":"proof","kind":"example","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Linear in features","position":{"start":{"line":289,"column":1},"end":{"line":289,"column":1}},"key":"X0MwAxFE71"}],"key":"h0SjljVfkn"},{"type":"paragraph","position":{"start":{"line":291,"column":1},"end":{"line":291,"column":1}},"children":[{"type":"text","value":"Another approach is to map each state-action pair into some ","position":{"start":{"line":291,"column":1},"end":{"line":291,"column":1}},"key":"jnjBO1uTFE"},{"type":"strong","position":{"start":{"line":291,"column":1},"end":{"line":291,"column":1}},"children":[{"type":"text","value":"feature space","position":{"start":{"line":291,"column":1},"end":{"line":291,"column":1}},"key":"dWuBNSxRKf"}],"key":"Tb8e5pG5yM"},{"type":"text","value":" ","position":{"start":{"line":291,"column":1},"end":{"line":291,"column":1}},"key":"ebt0kb9ZM5"},{"type":"inlineMath","value":"\\phi(s, a) \\in \\mathbb{R}^p","position":{"start":{"line":291,"column":1},"end":{"line":291,"column":1}},"html":"ϕ(s,a)Rp\\phi(s, a) \\in \\mathbb{R}^pϕ(s,a)Rp","key":"YyXZdFzzWF"},{"type":"text","value":". Then, to map a feature vector to a probability, we take a linear combination of the features and take a softmax:","position":{"start":{"line":291,"column":1},"end":{"line":291,"column":1}},"key":"yUzqhfF37R"}],"key":"G3ZQIvHAnE"},{"type":"math","value":"\\pi^\\text{linear in features}_{\\theta}(a|s) = \\frac{\\exp(\\theta^\\top \\phi(s, a))}{\\sum_{a'} \\exp(\\theta^\\top \\phi(s, a'))}.","position":{"start":{"line":293,"column":1},"end":{"line":293,"column":1}},"html":"πθlinear in features(as)=exp(θϕ(s,a))aexp(θϕ(s,a)).\\pi^\\text{linear in features}_{\\theta}(a|s) = \\frac{\\exp(\\theta^\\top \\phi(s, a))}{\\sum_{a'} \\exp(\\theta^\\top \\phi(s, a'))}.πθlinear in features(as)=aexp(θϕ(s,a))exp(θϕ(s,a)).","enumerator":"9","key":"blAqCxid1P"},{"type":"paragraph","position":{"start":{"line":295,"column":1},"end":{"line":295,"column":1}},"children":[{"type":"text","value":"Another interpretation is that ","position":{"start":{"line":295,"column":1},"end":{"line":295,"column":1}},"key":"wb3DdB9niP"},{"type":"text","value":"θ","position":{"start":{"line":295,"column":1},"end":{"line":295,"column":1}},"key":"SLORQZk7r8"},{"type":"text","value":" represents the feature vector of the “desired” state-action pair, as state-action pairs whose features align closely with ","position":{"start":{"line":295,"column":1},"end":{"line":295,"column":1}},"key":"DlL9Z2hkiJ"},{"type":"text","value":"θ","position":{"start":{"line":295,"column":1},"end":{"line":295,"column":1}},"key":"UJIF3BZ1yA"},{"type":"text","value":" are given higher probability.","position":{"start":{"line":295,"column":1},"end":{"line":295,"column":1}},"key":"moJjvjLPNa"}],"key":"ovqMrv4YT9"}],"enumerator":"2","key":"ORkj3EkQrI"},{"type":"proof","kind":"example","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Neural policies","position":{"start":{"line":298,"column":1},"end":{"line":298,"column":1}},"key":"KBelpLHRQq"}],"key":"RBVlWQNHbz"},{"type":"paragraph","position":{"start":{"line":300,"column":1},"end":{"line":300,"column":1}},"children":[{"type":"text","value":"More generally, we could map states and actions to unnormalized scores via some parameterized function ","position":{"start":{"line":300,"column":1},"end":{"line":300,"column":1}},"key":"OOhfokQtxs"},{"type":"inlineMath","value":"f_\\theta : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R},","position":{"start":{"line":300,"column":1},"end":{"line":300,"column":1}},"html":"fθ:S×AR,f_\\theta : \\mathcal{S} \\times \\mathcal{A} \\to \\mathbb{R},fθ:S×AR,","key":"tLqxKrMPHN"},{"type":"text","value":" such as a neural network, and choose actions according to a softmax: ","position":{"start":{"line":300,"column":1},"end":{"line":300,"column":1}},"key":"Ii6UbAqHki"}],"key":"yGkjsnxoA1"},{"type":"math","value":"\\pi^\\text{general}_\\theta(a|s) = \\frac{\\exp(f_{\\theta}(s,a))}{\\sum_{a'} \\exp(f_{\\theta}(s,a'))}.","position":{"start":{"line":300,"column":1},"end":{"line":300,"column":1}},"tight":"before","html":"πθgeneral(as)=exp(fθ(s,a))aexp(fθ(s,a)).\\pi^\\text{general}_\\theta(a|s) = \\frac{\\exp(f_{\\theta}(s,a))}{\\sum_{a'} \\exp(f_{\\theta}(s,a'))}.πθgeneral(as)=aexp(fθ(s,a))exp(fθ(s,a)).","enumerator":"10","key":"tfKzsJNPLZ"}],"enumerator":"3","key":"blUCq21F3P"}],"key":"lEWbMEMgki"},{"type":"block","position":{"start":{"line":303,"column":1},"end":{"line":303,"column":1}},"children":[{"type":"proof","kind":"example","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Diagonal Gaussian policies for continuous action spaces","position":{"start":{"line":305,"column":1},"end":{"line":305,"column":1}},"key":"AU7I1RyyrO"}],"key":"yZHIf6Fr35"},{"type":"paragraph","position":{"start":{"line":307,"column":1},"end":{"line":307,"column":1}},"children":[{"type":"text","value":"Consider a continuous ","position":{"start":{"line":307,"column":1},"end":{"line":307,"column":1}},"key":"QicgW9ka0x"},{"type":"inlineMath","value":"n","position":{"start":{"line":307,"column":1},"end":{"line":307,"column":1}},"html":"nnn","key":"itSp4Q2oS4"},{"type":"text","value":"-dimensional action space ","position":{"start":{"line":307,"column":1},"end":{"line":307,"column":1}},"key":"GWgGw2Wl4X"},{"type":"inlineMath","value":"\\mathcal{A} = \\mathbb{R}^n","position":{"start":{"line":307,"column":1},"end":{"line":307,"column":1}},"html":"A=Rn\\mathcal{A} = \\mathbb{R}^nA=Rn","key":"HenBKvjW1q"},{"type":"text","value":". Then for a stochastic policy, we could use a function to predict the ","position":{"start":{"line":307,"column":1},"end":{"line":307,"column":1}},"key":"jwZhAublp8"},{"type":"emphasis","position":{"start":{"line":307,"column":1},"end":{"line":307,"column":1}},"children":[{"type":"text","value":"mean","position":{"start":{"line":307,"column":1},"end":{"line":307,"column":1}},"key":"zduooFv17v"}],"key":"w9NOycrAag"},{"type":"text","value":" action and then add some random noise about it. For example, we could use a neural network to predict the mean action ","position":{"start":{"line":307,"column":1},"end":{"line":307,"column":1}},"key":"mj58RSMbJd"},{"type":"inlineMath","value":"\\mu_\\theta(s)","position":{"start":{"line":307,"column":1},"end":{"line":307,"column":1}},"html":"μθ(s)\\mu_\\theta(s)μθ(s)","key":"l4h9r1eLP7"},{"type":"text","value":" and then add some noise ","position":{"start":{"line":307,"column":1},"end":{"line":307,"column":1}},"key":"uR1pDLidvA"},{"type":"inlineMath","value":"\\epsilon \\sim \\mathcal{N}(0, \\sigma^2 I)","position":{"start":{"line":307,"column":1},"end":{"line":307,"column":1}},"html":"ϵN(0,σ2I)\\epsilon \\sim \\mathcal{N}(0, \\sigma^2 I)ϵN(0,σ2I)","key":"nWfcYSXUNs"},{"type":"text","value":" to it:","position":{"start":{"line":307,"column":1},"end":{"line":307,"column":1}},"key":"FFlZNE2DAr"}],"key":"PM9u19gOcB"},{"type":"math","value":"\\pi_\\theta(a|s) = \\mathcal{N}(\\mu_\\theta(s), \\sigma^2 I).","position":{"start":{"line":309,"column":1},"end":{"line":309,"column":1}},"html":"πθ(as)=N(μθ(s),σ2I).\\pi_\\theta(a|s) = \\mathcal{N}(\\mu_\\theta(s), \\sigma^2 I).πθ(as)=N(μθ(s),σ2I).","enumerator":"11","key":"STixyms4Pt"}],"enumerator":"4","key":"TIVjUn4TGR"},{"type":"comment","value":" **Exercise:** Can you extend the \"linear in features\" policy to continuous action spaces in a similar way? ","key":"PDGoZdaoKE"}],"key":"OBovJZdG04"},{"type":"block","position":{"start":{"line":315,"column":1},"end":{"line":315,"column":1}},"children":[{"type":"paragraph","position":{"start":{"line":319,"column":1},"end":{"line":321,"column":1}},"children":[{"type":"text","value":"Now that we have seen some examples of parameterized policies,\nwe will write the total reward in terms of the parameters,\noverloading notation and letting ","position":{"start":{"line":319,"column":1},"end":{"line":319,"column":1}},"key":"zm3DFcPJEq"},{"type":"inlineMath","value":"\\rho_\\theta := \\rho^{\\pi_\\theta}","position":{"start":{"line":319,"column":1},"end":{"line":319,"column":1}},"html":"ρθ:=ρπθ\\rho_\\theta := \\rho^{\\pi_\\theta}ρθ:=ρπθ","key":"h4H8Mdnpjh"},{"type":"text","value":":","position":{"start":{"line":319,"column":1},"end":{"line":319,"column":1}},"key":"rQK7dvYtiX"}],"key":"ORJpOaFtsq"},{"type":"math","value":"J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} R(\\tau)","position":{"start":{"line":323,"column":1},"end":{"line":323,"column":1}},"html":"J(θ)=EτρθR(τ)J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} R(\\tau)J(θ)=EτρθR(τ)","enumerator":"12","key":"N93eeIYOiM"},{"type":"paragraph","position":{"start":{"line":325,"column":1},"end":{"line":325,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":325,"column":1},"end":{"line":325,"column":1}},"key":"C1BvNk7lH0"},{"type":"inlineMath","value":"R(\\tau) = \\sum_{\\hi=0}^{\\hor-1} r(s_\\hi, a_\\hi)","position":{"start":{"line":325,"column":1},"end":{"line":325,"column":1}},"html":"R(τ)=h=0H1r(sh,ah)R(\\tau) = \\sum_{\\hi=0}^{\\hor-1} r(s_\\hi, a_\\hi)R(τ)=h=0H1r(sh,ah)","key":"xVNhGhdN27"},{"type":"text","value":" denotes the total reward in the trajectory.","position":{"start":{"line":325,"column":1},"end":{"line":325,"column":1}},"key":"ijFcqJ68Dl"}],"key":"aQsG2ZIhts"},{"type":"paragraph","position":{"start":{"line":327,"column":1},"end":{"line":328,"column":1}},"children":[{"type":"text","value":"Now how do we maximize this function (the expected total reward) over the parameters?\nOne simple idea would be to directly apply gradient ascent:","position":{"start":{"line":327,"column":1},"end":{"line":327,"column":1}},"key":"jYj9hBOHYJ"}],"key":"cfUjXcWeju"},{"type":"math","value":"\\theta^{k+1} = \\theta^k + \\eta \\nabla J(\\theta^k).","position":{"start":{"line":330,"column":1},"end":{"line":332,"column":1}},"html":"θk+1=θk+ηJ(θk).\\theta^{k+1} = \\theta^k + \\eta \\nabla J(\\theta^k).θk+1=θk+ηJ(θk).","enumerator":"13","key":"Wmav0thK2K"},{"type":"paragraph","position":{"start":{"line":334,"column":1},"end":{"line":336,"column":1}},"children":[{"type":"text","value":"In order to apply this technique, we need to be able to evaluate the gradient ","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"key":"tLsrBa5Hk8"},{"type":"inlineMath","value":"\\nabla J(\\theta).","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"html":"J(θ).\\nabla J(\\theta).J(θ).","key":"sH42rYYHXn"},{"type":"text","value":"\nBut ","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"key":"ku2ZTBm5zQ"},{"type":"inlineMath","value":"J(\\theta)","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"html":"J(θ)J(\\theta)J(θ)","key":"jCXox8udi5"},{"type":"text","value":" is very difficult, or even intractable, to compute exactly, since it involves taking an expectation over all possible trajectories ","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"key":"wPW80q8Qa6"},{"type":"inlineMath","value":"\\tau.","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"html":"τ.\\tau.τ.","key":"X1GmoaL4D3"},{"type":"text","value":"\nCan we rewrite it in a form that’s more convenient to implement?","position":{"start":{"line":334,"column":1},"end":{"line":334,"column":1}},"key":"lGSEgw9O44"}],"key":"w4gypl8FjK"}],"key":"wxGC5SEyNf"},{"type":"block","position":{"start":{"line":338,"column":1},"end":{"line":338,"column":1}},"children":[{"type":"heading","depth":3,"position":{"start":{"line":341,"column":1},"end":{"line":341,"column":1}},"children":[{"type":"text","value":"Importance Sampling","position":{"start":{"line":341,"column":1},"end":{"line":341,"column":1}},"key":"SAkT0VIRXq"}],"label":"importance_sampling","identifier":"importance_sampling","html_id":"importance-sampling","enumerator":"3.2","key":"lcpI4nErHd"},{"type":"paragraph","position":{"start":{"line":343,"column":1},"end":{"line":352,"column":1}},"children":[{"type":"text","value":"There is a general trick called ","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"zxNuL0luM7"},{"type":"strong","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"children":[{"type":"text","value":"importance sampling","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"PXPSgx0cap"}],"key":"v0DCCZRrRc"},{"type":"text","value":" for evaluating difficult expectations.\nSuppose we want to estimate ","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"vZdstLLQSN"},{"type":"inlineMath","value":"\\E_{x \\sim p}[f(x)]","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"html":"Exp[f(x)]\\E_{x \\sim p}[f(x)]Exp[f(x)]","key":"M3Bi7eGgOF"},{"type":"text","value":" where ","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"B0rf8J1bVI"},{"type":"inlineMath","value":"p","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"html":"ppp","key":"xs4WUjECCW"},{"type":"text","value":" is hard or expensive to sample from,\nbut easy to evaluate the likelihood ","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"LiGSOf1Ysd"},{"type":"inlineMath","value":"p(x)","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"html":"p(x)p(x)p(x)","key":"kPhgHcCua4"},{"type":"text","value":" of.\nSuppose that we ","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"CdDJuUiPjc"},{"type":"emphasis","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"children":[{"type":"text","value":"can","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"TDnwtzfm05"}],"key":"YqVtG4GJls"},{"type":"text","value":" easily sample from a different distribution ","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"mdJeuli6sH"},{"type":"inlineMath","value":"q","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"html":"qqq","key":"Ecz73ZrFSZ"},{"type":"text","value":".\nSince an expectation is just a weighted average, we can sample ","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"nlGMWs2LSU"},{"type":"inlineMath","value":"x","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"html":"xxx","key":"lRlNyxVzbY"},{"type":"text","value":" from ","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"pJVvpByRJr"},{"type":"inlineMath","value":"q","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"html":"qqq","key":"kQvtexuAt6"},{"type":"text","value":", compute ","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"uZoUX2u8F4"},{"type":"inlineMath","value":"f(x)","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"html":"f(x)f(x)f(x)","key":"bQOvXf6PwH"},{"type":"text","value":", and then reweight the results:\nif ","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"cJBtH7YLeA"},{"type":"inlineMath","value":"x","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"html":"xxx","key":"H3FFUF6tbs"},{"type":"text","value":" is very likely under ","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"ANeuzQBfqE"},{"type":"inlineMath","value":"p","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"html":"ppp","key":"MEk75JpKK1"},{"type":"text","value":" but unlikely under ","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"EU2qJIpMie"},{"type":"inlineMath","value":"q","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"html":"qqq","key":"yM840qWwt2"},{"type":"text","value":",\nwe should boost its weighting,\nand if it is common under ","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"EhDYnoI2iO"},{"type":"inlineMath","value":"q","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"html":"qqq","key":"r6kWgm8Ath"},{"type":"text","value":" but uncommon under ","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"zNPDZFWitr"},{"type":"inlineMath","value":"p","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"html":"ppp","key":"HFlbCNJBI5"},{"type":"text","value":",\nwe should lower its weighting.\nThe reweighting factor is exactly the ","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"AuOd68qyDE"},{"type":"strong","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"children":[{"type":"text","value":"likelihood ratio","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"d5d6UUa9Pg"}],"key":"VbUBy7nMKc"},{"type":"text","value":" between the target distribution ","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"y5oqhVXlSS"},{"type":"inlineMath","value":"p","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"html":"ppp","key":"xGtH2fcCXK"},{"type":"text","value":" and the sampling distribution ","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"heSYPLrfRp"},{"type":"inlineMath","value":"q","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"html":"qqq","key":"Ukq8FQPIYO"},{"type":"text","value":":","position":{"start":{"line":343,"column":1},"end":{"line":343,"column":1}},"key":"mHchjTthHV"}],"key":"xKbj6BqPbz"},{"type":"math","value":"\\E_{x \\sim p}[f(x)] = \\sum_{x \\in \\mathcal{X}} f(x) p(x) = \\sum_{x \\in \\mathcal{X}} f(x) \\frac{p(x)}{q(x)} q(x) = \\E_{x \\sim q} \\left[ \\frac{p(x)}{q(x)} f(x) \\right].","position":{"start":{"line":354,"column":1},"end":{"line":356,"column":1}},"html":"Exp[f(x)]=xXf(x)p(x)=xXf(x)p(x)q(x)q(x)=Exq[p(x)q(x)f(x)].\\E_{x \\sim p}[f(x)] = \\sum_{x \\in \\mathcal{X}} f(x) p(x) = \\sum_{x \\in \\mathcal{X}} f(x) \\frac{p(x)}{q(x)} q(x) = \\E_{x \\sim q} \\left[ \\frac{p(x)}{q(x)} f(x) \\right].Exp[f(x)]=xXf(x)p(x)=xXf(x)q(x)p(x)q(x)=Exq[q(x)p(x)f(x)].","enumerator":"14","key":"ZnLPLI6te3"},{"type":"paragraph","position":{"start":{"line":358,"column":1},"end":{"line":361,"column":1}},"children":[{"type":"text","value":"Doesn’t this seem too good to be true? If there were no drawbacks, we could use this to estimate ","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"key":"xJgg8jJTEc"},{"type":"emphasis","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"children":[{"type":"text","value":"any","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"key":"TWj3mH2P2a"}],"key":"RxDGh82wdS"},{"type":"text","value":" expectation of any function on any arbitrary distribution! The drawback is that the variance may be very large due to the likelihood ratio term.\nIf there are values of ","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"key":"UwphPDIwyF"},{"type":"inlineMath","value":"x","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"html":"xxx","key":"H77AeiSQHr"},{"type":"text","value":" that are very rare in the sampling distribution ","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"key":"JOkqzTf5ao"},{"type":"inlineMath","value":"q","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"html":"qqq","key":"k9PsUzyDZ5"},{"type":"text","value":",\nbut common under ","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"key":"j790eN92ai"},{"type":"inlineMath","value":"p","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"html":"ppp","key":"qwJUKwqXIm"},{"type":"text","value":",\nthen the likelihood ratio ","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"key":"AqANji6oyS"},{"type":"inlineMath","value":"p(x)/q(x)","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"html":"p(x)/q(x)p(x)/q(x)p(x)/q(x)","key":"RU2dHGHbOJ"},{"type":"text","value":" will cause the variance to blow up.","position":{"start":{"line":358,"column":1},"end":{"line":358,"column":1}},"key":"rj2BJWUflm"}],"key":"xBgw4edUNF"},{"type":"heading","depth":2,"position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"children":[{"type":"text","value":"The REINFORCE policy gradient","position":{"start":{"line":363,"column":1},"end":{"line":363,"column":1}},"key":"yJy2orJoqC"}],"identifier":"the-reinforce-policy-gradient","label":"The REINFORCE policy gradient","html_id":"the-reinforce-policy-gradient","implicit":true,"enumerator":"4","key":"ToAfY2QbsU"},{"type":"paragraph","position":{"start":{"line":365,"column":1},"end":{"line":367,"column":1}},"children":[{"type":"text","value":"Returning to RL, suppose there is some trajectory distribution ","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"whC1Z95gbZ"},{"type":"inlineMath","value":"\\rho(\\tau)","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"html":"ρ(τ)\\rho(\\tau)ρ(τ)","key":"WxjpkBAn2S"},{"type":"text","value":" that is ","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"ila5UnvpoI"},{"type":"strong","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"children":[{"type":"text","value":"easy to sample from,","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"TcXbz2quPm"}],"key":"F7FO4TEktx"},{"type":"text","value":" such as a database of existing trajectories.\nWe can then rewrite ","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"xONo1vah0z"},{"type":"inlineMath","value":"\\nabla J(\\theta)","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"html":"J(θ)\\nabla J(\\theta)J(θ)","key":"RQ2ZqEnL7l"},{"type":"text","value":", a.k.a. the ","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"ET6YHLwb9y"},{"type":"emphasis","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"children":[{"type":"text","value":"policy gradient","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"YWuguaNoKU"}],"key":"nko7rsXJz7"},{"type":"text","value":", as follows.\nAll gradients are being taken with respect to ","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"MQLdE1e7OJ"},{"type":"text","value":"θ","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"ycoIrYyxbr"},{"type":"text","value":".","position":{"start":{"line":365,"column":1},"end":{"line":365,"column":1}},"key":"Lry2Kxbn5y"}],"key":"q7reJRA3nv"},{"type":"math","value":"\\begin{aligned}\n \\nabla J(\\theta) & = \\nabla \\E_{\\tau \\sim \\rho_\\theta} [ R(\\tau) ] \\\\\n & = \\nabla \\E_{\\tau \\sim \\rho} \\left[ \\frac{\\rho_\\theta(\\tau)}{\\rho(\\tau)} R(\\tau) \\right] & & \\text{likelihood ratio trick} \\\\\n & = \\E_{\\tau \\sim \\rho} \\left[ \\frac{\\nabla \\rho_\\theta(\\tau)}{\\rho(\\tau)} R(\\tau) \\right] & & \\text{switching gradient and expectation}\n\\end{aligned}","position":{"start":{"line":369,"column":1},"end":{"line":375,"column":1}},"html":"J(θ)=Eτρθ[R(τ)]=Eτρ[ρθ(τ)ρ(τ)R(τ)]likelihood ratio trick=Eτρ[ρθ(τ)ρ(τ)R(τ)]switching gradient and expectation\\begin{aligned}\n \\nabla J(\\theta) & = \\nabla \\E_{\\tau \\sim \\rho_\\theta} [ R(\\tau) ] \\\\\n & = \\nabla \\E_{\\tau \\sim \\rho} \\left[ \\frac{\\rho_\\theta(\\tau)}{\\rho(\\tau)} R(\\tau) \\right] & & \\text{likelihood ratio trick} \\\\\n & = \\E_{\\tau \\sim \\rho} \\left[ \\frac{\\nabla \\rho_\\theta(\\tau)}{\\rho(\\tau)} R(\\tau) \\right] & & \\text{switching gradient and expectation}\n\\end{aligned}J(θ)=Eτρθ[R(τ)]=Eτρ[ρ(τ)ρθ(τ)R(τ)]=Eτρ[ρ(τ)ρθ(τ)R(τ)]likelihood ratio trickswitching gradient and expectation","enumerator":"15","key":"SwT6665xqy"},{"type":"paragraph","position":{"start":{"line":377,"column":1},"end":{"line":377,"column":1}},"children":[{"type":"text","value":"Note that for ","position":{"start":{"line":377,"column":1},"end":{"line":377,"column":1}},"key":"EFXErejeYS"},{"type":"inlineMath","value":"\\rho = \\rho_\\theta","position":{"start":{"line":377,"column":1},"end":{"line":377,"column":1}},"html":"ρ=ρθ\\rho = \\rho_\\thetaρ=ρθ","key":"v6MbCcapdb"},{"type":"text","value":", the inside term becomes","position":{"start":{"line":377,"column":1},"end":{"line":377,"column":1}},"key":"bQq95e9dPB"}],"key":"h9L0IsjL2H"},{"type":"math","value":"\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} [ \\nabla \\log \\rho_\\theta(\\tau) \\cdot R(\\tau)].","position":{"start":{"line":379,"column":1},"end":{"line":381,"column":1}},"html":"J(θ)=Eτρθ[logρθ(τ)R(τ)].\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} [ \\nabla \\log \\rho_\\theta(\\tau) \\cdot R(\\tau)].J(θ)=Eτρθ[logρθ(τ)R(τ)].","enumerator":"16","key":"FxHomtblJ3"},{"type":"paragraph","position":{"start":{"line":383,"column":1},"end":{"line":383,"column":1}},"children":[{"type":"text","value":"(The order of operations is ","position":{"start":{"line":383,"column":1},"end":{"line":383,"column":1}},"key":"r3PomIPTpo"},{"type":"inlineMath","value":"\\nabla (\\log \\rho_\\theta)(\\tau)","position":{"start":{"line":383,"column":1},"end":{"line":383,"column":1}},"html":"(logρθ)(τ)\\nabla (\\log \\rho_\\theta)(\\tau)(logρθ)(τ)","key":"QNvs1C4bDe"},{"type":"text","value":".)","position":{"start":{"line":383,"column":1},"end":{"line":383,"column":1}},"key":"UR95ez0yma"}],"key":"SvjpObXuVN"},{"type":"paragraph","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"children":[{"type":"text","value":"Recall that when the state transitions are Markov (i.e. ","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"key":"Bt1NIPpMcW"},{"type":"inlineMath","value":"s_{t}","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"html":"sts_{t}st","key":"JrRLHDiBeU"},{"type":"text","value":" only depends on ","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"key":"UJ6U0yeKlG"},{"type":"inlineMath","value":"s_{t-1}, a_{t-1}","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"html":"st1,at1s_{t-1}, a_{t-1}st1,at1","key":"tLN6x8dy0O"},{"type":"text","value":") and the policy is time-homogeneous (i.e. ","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"key":"bcNUwfRUnH"},{"type":"inlineMath","value":"a_\\hi \\sim \\pi_\\theta (s_\\hi)","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"html":"ahπθ(sh)a_\\hi \\sim \\pi_\\theta (s_\\hi)ahπθ(sh)","key":"bWQhCRgAed"},{"type":"text","value":"), we can write out the ","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"key":"yClgSdPm79"},{"type":"emphasis","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"children":[{"type":"text","value":"likelihood of a trajectory","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"key":"aN4zghv8sW"}],"key":"c3s94TRU9g"},{"type":"text","value":" under the policy ","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"key":"N8JfwLN3Qz"},{"type":"inlineMath","value":"\\pi_\\theta","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"html":"πθ\\pi_\\thetaπθ","key":"pOZnxwBJNt"},{"type":"text","value":" autoregressively, as in ","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"key":"CS0U0mwNCA"},{"type":"crossReference","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"children":[{"type":"text","value":"Definition ","key":"H9L92yAcag"},{"type":"text","value":"1.5","key":"gVUFIRv1HC"}],"identifier":"autoregressive_trajectories","label":"autoregressive_trajectories","kind":"proof:definition","template":"Definition %s","enumerator":"1.5","resolved":true,"html_id":"autoregressive-trajectories","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"GuUSY7ZSnp"},{"type":"text","value":". Taking the log of the trajectory likelihood turns it into a sum of terms:","position":{"start":{"line":386,"column":1},"end":{"line":386,"column":1}},"key":"KquV1BrAa1"}],"key":"mrbYGQ1eRl"},{"type":"math","value":"\\log \\rho_\\theta(\\tau) = \\log \\mu(s_0) + \\sum_{\\hi=0}^{\\hor-1} \\log \\pi_\\theta(a_\\hi \\mid s_\\hi) + \\log P(s_{\\hi+1} \\mid s_\\hi, a_\\hi)","position":{"start":{"line":388,"column":1},"end":{"line":390,"column":1}},"html":"logρθ(τ)=logμ(s0)+h=0H1logπθ(ahsh)+logP(sh+1sh,ah)\\log \\rho_\\theta(\\tau) = \\log \\mu(s_0) + \\sum_{\\hi=0}^{\\hor-1} \\log \\pi_\\theta(a_\\hi \\mid s_\\hi) + \\log P(s_{\\hi+1} \\mid s_\\hi, a_\\hi)logρθ(τ)=logμ(s0)+h=0H1logπθ(ahsh)+logP(sh+1sh,ah)","enumerator":"17","key":"CUxN7Kd8Ve"},{"type":"paragraph","position":{"start":{"line":392,"column":1},"end":{"line":394,"column":1}},"children":[{"type":"text","value":"When we take the gradient with respect to the parameters ","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"key":"G6o5wJUmme"},{"type":"text","value":"θ","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"key":"So2gxYyqn7"},{"type":"text","value":",\nonly the ","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"key":"QHWOpdLB8h"},{"type":"inlineMath","value":"\\pi_\\theta(a_\\hi | s_\\hi)","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"html":"πθ(ahsh)\\pi_\\theta(a_\\hi | s_\\hi)πθ(ahsh)","key":"mC9Yl6FkKa"},{"type":"text","value":" terms depend on ","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"key":"nESayRelbN"},{"type":"text","value":"θ","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"key":"abv3GSj32h"},{"type":"text","value":".\nThis gives the following expression for the policy gradient, known as the “REINFORCE” policy gradient ","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"key":"PQgg5aLwwJ"},{"type":"cite","kind":"narrative","label":"williams_simple_1992","identifier":"williams_simple_1992","children":[{"type":"text","value":"Williams (1992)","key":"ufuHR0AYg0"}],"enumerator":"2","key":"lhF35750Lk"},{"type":"text","value":":","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"key":"yUlYG1W2ht"}],"key":"ZC94qvEDtl"},{"type":"math","value":"\\begin{aligned}\n \\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\nabla_\\theta \\log \\pi_{\\theta}(a_\\hi | s_\\hi) R(\\tau) \\right]\n\\end{aligned}","label":"reinforce_pg","identifier":"reinforce_pg","html":"J(θ)=Eτρθ[h=0H1θlogπθ(ahsh)R(τ)]\\begin{aligned}\n \\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\nabla_\\theta \\log \\pi_{\\theta}(a_\\hi | s_\\hi) R(\\tau) \\right]\n\\end{aligned}J(θ)=Eτρθ[h=0H1θlogπθ(ahsh)R(τ)]","enumerator":"18","html_id":"reinforce-pg","key":"HFGqSYm1R5"},{"type":"paragraph","position":{"start":{"line":404,"column":1},"end":{"line":407,"column":1}},"children":[{"type":"text","value":"This expression allows us to estimate the gradient by sampling a few sample trajectories from ","position":{"start":{"line":404,"column":1},"end":{"line":404,"column":1}},"key":"GyiF8cQMIP"},{"type":"inlineMath","value":"\\pi_\\theta,","position":{"start":{"line":404,"column":1},"end":{"line":404,"column":1}},"html":"πθ,\\pi_\\theta,πθ,","key":"u6ANRfa1DR"},{"type":"text","value":"\ncalculating the likelihoods of the chosen actions,\nand substituting these into the expression inside the brackets of ","position":{"start":{"line":404,"column":1},"end":{"line":404,"column":1}},"key":"v9c7a0wJ6F"},{"type":"crossReference","position":{"start":{"line":404,"column":1},"end":{"line":404,"column":1}},"children":[{"type":"text","value":"(","key":"hOHlP7uC9Z"},{"type":"text","value":"18","key":"bKLPVxBe3P"},{"type":"text","value":")","key":"cDmy481IcC"}],"identifier":"reinforce_pg","label":"reinforce_pg","kind":"equation","template":"(%s)","enumerator":"18","resolved":true,"html_id":"reinforce-pg","key":"zFqG3tmZRN"},{"type":"text","value":".\nThen we can update the parameters ","position":{"start":{"line":404,"column":1},"end":{"line":404,"column":1}},"key":"Kx2mqXuozP"},{"type":"text","value":"θ","position":{"start":{"line":404,"column":1},"end":{"line":404,"column":1}},"key":"VNuG7NyN0Y"},{"type":"text","value":" in this direction to perform stochastic gradient ascent.","position":{"start":{"line":404,"column":1},"end":{"line":404,"column":1}},"key":"OWZeR39sgm"}],"key":"ApTlvJPrpd"},{"type":"paragraph","position":{"start":{"line":409,"column":1},"end":{"line":409,"column":1}},"children":[{"type":"text","value":"The rest of this chapter investigates ways to ","position":{"start":{"line":409,"column":1},"end":{"line":409,"column":1}},"key":"fJhH6H2LVK"},{"type":"emphasis","position":{"start":{"line":409,"column":1},"end":{"line":409,"column":1}},"children":[{"type":"text","value":"reduce the variance","position":{"start":{"line":409,"column":1},"end":{"line":409,"column":1}},"key":"KtuGMBwLEg"}],"key":"SurwrbyUAu"},{"type":"text","value":" of this estimator by subtracting off certain correlated quantities.","position":{"start":{"line":409,"column":1},"end":{"line":409,"column":1}},"key":"UTGw7iKOpT"}],"key":"SgbwwcSpZA"},{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Note","key":"vEnhMBbDoH"}],"key":"FXaY3NaoZx"},{"type":"paragraph","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"children":[{"type":"text","value":"Here is an alternative, intuitive presentation of ","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"Zvk2NX3iqh"},{"type":"crossReference","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"children":[{"type":"text","value":"(","key":"R53Y8JMBIt"},{"type":"text","value":"18","key":"qKQHezRIHN"},{"type":"text","value":")","key":"UgRK0MIBML"}],"identifier":"reinforce_pg","label":"reinforce_pg","kind":"equation","template":"(%s)","enumerator":"18","resolved":true,"html_id":"reinforce-pg","key":"KndXprhTJA"},{"type":"text","value":".","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"ctS81MSzA8"}],"key":"Ltm6kWyNCm"},{"type":"paragraph","position":{"start":{"line":415,"column":1},"end":{"line":419,"column":1}},"children":[{"type":"text","value":"Intuitively speaking,\nwe want to update the policy parameters to maximize the probability of taking ","position":{"start":{"line":415,"column":1},"end":{"line":415,"column":1}},"key":"TNVb6w01aj"},{"type":"emphasis","position":{"start":{"line":415,"column":1},"end":{"line":415,"column":1}},"children":[{"type":"text","value":"optimal actions","position":{"start":{"line":415,"column":1},"end":{"line":415,"column":1}},"key":"RenUVx1Qqu"}],"key":"O2x0rakrXr"},{"type":"text","value":".\nThat is, suppose we are in state ","position":{"start":{"line":415,"column":1},"end":{"line":415,"column":1}},"key":"A8QVlBWSFn"},{"type":"inlineMath","value":"s","position":{"start":{"line":415,"column":1},"end":{"line":415,"column":1}},"html":"sss","key":"cloQZ5HPlC"},{"type":"text","value":", and ","position":{"start":{"line":415,"column":1},"end":{"line":415,"column":1}},"key":"nCFF5N6mh6"},{"type":"inlineMath","value":"a^\\star","position":{"start":{"line":415,"column":1},"end":{"line":415,"column":1}},"html":"aa^\\stara","key":"Nalb1b2ths"},{"type":"text","value":" is an optimal action to take.\nThen we want to solve ","position":{"start":{"line":415,"column":1},"end":{"line":415,"column":1}},"key":"uP05WrqjSL"},{"type":"inlineMath","value":"\\theta = \\arg\\max_{\\theta'} \\pi_{\\theta'}(a^\\star \\mid s)","position":{"start":{"line":415,"column":1},"end":{"line":415,"column":1}},"html":"θ=argmaxθπθ(as)\\theta = \\arg\\max_{\\theta'} \\pi_{\\theta'}(a^\\star \\mid s)θ=argmaxθπθ(as)","key":"vkSvDKlCLj"},{"type":"text","value":",\nwhich would lead to the gradient ascent expression","position":{"start":{"line":415,"column":1},"end":{"line":415,"column":1}},"key":"Fw8z7YmAK7"}],"key":"zaigpCmxBJ"},{"type":"math","value":"\\theta \\gets \\theta + \\nabla \\pi_{\\theta}(a^\\star \\mid s).","position":{"start":{"line":421,"column":1},"end":{"line":423,"column":1}},"html":"θθ+πθ(as).\\theta \\gets \\theta + \\nabla \\pi_{\\theta}(a^\\star \\mid s).θθ+πθ(as).","enumerator":"19","key":"UPcUphctwz"},{"type":"paragraph","position":{"start":{"line":425,"column":1},"end":{"line":430,"column":1}},"children":[{"type":"text","value":"However, we don’t know the optimal action ","position":{"start":{"line":425,"column":1},"end":{"line":425,"column":1}},"key":"XczGsTx8uP"},{"type":"inlineMath","value":"a^\\star","position":{"start":{"line":425,"column":1},"end":{"line":425,"column":1}},"html":"aa^\\stara","key":"ZPMRp2ePUh"},{"type":"text","value":" in practice.\nSo instead, we must try many actions,\nand ","position":{"start":{"line":425,"column":1},"end":{"line":425,"column":1}},"key":"ZlyU8QC5cR"},{"type":"emphasis","position":{"start":{"line":425,"column":1},"end":{"line":425,"column":1}},"children":[{"type":"text","value":"increase","position":{"start":{"line":425,"column":1},"end":{"line":425,"column":1}},"key":"C4sW0IxmpQ"}],"key":"l25bcvEgrx"},{"type":"text","value":" the probability of the “good” ones\nand ","position":{"start":{"line":425,"column":1},"end":{"line":425,"column":1}},"key":"stZ1aQn3ZX"},{"type":"emphasis","position":{"start":{"line":425,"column":1},"end":{"line":425,"column":1}},"children":[{"type":"text","value":"decrease","position":{"start":{"line":425,"column":1},"end":{"line":425,"column":1}},"key":"U17x4BOrty"}],"key":"cFa0JDODYJ"},{"type":"text","value":" the probability of the “bad” ones.\nSuppose ","position":{"start":{"line":425,"column":1},"end":{"line":425,"column":1}},"key":"uBFhjl59e4"},{"type":"inlineMath","value":"A(s, a)","position":{"start":{"line":425,"column":1},"end":{"line":425,"column":1}},"html":"A(s,a)A(s, a)A(s,a)","key":"jXNUJDIgbr"},{"type":"text","value":" is a measure of how good action ","position":{"start":{"line":425,"column":1},"end":{"line":425,"column":1}},"key":"Fsq4VWRh3e"},{"type":"inlineMath","value":"a","position":{"start":{"line":425,"column":1},"end":{"line":425,"column":1}},"html":"aaa","key":"OtPg1QlsDA"},{"type":"text","value":" is in state ","position":{"start":{"line":425,"column":1},"end":{"line":425,"column":1}},"key":"dl0ruDYDwi"},{"type":"inlineMath","value":"s","position":{"start":{"line":425,"column":1},"end":{"line":425,"column":1}},"html":"sss","key":"UNZM6hU66w"},{"type":"text","value":".\nThen we could write","position":{"start":{"line":425,"column":1},"end":{"line":425,"column":1}},"key":"QSMTyUbcKC"}],"key":"uNjDz44RxG"},{"type":"math","value":"\\theta \\gets \\theta + \\sum_a \\pi_{\\theta}(a \\mid s) A(s, a) \\nabla \\pi_{\\theta}(a \\mid s).","position":{"start":{"line":432,"column":1},"end":{"line":434,"column":1}},"html":"θθ+aπθ(as)A(s,a)πθ(as).\\theta \\gets \\theta + \\sum_a \\pi_{\\theta}(a \\mid s) A(s, a) \\nabla \\pi_{\\theta}(a \\mid s).θθ+aπθ(as)A(s,a)πθ(as).","enumerator":"20","key":"Pz8gV8y5Rj"},{"type":"paragraph","position":{"start":{"line":436,"column":1},"end":{"line":440,"column":1}},"children":[{"type":"text","value":"But this has an issue: the size of each step doesn’t just depend on how good it is,\nbut also how ","position":{"start":{"line":436,"column":1},"end":{"line":436,"column":1}},"key":"TnW7RBZglg"},{"type":"emphasis","position":{"start":{"line":436,"column":1},"end":{"line":436,"column":1}},"children":[{"type":"text","value":"often","position":{"start":{"line":436,"column":1},"end":{"line":436,"column":1}},"key":"uJdn5zvooJ"}],"key":"itph8o692s"},{"type":"text","value":" the policy takes it already.\nThis could lead to a positive feedback loop where likely actions become more and more likely,\nwithout respect to the quality of the action.\nSo we divide by the likelihood to cancel out this factor:","position":{"start":{"line":436,"column":1},"end":{"line":436,"column":1}},"key":"guVKK3goW2"}],"key":"NA9EiH6PnO"},{"type":"math","value":"\\theta \\gets \\theta + \\sum_a \\pi_{\\theta}(a \\mid s) A(s, a) \\frac{\\nabla \\pi_{\\theta}(a \\mid s)}{\\pi_{\\theta}(a \\mid s)}.","position":{"start":{"line":442,"column":1},"end":{"line":444,"column":1}},"html":"θθ+aπθ(as)A(s,a)πθ(as)πθ(as).\\theta \\gets \\theta + \\sum_a \\pi_{\\theta}(a \\mid s) A(s, a) \\frac{\\nabla \\pi_{\\theta}(a \\mid s)}{\\pi_{\\theta}(a \\mid s)}.θθ+aπθ(as)A(s,a)πθ(as)πθ(as).","enumerator":"21","key":"GKopYptNzK"},{"type":"paragraph","position":{"start":{"line":446,"column":1},"end":{"line":446,"column":1}},"children":[{"type":"text","value":"But once we simplify, and sum across timesteps, this becomes ","position":{"start":{"line":446,"column":1},"end":{"line":446,"column":1}},"key":"K1rnvg7rHi"},{"type":"emphasis","position":{"start":{"line":446,"column":1},"end":{"line":446,"column":1}},"children":[{"type":"text","value":"almost","position":{"start":{"line":446,"column":1},"end":{"line":446,"column":1}},"key":"daukrfM1Pi"}],"key":"RGvKnNMt4F"},{"type":"text","value":" exactly the gradient written above!","position":{"start":{"line":446,"column":1},"end":{"line":446,"column":1}},"key":"L31hp65lVC"}],"key":"y62ZhkeioB"},{"type":"math","value":"\\theta \\gets \\theta + \\mathbb{E}_{a \\sim \\pi_{\\theta}(\\cdot \\mid s)} [\\sum_{\\hi=0}^{\\hor-1} A(s_\\hi, a_\\hi) \\nabla \\log \\pi_{\\theta}(a_\\hi \\mid s_\\hi) ].","position":{"start":{"line":448,"column":1},"end":{"line":450,"column":1}},"html":"θθ+Eaπθ(s)[h=0H1A(sh,ah)logπθ(ahsh)].\\theta \\gets \\theta + \\mathbb{E}_{a \\sim \\pi_{\\theta}(\\cdot \\mid s)} [\\sum_{\\hi=0}^{\\hor-1} A(s_\\hi, a_\\hi) \\nabla \\log \\pi_{\\theta}(a_\\hi \\mid s_\\hi) ].θθ+Eaπθ(s)[h=0H1A(sh,ah)logπθ(ahsh)].","enumerator":"22","key":"cuy5ransqN"},{"type":"paragraph","position":{"start":{"line":452,"column":1},"end":{"line":452,"column":1}},"children":[{"type":"text","value":"We will see later on what ","position":{"start":{"line":452,"column":1},"end":{"line":452,"column":1}},"key":"LnA2hsU0VB"},{"type":"inlineMath","value":"A","position":{"start":{"line":452,"column":1},"end":{"line":452,"column":1}},"html":"AAA","key":"lF6lC02477"},{"type":"text","value":" concretely corresponds to.","position":{"start":{"line":452,"column":1},"end":{"line":452,"column":1}},"key":"iszcLV4byX"}],"key":"mVRZuCYjtB"}],"label":"intuitive-remark","identifier":"intuitive-remark","enumerator":"1","html_id":"intuitive-remark","key":"PeJRJ0Crx3"},{"type":"code","lang":"python","value":"def estimate_gradient_reinforce_pseudocode(env, π, θ):\n τ = sample_trajectory(env, π(θ))\n gradient_hat = 0\n for s, a, r in τ:\n def policy_log_likelihood(θ):\n return log(π(θ)(s, a))\n gradient_hat += jax.grad(policy_log_likelihood)(θ) * τ.total_reward\n return gradient_hat","position":{"start":{"line":455,"column":1},"end":{"line":464,"column":1}},"key":"p8ycl7fIav"},{"type":"paragraph","position":{"start":{"line":466,"column":1},"end":{"line":466,"column":1}},"children":[{"type":"text","value":"For some intuition into how this method works, recall that we update our parameters according to","position":{"start":{"line":466,"column":1},"end":{"line":466,"column":1}},"key":"reuFa3Z9oC"}],"key":"QQK49yduXU"},{"type":"math","value":"\\begin{aligned}\n \\theta_{t+1} &= \\theta_t + \\eta \\nabla J(\\theta_t) \\\\\n &= \\theta_t + \\eta \\E_{\\tau \\sim \\rho_{\\theta_t}} [\\nabla \\log \\rho_{\\theta_t}(\\tau) \\cdot R(\\tau)].\n\\end{aligned}","position":{"start":{"line":468,"column":1},"end":{"line":473,"column":1}},"html":"θt+1=θt+ηJ(θt)=θt+ηEτρθt[logρθt(τ)R(τ)].\\begin{aligned}\n \\theta_{t+1} &= \\theta_t + \\eta \\nabla J(\\theta_t) \\\\\n &= \\theta_t + \\eta \\E_{\\tau \\sim \\rho_{\\theta_t}} [\\nabla \\log \\rho_{\\theta_t}(\\tau) \\cdot R(\\tau)].\n\\end{aligned}θt+1=θt+ηJ(θt)=θt+ηEτρθt[logρθt(τ)R(τ)].","enumerator":"23","key":"j4ok9unqip"},{"type":"paragraph","position":{"start":{"line":475,"column":1},"end":{"line":475,"column":1}},"children":[{"type":"text","value":"Consider the “good” trajectories where ","position":{"start":{"line":475,"column":1},"end":{"line":475,"column":1}},"key":"hXx7ZXVsrl"},{"type":"inlineMath","value":"R(\\tau)","position":{"start":{"line":475,"column":1},"end":{"line":475,"column":1}},"html":"R(τ)R(\\tau)R(τ)","key":"qwwJh93zFK"},{"type":"text","value":" is large. Then ","position":{"start":{"line":475,"column":1},"end":{"line":475,"column":1}},"key":"L29w3cO7br"},{"type":"text","value":"θ","position":{"start":{"line":475,"column":1},"end":{"line":475,"column":1}},"key":"SR570R21eu"},{"type":"text","value":" gets updated so that these trajectories become more likely. To see why, recall that ","position":{"start":{"line":475,"column":1},"end":{"line":475,"column":1}},"key":"Pq1YZhyqoa"},{"type":"inlineMath","value":"\\rho_{\\theta}(\\tau)","position":{"start":{"line":475,"column":1},"end":{"line":475,"column":1}},"html":"ρθ(τ)\\rho_{\\theta}(\\tau)ρθ(τ)","key":"mjDnG8Nh7e"},{"type":"text","value":" is the likelihood of the trajectory ","position":{"start":{"line":475,"column":1},"end":{"line":475,"column":1}},"key":"SRKYlXmrFV"},{"type":"text","value":"τ","position":{"start":{"line":475,"column":1},"end":{"line":475,"column":1}},"key":"qZmH30mJYx"},{"type":"text","value":" under the policy ","position":{"start":{"line":475,"column":1},"end":{"line":475,"column":1}},"key":"WjwsNxd4Dp"},{"type":"inlineMath","value":"\\pi_\\theta,","position":{"start":{"line":475,"column":1},"end":{"line":475,"column":1}},"html":"πθ,\\pi_\\theta,πθ,","key":"yngXj3t2HN"},{"type":"text","value":" so the gradient points in the direction that makes ","position":{"start":{"line":475,"column":1},"end":{"line":475,"column":1}},"key":"IWmQu39uCk"},{"type":"text","value":"τ","position":{"start":{"line":475,"column":1},"end":{"line":475,"column":1}},"key":"BnBBf8ZCdo"},{"type":"text","value":" more likely.","position":{"start":{"line":475,"column":1},"end":{"line":475,"column":1}},"key":"VhoMMI91qJ"}],"key":"PnKsDZ9MX2"}],"key":"hPrCpGHSwG"},{"type":"block","position":{"start":{"line":477,"column":1},"end":{"line":477,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":479,"column":1},"end":{"line":479,"column":1}},"children":[{"type":"text","value":"Baselines and advantages","position":{"start":{"line":479,"column":1},"end":{"line":479,"column":1}},"key":"opvCci7keF"}],"identifier":"baselines-and-advantages","label":"Baselines and advantages","html_id":"baselines-and-advantages","implicit":true,"enumerator":"5","key":"mDdwBOIbOf"},{"type":"paragraph","position":{"start":{"line":481,"column":1},"end":{"line":484,"column":1}},"children":[{"type":"text","value":"A central idea from supervised learning is the ","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"uqX79jYoJO"},{"type":"strong","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"children":[{"type":"text","value":"bias-variance decomposition","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"BmPBd8iR7w"}],"key":"eD0ooSYELa"},{"type":"text","value":",\nwhich shows that the mean squared error of an estimator is the sum of its squared bias and its variance.\nThe REINFORCE gradient estimator ","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"DmpSGDEUaG"},{"type":"crossReference","kind":"equation","identifier":"reinforce_pg","label":"reinforce_pg","children":[{"type":"text","value":"(","key":"q084BDO5ii"},{"type":"text","value":"18","key":"mo0BInWt3G"},{"type":"text","value":")","key":"my588UyyRq"}],"template":"(%s)","enumerator":"18","resolved":true,"html_id":"reinforce-pg","key":"eHm2DMebsv"},{"type":"text","value":" is already ","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"Bl3vpGzKrZ"},{"type":"emphasis","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"children":[{"type":"text","value":"unbiased,","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"d23c7C4qHF"}],"key":"p1YuCPih9q"},{"type":"text","value":" meaning that its expectation over trajectories is the true policy gradient.\nCan we find ways to reduce its ","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"v5M4cutJ6w"},{"type":"emphasis","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"children":[{"type":"text","value":"variance","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"bU01F2iUfR"}],"key":"NyZdgutT8L"},{"type":"text","value":" as well?","position":{"start":{"line":481,"column":1},"end":{"line":481,"column":1}},"key":"xbWokpLVIG"}],"key":"IKW3BlFE6f"},{"type":"paragraph","position":{"start":{"line":486,"column":1},"end":{"line":489,"column":1}},"children":[{"type":"text","value":"As a first step,\nconsider that the action taken at step ","position":{"start":{"line":486,"column":1},"end":{"line":486,"column":1}},"key":"PrV1O9thdA"},{"type":"inlineMath","value":"t","position":{"start":{"line":486,"column":1},"end":{"line":486,"column":1}},"html":"ttt","key":"ysqypw0Hrn"},{"type":"text","value":" does not affect the reward from previous timesteps, since they’re already in the past.\nYou can also show rigorously that this is the case,\nand that we only need to consider the present and future rewards to calculate the policy gradient:","position":{"start":{"line":486,"column":1},"end":{"line":486,"column":1}},"key":"JhZfQsdsRP"}],"key":"vTIOeYYVre"},{"type":"math","value":"\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\nabla_\\theta \\log \\pi_{\\theta}(a_\\hi | s_\\hi) \\sum_{\\hi' = \\hi}^{\\hor-1} r(s_{\\hi'}, a_{\\hi'}) \\right]","position":{"start":{"line":491,"column":1},"end":{"line":493,"column":1}},"html":"J(θ)=Eτρθ[h=0H1θlogπθ(ahsh)h=hH1r(sh,ah)]\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\nabla_\\theta \\log \\pi_{\\theta}(a_\\hi | s_\\hi) \\sum_{\\hi' = \\hi}^{\\hor-1} r(s_{\\hi'}, a_{\\hi'}) \\right]J(θ)=Eτρθ[h=0H1θlogπθ(ahsh)h=hH1r(sh,ah)]","enumerator":"24","key":"cftBBnp5ID"},{"type":"paragraph","position":{"start":{"line":495,"column":1},"end":{"line":496,"column":1}},"children":[{"type":"text","value":"Furthermore, by a conditioning argument, we can replace the inner sum over remaining rewards with the policy’s Q-function,\nevaluated at the current state:","position":{"start":{"line":495,"column":1},"end":{"line":495,"column":1}},"key":"RGD28fwwiK"}],"key":"XIYF1qn7q7"},{"type":"math","value":"\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\nabla_\\theta \\log \\pi_{\\theta}(a_\\hi | s_\\hi) Q^{\\pi_\\theta}(s_{\\hi}, a_{\\hi}) \\right]","label":"pg_with_q","identifier":"pg_with_q","html":"J(θ)=Eτρθ[h=0H1θlogπθ(ahsh)Qπθ(sh,ah)]\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\nabla_\\theta \\log \\pi_{\\theta}(a_\\hi | s_\\hi) Q^{\\pi_\\theta}(s_{\\hi}, a_{\\hi}) \\right]J(θ)=Eτρθ[h=0H1θlogπθ(ahsh)Qπθ(sh,ah)]","enumerator":"25","html_id":"pg-with-q","key":"OTmNzBM6aq"},{"type":"paragraph","position":{"start":{"line":504,"column":1},"end":{"line":504,"column":1}},"children":[{"type":"strong","position":{"start":{"line":504,"column":1},"end":{"line":504,"column":1}},"children":[{"type":"text","value":"Exercise:","position":{"start":{"line":504,"column":1},"end":{"line":504,"column":1}},"key":"gQGQGuIqln"}],"key":"y1JVcZvNDh"},{"type":"text","value":" Prove that this is equivalent to the previous definitions. What modification to the expression must be made for the discounted, infinite-horizon setting?","position":{"start":{"line":504,"column":1},"end":{"line":504,"column":1}},"key":"QHe8aYSwlE"}],"key":"jFNLfI4cgt"},{"type":"paragraph","position":{"start":{"line":506,"column":1},"end":{"line":507,"column":1}},"children":[{"type":"text","value":"We can further reduce variance by subtracting a ","position":{"start":{"line":506,"column":1},"end":{"line":506,"column":1}},"key":"lIuCs69bPp"},{"type":"strong","position":{"start":{"line":506,"column":1},"end":{"line":506,"column":1}},"children":[{"type":"text","value":"baseline function","position":{"start":{"line":506,"column":1},"end":{"line":506,"column":1}},"key":"owvoRnIGrk"}],"key":"sXRNuMPfLF"},{"type":"text","value":" ","position":{"start":{"line":506,"column":1},"end":{"line":506,"column":1}},"key":"T6ntqoy570"},{"type":"inlineMath","value":"b_\\hi : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":506,"column":1},"end":{"line":506,"column":1}},"html":"bh:SRb_\\hi : \\mathcal{S} \\to \\mathbb{R}bh:SR","key":"AUjrG8jveA"},{"type":"text","value":" at each timestep ","position":{"start":{"line":506,"column":1},"end":{"line":506,"column":1}},"key":"Xa2fWKFjLe"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":506,"column":1},"end":{"line":506,"column":1}},"html":"h\\hih","key":"fdeYim8hDK"},{"type":"text","value":".\nThis modifies the policy gradient as follows:","position":{"start":{"line":506,"column":1},"end":{"line":506,"column":1}},"key":"ZvKtDRemxh"}],"key":"Zasu19QBV6"},{"type":"math","value":"\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{\\hi=0}^{H-1} \\nabla \\log \\pi_\\theta (a_\\hi | s_\\hi) \\left(\n Q^{\\pi_\\theta}(s_\\hi, a_\\hi)\n - b_\\hi(s_\\hi)\n \\right)\n \\right].","position":{"start":{"line":509,"column":1},"end":{"line":517,"column":1}},"identifier":"eq:pg_baseline","label":"eq:pg_baseline","html_id":"eq-pg-baseline","html":"J(θ)=Eτρθ[h=0H1logπθ(ahsh)(Qπθ(sh,ah)bh(sh))].\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{\\hi=0}^{H-1} \\nabla \\log \\pi_\\theta (a_\\hi | s_\\hi) \\left(\n Q^{\\pi_\\theta}(s_\\hi, a_\\hi)\n - b_\\hi(s_\\hi)\n \\right)\n \\right].J(θ)=Eτρθ[h=0H1logπθ(ahsh)(Qπθ(sh,ah)bh(sh))].","enumerator":"26","key":"BnmKJziCNA"},{"type":"paragraph","position":{"start":{"line":519,"column":1},"end":{"line":520,"column":1}},"children":[{"type":"text","value":"(Again, you should try to prove that this equality still holds.)\nFor example, we might want ","position":{"start":{"line":519,"column":1},"end":{"line":519,"column":1}},"key":"ThmSwl5OkO"},{"type":"inlineMath","value":"b_\\hi","position":{"start":{"line":519,"column":1},"end":{"line":519,"column":1}},"html":"bhb_\\hibh","key":"ybQnW0HOsv"},{"type":"text","value":" to estimate the average reward-to-go at a given timestep:","position":{"start":{"line":519,"column":1},"end":{"line":519,"column":1}},"key":"Wvwu8MESej"}],"key":"i89zZGaaH9"},{"type":"math","value":"b_\\hi^\\theta = \\E_{\\tau \\sim \\rho_\\theta} R_\\hi(\\tau).","position":{"start":{"line":522,"column":1},"end":{"line":522,"column":1}},"html":"bhθ=EτρθRh(τ).b_\\hi^\\theta = \\E_{\\tau \\sim \\rho_\\theta} R_\\hi(\\tau).bhθ=EτρθRh(τ).","enumerator":"27","key":"gVs5d4sfq5"},{"type":"paragraph","position":{"start":{"line":524,"column":1},"end":{"line":531,"column":1}},"children":[{"type":"text","value":"As a better baseline, we could instead choose the ","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"key":"BhZJ9XNqQJ"},{"type":"emphasis","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"children":[{"type":"text","value":"value function.","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"key":"xFRIqvGPHW"}],"key":"goLhQsc90u"},{"type":"text","value":"\nNote that the random variable ","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"key":"RiYD4mWJ95"},{"type":"inlineMath","value":"Q^\\pi_\\hi(s, a) - V^\\pi_\\hi(s),","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"html":"Qhπ(s,a)Vhπ(s),Q^\\pi_\\hi(s, a) - V^\\pi_\\hi(s),Qhπ(s,a)Vhπ(s),","key":"Ow2HXbKrFS"},{"type":"text","value":"\nwhere the randomness is taken over the actions, is centered around zero.\n(Recall ","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"key":"KcJz7RdkTG"},{"type":"inlineMath","value":"V^\\pi_\\hi(s) = \\E_{a \\sim \\pi} Q^\\pi_\\hi(s, a).","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"html":"Vhπ(s)=EaπQhπ(s,a).V^\\pi_\\hi(s) = \\E_{a \\sim \\pi} Q^\\pi_\\hi(s, a).Vhπ(s)=EaπQhπ(s,a).","key":"ECIUg4ZYBg"},{"type":"text","value":")\nThis quantity matches the intuition given in ","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"key":"ZT0gMrOrsj"},{"type":"crossReference","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"children":[{"type":"text","value":"Note ","key":"sjMWtyuA5v"},{"type":"text","value":"1","key":"Iz53AUQjP6"}],"identifier":"intuitive-remark","label":"intuitive-remark","kind":"admonition:note","template":"Note %s","enumerator":"1","resolved":true,"html_id":"intuitive-remark","key":"arFVnTFdAG"},{"type":"text","value":":\nit is ","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"key":"FFztOPORGj"},{"type":"emphasis","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"children":[{"type":"text","value":"positive","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"key":"UupesYk4Zo"}],"key":"Wj5um8jf4N"},{"type":"text","value":" for actions that are better than average (in state ","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"key":"cCe989ubnt"},{"type":"inlineMath","value":"s","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"html":"sss","key":"W8iiJ1oehi"},{"type":"text","value":"),\nand ","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"key":"xkCWlPmYcJ"},{"type":"emphasis","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"children":[{"type":"text","value":"negative","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"key":"n4zhZbH9nn"}],"key":"IIrffS8NRS"},{"type":"text","value":" for actions that are worse than average.\nIn fact, this quantity has a particular name: the ","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"key":"BZsMrkYCaG"},{"type":"strong","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"children":[{"type":"text","value":"advantage function.","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"key":"pwoYLZG2UN"}],"key":"Nr1gwC2krk"}],"key":"rXcf7Npba6"},{"type":"proof","kind":"definition","label":"advantage","identifier":"advantage","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Advantage function","position":{"start":{"line":533,"column":1},"end":{"line":533,"column":1}},"key":"Lrmm2FFNtG"}],"key":"aJ9tL8Z1Pf"},{"type":"math","value":"A^\\pi_\\hi(s) = Q^\\pi_\\hi(s, a) - V^\\pi_\\hi(s)","position":{"start":{"line":536,"column":1},"end":{"line":538,"column":1}},"html":"Ahπ(s)=Qhπ(s,a)Vhπ(s)A^\\pi_\\hi(s) = Q^\\pi_\\hi(s, a) - V^\\pi_\\hi(s)Ahπ(s)=Qhπ(s,a)Vhπ(s)","enumerator":"28","key":"ZYuuIQJFg8"}],"enumerator":"2","html_id":"advantage","key":"BYP0ZpadpB"},{"type":"paragraph","position":{"start":{"line":541,"column":1},"end":{"line":542,"column":1}},"children":[{"type":"text","value":"This measures how much better this action does than the average for that policy.\n(Note that for an optimal policy ","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"key":"GZYU2xo8tn"},{"type":"inlineMath","value":"\\pi^\\star,","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"html":"π,\\pi^\\star,π,","key":"vigLNyakLo"},{"type":"text","value":" the advantage of a given state-action pair is always zero or negative.)","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"key":"sKmwKAOuDi"}],"key":"xkK0GEoyNc"},{"type":"paragraph","position":{"start":{"line":544,"column":1},"end":{"line":544,"column":1}},"children":[{"type":"text","value":"We can now express the policy gradient as follows. Note that the advantage function effectively replaces the ","position":{"start":{"line":544,"column":1},"end":{"line":544,"column":1}},"key":"RG4AgA7x0G"},{"type":"inlineMath","value":"Q","position":{"start":{"line":544,"column":1},"end":{"line":544,"column":1}},"html":"QQQ","key":"cl9ro7htR6"},{"type":"text","value":"-function from ","position":{"start":{"line":544,"column":1},"end":{"line":544,"column":1}},"key":"ZEEFBl5Sg5"},{"type":"crossReference","kind":"equation","identifier":"pg_with_q","label":"pg_with_q","children":[{"type":"text","value":"(","key":"CnrtL0Xh57"},{"type":"text","value":"25","key":"IKjQ4IgVpA"},{"type":"text","value":")","key":"IkEFespTBl"}],"template":"(%s)","enumerator":"25","resolved":true,"html_id":"pg-with-q","key":"XSzGE6cUdv"},{"type":"text","value":":","position":{"start":{"line":544,"column":1},"end":{"line":544,"column":1}},"key":"ON6mFpxf6T"}],"key":"ZsGePZTOxS"},{"type":"math","value":"\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{\\hi=0}^{\\hor-1} \\nabla \\log \\pi_\\theta(a_\\hi | s_\\hi) A^{\\pi_\\theta}_\\hi (s_\\hi, a_\\hi)\n\\right].","label":"pg_advantage","identifier":"pg_advantage","html":"J(θ)=Eτρθ[h=0H1logπθ(ahsh)Ahπθ(sh,ah)].\\nabla J(\\theta) = \\E_{\\tau \\sim \\rho_\\theta} \\left[\n \\sum_{\\hi=0}^{\\hor-1} \\nabla \\log \\pi_\\theta(a_\\hi | s_\\hi) A^{\\pi_\\theta}_\\hi (s_\\hi, a_\\hi)\n\\right].J(θ)=Eτρθ[h=0H1logπθ(ahsh)Ahπθ(sh,ah)].","enumerator":"29","html_id":"pg-advantage","key":"z5HsYgULGr"},{"type":"paragraph","position":{"start":{"line":554,"column":1},"end":{"line":554,"column":1}},"children":[{"type":"text","value":"Note that to avoid correlations between the gradient estimator and the value estimator (i.e. baseline), we must estimate them with independently sampled trajectories:","position":{"start":{"line":554,"column":1},"end":{"line":554,"column":1}},"key":"DZVQAJuLGF"}],"key":"Xv2LXgW0O3"},{"type":"comment","value":" TODO could use more explanation _why_ we want to avoid correlations ","key":"T8h5cb6ef2"},{"type":"comment","value":" Policy gradient with a learned baseline ","key":"WsOzaxPzis"}],"key":"Xc0nJvp17x"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def pg_with_learned_baseline_pseudocode(env, π, η, θ_init, K, N):\n θ = θ_init\n for k in range(K):\n trajectories = sample_trajectories(env, π(θ), N)\n V_hat = fit(trajectories) # estimates the value function of π(θ)\n τ = sample_trajectories(env, π(θ), 1)\n g = jnp.zeros_like(θ) # gradient estimator\n\n for h, (s, a) in enumerate(τ):\n def log_likelihood(θ_):\n return jnp.log(π(θ_)(s, a))\n g = g + jax.grad(log_likelihood)(θ) * (return_to_go(τ, h) - V_hat(s))\n \n θ = θ + η * g\n return θ","key":"b5KlQxcui2"},{"type":"output","id":"M02g4ZIXz70sRRe8XX91w","data":[],"key":"isr4MkLZVl"}],"data":{},"key":"o8HJy7gmAr"},{"type":"block","children":[{"type":"paragraph","position":{"start":{"line":578,"column":1},"end":{"line":579,"column":1}},"children":[{"type":"text","value":"Note that you could also generalize this by allowing the learning rate ","position":{"start":{"line":578,"column":1},"end":{"line":578,"column":1}},"key":"cbVjsMG3HE"},{"type":"text","value":"η","position":{"start":{"line":578,"column":1},"end":{"line":578,"column":1}},"key":"Uw5TqcDtDf"},{"type":"text","value":" to vary across steps,\nor take multiple trajectories ","position":{"start":{"line":578,"column":1},"end":{"line":578,"column":1}},"key":"VaXlbSfmAr"},{"type":"text","value":"τ","position":{"start":{"line":578,"column":1},"end":{"line":578,"column":1}},"key":"LOdttJD3fq"},{"type":"text","value":" and compute the sample average of the gradient estimates.","position":{"start":{"line":578,"column":1},"end":{"line":578,"column":1}},"key":"pn1qCowF28"}],"key":"AcwXOGP2ZD"},{"type":"paragraph","position":{"start":{"line":581,"column":1},"end":{"line":582,"column":1}},"children":[{"type":"text","value":"The baseline estimation step ","position":{"start":{"line":581,"column":1},"end":{"line":581,"column":1}},"key":"f7IpNeq4DN"},{"type":"inlineCode","value":"fit","position":{"start":{"line":581,"column":1},"end":{"line":581,"column":1}},"key":"a6a9bpGI26"},{"type":"text","value":" can be done using any appropriate supervised learning algorithm.\nNote that the gradient estimator will be unbiased regardless of the baseline.","position":{"start":{"line":581,"column":1},"end":{"line":581,"column":1}},"key":"FIDGSoiUBL"}],"key":"IqJpEZZ6jf"}],"key":"rP0lWbM6C8"},{"type":"block","position":{"start":{"line":584,"column":1},"end":{"line":584,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":586,"column":1},"end":{"line":586,"column":1}},"children":[{"type":"text","value":"Comparing policy gradient algorithms to policy iteration","position":{"start":{"line":586,"column":1},"end":{"line":586,"column":1}},"key":"ugU6Meswfq"}],"identifier":"comparing-policy-gradient-algorithms-to-policy-iteration","label":"Comparing policy gradient algorithms to policy iteration","html_id":"comparing-policy-gradient-algorithms-to-policy-iteration","implicit":true,"enumerator":"6","key":"PlEwLRd1vr"},{"type":"comment","value":" TODO maybe restructure this part ","key":"DCIEDL3E3C"},{"type":"paragraph","position":{"start":{"line":590,"column":1},"end":{"line":590,"column":1}},"children":[{"type":"text","value":"What advantages does the policy gradient algorithm have over the policy iteration algorithms covered in ","position":{"start":{"line":590,"column":1},"end":{"line":590,"column":1}},"key":"P6TOTTzK6Q"},{"type":"crossReference","position":{"start":{"line":590,"column":1},"end":{"line":590,"column":1}},"children":[{"type":"text","value":"Section ","key":"ihS3vCrmkn"},{"type":"text","value":"1.5.3.2","key":"c2FzQ505zV"}],"identifier":"policy_iteration","label":"policy_iteration","kind":"heading","template":"Section %s","enumerator":"1.5.3.2","resolved":true,"html_id":"policy-iteration","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"D8eNEQ2Bgx"},{"type":"text","value":"?","position":{"start":{"line":590,"column":1},"end":{"line":590,"column":1}},"key":"alS8APr0Dm"}],"key":"qw8El0zipM"},{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Policy iteration recap","position":{"start":{"line":592,"column":1},"end":{"line":592,"column":1}},"key":"c0CSYzXjd9"}],"key":"pG2zUazg1M"},{"type":"paragraph","position":{"start":{"line":593,"column":1},"end":{"line":593,"column":1}},"children":[{"type":"text","value":"Recall that policy iteration is an algorithm for MDPs with unknown state transitions where we alternate between these two steps:","position":{"start":{"line":593,"column":1},"end":{"line":593,"column":1}},"key":"bIC5h7rwbO"}],"key":"YwfaQZf3OS"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":595,"column":1},"end":{"line":596,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":595,"column":1},"end":{"line":595,"column":1}},"children":[{"type":"text","value":"Estimating the ","position":{"start":{"line":595,"column":1},"end":{"line":595,"column":1}},"key":"fAS4VdivLC"},{"type":"inlineMath","value":"Q","position":{"start":{"line":595,"column":1},"end":{"line":595,"column":1}},"html":"QQQ","key":"oiUvxVhkR4"},{"type":"text","value":"-function (or advantage function) of the current policy;","position":{"start":{"line":595,"column":1},"end":{"line":595,"column":1}},"key":"WglaQLi6sd"}],"key":"GxihYKXx4W"},{"type":"listItem","spread":true,"position":{"start":{"line":596,"column":1},"end":{"line":596,"column":1}},"children":[{"type":"text","value":"Updating the policy to be greedy with respect to this approximate ","position":{"start":{"line":596,"column":1},"end":{"line":596,"column":1}},"key":"AGCsrKAQ3s"},{"type":"inlineMath","value":"Q","position":{"start":{"line":596,"column":1},"end":{"line":596,"column":1}},"html":"QQQ","key":"z1Y50JFBrq"},{"type":"text","value":"-function (or advantage function).","position":{"start":{"line":596,"column":1},"end":{"line":596,"column":1}},"key":"OKXFpOWLds"}],"key":"MCz82rvWHS"}],"key":"fxv2weNucA"}],"key":"EfZUPYzyLX"},{"type":"paragraph","position":{"start":{"line":599,"column":1},"end":{"line":599,"column":1}},"children":[{"type":"text","value":"To analyze the difference between them, we’ll make use of the ","position":{"start":{"line":599,"column":1},"end":{"line":599,"column":1}},"key":"hYfxhnRzcB"},{"type":"strong","position":{"start":{"line":599,"column":1},"end":{"line":599,"column":1}},"children":[{"type":"text","value":"performance difference lemma","position":{"start":{"line":599,"column":1},"end":{"line":599,"column":1}},"key":"Dnos8j89Ao"}],"key":"tPUKqtneWm"},{"type":"text","value":", which provides an expression for comparing the difference between two value functions.","position":{"start":{"line":599,"column":1},"end":{"line":599,"column":1}},"key":"JGruxsxYg5"}],"key":"Y5xkIVuoiN"},{"type":"proof","kind":"theorem","label":"pdl","identifier":"pdl","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Performance difference lemma","position":{"start":{"line":601,"column":1},"end":{"line":601,"column":1}},"key":"ZZyycdD2ko"}],"key":"CbNNDxTpv0"},{"type":"paragraph","position":{"start":{"line":604,"column":1},"end":{"line":607,"column":1}},"children":[{"type":"text","value":"Suppose Alice is playing a game (an MDP).\nBob is spectating, and can evaluate how good an action is compared to his own strategy.\n(That is, Bob can compute his ","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"V56XgZ5q4N"},{"type":"emphasis","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"children":[{"type":"text","value":"advantage function","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"uOXXPvwHaf"}],"key":"NcqLhxHePS"},{"type":"text","value":" ","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"hDOJTNoUq9"},{"type":"inlineMath","value":"A_\\hi^{\\text{Bob}}(s_\\hi, a_\\hi)","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"html":"AhBob(sh,ah)A_\\hi^{\\text{Bob}}(s_\\hi, a_\\hi)AhBob(sh,ah)","key":"enAp3mtVmq"},{"type":"text","value":").\nThe performance difference lemma says that Bob can now calculate exactly how much better or worse he is than Alice as follows:","position":{"start":{"line":604,"column":1},"end":{"line":604,"column":1}},"key":"s69bVJ2cQT"}],"key":"WnFm3OZwpD"},{"type":"math","value":"V_0^{\\text{Alice}}(s) - V_0^{\\text{Bob}}(s) = \\E_{\\tau \\sim \\rho_{\\text{Alice}, s}} \\left[ \\sum_{h=0}^{H-1} A_\\hi^{\\text{Bob}} (s_\\hi, a_\\hi) \\right]","label":"pdl_eq","identifier":"pdl_eq","html":"V0Alice(s)V0Bob(s)=EτρAlice,s[h=0H1AhBob(sh,ah)]V_0^{\\text{Alice}}(s) - V_0^{\\text{Bob}}(s) = \\E_{\\tau \\sim \\rho_{\\text{Alice}, s}} \\left[ \\sum_{h=0}^{H-1} A_\\hi^{\\text{Bob}} (s_\\hi, a_\\hi) \\right]V0Alice(s)V0Bob(s)=EτρAlice,s[h=0H1AhBob(sh,ah)]","enumerator":"30","html_id":"pdl-eq","key":"CbdxXv5pzV"},{"type":"paragraph","position":{"start":{"line":614,"column":1},"end":{"line":614,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":614,"column":1},"end":{"line":614,"column":1}},"key":"rDyJDCmhYI"},{"type":"inlineMath","value":"\\rho_{\\text{Alice}, s}","position":{"start":{"line":614,"column":1},"end":{"line":614,"column":1}},"html":"ρAlice,s\\rho_{\\text{Alice}, s}ρAlice,s","key":"bN9HljE07G"},{"type":"text","value":" denotes the distribution over trajectories starting in state ","position":{"start":{"line":614,"column":1},"end":{"line":614,"column":1}},"key":"dAWj7DFY2V"},{"type":"inlineMath","value":"s","position":{"start":{"line":614,"column":1},"end":{"line":614,"column":1}},"html":"sss","key":"UDraNkJRtN"},{"type":"text","value":" when Alice is playing.","position":{"start":{"line":614,"column":1},"end":{"line":614,"column":1}},"key":"iQaj4SKjeP"}],"key":"yqww23oE1j"},{"type":"paragraph","position":{"start":{"line":616,"column":1},"end":{"line":617,"column":1}},"children":[{"type":"text","value":"To see why, consider a specific step ","position":{"start":{"line":616,"column":1},"end":{"line":616,"column":1}},"key":"l1tT0359B2"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":616,"column":1},"end":{"line":616,"column":1}},"html":"h\\hih","key":"AqXtCEmmnO"},{"type":"text","value":" in the trajectory. We compute how much better actions from Bob are than the actions from Alice, on average.\nBut this is exactly the average Bob-advantage across actions from Alice, as described in the PDL!","position":{"start":{"line":616,"column":1},"end":{"line":616,"column":1}},"key":"uzXYwsJkDF"}],"key":"apz0kpzPRc"},{"type":"paragraph","position":{"start":{"line":619,"column":1},"end":{"line":619,"column":1}},"children":[{"type":"text","value":"Formally, this corresponds to a nice telescoping simplification when we expand out the definition of the advantage function. Note that","position":{"start":{"line":619,"column":1},"end":{"line":619,"column":1}},"key":"SSkNhvyi44"}],"key":"qwanYVun84"},{"type":"math","value":"\\begin{aligned}\nA^\\pi_\\hi(s_\\hi, a_\\hi) &= Q^\\pi_\\hi(s_\\hi, a_\\hi) - V^\\pi_\\hi(s_\\hi) \\\\\n&= r_\\hi(s_\\hi, a_\\hi) + \\E_{s_{\\hi+1} \\sim P(s_\\hi, a_\\hi)} [V^\\pi_{\\hi+1}(s_{\\hi+1})] - V^\\pi_\\hi(s_\\hi)\n\\end{aligned}","position":{"start":{"line":621,"column":1},"end":{"line":626,"column":1}},"html":"Ahπ(sh,ah)=Qhπ(sh,ah)Vhπ(sh)=rh(sh,ah)+Esh+1P(sh,ah)[Vh+1π(sh+1)]Vhπ(sh)\\begin{aligned}\nA^\\pi_\\hi(s_\\hi, a_\\hi) &= Q^\\pi_\\hi(s_\\hi, a_\\hi) - V^\\pi_\\hi(s_\\hi) \\\\\n&= r_\\hi(s_\\hi, a_\\hi) + \\E_{s_{\\hi+1} \\sim P(s_\\hi, a_\\hi)} [V^\\pi_{\\hi+1}(s_{\\hi+1})] - V^\\pi_\\hi(s_\\hi)\n\\end{aligned}Ahπ(sh,ah)=Qhπ(sh,ah)Vhπ(sh)=rh(sh,ah)+Esh+1P(sh,ah)[Vh+1π(sh+1)]Vhπ(sh)","enumerator":"31","key":"BSca0O4BT6"},{"type":"paragraph","position":{"start":{"line":628,"column":1},"end":{"line":628,"column":1}},"children":[{"type":"text","value":"so expanding out the r.h.s. expression of ","position":{"start":{"line":628,"column":1},"end":{"line":628,"column":1}},"key":"OMVdaIHlzI"},{"type":"crossReference","kind":"equation","identifier":"pdl_eq","label":"pdl_eq","children":[{"type":"text","value":"(","key":"TiUYw139S9"},{"type":"text","value":"30","key":"zMbNYaNUwY"},{"type":"text","value":")","key":"S3CEpz91A8"}],"template":"(%s)","enumerator":"30","resolved":true,"html_id":"pdl-eq","key":"NYH5AzyLr1"},{"type":"text","value":" and grouping terms together gives","position":{"start":{"line":628,"column":1},"end":{"line":628,"column":1}},"key":"bhkLkbwwhZ"}],"key":"gSxVwZc6aI"},{"type":"math","value":"\\begin{aligned}\n\\E_{\\tau \\sim \\rho_{\\text{Alice}, s}} \\left[ \\sum_{\\hi=0}^{\\hor-1} A_\\hi^{\\text{Bob}} (s_\\hi, a_\\hi) \\right] &= \\E_{\\tau \\sim \\rho_{\\text{Alice}, s}} \\left[ \\left( \\sum_{\\hi=0}^{\\hor-1} r_\\hi(s_\\hi, a_\\hi) \\right) + \\left( V^{\\text{Bob}}_1(s_1) + \\cdots + V^{\\text{Bob}}_\\hor(s_\\hor) \\right) - \\left( V^{\\text{Bob}_0}(s_0) + \\cdots + V^{\\text{Bob}}_{\\hor-1}(s_{\\hor-1}) \\right) \\right] \\\\\n&= V^{\\text{Alice}}_0(s) - V^{\\text{Bob}}_0(s)\n\\end{aligned}","position":{"start":{"line":630,"column":1},"end":{"line":635,"column":1}},"html":"EτρAlice,s[h=0H1AhBob(sh,ah)]=EτρAlice,s[(h=0H1rh(sh,ah))+(V1Bob(s1)++VHBob(sH))(VBob0(s0)++VH1Bob(sH1))]=V0Alice(s)V0Bob(s)\\begin{aligned}\n\\E_{\\tau \\sim \\rho_{\\text{Alice}, s}} \\left[ \\sum_{\\hi=0}^{\\hor-1} A_\\hi^{\\text{Bob}} (s_\\hi, a_\\hi) \\right] &= \\E_{\\tau \\sim \\rho_{\\text{Alice}, s}} \\left[ \\left( \\sum_{\\hi=0}^{\\hor-1} r_\\hi(s_\\hi, a_\\hi) \\right) + \\left( V^{\\text{Bob}}_1(s_1) + \\cdots + V^{\\text{Bob}}_\\hor(s_\\hor) \\right) - \\left( V^{\\text{Bob}_0}(s_0) + \\cdots + V^{\\text{Bob}}_{\\hor-1}(s_{\\hor-1}) \\right) \\right] \\\\\n&= V^{\\text{Alice}}_0(s) - V^{\\text{Bob}}_0(s)\n\\end{aligned}EτρAlice,s[h=0H1AhBob(sh,ah)]=EτρAlice,s[(h=0H1rh(sh,ah))+(V1Bob(s1)++VHBob(sH))(VBob0(s0)++VH1Bob(sH1))]=V0Alice(s)V0Bob(s)","enumerator":"32","key":"IeSU434zo0"},{"type":"paragraph","position":{"start":{"line":637,"column":1},"end":{"line":637,"column":1}},"children":[{"type":"text","value":"as desired. (Note that the “inner” expectation from expanding the advantage function has the same distribution as the outer one, so omitting it here is valid.)","position":{"start":{"line":637,"column":1},"end":{"line":637,"column":1}},"key":"jsgiCnFC7m"}],"key":"zbAUKwGiKD"}],"enumerator":"1","html_id":"pdl","key":"NEOtn1T1G7"},{"type":"paragraph","position":{"start":{"line":640,"column":1},"end":{"line":645,"column":1}},"children":[{"type":"text","value":"The PDL gives insight into why fitted approaches such as PI don’t work as well in the “full” RL setting.\nTo see why, let’s consider a single iteration of policy iteration, where policy ","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"hwCVkuyinR"},{"type":"text","value":"π","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"SSfwMW34gj"},{"type":"text","value":" gets updated to ","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"uO13bHR8N4"},{"type":"inlineMath","value":"\\tilde \\pi","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"html":"π~\\tilde \\piπ~","key":"yIq7pqooDl"},{"type":"text","value":". We’ll assume these policies are deterministic.\nSuppose the new policy ","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"llRxsUleYz"},{"type":"inlineMath","value":"\\tilde \\pi","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"html":"π~\\tilde \\piπ~","key":"kNy5dsIRf1"},{"type":"text","value":" chooses some action with a negative advantage with respect to ","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"eHdpGumJJa"},{"type":"text","value":"π","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"N3OBO5FAMd"},{"type":"text","value":".\nThat is, when acting according to ","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"XYJSXWArRt"},{"type":"text","value":"π","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"dcrsDNRhMh"},{"type":"text","value":", taking the action from ","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"WYledyhAH3"},{"type":"inlineMath","value":"\\tilde \\pi","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"html":"π~\\tilde \\piπ~","key":"Z5FFDU17u6"},{"type":"text","value":" would perform worse than expected.\nDefine ","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"cinxNqAy36"},{"type":"inlineMath","value":"\\Delta_\\infty","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"html":"Δ\\Delta_\\inftyΔ","key":"SGdBZyT0y8"},{"type":"text","value":" to be the most negative advantage, that is, ","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"oUjcxEOQ88"},{"type":"inlineMath","value":"\\Delta_\\infty = \\min_{s \\in \\mathcal{S}} A^{\\pi}_\\hi(s, \\tilde \\pi(s))","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"html":"Δ=minsSAhπ(s,π~(s))\\Delta_\\infty = \\min_{s \\in \\mathcal{S}} A^{\\pi}_\\hi(s, \\tilde \\pi(s))Δ=minsSAhπ(s,π~(s))","key":"Y1HSaBHsuB"},{"type":"text","value":".\nPlugging this into the ","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"hmAFJgGRWO"},{"type":"crossReference","kind":"proof:theorem","identifier":"pdl","label":"pdl","children":[{"type":"text","value":"Theorem ","key":"DVT71eHUJ2"},{"type":"text","value":"1","key":"zV95WVXlPV"}],"template":"Theorem %s","enumerator":"1","resolved":true,"html_id":"pdl","key":"LNIRhANBbb"},{"type":"text","value":" gives","position":{"start":{"line":640,"column":1},"end":{"line":640,"column":1}},"key":"i0HNs0OdWR"}],"key":"PMKspaDfEZ"},{"type":"math","value":"\\begin{aligned}\nV_0^{\\tilde \\pi}(s) - V_0^{\\pi}(s) &= \\E_{\\tau \\sim \\rho_{\\tilde \\pi, s}} \\left[\n\\sum_{\\hi=0}^{\\hor-1} A_\\hi^{\\pi}(s_\\hi, a_\\hi)\n\\right] \\\\\n&\\ge H \\Delta_\\infty \\\\\nV_0^{\\tilde \\pi}(s) &\\ge V_0^{\\pi}(s) - H|\\Delta_\\infty|.\n\\end{aligned}","position":{"start":{"line":647,"column":1},"end":{"line":655,"column":1}},"html":"V0π~(s)V0π(s)=Eτρπ~,s[h=0H1Ahπ(sh,ah)]HΔV0π~(s)V0π(s)HΔ.\\begin{aligned}\nV_0^{\\tilde \\pi}(s) - V_0^{\\pi}(s) &= \\E_{\\tau \\sim \\rho_{\\tilde \\pi, s}} \\left[\n\\sum_{\\hi=0}^{\\hor-1} A_\\hi^{\\pi}(s_\\hi, a_\\hi)\n\\right] \\\\\n&\\ge H \\Delta_\\infty \\\\\nV_0^{\\tilde \\pi}(s) &\\ge V_0^{\\pi}(s) - H|\\Delta_\\infty|.\n\\end{aligned}V0π~(s)V0π(s)V0π~(s)=Eτρπ~,s[h=0H1Ahπ(sh,ah)]HΔV0π(s)HΔ∣.","enumerator":"33","key":"c5dyAP1EbF"},{"type":"paragraph","position":{"start":{"line":657,"column":1},"end":{"line":663,"column":1}},"children":[{"type":"text","value":"That is, for some state ","position":{"start":{"line":657,"column":1},"end":{"line":657,"column":1}},"key":"phBCn8kdEL"},{"type":"inlineMath","value":"s","position":{"start":{"line":657,"column":1},"end":{"line":657,"column":1}},"html":"sss","key":"HbRQYCvX0n"},{"type":"text","value":", the lower bound on the performance of ","position":{"start":{"line":657,"column":1},"end":{"line":657,"column":1}},"key":"iWmaSHAJ0w"},{"type":"inlineMath","value":"\\tilde \\pi","position":{"start":{"line":657,"column":1},"end":{"line":657,"column":1}},"html":"π~\\tilde \\piπ~","key":"ttlXH5scE9"},{"type":"text","value":" is ","position":{"start":{"line":657,"column":1},"end":{"line":657,"column":1}},"key":"xHLadPp6Ci"},{"type":"emphasis","position":{"start":{"line":657,"column":1},"end":{"line":657,"column":1}},"children":[{"type":"text","value":"lower","position":{"start":{"line":657,"column":1},"end":{"line":657,"column":1}},"key":"rDdyeHkODj"}],"key":"lmGB7RvuNv"},{"type":"text","value":" than the performance of ","position":{"start":{"line":657,"column":1},"end":{"line":657,"column":1}},"key":"KCfQqXijZy"},{"type":"text","value":"π","position":{"start":{"line":657,"column":1},"end":{"line":657,"column":1}},"key":"uQiC7Hyo6Z"},{"type":"text","value":".\nThis doesn’t state that ","position":{"start":{"line":657,"column":1},"end":{"line":657,"column":1}},"key":"i9MKhK9kJt"},{"type":"inlineMath","value":"\\tilde \\pi","position":{"start":{"line":657,"column":1},"end":{"line":657,"column":1}},"html":"π~\\tilde \\piπ~","key":"lujhSqmORr"},{"type":"text","value":" ","position":{"start":{"line":657,"column":1},"end":{"line":657,"column":1}},"key":"VAdgF73eVq"},{"type":"emphasis","position":{"start":{"line":657,"column":1},"end":{"line":657,"column":1}},"children":[{"type":"text","value":"will","position":{"start":{"line":657,"column":1},"end":{"line":657,"column":1}},"key":"oqqcVIuzut"}],"key":"rKWnGK5sYu"},{"type":"text","value":" necessarily perform worse than ","position":{"start":{"line":657,"column":1},"end":{"line":657,"column":1}},"key":"eVX1RF80sj"},{"type":"text","value":"π","position":{"start":{"line":657,"column":1},"end":{"line":657,"column":1}},"key":"gaESpcdRGc"},{"type":"text","value":",\nonly suggests that it might be possible.\nIf these worst case states do exist, though,\nPI does not avoid situations where the new policy often visits them;\nIt does not enforce that the trajectory distributions ","position":{"start":{"line":657,"column":1},"end":{"line":657,"column":1}},"key":"M5yGpWXT3z"},{"type":"inlineMath","value":"\\rho_\\pi","position":{"start":{"line":657,"column":1},"end":{"line":657,"column":1}},"html":"ρπ\\rho_\\piρπ","key":"l02rqygX3n"},{"type":"text","value":" and ","position":{"start":{"line":657,"column":1},"end":{"line":657,"column":1}},"key":"LHLdkCuGye"},{"type":"inlineMath","value":"\\rho_{\\tilde \\pi}","position":{"start":{"line":657,"column":1},"end":{"line":657,"column":1}},"html":"ρπ~\\rho_{\\tilde \\pi}ρπ~","key":"YfHIv4D3L9"},{"type":"text","value":" be close to each other.\nIn other words, the “training distribution” that our prediction rule is fitted on, ","position":{"start":{"line":657,"column":1},"end":{"line":657,"column":1}},"key":"O2pGuu8fgA"},{"type":"inlineMath","value":"\\rho_\\pi","position":{"start":{"line":657,"column":1},"end":{"line":657,"column":1}},"html":"ρπ\\rho_\\piρπ","key":"czuvrz2HgN"},{"type":"text","value":", may differ significantly from the “evaluation distribution” ","position":{"start":{"line":657,"column":1},"end":{"line":657,"column":1}},"key":"epLxwVELpN"},{"type":"inlineMath","value":"\\rho_{\\tilde \\pi}","position":{"start":{"line":657,"column":1},"end":{"line":657,"column":1}},"html":"ρπ~\\rho_{\\tilde \\pi}ρπ~","key":"C6u0xTb9ae"},{"type":"text","value":".","position":{"start":{"line":657,"column":1},"end":{"line":657,"column":1}},"key":"lUqy2PeDgC"}],"key":"Wq5Q6N8iSj"},{"type":"comment","value":" \nThis is an instance of *distributional shift*.\nTo begin, let's ask, where *do* fitted approaches work well?\nThey are commonly seen in SL,\nwhere a prediction rule is fit using some labelled training set,\nand then assessed on a test set from the same distribution.\nBut policy iteration isn't performed in the same scenario:\nthere is now _distributional shift_ between the different iterations of the policy. ","key":"T3FzeOJxvt"},{"type":"paragraph","position":{"start":{"line":674,"column":1},"end":{"line":680,"column":1}},"children":[{"type":"text","value":"On the other hand, policy gradient methods ","position":{"start":{"line":674,"column":1},"end":{"line":674,"column":1}},"key":"s1O730NOKa"},{"type":"emphasis","position":{"start":{"line":674,"column":1},"end":{"line":674,"column":1}},"children":[{"type":"text","value":"do","position":{"start":{"line":674,"column":1},"end":{"line":674,"column":1}},"key":"qic4O9SI2z"}],"key":"iGrYn934Tn"},{"type":"text","value":", albeit implicitly,\nencourage ","position":{"start":{"line":674,"column":1},"end":{"line":674,"column":1}},"key":"XlBqXF3pth"},{"type":"inlineMath","value":"\\rho_\\pi","position":{"start":{"line":674,"column":1},"end":{"line":674,"column":1}},"html":"ρπ\\rho_\\piρπ","key":"oEEqpwFwGO"},{"type":"text","value":" and ","position":{"start":{"line":674,"column":1},"end":{"line":674,"column":1}},"key":"YP3T1xATmj"},{"type":"inlineMath","value":"\\rho_{\\tilde \\pi}","position":{"start":{"line":674,"column":1},"end":{"line":674,"column":1}},"html":"ρπ~\\rho_{\\tilde \\pi}ρπ~","key":"XIavpj4gro"},{"type":"text","value":" to be similar.\nSuppose that the mapping from policy parameters to trajectory distributions is relatively smooth.\nThen, by adjusting the parameters only a small distance,\nthe new policy will also have a similar trajectory distribution.\nBut this is not very rigorous, and in practice the parameter-to-distribution mapping may not be so smooth.\nCan we constrain the distance between the resulting distributions more ","position":{"start":{"line":674,"column":1},"end":{"line":674,"column":1}},"key":"c3nKCS8eH1"},{"type":"emphasis","position":{"start":{"line":674,"column":1},"end":{"line":674,"column":1}},"children":[{"type":"text","value":"explicitly","position":{"start":{"line":674,"column":1},"end":{"line":674,"column":1}},"key":"DphHdJcj3z"}],"key":"U9U4TlWZg5"},{"type":"text","value":"?","position":{"start":{"line":674,"column":1},"end":{"line":674,"column":1}},"key":"YtYjOJGVU0"}],"key":"Sh8yCrE2EM"},{"type":"paragraph","position":{"start":{"line":682,"column":1},"end":{"line":682,"column":1}},"children":[{"type":"text","value":"This brings us to the next three methods:","position":{"start":{"line":682,"column":1},"end":{"line":682,"column":1}},"key":"KDQxcGZggI"}],"key":"gugihjjzf9"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":683,"column":1},"end":{"line":686,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":683,"column":1},"end":{"line":683,"column":1}},"children":[{"type":"strong","position":{"start":{"line":683,"column":1},"end":{"line":683,"column":1}},"children":[{"type":"text","value":"trust region policy optimization","position":{"start":{"line":683,"column":1},"end":{"line":683,"column":1}},"key":"EdC0wVEgAb"}],"key":"xfxumnNp8D"},{"type":"text","value":" (TRPO), which explicitly constrains the difference between the distributions before and after each step;","position":{"start":{"line":683,"column":1},"end":{"line":683,"column":1}},"key":"MVAYIdgjUx"}],"key":"FYXzQSoT3F"},{"type":"listItem","spread":true,"position":{"start":{"line":684,"column":1},"end":{"line":684,"column":1}},"children":[{"type":"text","value":"the ","position":{"start":{"line":684,"column":1},"end":{"line":684,"column":1}},"key":"VT30QfkCQA"},{"type":"strong","position":{"start":{"line":684,"column":1},"end":{"line":684,"column":1}},"children":[{"type":"text","value":"natural policy gradient","position":{"start":{"line":684,"column":1},"end":{"line":684,"column":1}},"key":"ouVUASDp5T"}],"key":"GS5lhMSDav"},{"type":"text","value":" (NPG), a first-order approximation of TRPO;","position":{"start":{"line":684,"column":1},"end":{"line":684,"column":1}},"key":"uSQwGpk02r"}],"key":"aGecjm5utY"},{"type":"listItem","spread":true,"position":{"start":{"line":685,"column":1},"end":{"line":686,"column":1}},"children":[{"type":"strong","position":{"start":{"line":685,"column":1},"end":{"line":685,"column":1}},"children":[{"type":"text","value":"proximal policy optimization","position":{"start":{"line":685,"column":1},"end":{"line":685,"column":1}},"key":"MRDBbRxJFp"}],"key":"y30oMqf4Q0"},{"type":"text","value":" (PPO), a “soft relaxation” of TRPO.","position":{"start":{"line":685,"column":1},"end":{"line":685,"column":1}},"key":"XfrgtdKAcc"}],"key":"eEoSgI76TY"}],"key":"Bly4cFcSrj"}],"key":"McKqcf3vqc"},{"type":"block","position":{"start":{"line":687,"column":1},"end":{"line":687,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":690,"column":1},"end":{"line":690,"column":1}},"children":[{"type":"text","value":"Trust region policy optimization","position":{"start":{"line":690,"column":1},"end":{"line":690,"column":1}},"key":"rZX12mlmr9"}],"identifier":"trust-region-policy-optimization","label":"Trust region policy optimization","html_id":"trust-region-policy-optimization","implicit":true,"enumerator":"7","key":"GxdkuGoTg5"},{"type":"paragraph","position":{"start":{"line":692,"column":1},"end":{"line":696,"column":1}},"children":[{"type":"text","value":"We saw above that policy gradient methods are effective because they implicitly constrain how much the policy changes at each iteration.\nCan we design an algorithm that ","position":{"start":{"line":692,"column":1},"end":{"line":692,"column":1}},"key":"aHOnWzsWSW"},{"type":"emphasis","position":{"start":{"line":692,"column":1},"end":{"line":692,"column":1}},"children":[{"type":"text","value":"explicitly","position":{"start":{"line":692,"column":1},"end":{"line":692,"column":1}},"key":"V6Ty7VU81v"}],"key":"nXNCtG9Cnn"},{"type":"text","value":" constrains the “step size”?\nThat is, we want to ","position":{"start":{"line":692,"column":1},"end":{"line":692,"column":1}},"key":"bRoMpq9LNa"},{"type":"emphasis","position":{"start":{"line":692,"column":1},"end":{"line":692,"column":1}},"children":[{"type":"text","value":"improve","position":{"start":{"line":692,"column":1},"end":{"line":692,"column":1}},"key":"RZWy7AFlzK"}],"key":"z4VR1PtK8g"},{"type":"text","value":" the policy as much as possible,\nmeasured in terms of the r.h.s. of the ","position":{"start":{"line":692,"column":1},"end":{"line":692,"column":1}},"key":"NSnTai4CUC"},{"type":"crossReference","kind":"proof:theorem","identifier":"pdl","label":"pdl","children":[{"type":"text","value":"Theorem ","key":"uWHuBWsa4H"},{"type":"text","value":"1","key":"ygTyKSshqf"}],"template":"Theorem %s","enumerator":"1","resolved":true,"html_id":"pdl","key":"ILjeRzbT7b"},{"type":"text","value":",\nwhile ensuring that its trajectory distribution does not change too much:","position":{"start":{"line":692,"column":1},"end":{"line":692,"column":1}},"key":"K9rm64bdgr"}],"key":"k7C6aa9S2Z"},{"type":"math","value":"\\begin{aligned}\n\\theta^{k+1} &\\gets \\arg\\max_{\\theta^{\\text{opt}}} \\E_{s_0, \\dots, s_{H-1} \\sim \\pi^{k}} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\E_{a_\\hi \\sim \\pi^{\\theta^\\text{opt}}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi) \\right] \\\\\n& \\text{where } \\text{distance}(\\rho_{\\theta^{\\text{opt}}}, \\rho_{\\theta^k}) < \\delta\n\\end{aligned}","position":{"start":{"line":698,"column":1},"end":{"line":703,"column":1}},"html":"θk+1argmaxθoptEs0,,sH1πk[h=0H1Eahπθopt(sh)Aπk(sh,ah)]where distance(ρθopt,ρθk)<δ\\begin{aligned}\n\\theta^{k+1} &\\gets \\arg\\max_{\\theta^{\\text{opt}}} \\E_{s_0, \\dots, s_{H-1} \\sim \\pi^{k}} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\E_{a_\\hi \\sim \\pi^{\\theta^\\text{opt}}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi) \\right] \\\\\n& \\text{where } \\text{distance}(\\rho_{\\theta^{\\text{opt}}}, \\rho_{\\theta^k}) < \\delta\n\\end{aligned}θk+1argθoptmaxEs0,,sH1πk[h=0H1Eahπθopt(sh)Aπk(sh,ah)]where distance(ρθopt,ρθk)<δ","enumerator":"34","key":"LXszJbaQJH"},{"type":"paragraph","position":{"start":{"line":705,"column":1},"end":{"line":711,"column":1}},"children":[{"type":"text","value":"Note that we have made a small change to the r.h.s. expression:\nwe use the ","position":{"start":{"line":705,"column":1},"end":{"line":705,"column":1}},"key":"MtuX1LS6Xc"},{"type":"emphasis","position":{"start":{"line":705,"column":1},"end":{"line":705,"column":1}},"children":[{"type":"text","value":"states","position":{"start":{"line":705,"column":1},"end":{"line":705,"column":1}},"key":"hjRX8LhZB3"}],"key":"fPyrYXxDsY"},{"type":"text","value":" sampled from the old policy, and only use the ","position":{"start":{"line":705,"column":1},"end":{"line":705,"column":1}},"key":"KFcqjauFOm"},{"type":"emphasis","position":{"start":{"line":705,"column":1},"end":{"line":705,"column":1}},"children":[{"type":"text","value":"actions","position":{"start":{"line":705,"column":1},"end":{"line":705,"column":1}},"key":"JSUIBxEyge"}],"key":"nIX9WUTMeI"},{"type":"text","value":" from the new policy.\nIt would be computationally infeasible to sample entire trajectories from ","position":{"start":{"line":705,"column":1},"end":{"line":705,"column":1}},"key":"pKBi82xh4l"},{"type":"inlineMath","value":"\\pi_\\theta","position":{"start":{"line":705,"column":1},"end":{"line":705,"column":1}},"html":"πθ\\pi_\\thetaπθ","key":"MiYETNfVij"},{"type":"text","value":" as we are optimizing over ","position":{"start":{"line":705,"column":1},"end":{"line":705,"column":1}},"key":"gNcR8IIm1Z"},{"type":"text","value":"θ","position":{"start":{"line":705,"column":1},"end":{"line":705,"column":1}},"key":"IvIEDKbGT7"},{"type":"text","value":".\nOn the other hand, if ","position":{"start":{"line":705,"column":1},"end":{"line":705,"column":1}},"key":"gS1frsFqxg"},{"type":"inlineMath","value":"\\pi_\\theta","position":{"start":{"line":705,"column":1},"end":{"line":705,"column":1}},"html":"πθ\\pi_\\thetaπθ","key":"qTZHRHQ1qD"},{"type":"text","value":" returns a vector representing a probability distribution over actions,\nthen evaluating the expected advantage with respect to this distribution only requires taking a dot product.\nThis approximation also matches the r.h.s. of the PDL to first order in ","position":{"start":{"line":705,"column":1},"end":{"line":705,"column":1}},"key":"JJQkGCqvhb"},{"type":"text","value":"θ","position":{"start":{"line":705,"column":1},"end":{"line":705,"column":1}},"key":"y6J1u6hjA2"},{"type":"text","value":".\n(We will elaborate more on this later.)","position":{"start":{"line":705,"column":1},"end":{"line":705,"column":1}},"key":"jVDwXXVqOM"}],"key":"XET7AH1IbQ"},{"type":"paragraph","position":{"start":{"line":713,"column":1},"end":{"line":714,"column":1}},"children":[{"type":"text","value":"How do we describe the distance between ","position":{"start":{"line":713,"column":1},"end":{"line":713,"column":1}},"key":"RtbWbiF5YZ"},{"type":"inlineMath","value":"\\rho_{\\theta^{\\text{opt}}}","position":{"start":{"line":713,"column":1},"end":{"line":713,"column":1}},"html":"ρθopt\\rho_{\\theta^{\\text{opt}}}ρθopt","key":"QhVuvHfPOh"},{"type":"text","value":" and ","position":{"start":{"line":713,"column":1},"end":{"line":713,"column":1}},"key":"pE6VRCDlxJ"},{"type":"inlineMath","value":"\\rho_{\\theta^k}","position":{"start":{"line":713,"column":1},"end":{"line":713,"column":1}},"html":"ρθk\\rho_{\\theta^k}ρθk","key":"xMsxFKsjvM"},{"type":"text","value":"?\nWe’ll use the ","position":{"start":{"line":713,"column":1},"end":{"line":713,"column":1}},"key":"ulz4HaP9lT"},{"type":"strong","position":{"start":{"line":713,"column":1},"end":{"line":713,"column":1}},"children":[{"type":"text","value":"Kullback-Leibler divergence (KLD)","position":{"start":{"line":713,"column":1},"end":{"line":713,"column":1}},"key":"jJaskJ3vVJ"}],"key":"cNNNfHNTlP"},{"type":"text","value":":","position":{"start":{"line":713,"column":1},"end":{"line":713,"column":1}},"key":"pESNviFHlc"}],"key":"Hop8ZJ2xbo"},{"type":"proof","kind":"definition","label":"kld","identifier":"kld","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Kullback-Leibler divergence","position":{"start":{"line":716,"column":1},"end":{"line":716,"column":1}},"key":"qftulyAVQs"}],"key":"cuaVrgeEFo"},{"type":"paragraph","position":{"start":{"line":719,"column":1},"end":{"line":719,"column":1}},"children":[{"type":"text","value":"For two PDFs ","position":{"start":{"line":719,"column":1},"end":{"line":719,"column":1}},"key":"LrS8j8RGUc"},{"type":"inlineMath","value":"p, q","position":{"start":{"line":719,"column":1},"end":{"line":719,"column":1}},"html":"p,qp, qp,q","key":"PHQ3TGSEp8"},{"type":"text","value":",","position":{"start":{"line":719,"column":1},"end":{"line":719,"column":1}},"key":"PrybjoVd2z"}],"key":"GlJbxjp5tS"},{"type":"math","value":"\\kl{p}{q} := \\E_{x \\sim p} \\left[ \\log \\frac{p(x)}{q(x)} \\right]","position":{"start":{"line":721,"column":1},"end":{"line":721,"column":1}},"html":"KL(pq):=Exp[logp(x)q(x)]\\kl{p}{q} := \\E_{x \\sim p} \\left[ \\log \\frac{p(x)}{q(x)} \\right]KL(pq):=Exp[logq(x)p(x)]","enumerator":"35","key":"kx6EhuFbfK"},{"type":"paragraph","position":{"start":{"line":723,"column":1},"end":{"line":726,"column":1}},"children":[{"type":"text","value":"This can be interpreted in many different ways, many stemming from information theory.\nOne such interpretation is that ","position":{"start":{"line":723,"column":1},"end":{"line":723,"column":1}},"key":"xmCCr2SYb3"},{"type":"inlineMath","value":"\\kl{p}{q}","position":{"start":{"line":723,"column":1},"end":{"line":723,"column":1}},"html":"KL(pq)\\kl{p}{q}KL(pq)","key":"A0zZXqAdrp"},{"type":"text","value":" describes my average “surprise” if I ","position":{"start":{"line":723,"column":1},"end":{"line":723,"column":1}},"key":"bRvHBggpMc"},{"type":"emphasis","position":{"start":{"line":723,"column":1},"end":{"line":723,"column":1}},"children":[{"type":"text","value":"think","position":{"start":{"line":723,"column":1},"end":{"line":723,"column":1}},"key":"WneIDPwDda"}],"key":"DWlkQi0VwA"},{"type":"text","value":" data is being generated by ","position":{"start":{"line":723,"column":1},"end":{"line":723,"column":1}},"key":"QszSZO1mVp"},{"type":"inlineMath","value":"q","position":{"start":{"line":723,"column":1},"end":{"line":723,"column":1}},"html":"qqq","key":"ONUzo1lFX7"},{"type":"text","value":" but it’s actually generated by ","position":{"start":{"line":723,"column":1},"end":{"line":723,"column":1}},"key":"Uas9Bt5jny"},{"type":"inlineMath","value":"p","position":{"start":{"line":723,"column":1},"end":{"line":723,"column":1}},"html":"ppp","key":"Xc7kGYn3zs"},{"type":"text","value":".\n(The ","position":{"start":{"line":723,"column":1},"end":{"line":723,"column":1}},"key":"wVVwxlPBCE"},{"type":"strong","position":{"start":{"line":723,"column":1},"end":{"line":723,"column":1}},"children":[{"type":"text","value":"surprise","position":{"start":{"line":723,"column":1},"end":{"line":723,"column":1}},"key":"e8lPocOWvr"}],"key":"gzM8edZukp"},{"type":"text","value":" of an event with probability ","position":{"start":{"line":723,"column":1},"end":{"line":723,"column":1}},"key":"fHm7mYQVNj"},{"type":"inlineMath","value":"p","position":{"start":{"line":723,"column":1},"end":{"line":723,"column":1}},"html":"ppp","key":"u5uSaZdynO"},{"type":"text","value":" is ","position":{"start":{"line":723,"column":1},"end":{"line":723,"column":1}},"key":"zYOeZd9c8i"},{"type":"inlineMath","value":"- \\log_2 p","position":{"start":{"line":723,"column":1},"end":{"line":723,"column":1}},"html":"log2p- \\log_2 plog2p","key":"ZGLUDzCVJj"},{"type":"text","value":".)\nNote that ","position":{"start":{"line":723,"column":1},"end":{"line":723,"column":1}},"key":"gjABmj9b9R"},{"type":"inlineMath","value":"\\kl{p}{q} = 0","position":{"start":{"line":723,"column":1},"end":{"line":723,"column":1}},"html":"KL(pq)=0\\kl{p}{q} = 0KL(pq)=0","key":"eaVwJgpeWO"},{"type":"text","value":" if and only if ","position":{"start":{"line":723,"column":1},"end":{"line":723,"column":1}},"key":"I4jfvtsQx6"},{"type":"inlineMath","value":"p = q","position":{"start":{"line":723,"column":1},"end":{"line":723,"column":1}},"html":"p=qp = qp=q","key":"ZABhUhZVwR"},{"type":"text","value":". Also note that it is generally ","position":{"start":{"line":723,"column":1},"end":{"line":723,"column":1}},"key":"ay6bH8gVNq"},{"type":"emphasis","position":{"start":{"line":723,"column":1},"end":{"line":723,"column":1}},"children":[{"type":"text","value":"not","position":{"start":{"line":723,"column":1},"end":{"line":723,"column":1}},"key":"b7mxnF5Y2z"}],"key":"tcp5cM08kQ"},{"type":"text","value":" symmetric.","position":{"start":{"line":723,"column":1},"end":{"line":723,"column":1}},"key":"onRbtVYTvp"}],"key":"J1B72GRDV0"}],"enumerator":"3","html_id":"kld","key":"xLb7KWWIVo"},{"type":"paragraph","position":{"start":{"line":729,"column":1},"end":{"line":732,"column":1}},"children":[{"type":"text","value":"Both the objective function and the KLD constraint involve a weighted average over the space of all trajectories.\nThis is intractable in general, so we need to estimate the expectation.\nAs before, we can do this by taking an empirical average over samples from the trajectory distribution.\nThis gives us the following pseudocode:","position":{"start":{"line":729,"column":1},"end":{"line":729,"column":1}},"key":"NN4OhuHk1j"}],"key":"RT4TrdQCJU"},{"type":"proof","kind":"definition","label":"trpo","identifier":"trpo","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Trust region policy optimization (exact)","position":{"start":{"line":734,"column":1},"end":{"line":734,"column":1}},"key":"sbseGkea2J"}],"key":"uQWczmIH2c"},{"type":"code","lang":"python","value":"def trpo_pseudocode(env, δ, θ_init, M):\n θ = θ_init\n for k in range(K):\n trajectories = sample_trajectories(env, π(θ), M)\n A_hat = fit(trajectories)\n \n def approximate_gain(θ_):\n total_advantage = 0\n for τ in trajectories:\n for s, _a, _r in τ:\n for a in env.action_space:\n total_advantage += π(θ)(s, a) * A_hat(s, a)\n return total_advantage\n \n def constraint(θ_):\n kl_div = 0\n for τ in trajectories:\n for s, a, _r in τ:\n kl_div += jnp.log(π(θ)(s, a)) - jnp.log(π(θ_)(s, a))\n return kl_div <= δ\n \n θ = optimize(approximate_gain, constraint)\n\n return θ","position":{"start":{"line":738,"column":1},"end":{"line":763,"column":1}},"key":"ogqu9yXB05"}],"enumerator":"4","html_id":"trpo","key":"sBaw6mAPfg"},{"type":"comment","value":"\nApplying importance sampling allows us to estimate the TRPO objective as follows:\n\n::::{prf:definition} Trust region policy optimization (implementation)\n:label: trpo_implement\n\n:::{prf:definitionic} TODO\nInitialize $\\theta^0$\n\nSample $N$ trajectories from $\\rho^k$ to learn a value estimator $\\tilde b_\\hi(s) \\approx V^{\\pi^k}_\\hi(s)$\n\nSample $M$ trajectories $\\tau_0, \\dots, \\tau_{M-1} \\sim \\rho^k$\n\n$$\\begin{gathered}\n \\theta^{k+1} \\gets \\arg\\max_{\\theta} \\frac{1}{M} \\sum_{m=0}^{M-1} \\sum_{h=0}^{H-1} \\frac{\\pi_\\theta(a_\\hi \\mid s_\\hi)}{\\pi^k(a_\\hi \\mid s_\\hi)} [ R_\\hi(\\tau_m) - \\tilde b_\\hi(s_\\hi) ] \\\\\n \\text{where } \\sum_{m=0}^{M-1} \\sum_{h=0}^{H-1} \\log \\frac{\\pi_k(a_\\hi^m \\mid s_\\hi^m)}{\\pi_\\theta(a_\\hi^m \\mid s_\\hi^m)} \\le \\delta\n \n\\end{gathered}$$\n:::\n:::: ","key":"HiBz4JuFGC"},{"type":"paragraph","position":{"start":{"line":787,"column":1},"end":{"line":794,"column":1}},"children":[{"type":"text","value":"The above isn’t entirely complete:\nwe still need to solve the actual optimization problem at each step.\nUnless we know additional properties of the problem,\nthis might be an intractable optimization.\nDo we need to solve it exactly, though?\nInstead, if we assume that both the objective function and the constraint are somewhat smooth in terms of the policy parameters,\nwe can use their ","position":{"start":{"line":787,"column":1},"end":{"line":787,"column":1}},"key":"jpGOPL6b3L"},{"type":"emphasis","position":{"start":{"line":787,"column":1},"end":{"line":787,"column":1}},"children":[{"type":"text","value":"Taylor expansions","position":{"start":{"line":787,"column":1},"end":{"line":787,"column":1}},"key":"U8YPc47OA4"}],"key":"rshT0gOdxW"},{"type":"text","value":" to give us a simpler optimization problem with a closed-form solution.\nThis brings us to the ","position":{"start":{"line":787,"column":1},"end":{"line":787,"column":1}},"key":"PFnRc0EDg2"},{"type":"strong","position":{"start":{"line":787,"column":1},"end":{"line":787,"column":1}},"children":[{"type":"text","value":"natural policy gradient","position":{"start":{"line":787,"column":1},"end":{"line":787,"column":1}},"key":"XFuyBJoLZA"}],"key":"qJAEvwYYeu"},{"type":"text","value":" algorithm.","position":{"start":{"line":787,"column":1},"end":{"line":787,"column":1}},"key":"oolWTeSI9E"}],"key":"K1q3U8I9ia"}],"key":"AUIVGTH5cH"},{"type":"block","position":{"start":{"line":796,"column":1},"end":{"line":796,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":798,"column":1},"end":{"line":798,"column":1}},"children":[{"type":"text","value":"Natural policy gradient","position":{"start":{"line":798,"column":1},"end":{"line":798,"column":1}},"key":"pAkpR9uvHc"}],"identifier":"natural-policy-gradient","label":"Natural policy gradient","html_id":"natural-policy-gradient","implicit":true,"enumerator":"8","key":"wasP5ZkCtT"},{"type":"paragraph","position":{"start":{"line":800,"column":1},"end":{"line":801,"column":1}},"children":[{"type":"text","value":"We take a ","position":{"start":{"line":800,"column":1},"end":{"line":800,"column":1}},"key":"dR3e8aw2Qw"},{"type":"emphasis","position":{"start":{"line":800,"column":1},"end":{"line":800,"column":1}},"children":[{"type":"text","value":"linear","position":{"start":{"line":800,"column":1},"end":{"line":800,"column":1}},"key":"rk5lB0V2NT"}],"key":"fUlydO7jxD"},{"type":"text","value":" (first-order) approximation to the objective function and a ","position":{"start":{"line":800,"column":1},"end":{"line":800,"column":1}},"key":"lg1RYfOpGm"},{"type":"emphasis","position":{"start":{"line":800,"column":1},"end":{"line":800,"column":1}},"children":[{"type":"text","value":"quadratic","position":{"start":{"line":800,"column":1},"end":{"line":800,"column":1}},"key":"GKei9iyMWx"}],"key":"dot6wMnjBi"},{"type":"text","value":" (second-order) approximation to the KL divergence constraint about the current estimate ","position":{"start":{"line":800,"column":1},"end":{"line":800,"column":1}},"key":"j2zEp9vZXE"},{"type":"inlineMath","value":"\\theta^k","position":{"start":{"line":800,"column":1},"end":{"line":800,"column":1}},"html":"θk\\theta^kθk","key":"zuzqW2IPik"},{"type":"text","value":".\nThis results in the optimization problem","position":{"start":{"line":800,"column":1},"end":{"line":800,"column":1}},"key":"FMlpGj6oaE"}],"key":"z3fs6CAiJA"},{"type":"math","value":"\\begin{gathered}\n \\max_\\theta \\nabla_\\theta J(\\pi_{\\theta^k})^\\top (\\theta - \\theta^k) \\\\\n \\text{where } \\frac{1}{2} (\\theta - \\theta^k)^\\top F_{\\theta^k} (\\theta - \\theta^k) \\le \\delta\n\\end{gathered}","label":"npg_optimization","identifier":"npg_optimization","html":"maxθθJ(πθk)(θθk)where 12(θθk)Fθk(θθk)δ\\begin{gathered}\n \\max_\\theta \\nabla_\\theta J(\\pi_{\\theta^k})^\\top (\\theta - \\theta^k) \\\\\n \\text{where } \\frac{1}{2} (\\theta - \\theta^k)^\\top F_{\\theta^k} (\\theta - \\theta^k) \\le \\delta\n\\end{gathered}θmaxθJ(πθk)(θθk)where 21(θθk)Fθk(θθk)δ","enumerator":"36","html_id":"npg-optimization","key":"Othfd9nKv0"},{"type":"paragraph","position":{"start":{"line":812,"column":1},"end":{"line":812,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":812,"column":1},"end":{"line":812,"column":1}},"key":"v9x7oKqf8n"},{"type":"inlineMath","value":"F_{\\theta^k}","position":{"start":{"line":812,"column":1},"end":{"line":812,"column":1}},"html":"FθkF_{\\theta^k}Fθk","key":"aVEeKny1r1"},{"type":"text","value":" is the ","position":{"start":{"line":812,"column":1},"end":{"line":812,"column":1}},"key":"yKYoFwUDcZ"},{"type":"strong","position":{"start":{"line":812,"column":1},"end":{"line":812,"column":1}},"children":[{"type":"text","value":"Fisher information matrix","position":{"start":{"line":812,"column":1},"end":{"line":812,"column":1}},"key":"icM6DuCFlQ"}],"key":"hzrc4bkVha"},{"type":"text","value":" defined below.","position":{"start":{"line":812,"column":1},"end":{"line":812,"column":1}},"key":"BDJZGtCF1c"}],"key":"zHsYS3TTjN"},{"type":"proof","kind":"definition","label":"fisher_matrix","identifier":"fisher_matrix","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Fisher information matrix","position":{"start":{"line":814,"column":1},"end":{"line":814,"column":1}},"key":"w2qkj55qEu"}],"key":"hv09WlSi4S"},{"type":"paragraph","position":{"start":{"line":817,"column":1},"end":{"line":818,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":817,"column":1},"end":{"line":817,"column":1}},"key":"kX7RpOanqG"},{"type":"inlineMath","value":"p_\\theta","position":{"start":{"line":817,"column":1},"end":{"line":817,"column":1}},"html":"pθp_\\thetapθ","key":"GSxOMXHGQP"},{"type":"text","value":" denote a parameterized distribution.\nIts Fisher information matrix ","position":{"start":{"line":817,"column":1},"end":{"line":817,"column":1}},"key":"NgUcNGCWaT"},{"type":"inlineMath","value":"F_\\theta","position":{"start":{"line":817,"column":1},"end":{"line":817,"column":1}},"html":"FθF_\\thetaFθ","key":"kra1RHxYNW"},{"type":"text","value":" can be defined equivalently as:","position":{"start":{"line":817,"column":1},"end":{"line":817,"column":1}},"key":"TU6a9uuUU2"}],"key":"qqe9ahBU54"},{"type":"math","value":"\\begin{aligned}\n F_{\\theta} & = \\E_{x \\sim p_\\theta} \\left[ (\\nabla_\\theta \\log p_\\theta(x)) (\\nabla_\\theta \\log p_\\theta(x))^\\top \\right] & \\text{covariance matrix of the Fisher score} \\\\\n & = \\E_{x \\sim p_{\\theta}} [- \\nabla_\\theta^2 \\log p_\\theta(x)] & \\text{average Hessian of the negative log-likelihood}\n\\end{aligned}","position":{"start":{"line":820,"column":1},"end":{"line":825,"column":1}},"html":"Fθ=Expθ[(θlogpθ(x))(θlogpθ(x))]covariance matrix of the Fisher score=Expθ[θ2logpθ(x)]average Hessian of the negative log-likelihood\\begin{aligned}\n F_{\\theta} & = \\E_{x \\sim p_\\theta} \\left[ (\\nabla_\\theta \\log p_\\theta(x)) (\\nabla_\\theta \\log p_\\theta(x))^\\top \\right] & \\text{covariance matrix of the Fisher score} \\\\\n & = \\E_{x \\sim p_{\\theta}} [- \\nabla_\\theta^2 \\log p_\\theta(x)] & \\text{average Hessian of the negative log-likelihood}\n\\end{aligned}Fθ=Expθ[(θlogpθ(x))(θlogpθ(x))]=Expθ[θ2logpθ(x)]covariance matrix of the Fisher scoreaverage Hessian of the negative log-likelihood","enumerator":"37","key":"jMl1AwCB8E"},{"type":"paragraph","position":{"start":{"line":827,"column":1},"end":{"line":830,"column":1}},"children":[{"type":"text","value":"Recall that the Hessian of a function describes its curvature:\nfor a vector ","position":{"start":{"line":827,"column":1},"end":{"line":827,"column":1}},"key":"vKgw2ANuHW"},{"type":"inlineMath","value":"\\delta \\in \\Theta","position":{"start":{"line":827,"column":1},"end":{"line":827,"column":1}},"html":"δΘ\\delta \\in \\ThetaδΘ","key":"AIfOh3jb9C"},{"type":"text","value":",\nthe quantity ","position":{"start":{"line":827,"column":1},"end":{"line":827,"column":1}},"key":"auLqpaGRs4"},{"type":"inlineMath","value":"\\delta^\\top F_\\theta \\delta","position":{"start":{"line":827,"column":1},"end":{"line":827,"column":1}},"html":"δFθδ\\delta^\\top F_\\theta \\deltaδFθδ","key":"yzZpckYI0L"},{"type":"text","value":" describes how rapidly the negative log-likelihood changes if we move by ","position":{"start":{"line":827,"column":1},"end":{"line":827,"column":1}},"key":"LkTeu2i95k"},{"type":"text","value":"δ","position":{"start":{"line":827,"column":1},"end":{"line":827,"column":1}},"key":"kRCVWBlQ2d"},{"type":"text","value":".\nThe Fisher information matrix is precisely the Hessian of the KL divergence (with respect to either one of the parameters).","position":{"start":{"line":827,"column":1},"end":{"line":827,"column":1}},"key":"pEcDfzYdLk"}],"key":"DxiCBZFSnE"},{"type":"paragraph","position":{"start":{"line":832,"column":1},"end":{"line":832,"column":1}},"children":[{"type":"text","value":"In particular, when ","position":{"start":{"line":832,"column":1},"end":{"line":832,"column":1}},"key":"bBoWM36rwg"},{"type":"inlineMath","value":"p_\\theta = \\rho_{\\theta}","position":{"start":{"line":832,"column":1},"end":{"line":832,"column":1}},"html":"pθ=ρθp_\\theta = \\rho_{\\theta}pθ=ρθ","key":"tbneiOgOrg"},{"type":"text","value":" denotes a trajectory distribution, we can further simplify the expression:","position":{"start":{"line":832,"column":1},"end":{"line":832,"column":1}},"key":"B0GgvqRXJt"}],"key":"ox3fNT7IOg"},{"type":"math","value":"F_{\\theta} = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{h=0}^{H-1} (\\nabla \\log \\pi_\\theta (a_\\hi \\mid s_\\hi)) (\\nabla \\log \\pi_\\theta(a_\\hi \\mid s_\\hi))^\\top \\right]","label":"fisher_trajectory","identifier":"fisher_trajectory","html":"Fθ=Eτρθ[h=0H1(logπθ(ahsh))(logπθ(ahsh))]F_{\\theta} = \\E_{\\tau \\sim \\rho_\\theta} \\left[ \\sum_{h=0}^{H-1} (\\nabla \\log \\pi_\\theta (a_\\hi \\mid s_\\hi)) (\\nabla \\log \\pi_\\theta(a_\\hi \\mid s_\\hi))^\\top \\right]Fθ=Eτρθ[h=0H1(logπθ(ahsh))(logπθ(ahsh))]","enumerator":"38","html_id":"fisher-trajectory","key":"MvtMInHyYc"},{"type":"paragraph","position":{"start":{"line":840,"column":1},"end":{"line":840,"column":1}},"children":[{"type":"text","value":"Note that we’ve used the Markov property to cancel out the cross terms corresponding to two different time steps.","position":{"start":{"line":840,"column":1},"end":{"line":840,"column":1}},"key":"lfFgmNKJCu"}],"key":"c6xcVO5JaV"}],"enumerator":"5","html_id":"fisher-matrix","key":"qdehJGGYDL"},{"type":"paragraph","position":{"start":{"line":843,"column":1},"end":{"line":848,"column":1}},"children":[{"type":"text","value":"This is a convex optimization problem with a closed-form solution.\nTo see why, it helps to visualize the case where ","position":{"start":{"line":843,"column":1},"end":{"line":843,"column":1}},"key":"drnWYUm0OP"},{"type":"text","value":"θ","position":{"start":{"line":843,"column":1},"end":{"line":843,"column":1}},"key":"sLtdTBl2Mr"},{"type":"text","value":" is two-dimensional:\nthe constraint describes the inside of an ellipse,\nand the objective function is linear,\nso we can find the extreme point on the boundary of the ellipse.\nWe recommend ","position":{"start":{"line":843,"column":1},"end":{"line":843,"column":1}},"key":"sYErywypaY"},{"type":"cite","kind":"narrative","label":"boyd_convex_2004","identifier":"boyd_convex_2004","children":[{"type":"text","value":"Boyd & Vandenberghe (2004)","key":"s3k3QsFvXT"}],"enumerator":"3","key":"vnGPisAoc3"},{"type":"text","value":" for a comprehensive treatment of convex optimization.","position":{"start":{"line":843,"column":1},"end":{"line":843,"column":1}},"key":"xvAy4GfIFw"}],"key":"VRfONa3ySV"},{"type":"paragraph","position":{"start":{"line":850,"column":1},"end":{"line":851,"column":1}},"children":[{"type":"text","value":"More generally, for a higher-dimensional ","position":{"start":{"line":850,"column":1},"end":{"line":850,"column":1}},"key":"n2GCI4YQvv"},{"type":"text","value":"θ","position":{"start":{"line":850,"column":1},"end":{"line":850,"column":1}},"key":"l7moUWU90t"},{"type":"text","value":",\nwe can compute the global optima by setting the gradient of the Lagrangian to zero:","position":{"start":{"line":850,"column":1},"end":{"line":850,"column":1}},"key":"vcZRaUeIAG"}],"key":"Yn7XQHFQ17"},{"type":"math","value":"\\begin{aligned}\n \\mathcal{L}(\\theta, \\alpha) & = \\nabla J(\\pi_{\\theta^k})^\\top (\\theta - \\theta^k) - \\alpha \\left[ \\frac{1}{2} (\\theta - \\theta^k)^\\top F_{\\theta^k} (\\theta - \\theta^k) - \\delta \\right] \\\\\n \\nabla \\mathcal{L}(\\theta^{k+1}, \\alpha) & := 0 \\\\\n \\implies \\nabla J(\\pi_{\\theta^k}) & = \\alpha F_{\\theta^k} (\\theta^{k+1} - \\theta^k) \\\\\n \\theta^{k+1} & = \\theta^k + \\eta F_{\\theta^k}^{-1} \\nabla J(\\pi_{\\theta^k}) \\\\\n \\text{where } \\eta & = \\sqrt{\\frac{2 \\delta}{\\nabla J(\\pi_{\\theta^k})^\\top F_{\\theta^k}^{-1} \\nabla J(\\pi_{\\theta^k})}}\n\\end{aligned}","position":{"start":{"line":853,"column":1},"end":{"line":861,"column":1}},"html":"L(θ,α)=J(πθk)(θθk)α[12(θθk)Fθk(θθk)δ]L(θk+1,α):=0    J(πθk)=αFθk(θk+1θk)θk+1=θk+ηFθk1J(πθk)where η=2δJ(πθk)Fθk1J(πθk)\\begin{aligned}\n \\mathcal{L}(\\theta, \\alpha) & = \\nabla J(\\pi_{\\theta^k})^\\top (\\theta - \\theta^k) - \\alpha \\left[ \\frac{1}{2} (\\theta - \\theta^k)^\\top F_{\\theta^k} (\\theta - \\theta^k) - \\delta \\right] \\\\\n \\nabla \\mathcal{L}(\\theta^{k+1}, \\alpha) & := 0 \\\\\n \\implies \\nabla J(\\pi_{\\theta^k}) & = \\alpha F_{\\theta^k} (\\theta^{k+1} - \\theta^k) \\\\\n \\theta^{k+1} & = \\theta^k + \\eta F_{\\theta^k}^{-1} \\nabla J(\\pi_{\\theta^k}) \\\\\n \\text{where } \\eta & = \\sqrt{\\frac{2 \\delta}{\\nabla J(\\pi_{\\theta^k})^\\top F_{\\theta^k}^{-1} \\nabla J(\\pi_{\\theta^k})}}\n\\end{aligned}L(θ,α)L(θk+1,α)J(πθk)θk+1where η=J(πθk)(θθk)α[21(θθk)Fθk(θθk)δ]:=0=αFθk(θk+1θk)=θk+ηFθk1J(πθk)=J(πθk)Fθk1J(πθk)2δ","enumerator":"39","key":"K3FzA5rdDR"},{"type":"paragraph","position":{"start":{"line":863,"column":1},"end":{"line":865,"column":1}},"children":[{"type":"text","value":"This gives us the closed-form update.\nNow the only challenge is to estimate the Fisher information matrix,\nsince, as with the KL divergence constraint, it is an expectation over trajectories, and computing it exactly is therefore typically intractable.","position":{"start":{"line":863,"column":1},"end":{"line":863,"column":1}},"key":"Xhus098K5i"}],"key":"d8FCrIVvVW"},{"type":"proof","kind":"definition","label":"npg","identifier":"npg","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Natural policy gradient","position":{"start":{"line":867,"column":1},"end":{"line":867,"column":1}},"key":"RejS4cCOaS"}],"key":"oEJWHrXJo3"},{"type":"paragraph","position":{"start":{"line":870,"column":1},"end":{"line":872,"column":1}},"children":[{"type":"text","value":"How many trajectory samples do we need to accurately estimate the Fisher information matrix?\nAs a rule of thumb, the sample complexity should scale with the dimension of the parameter space.\nThis makes this approach intractable in the deep learning setting where we might have a very large number of parameters.","position":{"start":{"line":870,"column":1},"end":{"line":870,"column":1}},"key":"aH3Ytm1MQI"}],"key":"GucicwzDyn"}],"enumerator":"6","html_id":"npg","key":"vbkgrU9hRB"},{"type":"paragraph","position":{"start":{"line":875,"column":1},"end":{"line":880,"column":1}},"children":[{"type":"text","value":"As you can see, the NPG is the “basic” policy gradient algorithm we saw above,\nbut with the gradient transformed by the inverse Fisher information matrix.\nThis matrix can be understood as accounting for the ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"xWJ3Yft5oW"},{"type":"strong","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"children":[{"type":"text","value":"geometry of the parameter space.","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"aBXvYCbAjn"}],"key":"F3JR3QkkhB"},{"type":"text","value":"\nThe typical gradient descent algorithm implicitly measures distances between parameters using the typical ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"ufp8azaRQJ"},{"type":"emphasis","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"children":[{"type":"text","value":"Euclidean distance","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"IDNjE35mLD"}],"key":"VPcnptQPM7"},{"type":"text","value":".\nHere, where the parameters map to a ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"fIfh2oE7XO"},{"type":"emphasis","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"children":[{"type":"text","value":"distribution","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"UmF2bMLs1q"}],"key":"bjEqFEjqf8"},{"type":"text","value":", using the natural gradient update is equivalent to optimizing over ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"AbFiK4Mmqu"},{"type":"strong","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"children":[{"type":"text","value":"distribution space","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"NUFNmGU3db"}],"key":"Zm366lsUby"},{"type":"text","value":" rather than parameter space,\nwhere distance between distributions is measured by the ","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"Xd54rvhSdg"},{"type":"crossReference","kind":"proof:definition","identifier":"kld","label":"kld","children":[{"type":"text","value":"Definition ","key":"ok7LYrB2Vs"},{"type":"text","value":"3","key":"RIEFTHha9F"}],"template":"Definition %s","enumerator":"3","resolved":true,"html_id":"kld","key":"qhoid32BkY"},{"type":"text","value":".","position":{"start":{"line":875,"column":1},"end":{"line":875,"column":1}},"key":"i9laOkqMNe"}],"key":"fddvKHX4nc"},{"type":"proof","kind":"example","label":"natural_simple","identifier":"natural_simple","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Natural gradient on a simple problem","position":{"start":{"line":882,"column":1},"end":{"line":882,"column":1}},"key":"w2yNHrO0LD"}],"key":"SV3vzc8AiU"},{"type":"paragraph","position":{"start":{"line":885,"column":1},"end":{"line":885,"column":1}},"children":[{"type":"text","value":"Let’s step away from RL and consider the following optimization problem over Bernoulli distributions ","position":{"start":{"line":885,"column":1},"end":{"line":885,"column":1}},"key":"HRPVicGmJ1"},{"type":"inlineMath","value":"\\pi \\in \\Delta(\\{ 0, 1 \\})","position":{"start":{"line":885,"column":1},"end":{"line":885,"column":1}},"html":"πΔ({0,1})\\pi \\in \\Delta(\\{ 0, 1 \\})πΔ({0,1})","key":"KaRw0dKpwM"},{"type":"text","value":":","position":{"start":{"line":885,"column":1},"end":{"line":885,"column":1}},"key":"sY0uUaf7g3"}],"key":"hwuBueQlue"},{"type":"math","value":"\\begin{aligned}\n J(\\pi) & = 100 \\cdot \\pi(1) + 1 \\cdot \\pi(0)\n\\end{aligned}","position":{"start":{"line":887,"column":1},"end":{"line":891,"column":1}},"html":"J(π)=100π(1)+1π(0)\\begin{aligned}\n J(\\pi) & = 100 \\cdot \\pi(1) + 1 \\cdot \\pi(0)\n\\end{aligned}J(π)=100π(1)+1π(0)","enumerator":"40","key":"yhD3MREiKR"},{"type":"paragraph","position":{"start":{"line":893,"column":1},"end":{"line":893,"column":1}},"children":[{"type":"text","value":"We can think of the space of such distributions as the line between ","position":{"start":{"line":893,"column":1},"end":{"line":893,"column":1}},"key":"h8DBO3siYf"},{"type":"inlineMath","value":"(0, 1)","position":{"start":{"line":893,"column":1},"end":{"line":893,"column":1}},"html":"(0,1)(0, 1)(0,1)","key":"HgHh6czUY8"},{"type":"text","value":" to ","position":{"start":{"line":893,"column":1},"end":{"line":893,"column":1}},"key":"ZCxSOHkh0N"},{"type":"inlineMath","value":"(1, 0)","position":{"start":{"line":893,"column":1},"end":{"line":893,"column":1}},"html":"(1,0)(1, 0)(1,0)","key":"Y2u2Gl2Bwt"},{"type":"text","value":" on the Cartesian plane:","position":{"start":{"line":893,"column":1},"end":{"line":893,"column":1}},"key":"J3NqGLL9Xg"}],"key":"EJjzAb1dC5"},{"type":"image","url":"/build/npg_line-18dfc6d5286c25a94643b5e115d15484.png","alt":"a line from (0, 1) to (1, 0)","width":"240px","align":"center","key":"pFrguOd8iS","urlSource":"shared/npg_line.png","urlOptimized":"/build/npg_line-18dfc6d5286c25a94643b5e115d15484.webp"},{"type":"paragraph","position":{"start":{"line":901,"column":1},"end":{"line":903,"column":1}},"children":[{"type":"text","value":"Clearly the optimal distribution is the constant one ","position":{"start":{"line":901,"column":1},"end":{"line":901,"column":1}},"key":"tDDDUuvpLw"},{"type":"inlineMath","value":"\\pi(1) = 1","position":{"start":{"line":901,"column":1},"end":{"line":901,"column":1}},"html":"π(1)=1\\pi(1) = 1π(1)=1","key":"w7ubVOk0d4"},{"type":"text","value":". Suppose we optimize over the parameterized family ","position":{"start":{"line":901,"column":1},"end":{"line":901,"column":1}},"key":"DBONgIV0ar"},{"type":"inlineMath","value":"\\pi_\\theta(1) = \\frac{\\exp(\\theta)}{1+\\exp(\\theta)}","position":{"start":{"line":901,"column":1},"end":{"line":901,"column":1}},"html":"πθ(1)=exp(θ)1+exp(θ)\\pi_\\theta(1) = \\frac{\\exp(\\theta)}{1+\\exp(\\theta)}πθ(1)=1+exp(θ)exp(θ)","key":"uaSBc3f2rt"},{"type":"text","value":".\nThen our optimization algorithm should set ","position":{"start":{"line":901,"column":1},"end":{"line":901,"column":1}},"key":"cVhhK15TYQ"},{"type":"text","value":"θ","position":{"start":{"line":901,"column":1},"end":{"line":901,"column":1}},"key":"hvlzBTGKl3"},{"type":"text","value":" to be unboundedly large.\nThen the “vanilla” gradient is","position":{"start":{"line":901,"column":1},"end":{"line":901,"column":1}},"key":"Zlort5AVqR"}],"key":"roy47UkItf"},{"type":"math","value":"\\nabla_\\theta J(\\pi_\\theta) = \\frac{99 \\exp(\\theta)}{(1 + \\exp(\\theta))^2}.","position":{"start":{"line":905,"column":1},"end":{"line":905,"column":1}},"html":"θJ(πθ)=99exp(θ)(1+exp(θ))2.\\nabla_\\theta J(\\pi_\\theta) = \\frac{99 \\exp(\\theta)}{(1 + \\exp(\\theta))^2}.θJ(πθ)=(1+exp(θ))299exp(θ).","enumerator":"41","key":"qSW69Y4Vec"},{"type":"paragraph","position":{"start":{"line":907,"column":1},"end":{"line":908,"column":1}},"children":[{"type":"text","value":"Note that as ","position":{"start":{"line":907,"column":1},"end":{"line":907,"column":1}},"key":"OzxEZBFnfv"},{"type":"inlineMath","value":"\\theta \\to \\infty","position":{"start":{"line":907,"column":1},"end":{"line":907,"column":1}},"html":"θ\\theta \\to \\inftyθ","key":"rV0Y4hLTLM"},{"type":"text","value":" that the increments get closer and closer to ","position":{"start":{"line":907,"column":1},"end":{"line":907,"column":1}},"key":"OYjuYpx8r5"},{"type":"text","value":"0","position":{"start":{"line":907,"column":1},"end":{"line":907,"column":1}},"key":"jUptouXhbg"},{"type":"text","value":";\nthe rate of increase becomes exponentially slow.","position":{"start":{"line":907,"column":1},"end":{"line":907,"column":1}},"key":"Wh3I9U297U"}],"key":"ex3Io0wT4I"},{"type":"paragraph","position":{"start":{"line":911,"column":1},"end":{"line":911,"column":1}},"children":[{"type":"text","value":"However, if we compute the Fisher information “matrix” (which is just a scalar in this case), we can account for the geometry induced by the parameterization.","position":{"start":{"line":911,"column":1},"end":{"line":911,"column":1}},"key":"dQMssKzwjq"}],"key":"t1SMd1NKkR"},{"type":"math","value":"\\begin{aligned}\n F_\\theta & = \\E_{x \\sim \\pi_\\theta} [ (\\nabla_\\theta \\log \\pi_\\theta(x))^2 ] \\\\\n & = \\frac{\\exp(\\theta)}{(1 + \\exp(\\theta))^2}.\n\\end{aligned}","position":{"start":{"line":913,"column":1},"end":{"line":918,"column":1}},"html":"Fθ=Exπθ[(θlogπθ(x))2]=exp(θ)(1+exp(θ))2.\\begin{aligned}\n F_\\theta & = \\E_{x \\sim \\pi_\\theta} [ (\\nabla_\\theta \\log \\pi_\\theta(x))^2 ] \\\\\n & = \\frac{\\exp(\\theta)}{(1 + \\exp(\\theta))^2}.\n\\end{aligned}Fθ=Exπθ[(θlogπθ(x))2]=(1+exp(θ))2exp(θ).","enumerator":"42","key":"E0ztjEv2H8"},{"type":"paragraph","position":{"start":{"line":920,"column":1},"end":{"line":920,"column":1}},"children":[{"type":"text","value":"This gives the natural gradient update","position":{"start":{"line":920,"column":1},"end":{"line":920,"column":1}},"key":"Tu2arZf25e"}],"key":"dc6gfWHfjA"},{"type":"math","value":"\\begin{aligned}\n \\theta^{k+1} & = \\theta^k + \\eta F_{\\theta^k}^{-1} \\nabla_ \\theta J(\\theta^k) \\\\\n & = \\theta^k + 99 \\eta\n\\end{aligned}","position":{"start":{"line":922,"column":1},"end":{"line":927,"column":1}},"html":"θk+1=θk+ηFθk1θJ(θk)=θk+99η\\begin{aligned}\n \\theta^{k+1} & = \\theta^k + \\eta F_{\\theta^k}^{-1} \\nabla_ \\theta J(\\theta^k) \\\\\n & = \\theta^k + 99 \\eta\n\\end{aligned}θk+1=θk+ηFθk1θJ(θk)=θk+99η","enumerator":"43","key":"mUU0utUTkN"},{"type":"paragraph","position":{"start":{"line":929,"column":1},"end":{"line":929,"column":1}},"children":[{"type":"text","value":"which increases at a constant rate, i.e. improves the objective more quickly than “vanilla” gradient ascent.","position":{"start":{"line":929,"column":1},"end":{"line":929,"column":1}},"key":"mhNN7jWjzT"}],"key":"VEvutOFdXz"}],"enumerator":"5","html_id":"natural-simple","key":"NrmNb9n9Lq"},{"type":"paragraph","position":{"start":{"line":932,"column":1},"end":{"line":936,"column":1}},"children":[{"type":"text","value":"Though the NPG now gives a closed-form optimization step,\nit requires computing the inverse Fisher information matrix,\nwhich typically scales as ","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"key":"x6gqQvmly1"},{"type":"inlineMath","value":"O((\\dim \\Theta)^3)","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"html":"O((dimΘ)3)O((\\dim \\Theta)^3)O((dimΘ)3)","key":"GiFTB3yiHt"},{"type":"text","value":".\nThis can be expensive if the parameter space is large.\nCan we find an algorithm that works in ","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"key":"gfT8tJ6ytZ"},{"type":"emphasis","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"children":[{"type":"text","value":"linear time","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"key":"lgPRMavYNQ"}],"key":"XZF22Jzdk8"},{"type":"text","value":" with respect to the dimension of the parameter space?","position":{"start":{"line":932,"column":1},"end":{"line":932,"column":1}},"key":"yIO6NugH63"}],"key":"KyinGQajx6"}],"key":"IgIU3URgIe"},{"type":"block","position":{"start":{"line":938,"column":1},"end":{"line":938,"column":1}},"children":[{"type":"heading","depth":2,"position":{"start":{"line":940,"column":1},"end":{"line":940,"column":1}},"children":[{"type":"text","value":"Proximal policy optimization","position":{"start":{"line":940,"column":1},"end":{"line":940,"column":1}},"key":"hyBSrhw4Ey"}],"identifier":"proximal-policy-optimization","label":"Proximal policy optimization","html_id":"proximal-policy-optimization","implicit":true,"enumerator":"9","key":"J57IhCw3c9"},{"type":"paragraph","position":{"start":{"line":942,"column":1},"end":{"line":944,"column":1}},"children":[{"type":"text","value":"We can relax the TRPO optimization problem in a different way:\nRather than imposing a hard constraint on the KL distance,\nwe can instead impose a ","position":{"start":{"line":942,"column":1},"end":{"line":942,"column":1}},"key":"Dbtjnqxzue"},{"type":"emphasis","position":{"start":{"line":942,"column":1},"end":{"line":942,"column":1}},"children":[{"type":"text","value":"soft","position":{"start":{"line":942,"column":1},"end":{"line":942,"column":1}},"key":"CshgFJ37O5"}],"key":"wiB5UPNPw2"},{"type":"text","value":" constraint by incorporating it into the objective and penalizing parameter values that drastically change the trajectory distribution.","position":{"start":{"line":942,"column":1},"end":{"line":942,"column":1}},"key":"VF4D9UE1vi"}],"key":"RUyXoXyfJd"},{"type":"math","value":"\\begin{aligned}\n\\theta^{k+1} &\\gets \\arg\\max_{\\theta} \\E_{s_0, \\dots, s_{H-1} \\sim \\rho_{\\pi^{k}}} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\E_{a_\\hi \\sim \\pi_{\\theta}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi) \\right] - \\lambda \\kl{\\rho_{\\theta}}{\\rho_{\\theta^k}}\n\\end{aligned}","position":{"start":{"line":946,"column":1},"end":{"line":950,"column":1}},"html":"θk+1argmaxθEs0,,sH1ρπk[h=0H1Eahπθ(sh)Aπk(sh,ah)]λKL(ρθρθk)\\begin{aligned}\n\\theta^{k+1} &\\gets \\arg\\max_{\\theta} \\E_{s_0, \\dots, s_{H-1} \\sim \\rho_{\\pi^{k}}} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\E_{a_\\hi \\sim \\pi_{\\theta}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi) \\right] - \\lambda \\kl{\\rho_{\\theta}}{\\rho_{\\theta^k}}\n\\end{aligned}θk+1argθmaxEs0,,sH1ρπk[h=0H1Eahπθ(sh)Aπk(sh,ah)]λKL(ρθρθk)","enumerator":"44","key":"LqoEHGQLiX"},{"type":"paragraph","position":{"start":{"line":952,"column":1},"end":{"line":953,"column":1}},"children":[{"type":"text","value":"Here ","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"key":"gFDGzqgQog"},{"type":"text","value":"λ","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"key":"OrOUFVQ1Uj"},{"type":"text","value":" is a ","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"key":"iVTpA6FKNB"},{"type":"strong","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"children":[{"type":"text","value":"regularization hyperparameter","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"key":"o1L23arYSs"}],"key":"ZUWVpPNWqO"},{"type":"text","value":" that controls the tradeoff between the two terms.\nThis is the objective of the ","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"key":"dO7Vxj2aEx"},{"type":"strong","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"children":[{"type":"text","value":"proximal policy optimization","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"key":"EcvFACdPKi"}],"key":"Ab8e0eVwJv"},{"type":"text","value":" algorithm ","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"key":"tnSo8RhP9s"},{"type":"cite","kind":"narrative","label":"schulman_proximal_2017","identifier":"schulman_proximal_2017","children":[{"type":"text","value":"Schulman ","key":"L0ViaRx91k"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"pTxRBMtFdo"}],"key":"ZAV7YQfOIq"},{"type":"text","value":" (2017)","key":"pEXRNaX70R"}],"enumerator":"4","key":"ZoslHjgx93"},{"type":"text","value":".","position":{"start":{"line":952,"column":1},"end":{"line":952,"column":1}},"key":"zfXbJuSSDH"}],"key":"Sh2xJfC1O8"},{"type":"paragraph","position":{"start":{"line":955,"column":1},"end":{"line":956,"column":1}},"children":[{"type":"text","value":"Like the original TRPO algorithm ","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"aaICC9ERnw"},{"type":"crossReference","kind":"proof:definition","identifier":"trpo","label":"trpo","children":[{"type":"text","value":"Definition ","key":"A60aEPZmLW"},{"type":"text","value":"4","key":"mYQux6CbBO"}],"template":"Definition %s","enumerator":"4","resolved":true,"html_id":"trpo","key":"mW9l1jG1pk"},{"type":"text","value":",\nPPO is not gradient-based; rather, at each step, we try to maximize local advantage relative to the current policy.","position":{"start":{"line":955,"column":1},"end":{"line":955,"column":1}},"key":"YqZjgSige4"}],"key":"YcWkCLXcyi"},{"type":"paragraph","position":{"start":{"line":958,"column":1},"end":{"line":959,"column":1}},"children":[{"type":"text","value":"How do we solve this optimization?\nLet us begin by simplifying the ","position":{"start":{"line":958,"column":1},"end":{"line":958,"column":1}},"key":"g2ivHU4ZBM"},{"type":"inlineMath","value":"\\kl{\\rho_{\\pi^k}}{\\rho_{\\pi_{\\theta}}}","position":{"start":{"line":958,"column":1},"end":{"line":958,"column":1}},"html":"KL(ρπkρπθ)\\kl{\\rho_{\\pi^k}}{\\rho_{\\pi_{\\theta}}}KL(ρπkρπθ)","key":"r3rzoP22Qw"},{"type":"text","value":" term. Expanding gives","position":{"start":{"line":958,"column":1},"end":{"line":958,"column":1}},"key":"CA5g6LtfQA"}],"key":"Z6BSvILrdb"},{"type":"math","value":"\\begin{aligned}\n \\kl{\\rho_{\\pi^k}}{\\rho_{\\pi_{\\theta}}} & = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[\\log \\frac{\\rho_{\\pi^k}(\\tau)}{\\rho_{\\pi_{\\theta}}(\\tau)}\\right] \\\\\n & = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\log \\frac{\\pi^k(a_\\hi \\mid s_\\hi)}{\\pi_{\\theta}(a_\\hi \\mid s_\\hi)}\\right] & \\text{state transitions cancel} \\\\\n & = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\log \\frac{1}{\\pi_{\\theta}(a_\\hi \\mid s_\\hi)}\\right] + c\n\\end{aligned}","position":{"start":{"line":961,"column":1},"end":{"line":967,"column":1}},"html":"KL(ρπkρπθ)=Eτρπk[logρπk(τ)ρπθ(τ)]=Eτρπk[h=0H1logπk(ahsh)πθ(ahsh)]state transitions cancel=Eτρπk[h=0H1log1πθ(ahsh)]+c\\begin{aligned}\n \\kl{\\rho_{\\pi^k}}{\\rho_{\\pi_{\\theta}}} & = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[\\log \\frac{\\rho_{\\pi^k}(\\tau)}{\\rho_{\\pi_{\\theta}}(\\tau)}\\right] \\\\\n & = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\log \\frac{\\pi^k(a_\\hi \\mid s_\\hi)}{\\pi_{\\theta}(a_\\hi \\mid s_\\hi)}\\right] & \\text{state transitions cancel} \\\\\n & = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\log \\frac{1}{\\pi_{\\theta}(a_\\hi \\mid s_\\hi)}\\right] + c\n\\end{aligned}KL(ρπkρπθ)=Eτρπk[logρπθ(τ)ρπk(τ)]=Eτρπk[h=0H1logπθ(ahsh)πk(ahsh)]=Eτρπk[h=0H1logπθ(ahsh)1]+cstate transitions cancel","enumerator":"45","key":"Fy1EoaXYNb"},{"type":"paragraph","position":{"start":{"line":969,"column":1},"end":{"line":970,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":969,"column":1},"end":{"line":969,"column":1}},"key":"JzB06WEM4a"},{"type":"inlineMath","value":"c","position":{"start":{"line":969,"column":1},"end":{"line":969,"column":1}},"html":"ccc","key":"mOlqCULGQ5"},{"type":"text","value":" is some constant with respect to ","position":{"start":{"line":969,"column":1},"end":{"line":969,"column":1}},"key":"QbiA6H0bbk"},{"type":"text","value":"θ","position":{"start":{"line":969,"column":1},"end":{"line":969,"column":1}},"key":"IBtRAgSFp7"},{"type":"text","value":", and can be ignored.\nThis gives the objective","position":{"start":{"line":969,"column":1},"end":{"line":969,"column":1}},"key":"ytEmCwWBUj"}],"key":"XhCekmt3PJ"},{"type":"math","value":"\\ell^k(\\theta)\n=\n\\E_{s_0, \\dots, s_{H-1} \\sim \\rho_{\\pi^{k}}} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\E_{a_\\hi \\sim \\pi_{\\theta}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi) \\right] - \\lambda \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\log \\frac{1}{\\pi_{\\theta}(a_\\hi \\mid s_\\hi)}\\right]","position":{"start":{"line":972,"column":1},"end":{"line":976,"column":1}},"html":"k(θ)=Es0,,sH1ρπk[h=0H1Eahπθ(sh)Aπk(sh,ah)]λEτρπk[h=0H1log1πθ(ahsh)]\\ell^k(\\theta)\n=\n\\E_{s_0, \\dots, s_{H-1} \\sim \\rho_{\\pi^{k}}} \\left[ \\sum_{\\hi=0}^{\\hor-1} \\E_{a_\\hi \\sim \\pi_{\\theta}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi) \\right] - \\lambda \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\log \\frac{1}{\\pi_{\\theta}(a_\\hi \\mid s_\\hi)}\\right]k(θ)=Es0,,sH1ρπk[h=0H1Eahπθ(sh)Aπk(sh,ah)]λEτρπk[h=0H1logπθ(ahsh)1]","enumerator":"46","key":"hrq0q9yK33"},{"type":"paragraph","position":{"start":{"line":978,"column":1},"end":{"line":982,"column":1}},"children":[{"type":"text","value":"Once again, this takes an expectation over trajectories.\nBut here we cannot directly sample trajectories from ","position":{"start":{"line":978,"column":1},"end":{"line":978,"column":1}},"key":"oI7qa3AKQw"},{"type":"inlineMath","value":"\\pi^k","position":{"start":{"line":978,"column":1},"end":{"line":978,"column":1}},"html":"πk\\pi^kπk","key":"sSpoXzi7aN"},{"type":"text","value":",\nsince in the first term, the actions actually come from ","position":{"start":{"line":978,"column":1},"end":{"line":978,"column":1}},"key":"LoqHIe5w9W"},{"type":"inlineMath","value":"\\pi_\\theta","position":{"start":{"line":978,"column":1},"end":{"line":978,"column":1}},"html":"πθ\\pi_\\thetaπθ","key":"GUwZ2X554r"},{"type":"text","value":".\nTo make this term line up with the other expectation,\nwe would need the actions to also come from ","position":{"start":{"line":978,"column":1},"end":{"line":978,"column":1}},"key":"xAsYMQkZL9"},{"type":"inlineMath","value":"\\pi^k","position":{"start":{"line":978,"column":1},"end":{"line":978,"column":1}},"html":"πk\\pi^kπk","key":"NH85IL5yy8"},{"type":"text","value":".","position":{"start":{"line":978,"column":1},"end":{"line":978,"column":1}},"key":"QOi1rn8MmX"}],"key":"mR8GfFb3Vh"},{"type":"paragraph","position":{"start":{"line":984,"column":1},"end":{"line":986,"column":1}},"children":[{"type":"text","value":"This should sound familiar:\nwe want to estimate an expectation over one distribution by sampling from another.\nWe can once again use ","position":{"start":{"line":984,"column":1},"end":{"line":984,"column":1}},"key":"xq9tOhczMf"},{"type":"crossReference","position":{"start":{"line":984,"column":1},"end":{"line":984,"column":1}},"children":[{"type":"text","value":"Section ","key":"FHUKZkv1SN"},{"type":"text","value":"3.2","key":"j0M6Xt8Dyx"}],"identifier":"importance_sampling","label":"importance_sampling","kind":"heading","template":"Section %s","enumerator":"3.2","resolved":true,"html_id":"importance-sampling","key":"pSxOw7dt2e"},{"type":"text","value":" to rewrite the inner expectation:","position":{"start":{"line":984,"column":1},"end":{"line":984,"column":1}},"key":"Y2DGiGELPa"}],"key":"Ptk3Qjcvjz"},{"type":"math","value":"\\E_{a_\\hi \\sim \\pi_{\\theta}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi)\n=\n\\E_{a_\\hi \\sim \\pi^k(s_\\hi)} \\frac{\\pi_\\theta(a_\\hi \\mid s_\\hi)}{\\pi^k(a_\\hi \\mid s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi)","position":{"start":{"line":988,"column":1},"end":{"line":992,"column":1}},"html":"Eahπθ(sh)Aπk(sh,ah)=Eahπk(sh)πθ(ahsh)πk(ahsh)Aπk(sh,ah)\\E_{a_\\hi \\sim \\pi_{\\theta}(s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi)\n=\n\\E_{a_\\hi \\sim \\pi^k(s_\\hi)} \\frac{\\pi_\\theta(a_\\hi \\mid s_\\hi)}{\\pi^k(a_\\hi \\mid s_\\hi)} A^{\\pi^{k}}(s_\\hi, a_\\hi)Eahπθ(sh)Aπk(sh,ah)=Eahπk(sh)πk(ahsh)πθ(ahsh)Aπk(sh,ah)","enumerator":"47","key":"eyfw1I72k7"},{"type":"paragraph","position":{"start":{"line":994,"column":1},"end":{"line":994,"column":1}},"children":[{"type":"text","value":"Now we can combine the expectations together to get the objective","position":{"start":{"line":994,"column":1},"end":{"line":994,"column":1}},"key":"CeAj4HTOCa"}],"key":"uGWhODWMLI"},{"type":"math","value":"\\ell^k(\\theta) = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\left( \\frac{\\pi_\\theta(a_\\hi \\mid s_\\hi)}{\\pi^k(a_\\hi \\mid s_\\hi)} A^{\\pi^k}(s_\\hi, a_\\hi) - \\lambda \\log \\frac{1}{\\pi_\\theta(a_\\hi \\mid s_\\hi)} \\right) \\right]","position":{"start":{"line":996,"column":1},"end":{"line":998,"column":1}},"html":"k(θ)=Eτρπk[h=0H1(πθ(ahsh)πk(ahsh)Aπk(sh,ah)λlog1πθ(ahsh))]\\ell^k(\\theta) = \\E_{\\tau \\sim \\rho_{\\pi^k}} \\left[ \\sum_{h=0}^{H-1} \\left( \\frac{\\pi_\\theta(a_\\hi \\mid s_\\hi)}{\\pi^k(a_\\hi \\mid s_\\hi)} A^{\\pi^k}(s_\\hi, a_\\hi) - \\lambda \\log \\frac{1}{\\pi_\\theta(a_\\hi \\mid s_\\hi)} \\right) \\right]k(θ)=Eτρπk[h=0H1(πk(ahsh)πθ(ahsh)Aπk(sh,ah)λlogπθ(ahsh)1)]","enumerator":"48","key":"SXFqVzXYX5"},{"type":"paragraph","position":{"start":{"line":1000,"column":1},"end":{"line":1002,"column":1}},"children":[{"type":"text","value":"Now we can estimate this function by a sample average over trajectories from ","position":{"start":{"line":1000,"column":1},"end":{"line":1000,"column":1}},"key":"asyTUNcMUp"},{"type":"inlineMath","value":"\\pi^k","position":{"start":{"line":1000,"column":1},"end":{"line":1000,"column":1}},"html":"πk\\pi^kπk","key":"z4oidDKrAj"},{"type":"text","value":".\nRemember that to complete a single iteration of PPO,\nwe execute","position":{"start":{"line":1000,"column":1},"end":{"line":1000,"column":1}},"key":"jMX1XLsvMP"}],"key":"LJ5OV0FCnZ"},{"type":"math","value":"\\theta^{k+1} \\gets \\arg\\max_{\\theta} \\ell^k(\\theta).","position":{"start":{"line":1004,"column":1},"end":{"line":1006,"column":1}},"html":"θk+1argmaxθk(θ).\\theta^{k+1} \\gets \\arg\\max_{\\theta} \\ell^k(\\theta).θk+1argθmaxk(θ).","enumerator":"49","key":"La5nYLr2gx"},{"type":"paragraph","position":{"start":{"line":1008,"column":1},"end":{"line":1008,"column":1}},"children":[{"type":"text","value":"If ","position":{"start":{"line":1008,"column":1},"end":{"line":1008,"column":1}},"key":"sLXvHJMzsm"},{"type":"inlineMath","value":"\\ell^k","position":{"start":{"line":1008,"column":1},"end":{"line":1008,"column":1}},"html":"k\\ell^kk","key":"fprbJFjZUh"},{"type":"text","value":" is differentiable, we can optimize it by gradient ascent, completing a single iteration of PPO.","position":{"start":{"line":1008,"column":1},"end":{"line":1008,"column":1}},"key":"cIrcXkXoLp"}],"key":"RsK6Z4iMe4"},{"type":"code","lang":"python","value":"def ppo_pseudocode(\n env,\n π: Callable[[Params], Callable[[State, Action], Float]],\n λ: float,\n θ_init: Params,\n n_iters: int,\n n_fit_trajectories: int,\n n_sample_trajectories: int,\n):\n θ = θ_init\n for k in range(n_iters):\n fit_trajectories = sample_trajectories(env, π(θ), n_fit_trajectories)\n A_hat = fit(fit_trajectories)\n\n sample_trajectories = sample_trajectories(env, π(θ), n_sample_trajectories)\n \n def objective(θ_opt):\n total_objective = 0\n for τ in sample_trajectories:\n for s, a, _r in τ:\n total_objective += π(θ_opt)(s, a) / π(θ)(s, a) * A_hat(s, a) + λ * jnp.log(π(θ_opt)(s, a))\n return total_objective / n_sample_trajectories\n \n θ = optimize(objective, θ)\n\n return θ","position":{"start":{"line":1010,"column":1},"end":{"line":1037,"column":1}},"key":"mEOcTkOCum"},{"type":"heading","depth":2,"position":{"start":{"line":1039,"column":1},"end":{"line":1039,"column":1}},"children":[{"type":"text","value":"Summary","position":{"start":{"line":1039,"column":1},"end":{"line":1039,"column":1}},"key":"ET1CRhySkK"}],"identifier":"summary","label":"Summary","html_id":"summary","implicit":true,"enumerator":"10","key":"Y5OaOF7Ghy"},{"type":"paragraph","position":{"start":{"line":1041,"column":1},"end":{"line":1047,"column":1}},"children":[{"type":"text","value":"Policy gradient methods are a powerful family of algorithms that directly optimize the expected total reward by iteratively updating the policy parameters.\nPrecisely,\nwe estimate the gradient of the expected total reward (with respect to the parameters),\nand update the parameters in that direction.\nBut estimating the gradient is a tricky task!\nWe saw many ways to reduce the variance of the gradient estimator,\nculminating in the advantage-based expression ","position":{"start":{"line":1041,"column":1},"end":{"line":1041,"column":1}},"key":"VAE5epdoZa"},{"type":"crossReference","position":{"start":{"line":1041,"column":1},"end":{"line":1041,"column":1}},"children":[{"type":"text","value":"(","key":"ooRyerMiCt"},{"type":"text","value":"29","key":"CawfYJ8MLA"},{"type":"text","value":")","key":"giwCOiJjXU"}],"identifier":"pg_advantage","label":"pg_advantage","kind":"equation","template":"(%s)","enumerator":"29","resolved":true,"html_id":"pg-advantage","key":"SRLYiswHwJ"},{"type":"text","value":".","position":{"start":{"line":1041,"column":1},"end":{"line":1041,"column":1}},"key":"GEi9OPoXYf"}],"key":"nolAMBECjS"},{"type":"paragraph","position":{"start":{"line":1049,"column":1},"end":{"line":1054,"column":1}},"children":[{"type":"text","value":"But updating the parameters doesn’t entirely solve the problem:\nSometimes, a small step in the parameters might lead to a big step in the policy.\nTo avoid changing the policy too much at each step,\nwe must account for the curvature in the parameter space.\nWe first did this explicitly with ","position":{"start":{"line":1049,"column":1},"end":{"line":1049,"column":1}},"key":"PI12sVxmBE"},{"type":"crossReference","position":{"start":{"line":1049,"column":1},"end":{"line":1049,"column":1}},"children":[{"type":"text","value":"Definition ","key":"U9yPfa2h5e"},{"type":"text","value":"4","key":"Qy6P9fp2YW"}],"identifier":"trpo","label":"trpo","kind":"proof:definition","template":"Definition %s","enumerator":"4","resolved":true,"html_id":"trpo","key":"pZHPe0IgVa"},{"type":"text","value":",\nand then saw ways to relax the constraint in ","position":{"start":{"line":1049,"column":1},"end":{"line":1049,"column":1}},"key":"dm7a0VrrDR"},{"type":"crossReference","position":{"start":{"line":1049,"column":1},"end":{"line":1049,"column":1}},"children":[{"type":"text","value":"Definition ","key":"gGbZQMeFeS"},{"type":"text","value":"6","key":"gxcIeIBKC9"}],"identifier":"npg","label":"npg","kind":"proof:definition","template":"Definition %s","enumerator":"6","resolved":true,"html_id":"npg","key":"TnvFlHijai"},{"type":"text","value":" and ","position":{"start":{"line":1049,"column":1},"end":{"line":1049,"column":1}},"key":"buAo2M2CeJ"},{"type":"crossReference","position":{"start":{"line":1049,"column":1},"end":{"line":1049,"column":1}},"children":[{"type":"text","value":"Section ","key":"XYUW6FcA1G"},{"type":"text","value":"9","key":"suVVFgNxVw"}],"identifier":"proximal-policy-optimization","label":"proximal-policy-optimization","kind":"heading","template":"Section %s","enumerator":"9","resolved":true,"html_id":"proximal-policy-optimization","key":"XvWJxsqaYu"},{"type":"text","value":".","position":{"start":{"line":1049,"column":1},"end":{"line":1049,"column":1}},"key":"yDI3OTGRXx"}],"key":"cTYNaAG8Ss"},{"type":"paragraph","position":{"start":{"line":1056,"column":1},"end":{"line":1057,"column":1}},"children":[{"type":"text","value":"These are still popular methods to this day,\nespecially because they efficiently integrate with ","position":{"start":{"line":1056,"column":1},"end":{"line":1056,"column":1}},"key":"PwfplClin0"},{"type":"emphasis","position":{"start":{"line":1056,"column":1},"end":{"line":1056,"column":1}},"children":[{"type":"text","value":"deep neural networks","position":{"start":{"line":1056,"column":1},"end":{"line":1056,"column":1}},"key":"uXQT5XuWKx"}],"key":"nDFr6NQGzu"},{"type":"text","value":" for representing complex functions.","position":{"start":{"line":1056,"column":1},"end":{"line":1056,"column":1}},"key":"RvzzSwOpaq"}],"key":"frStB9LQLa"}],"key":"oeJsJapEUU"}],"key":"WkB941QGYa"},"references":{"cite":{"order":["baydin_automatic_2018","williams_simple_1992","boyd_convex_2004","schulman_proximal_2017"],"data":{"baydin_automatic_2018":{"label":"baydin_automatic_2018","enumerator":"1","doi":"10.48550/arXiv.1502.05767","html":"Baydin, A. G., Pearlmutter, B. A., Radul, A. A., & Siskind, J. M. (2018). Automatic Differentiation in Machine Learning: A Survey. arXiv. 10.48550/arXiv.1502.05767","url":"https://doi.org/10.48550/arXiv.1502.05767"},"williams_simple_1992":{"label":"williams_simple_1992","enumerator":"2","doi":"10.1007/BF00992696","html":"Williams, R. J. (1992). Simple Statistical Gradient-Following Algorithms for Connectionist Reinforcement Learning. Machine Learning, 8(3), 229–256. 10.1007/BF00992696","url":"https://doi.org/10.1007/BF00992696"},"boyd_convex_2004":{"label":"boyd_convex_2004","enumerator":"3","html":"Boyd, S., & Vandenberghe, L. (2004). Convex Optimization. Cambridge University Press."},"schulman_proximal_2017":{"label":"schulman_proximal_2017","enumerator":"4","doi":"10.48550/arXiv.1707.06347","html":"Schulman, J., Wolski, F., Dhariwal, P., Radford, A., & Klimov, O. (2017). Proximal Policy Optimization Algorithms. arXiv. 10.48550/arXiv.1707.06347","url":"https://doi.org/10.48550/arXiv.1707.06347"}}}},"footer":{"navigation":{"prev":{"title":"5 Fitted Dynamic Programming Algorithms","url":"/fitted-dp","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"7 Imitation Learning","url":"/imitation-learning","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file diff --git a/planning.html b/planning.html index a85f0cf..dfd6719 100644 --- a/planning.html +++ b/planning.html @@ -1,4 +1,4 @@ -8 Tree Search Methods - CS/STAT 184: Introduction to Reinforcement Learning

8 Tree Search Methods

8.1Introduction

Have you ever lost a strategy game against a skilled opponent? + c-1.4,4.9-3.6,8.6-6.8,11.1c-3.1,2.5-7.3,3.7-12.4,3.8H63c-6.7,0-10,2.9-10,8.8">Made with MyST

8 Tree Search Methods

8.1Introduction

Have you ever lost a strategy game against a skilled opponent? It probably seemed like they were ahead of you at every turn. They might have been planning ahead and anticipating your actions, then planning around them in order to win. @@ -26,7 +26,7 @@ Each possible state is a node in the tree, and since we only consider deterministic games, we can represent actions as edges leading from the current state to the next. -Each path through the tree, from root to leaf, represents a single game.

The first two layers of the complete game tree of tic-tac-toe.
+Each path through the tree, from root to leaf, represents a single game.</p><figure id=The first two layers of the complete game tree of tic-tac-toe.
 From Wikimedia.

The first two layers of the complete game tree of tic-tac-toe. From Wikimedia.

If you could store the complete game tree on a computer, you would be able to win every potentially winnable game @@ -55,14 +55,14 @@ where nn is a natural number.

  • The space of possible actions, Ah(s)\mathcal{A}_h(s), depends on the state itself, as well as whose turn it is. (For example, in tic-tac-toe, Max can only play Xs while Min can only play Os.)
  • The game ends after HH total moves (which might be even or odd). We call the final state a terminal state.
  • PP denotes the state transitions, that is, -P(s,a)P(s, a) denotes the resulting state when taking action aA(s)a \in \mathcal{A}(s) in state ss.
  • r(s)r(s) denotes the game score of the terminal state ss. +P(s,a)P(s, a) denotes the resulting state when taking action aA(s)a \in \mathcal{A}(s) in state ss. We’ll assume that this function is time-homogeneous (a.k.a. stationary) and doesn’t change across timesteps.
  • r(s)r(s) denotes the game score of the terminal state ss. Note that this is some positive or negative value seen by both players: A positive value indicates Max winning, a negative value indicates Min winning, and a value of 0 indicates a tie.
  • We also call the sequence of states and actions a trajectory.

    Let us frame tic-tac-toe in this setting.

    • Each of the 9 squares is either empty, marked X, or marked O. +How would you describe this?

    Our notation may remind you of Markov decision processes. Given that these games also involve a sequence of states and actions, can we formulate them as finite-horizon MDPs? The two settings are not exactly analogous, @@ -74,17 +74,15 @@ we claimed that we could win any potentially winnable game by looking ahead and predicting the opponent’s actions. This would mean that each nonterminal state already has some predetermined game score, that is, in each state, -it is already “obvious” which player is going to win. -Let Vh(s)V_\hi^\star(s) denote the game score under optimal play starting in state ss at time h\hi. -We can compute this by starting at the terminal states, +it is already “obvious” which player is going to win.

    Let Vh(s)V_\hi^\star(s) denote the game score under optimal play from both players starting in state ss at time h\hi.

    We can compute this by starting at the terminal states, when the game’s outcome is known, and working backwards, assuming that Max chooses the action that leads to the highest score -and Min chooses the action that leads to the lowest score.

    This translates directly into a recursive depth-first search algorithm for searching the complete game tree.

    def minimax_search(s, player) -> Tuple["Action", "Value"]:
    +and Min chooses the action that leads to the lowest score.

    This translates directly into a recursive depth-first search algorithm for searching the complete game tree.

    def minimax_search(s, player) -> Tuple["Action", "Value"]:
         """Return the value of the state (for Max) and the best action for Max to take."""
         if env.is_terminal(s):
             return None, env.winner(s)
    @@ -102,21 +100,21 @@
                 _, v = minimax_search(env.step(s, a), max)
                 if v < v_min:
                     a_min, v_min = a, v
    -        return a_min, v_min

    At each of the H\hor timesteps, +We can fill in the values of these nodes accordingly:

    Thus, Max’s best move is to take action C, +resulting in a game score of max(2,3,1)=1\max(-2, -3, -1) = -1.

    At each of the H\hor timesteps, this algorithm iterates through the entire action space at that state, and therefore has a time complexity of HnA\hor^{n_A} (where nAn_A is the largest number of actions possibly available at once). @@ -128,10 +126,9 @@ and considering whether to take action aa or aa'. If at any point they find out that action aa' is definitely worse than (or equal to) action aa, they don’t need to evaluate action aa' any further.

    Concretely, we run min-max search as above, -except now we keep track of two additional parameters α(s)\alpha(s) and β(s)\beta(s) while evaluating each state. -Suppose we are evaluating Vh(s)V^\star_\hi(s), +except now we keep track of two additional parameters α(s)\alpha(s) and β(s)\beta(s) while evaluating each state:

    • Starting in state ss, Max can achieve a game score of at least α(s)\alpha(s) assuming Min plays optimally. That is, Vh(s)α(s)V^\star_\hi(s) \ge \alpha(s) at all points.
    • Analogously, starting in state ss, Min can ensure a game score of at most β(s)\beta(s) assuming Max plays optimally. That is, Vh(s)β(s)V^\star_\hi(s) \le \beta(s) at all points.

    Suppose we are evaluating Vh(s)V^\star_\hi(s), where it is Max’s turn (h\hi is even). -We update α(s)\alpha(s) to be the highest value achievable from ss so far. +We update α(s)\alpha(s) to be the highest minimax value achievable from ss so far. That is, the value of ss is at least α(s)\alpha(s). Suppose Max chooses action aa, which leads to state ss', in which it is Min’s turn. If any of Min’s actions in ss' achieve a value Vh+1(s)α(s)V^\star_{\hi+1}(s') \le \alpha(s), @@ -144,19 +141,19 @@ which leads to state ss' for Max. If Max has any actions that do better than β(s)\beta(s), they would take it, -making action aa a suboptimal choice for Min.

    def alpha_beta_search(s, player, alpha, beta) -> Tuple["Action", "Value"]:
         """Return the value of the state (for Max) and the best action for Max to take."""
         if env.is_terminal(s):
             return None, env.winner(s)
    @@ -230,8 +227,8 @@
     where each action corresponds to an arm,
     and the reward distribution of arm kk is the distribution of the game score over random games after choosing that arm.
     The most commonly used bandit algorithm in practice for MCTS is the Upper Confidence Bound (UCB) algorithm.

  • Append (s,a)(s, a) to τ
  • Set sP(s,a)s \gets P(s, a)
  • Expansion: Let snews_\text{new} denote the final state in τ (that has at least one action that hasn’t been taken). Choose one of these unexplored actions from snews_\text{new}. Call it anewa_{\text{new}}. Add it to τ.
  • Simulation: Let snext=P(snew,anew)s_\text{next} = P(s_\text{new}, a_\text{new}). Evaluate r=v(snext)r = v(s_\text{next}). This approximates the value of the game after taking the action anewa_\text{new}.
  • Backup: For each (s,a)τ(s, a) \in \tau:
    • Ns,aNs,a+1N^{s, a} \gets N^{s, a} + 1
    • Ws,aWs,a+rW^{s, a} \gets W^{s, a} + r
    • NsNs+1N^s \gets N^s + 1
  • We finally return the action with the highest UCB value (8.5). Then play continues. As before, we can reuse the tree across timesteps.

    How do we actually compute a useful πguide\pi_\text{guide} and vv? If we have some existing dataset of trajectories, -we could use supervised learning (that is, imitation learning) +we could use supervised learning (that is, imitation learning) to generate a policy πguide\pi_\text{guide} via behavioral cloning and learn vv by regressing the game outcomes onto states. Then, plugging these into the above algorithm @@ -319,13 +316,13 @@ for a given policy π0\pi^0, we can use it to guide MCTS, resulting in an algorithm that is itself a policy πMCTS0\pi^0_\text{MCTS} that maps from states to actions. -Now, we can use behavioral cloning +Now, we can use behavioral cloning to obtain a new policy π1\pi^1 that imitates πMCTS0\pi^0_\text{MCTS}. We can now use π1\pi^1 to guide MCTS, -and repeat.

    This algorithm was brought to fame by AlphaGo Zero Silver et al. (2017).

    8.6Summary

    In this chapter, @@ -343,9 +340,9 @@ namely shogi and chess, also learning from scratch. In MuZero Schrittwieser et al. (2020), -this was further extended by learning a model of the game dynamics.

    References
    1. Silver, D., Huang, A., Maddison, C. J., Guez, A., Sifre, L., van den Driessche, G., Schrittwieser, J., Antonoglou, I., Panneershelvam, V., Lanctot, M., Dieleman, S., Grewe, D., Nham, J., Kalchbrenner, N., Sutskever, I., Lillicrap, T., Leach, M., Kavukcuoglu, K., Graepel, T., & Hassabis, D. (2016). Mastering the Game of Go with Deep Neural Networks and Tree Search. Nature, 529(7587), 484–489. 10.1038/nature16961
    2. Silver, D., Schrittwieser, J., Simonyan, K., Antonoglou, I., Huang, A., Guez, A., Hubert, T., Baker, L., Lai, M., Bolton, A., Chen, Y., Lillicrap, T., Hui, F., Sifre, L., van den Driessche, G., Graepel, T., & Hassabis, D. (2017). Mastering the Game of Go without Human Knowledge. Nature, 550(7676), 354–359. 10.1038/nature24270
    3. Russell, S. J., & Norvig, P. (2021). Artificial Intelligence: A Modern Approach (Fourth edition). Pearson.
    4. Silver, D., Hubert, T., Schrittwieser, J., Antonoglou, I., Lai, M., Guez, A., Lanctot, M., Sifre, L., Kumaran, D., Graepel, T., Lillicrap, T., Simonyan, K., & Hassabis, D. (2018). A General Reinforcement Learning Algorithm That Masters Chess, Shogi, and Go through Self-Play. Science, 362(6419), 1140–1144. 10.1126/science.aar6404
    5. Schrittwieser, J., Antonoglou, I., Hubert, T., Simonyan, K., Sifre, L., Schmitt, S., Guez, A., Lockhart, E., Hassabis, D., Graepel, T., Lillicrap, T., & Silver, D. (2020). Mastering Atari, Go, Chess and Shogi by Planning with a Learned Model. Nature, 588(7839), 604–609. 10.1038/s41586-020-03051-4
    \ No newline at end of file diff --git a/planning.json b/planning.json index 567da99..2d5acfd 100644 --- a/planning.json +++ b/planning.json @@ -1 +1 @@ -{"kind":"Notebook","sha256":"5ad6f72255f948ee283927b483938dbb9b2b372614850f669c0034ff5fc30bdc","slug":"planning","location":"/planning.md","dependencies":[],"frontmatter":{"title":"8 Tree Search Methods","numbering":{"all":{"enabled":true},"enumerator":{"template":"8.%s"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"thumbnail":"/build/tic_tac_toe-a6b4190582d91cb90a4dd4ea91b55ed0.png","thumbnailOptimized":"/build/tic_tac_toe-a6b4190582d91cb90a4dd4ea91b55ed0.webp","exports":[{"format":"md","filename":"planning.md","url":"/build/planning-7b5ef62df9036b73ec5f6119008db1f7.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"DrkHKuAHY8"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"8.1","key":"apsgGDM72h"},{"type":"paragraph","position":{"start":{"line":22,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"text","value":"Have you ever lost a strategy game against a skilled opponent?\nIt probably seemed like they were ahead of you at every turn.\nThey might have been ","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"key":"Ay8wqXguob"},{"type":"emphasis","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"children":[{"type":"text","value":"planning ahead","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"key":"PjdA72JHwb"}],"key":"PZKIceU8eM"},{"type":"text","value":" and anticipating your actions,\nthen planning around them in order to win.\nIf this opponent was a computer,\nthey might have been using one of the strategies that we are about to explore.","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"key":"bITX0yZkRO"}],"key":"dVIa4jmYBt"},{"type":"heading","depth":2,"position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"text","value":"Deterministic, zero sum, fully observable two-player games","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"key":"MJaTKvUkeM"}],"identifier":"deterministic-zero-sum-fully-observable-two-player-games","label":"Deterministic, zero sum, fully observable two-player games","html_id":"deterministic-zero-sum-fully-observable-two-player-games","implicit":true,"enumerator":"8.2","key":"PvSFKBkGLh"},{"type":"paragraph","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"children":[{"type":"text","value":"In this chapter, we will focus on games that are:","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"O25kI2JDLV"}],"key":"mZ0ZAU8d1w"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":33,"column":1},"end":{"line":37,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"emphasis","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"text","value":"deterministic,","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"kyOxBmRMZp"}],"key":"zJ9Xj8YvWK"}],"key":"lPm3G8u1Co"},{"type":"listItem","spread":true,"position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"children":[{"type":"emphasis","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"children":[{"type":"text","value":"zero sum","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"LyKwJszgQo"}],"key":"alb4xC0n0L"},{"type":"text","value":" (one player wins and the other loses),","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"fLCSX8t4y3"}],"key":"oH6Y3aZEOB"},{"type":"listItem","spread":true,"position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"children":[{"type":"emphasis","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"children":[{"type":"text","value":"fully observable,","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"vaxIz1POPY"}],"key":"HQYQr6k8iR"},{"type":"text","value":" that is, the state of the game is perfectly known by both players,","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"if6NU6NNps"}],"key":"KLF00wLBkD"},{"type":"listItem","spread":true,"position":{"start":{"line":36,"column":1},"end":{"line":37,"column":1}},"children":[{"type":"text","value":"for ","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"key":"J8n4yVTFDE"},{"type":"emphasis","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"children":[{"type":"text","value":"two players","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"key":"qmP4Q3YfoY"}],"key":"gFWKgREkEH"},{"type":"text","value":" that alternate turns,","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"key":"WX1CAk5Pc5"}],"key":"ugckeA2Wzn"}],"key":"FRLrPFSaqM"},{"type":"paragraph","position":{"start":{"line":38,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"We can represent such a game as a ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"LKJ4ofAoGh"},{"type":"emphasis","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"children":[{"type":"text","value":"complete game tree.","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"PEztQCnVaF"}],"key":"Nmgc7VOzAx"},{"type":"text","value":"\nEach possible state is a node in the tree,\nand since we only consider deterministic games,\nwe can represent actions as edges leading from the current state to the next.\nEach path through the tree, from root to leaf, represents a single game.","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"TGxE1v7IHd"}],"key":"S99pzBe2HH"},{"type":"container","kind":"figure","children":[{"type":"image","url":"/build/tic_tac_toe-a6b4190582d91cb90a4dd4ea91b55ed0.png","alt":"The first two layers of the complete game tree of tic-tac-toe.\nFrom Wikimedia.","align":"center","data":{"altTextIsAutoGenerated":true},"key":"EoJ42K2qhM","urlSource":"shared/tic_tac_toe.png","urlOptimized":"/build/tic_tac_toe-a6b4190582d91cb90a4dd4ea91b55ed0.webp"},{"type":"caption","children":[{"type":"paragraph","position":{"start":{"line":47,"column":1},"end":{"line":48,"column":1}},"children":[{"type":"text","value":"The first two layers of the complete game tree of tic-tac-toe.\nFrom Wikimedia.","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"key":"tDkSkr6iMk"}],"key":"f3enJzbyAh"}],"key":"frIAstDnxG"}],"enumerator":"8.1","key":"GpptOhTzQw"},{"type":"paragraph","position":{"start":{"line":51,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"If you could store the complete game tree on a computer,\nyou would be able to win every potentially winnable game\nby searching all paths from your current state and taking a winning move.\nWe will see an explicit algorithm for this in ","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"drYDTWzPFJ"},{"type":"crossReference","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"children":[{"type":"text","value":"the next section","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"DXqVVq9K5D"}],"identifier":"min-max-search","label":"min-max-search","kind":"heading","template":"Section %s","enumerator":"8.3","resolved":true,"html_id":"min-max-search","key":"OOGviuGNB4"},{"type":"text","value":".\nHowever, as games become more complex,\nit becomes computationally impossible to search every possible path.","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"WtRb2MqvFZ"}],"key":"fugg1lbAh2"},{"type":"paragraph","position":{"start":{"line":58,"column":1},"end":{"line":66,"column":1}},"children":[{"type":"text","value":"For instance,\na chess player has roughly 30 actions to choose from at each turn,\nand each game takes roughly 40 moves per player,\nso trying to solve chess exactly using minimax\nwould take somewhere on the order of ","position":{"start":{"line":58,"column":1},"end":{"line":58,"column":1}},"key":"DzhRYZ3jr9"},{"type":"inlineMath","value":"30^{80} \\approx 10^{118}","position":{"start":{"line":58,"column":1},"end":{"line":58,"column":1}},"html":"30801011830^{80} \\approx 10^{118}308010118","key":"QKNswIhj26"},{"type":"text","value":" operations.\nThat’s 10 billion billion billion billion billion billion billion billion billion billion billion billion billion operations.\nAs of the time of writing,\nthe fastest processor can achieve almost 10 GHz (10 billion operations per second),\nso to fully solve chess using minimax is many, many orders of magnitude out of reach.","position":{"start":{"line":58,"column":1},"end":{"line":58,"column":1}},"key":"U7nn48FZvn"}],"key":"ruebTJR6uV"},{"type":"paragraph","position":{"start":{"line":68,"column":1},"end":{"line":74,"column":1}},"children":[{"type":"text","value":"It is thus intractable, in any realistic setting, to solve the complete game tree exactly.\nLuckily, only a small fraction of those games ever occur in reality;\nLater in this chapter,\nwe will explore ways to ","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"key":"hilWIQygtl"},{"type":"emphasis","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"children":[{"type":"text","value":"prune away","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"key":"EoJavf2Zld"}],"key":"SQrArEGrVR"},{"type":"text","value":" parts of the tree that we know we can safely ignore.\nWe can also ","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"key":"TwcMU0zU7I"},{"type":"emphasis","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"children":[{"type":"text","value":"approximate","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"key":"irobc49ft9"}],"key":"pChlnGtgUr"},{"type":"text","value":" the value of a state without fully evaluating it.\nUsing these approximations, we can no longer ","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"key":"LDdjtUgDYn"},{"type":"emphasis","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"children":[{"type":"text","value":"guarantee","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"key":"kC94G5LzvZ"}],"key":"jOv4h5uc9A"},{"type":"text","value":" winning the game,\nbut we can come up with strategies that will do well against most opponents.","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"key":"CQYq4DoWNp"}],"key":"axs5wzDAZz"},{"type":"heading","depth":3,"position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"children":[{"type":"text","value":"Notation","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"weUAxLITcs"}],"identifier":"notation","label":"Notation","html_id":"notation","implicit":true,"enumerator":"8.2.1","key":"lp6rZ36wxe"},{"type":"paragraph","position":{"start":{"line":78,"column":1},"end":{"line":81,"column":1}},"children":[{"type":"text","value":"Let us now describe these games formally.\nWe’ll call the first player Max and the second player Min.\nMax seeks to maximize the final game score,\nwhile Min seeks to minimize the final game score.","position":{"start":{"line":78,"column":1},"end":{"line":78,"column":1}},"key":"YlQTIKptii"}],"key":"oJ8UuJWAfc"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":83,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"children":[{"type":"text","value":"We’ll use ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"AI07z0P4Jn"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"S\\mathcal{S}S","key":"blzGJgwBQ8"},{"type":"text","value":" to denote the set of all possible game states.","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"o70EmLH50v"}],"key":"cWzyYogPOM"},{"type":"listItem","spread":true,"position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"children":[{"type":"text","value":"The game begins in some ","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"key":"NIjvHwReSj"},{"type":"strong","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"children":[{"type":"text","value":"initial state","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"key":"y2JK36JzDL"}],"key":"bLUqCyJ2oM"},{"type":"text","value":" ","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"key":"R7pxUDjoyx"},{"type":"inlineMath","value":"s_0 \\in \\mathcal{S}","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"html":"s0Ss_0 \\in \\mathcal{S}s0S","key":"NsTnx4azAn"},{"type":"text","value":".","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"key":"bI5Bea2nZB"}],"key":"XmJgrng4wg"},{"type":"listItem","spread":true,"position":{"start":{"line":85,"column":1},"end":{"line":87,"column":1}},"children":[{"type":"text","value":"Max moves on even turn numbers ","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"NY4BlPwuOz"},{"type":"inlineMath","value":"h = 2n","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"html":"h=2nh = 2nh=2n","key":"itICrhln0s"},{"type":"text","value":",\nand Min moves on odd turn numbers ","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"YPAgXmOi1D"},{"type":"inlineMath","value":"h = 2n+1","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"html":"h=2n+1h = 2n+1h=2n+1","key":"LjqmZs3kqn"},{"type":"text","value":",\nwhere ","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"lmB7OfiAPc"},{"type":"inlineMath","value":"n","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"html":"nnn","key":"ACacTvPaVj"},{"type":"text","value":" is a natural number.","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"A1fLaHMrPn"}],"key":"j1044p4aTV"},{"type":"listItem","spread":true,"position":{"start":{"line":88,"column":1},"end":{"line":90,"column":1}},"children":[{"type":"text","value":"The space of possible actions, ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"vXCWmOqTWT"},{"type":"inlineMath","value":"\\mathcal{A}_h(s)","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"html":"Ah(s)\\mathcal{A}_h(s)Ah(s)","key":"PAEYH8cPgp"},{"type":"text","value":",\ndepends on the state itself, as well as whose turn it is.\n(For example, in tic-tac-toe, Max can only play ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"aneXq0R5Yc"},{"type":"inlineCode","value":"X","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"OUG7Re7dN9"},{"type":"text","value":"s while Min can only play ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"ZoHuSAyXZv"},{"type":"inlineCode","value":"O","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"m9lnpJa3fa"},{"type":"text","value":"s.)","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"HVGvHq3e87"}],"key":"CHSodUY5b6"},{"type":"listItem","spread":true,"position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"children":[{"type":"text","value":"The game ends after ","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"key":"Et23gUckuq"},{"type":"inlineMath","value":"H","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"html":"HHH","key":"G7YUeJkXn1"},{"type":"text","value":" total moves (which might be even or odd). We call the final state a ","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"key":"zYoRbHos0v"},{"type":"strong","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"children":[{"type":"text","value":"terminal state","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"key":"dEvFGr7nyv"}],"key":"goKYDaGrf1"},{"type":"text","value":".","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"key":"FWwtyAy3Oj"}],"key":"jA3lSg9avS"},{"type":"listItem","spread":true,"position":{"start":{"line":92,"column":1},"end":{"line":93,"column":1}},"children":[{"type":"inlineMath","value":"P","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"html":"PPP","key":"TN3Cfm5LKh"},{"type":"text","value":" denotes the ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"j0WyMVvqXz"},{"type":"strong","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"children":[{"type":"text","value":"state transitions","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"wsV2PkqtVu"}],"key":"g0QSrJQizh"},{"type":"text","value":", that is,\n","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"vpHT9MF6Lc"},{"type":"inlineMath","value":"P(s, a)","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"html":"P(s,a)P(s, a)P(s,a)","key":"HsdI1vuIwg"},{"type":"text","value":" denotes the resulting state when taking action ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"c0eC2yShtE"},{"type":"inlineMath","value":"a \\in \\mathcal{A}(s)","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"html":"aA(s)a \\in \\mathcal{A}(s)aA(s)","key":"xr5RP7rBJM"},{"type":"text","value":" in state ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"LERX6fvQFw"},{"type":"inlineMath","value":"s","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"html":"sss","key":"T6OrRNZLfM"},{"type":"text","value":".","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"MtBHcJsT6F"}],"key":"AdCdfeSx4P"},{"type":"listItem","spread":true,"position":{"start":{"line":94,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"inlineMath","value":"r(s)","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"r(s)r(s)r(s)","key":"PoKGO3jHql"},{"type":"text","value":" denotes the ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"K7ajyVlXSY"},{"type":"strong","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"game score","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"Dwpc3VkwvH"}],"key":"LButVLC7Oe"},{"type":"text","value":" of the terminal state ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"UwPNC3IwLy"},{"type":"inlineMath","value":"s","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"sss","key":"rE5wVUDafO"},{"type":"text","value":".\nNote that this is some positive or negative value seen by both players:\nA positive value indicates Max winning, a negative value indicates Min winning, and a value of ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"Gn3xjyBwyN"},{"type":"text","value":"0","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"CCDaM9JBoa"},{"type":"text","value":" indicates a tie.","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"EtQSewOEXG"}],"key":"XyaBRCmFB9"}],"key":"qpxXZdr8hD"},{"type":"paragraph","position":{"start":{"line":98,"column":1},"end":{"line":98,"column":1}},"children":[{"type":"text","value":"We also call the sequence of states and actions a ","position":{"start":{"line":98,"column":1},"end":{"line":98,"column":1}},"key":"BY9efIvE3r"},{"type":"strong","position":{"start":{"line":98,"column":1},"end":{"line":98,"column":1}},"children":[{"type":"text","value":"trajectory","position":{"start":{"line":98,"column":1},"end":{"line":98,"column":1}},"key":"Y0aEtVF7ZG"}],"key":"C5wwpLKfXz"},{"type":"text","value":".","position":{"start":{"line":98,"column":1},"end":{"line":98,"column":1}},"key":"h50SRkrSLp"}],"key":"I5y8MpOydg"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"fVQUBYTLqk"}],"key":"UJLA2OdFND"},{"type":"paragraph","position":{"start":{"line":101,"column":1},"end":{"line":103,"column":1}},"children":[{"type":"text","value":"Above, we suppose that the game ends after ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"zz0tjf1gLT"},{"type":"inlineMath","value":"H","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"html":"HHH","key":"jdRsyjhQ3g"},{"type":"text","value":" total moves.\nBut most real games have a ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"gviYROzbgT"},{"type":"emphasis","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"children":[{"type":"text","value":"variable","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"D4av2vma4H"}],"key":"dShqpD7Gin"},{"type":"text","value":" length.\nHow would you describe this?","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"wXgbyzrFUS"}],"key":"Q3X5nnFfaR"}],"key":"LRSCimHIA1"},{"type":"paragraph","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"children":[{"type":"text","value":"Let us frame tic-tac-toe in this setting.","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"key":"rD7VRdRqTp"}],"key":"NbnROuuLrf"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":108,"column":1},"end":{"line":117,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":108,"column":1},"end":{"line":110,"column":1}},"children":[{"type":"text","value":"Each of the ","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"key":"F293XGI7sH"},{"type":"text","value":"9","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"key":"NH4IQi2Yvq"},{"type":"text","value":" squares is either empty, marked X, or marked O.\nSo there are ","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"key":"VOgLKYOYEW"},{"type":"inlineMath","value":"|\\mathcal{S}| = 3^9","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"html":"S=39|\\mathcal{S}| = 3^9S=39","key":"M9tSOGbAKz"},{"type":"text","value":" potential states.\nNot all of these may be reachable!","position":{"start":{"line":108,"column":1},"end":{"line":108,"column":1}},"key":"qXv4vW8i3L"}],"key":"djtCr97X6V"},{"type":"listItem","spread":true,"position":{"start":{"line":111,"column":1},"end":{"line":111,"column":1}},"children":[{"type":"text","value":"The initial state ","position":{"start":{"line":111,"column":1},"end":{"line":111,"column":1}},"key":"yoCqcY14uD"},{"type":"inlineMath","value":"s_0","position":{"start":{"line":111,"column":1},"end":{"line":111,"column":1}},"html":"s0s_0s0","key":"tqMEeU3bio"},{"type":"text","value":" is the empty board.","position":{"start":{"line":111,"column":1},"end":{"line":111,"column":1}},"key":"Fahf7pECTB"}],"key":"I0cPTOoyUH"},{"type":"listItem","spread":true,"position":{"start":{"line":112,"column":1},"end":{"line":113,"column":1}},"children":[{"type":"text","value":"The set of possible actions for Max in state ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"hLRQRQ8Ccv"},{"type":"inlineMath","value":"s","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"sss","key":"pzfBaA8lcw"},{"type":"text","value":", ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"LYchOdmdVP"},{"type":"inlineMath","value":"\\mathcal{A}_{2n}(s)","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"A2n(s)\\mathcal{A}_{2n}(s)A2n(s)","key":"pBj8pJ7Fxi"},{"type":"text","value":", is the set of tuples ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"l5tu3MDvTr"},{"type":"inlineMath","value":"(\\text{``X''}, i)","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"(“X”,i)(\\text{``X''}, i)(“X”,i)","key":"jfvo3PvgQS"},{"type":"text","value":" where ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"KkK72rBL4D"},{"type":"inlineMath","value":"i","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"iii","key":"VHcv5jwj8G"},{"type":"text","value":" refers to an empty square in ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"yrYczAY4E8"},{"type":"inlineMath","value":"s","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"sss","key":"YtFuD1m3uJ"},{"type":"text","value":".\nSimilarly, ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"iNJ2v3JT8d"},{"type":"inlineMath","value":"\\mathcal{A}_{2n+1}(s)","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"A2n+1(s)\\mathcal{A}_{2n+1}(s)A2n+1(s)","key":"r6qULkIlrj"},{"type":"text","value":" is the set of tuples ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"GKu1YZ8CUP"},{"type":"inlineMath","value":"(\\text{``O''}, i)","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"(“O”,i)(\\text{``O''}, i)(“O”,i)","key":"pZuNZ529gj"},{"type":"text","value":" where ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"kO8RqH4Wf3"},{"type":"inlineMath","value":"i","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"iii","key":"JjxkvBNtIt"},{"type":"text","value":" refers to an empty square in ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"y8bVxYiGe9"},{"type":"inlineMath","value":"s","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"sss","key":"XFEWYbQ5fC"},{"type":"text","value":".","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"g3fi7TEnTq"}],"key":"rLsSAdyoLO"},{"type":"listItem","spread":true,"position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"children":[{"type":"text","value":"We can take ","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"InyjeEVoQQ"},{"type":"inlineMath","value":"H = 9","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"html":"H=9H = 9H=9","key":"FTgswr3Q6X"},{"type":"text","value":" as the longest possible game length.","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"zUUFuobJdF"}],"key":"Y32ccvnUMr"},{"type":"listItem","spread":true,"position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"children":[{"type":"inlineMath","value":"P(s, a)","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"P(s,a)P(s, a)P(s,a)","key":"gujw1MdpIi"},{"type":"text","value":" for a ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"nuSlmZMRu4"},{"type":"emphasis","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"children":[{"type":"text","value":"nonterminal","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"DCUWWkrVHN"}],"key":"VgOfsw98kJ"},{"type":"text","value":" state ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"ZZ9X7EtdtK"},{"type":"inlineMath","value":"s","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"sss","key":"RDo8R6LvpN"},{"type":"text","value":" is simply the board with the symbol and square specified by ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"l1LHNh7WEs"},{"type":"inlineMath","value":"a","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"aaa","key":"KMVPVwfa5C"},{"type":"text","value":" marked into ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"LYUNxQmldS"},{"type":"inlineMath","value":"s","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"sss","key":"hl8FRGG8Y4"},{"type":"text","value":". Otherwise, if ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"Ht8efQbxx4"},{"type":"inlineMath","value":"s","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"sss","key":"UftON1JN3a"},{"type":"text","value":" is a ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"ZnejDmOuea"},{"type":"emphasis","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"children":[{"type":"text","value":"terminal","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"aoRiFTPbmR"}],"key":"oWwMy4yRN8"},{"type":"text","value":" state, i.e. it already has three symbols in a row, the state no longer changes.","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"VH7rcmlw2G"}],"key":"bPTQdNjwhw"},{"type":"listItem","spread":true,"position":{"start":{"line":116,"column":1},"end":{"line":117,"column":1}},"children":[{"type":"inlineMath","value":"r(s)","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"html":"r(s)r(s)r(s)","key":"HhCoZVMAaM"},{"type":"text","value":" at a ","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"qs1FrUGr75"},{"type":"emphasis","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"children":[{"type":"text","value":"terminal","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"P8XsqpXfA5"}],"key":"VefEYDZSqF"},{"type":"text","value":" state is ","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"kKP6a6JZm7"},{"type":"text","value":"+1","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"v74rzp3ASW"},{"type":"text","value":" if there are three Xs in a row, ","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"xPo4auod06"},{"type":"text","value":"-1","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"LspZoFj5pA"},{"type":"text","value":" if there are three Os in a row, and ","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"RjC7x9mWyu"},{"type":"text","value":"0","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"y0rIBYEw6x"},{"type":"text","value":" otherwise.","position":{"start":{"line":116,"column":1},"end":{"line":116,"column":1}},"key":"Gwoaa9FXNd"}],"key":"SD1Fk7IWav"}],"key":"RMbjwkFVFQ"},{"type":"paragraph","position":{"start":{"line":118,"column":1},"end":{"line":125,"column":1}},"children":[{"type":"text","value":"Our notation may remind you of ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"XfrxIN0yYF"},{"type":"link","url":"/mdps","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"Markov decision processes","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"CfNGCrTFql"}],"urlSource":"./mdps.md","dataUrl":"/mdps.json","internal":true,"protocol":"file","key":"rxV5RZwOgX"},{"type":"text","value":".\nGiven that these games also involve a sequence of states and actions,\ncan we formulate them as finite-horizon MDPs?\nThe two settings are not exactly analogous,\nsince in MDPs we only consider a ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"ycRT0VMhkm"},{"type":"emphasis","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"single","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"wxjfusMAA3"}],"key":"omVenZb0Ob"},{"type":"text","value":" policy,\nwhile these games involve two distinct players with opposite objectives.\nSince we want to analyze the behavior of ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"XM3S9ZxaXJ"},{"type":"emphasis","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"both","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"YvIKJ7LJBN"}],"key":"QyBUNSQQwF"},{"type":"text","value":" players at the same time,\ndescribing such a game as an MDP is more trouble than it’s worth.","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"AfT703E779"}],"key":"iSLxJGOsNh"},{"type":"heading","depth":2,"position":{"start":{"line":128,"column":1},"end":{"line":128,"column":1}},"children":[{"type":"text","value":"Min-max search *","position":{"start":{"line":128,"column":1},"end":{"line":128,"column":1}},"key":"vkmHM73lv1"}],"label":"min-max-search","identifier":"min-max-search","html_id":"min-max-search","enumerator":"8.3","key":"utJpfIY3Nw"},{"type":"admonition","kind":"important","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Important","key":"hjcISDrBws"}],"key":"GbGOwfvL7X"},{"type":"paragraph","position":{"start":{"line":131,"column":1},"end":{"line":132,"column":1}},"children":[{"type":"text","value":"The course (Fall 2024) does not cover min-max search.\nThis content is here to provide background on ","position":{"start":{"line":131,"column":1},"end":{"line":131,"column":1}},"key":"mqz4zxUTki"},{"type":"emphasis","position":{"start":{"line":131,"column":1},"end":{"line":131,"column":1}},"children":[{"type":"text","value":"optimally","position":{"start":{"line":131,"column":1},"end":{"line":131,"column":1}},"key":"NmYKEbhqQi"}],"key":"yzngu4XDjq"},{"type":"text","value":" solving these deterministic, zero-sum, two-player games.","position":{"start":{"line":131,"column":1},"end":{"line":131,"column":1}},"key":"cOqzksOKbr"}],"key":"BpUWkLvY9y"}],"key":"upr7iOg3zI"},{"type":"paragraph","position":{"start":{"line":135,"column":1},"end":{"line":145,"column":1}},"children":[{"type":"text","value":"In the introduction,\nwe claimed that we could win any potentially winnable game by looking ahead and predicting the opponent’s actions.\nThis would mean that each ","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"key":"O9wNFOpMMU"},{"type":"emphasis","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"children":[{"type":"text","value":"nonterminal","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"key":"OyjlfPq3HM"}],"key":"f9ldmQp5rV"},{"type":"text","value":" state already has some predetermined game score,\nthat is, in each state,\nit is already “obvious” which player is going to win.\nLet ","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"key":"t85z1BVt7E"},{"type":"inlineMath","value":"V_\\hi^\\star(s)","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"html":"Vh(s)V_\\hi^\\star(s)Vh(s)","key":"efRuC8oiVM"},{"type":"text","value":" denote the game score under optimal play starting in state ","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"key":"rRRpn1wUFd"},{"type":"inlineMath","value":"s","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"html":"sss","key":"DVHwGQwAzh"},{"type":"text","value":" at time ","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"key":"JmzHOPHNXF"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"html":"h\\hih","key":"ayrIvcb1Mk"},{"type":"text","value":".\nWe can compute this by starting at the terminal states,\nwhen the game’s outcome is known,\nand working backwards,\nassuming that Max chooses the action that leads to the highest score\nand Min chooses the action that leads to the lowest score.","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"key":"dlNSRwkFLO"}],"key":"ZO70CBYG0Q"},{"type":"proof","kind":"algorithm","label":"min-max-value","identifier":"min-max-value","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Min-max search algorithm","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"key":"NVhRVn9wPF"}],"key":"SkRB3W5SPO"},{"type":"math","value":"V_\\hi^{\\star}(s) = \\begin{cases}\nr(s) & \\hi = \\hor \\\\\n\\max_{a \\in \\mathcal{A}(s)} V_{\\hi+1}^{\\star}(P(s, a)) & h \\text{ is even and } h < H \\\\\n\\min_{a \\in \\mathcal{A}(s)} V_{\\hi+1}^{\\star}(P(s, a)) & h \\text{ is odd and } h < H \\\\\n\\end{cases}","position":{"start":{"line":150,"column":1},"end":{"line":156,"column":1}},"html":"Vh(s)={r(s)h=HmaxaA(s)Vh+1(P(s,a))h is even and h<HminaA(s)Vh+1(P(s,a))h is odd and h<HV_\\hi^{\\star}(s) = \\begin{cases}\nr(s) & \\hi = \\hor \\\\\n\\max_{a \\in \\mathcal{A}(s)} V_{\\hi+1}^{\\star}(P(s, a)) & h \\text{ is even and } h < H \\\\\n\\min_{a \\in \\mathcal{A}(s)} V_{\\hi+1}^{\\star}(P(s, a)) & h \\text{ is odd and } h < H \\\\\n\\end{cases}Vh(s)=r(s)maxaA(s)Vh+1(P(s,a))minaA(s)Vh+1(P(s,a))h=Hh is even and h<Hh is odd and h<H","enumerator":"8.1","key":"GozfG3N0Xo"}],"enumerator":"8.1","html_id":"min-max-value","key":"RSkqY3iEr7"},{"type":"paragraph","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"children":[{"type":"text","value":"This translates directly into a recursive depth-first search algorithm for searching the complete game tree.","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"gtCq7su4Qt"}],"key":"LuFsgZzsOo"},{"type":"code","lang":"python","value":"def minimax_search(s, player) -> Tuple[\"Action\", \"Value\"]:\n \"\"\"Return the value of the state (for Max) and the best action for Max to take.\"\"\"\n if env.is_terminal(s):\n return None, env.winner(s)\n\n if player is max:\n a_max, v_max = None, None\n for a in actions:\n _, v = minimax_search(env.step(s, a), min)\n if v > v_max:\n a_max, v_max = a, v\n return a_max, v_max\n else:\n a_min, v_min = None, None\n for a in actions:\n _, v = minimax_search(env.step(s, a), max)\n if v < v_min:\n a_min, v_min = a, v\n return a_min, v_min","position":{"start":{"line":161,"column":1},"end":{"line":181,"column":1}},"key":"r1vusfAFrd"},{"type":"proof","kind":"example","label":"min-max-example","identifier":"min-max-example","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Min-max search for a simple game","position":{"start":{"line":183,"column":1},"end":{"line":183,"column":1}},"key":"IULJuDeFjU"}],"key":"GpnOstJe6f"},{"type":"paragraph","position":{"start":{"line":186,"column":1},"end":{"line":189,"column":1}},"children":[{"type":"text","value":"Consider a simple game: Max chooses one of three possible actions (A, B, C),\nMin chooses one of three possible actions (D, E, F),\nand the combination leads to a certain integer outcome,\nshown in the table below:","position":{"start":{"line":186,"column":1},"end":{"line":186,"column":1}},"key":"wlxL2nghiT"}],"key":"S1YW5CTDeV"},{"type":"table","position":{"start":{"line":191,"column":1},"end":{"line":195,"column":1}},"children":[{"type":"tableRow","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"children":[{"type":"tableCell","header":true,"position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"children":[],"key":"B79fy6yLMe"},{"type":"tableCell","header":true,"position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"children":[{"type":"text","value":"D","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"key":"KM57oTdVdo"}],"key":"hKciCSrWwB"},{"type":"tableCell","header":true,"position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"children":[{"type":"text","value":"E","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"key":"x7LoIb3FqI"}],"key":"lrsAXlwi5s"},{"type":"tableCell","header":true,"position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"children":[{"type":"text","value":"F","position":{"start":{"line":191,"column":1},"end":{"line":191,"column":1}},"key":"PI19rrTOV5"}],"key":"pCs4PjAC8i"}],"key":"yJUp4wseSM"},{"type":"tableRow","position":{"start":{"line":193,"column":1},"end":{"line":193,"column":1}},"children":[{"type":"tableCell","position":{"start":{"line":193,"column":1},"end":{"line":193,"column":1}},"children":[{"type":"text","value":"A","position":{"start":{"line":193,"column":1},"end":{"line":193,"column":1}},"key":"TZAmDQdUZ9"}],"key":"lAWveqQUsL"},{"type":"tableCell","position":{"start":{"line":193,"column":1},"end":{"line":193,"column":1}},"children":[{"type":"text","value":"4","position":{"start":{"line":193,"column":1},"end":{"line":193,"column":1}},"key":"DQHzawHgI1"}],"key":"QvMv20LsTQ"},{"type":"tableCell","position":{"start":{"line":193,"column":1},"end":{"line":193,"column":1}},"children":[{"type":"text","value":"-2","position":{"start":{"line":193,"column":1},"end":{"line":193,"column":1}},"key":"umyj6I6GOC"}],"key":"kNQLIBszC9"},{"type":"tableCell","position":{"start":{"line":193,"column":1},"end":{"line":193,"column":1}},"children":[{"type":"text","value":"5","position":{"start":{"line":193,"column":1},"end":{"line":193,"column":1}},"key":"UtvGpbtf9a"}],"key":"jqMyuyurNv"}],"key":"LzRCkVTGP2"},{"type":"tableRow","position":{"start":{"line":194,"column":1},"end":{"line":194,"column":1}},"children":[{"type":"tableCell","position":{"start":{"line":194,"column":1},"end":{"line":194,"column":1}},"children":[{"type":"text","value":"B","position":{"start":{"line":194,"column":1},"end":{"line":194,"column":1}},"key":"Kt5EDRdQd5"}],"key":"u25Ux6Q1t6"},{"type":"tableCell","position":{"start":{"line":194,"column":1},"end":{"line":194,"column":1}},"children":[{"type":"text","value":"-3","position":{"start":{"line":194,"column":1},"end":{"line":194,"column":1}},"key":"eadYS7Hpi2"}],"key":"x02bLzWjzj"},{"type":"tableCell","position":{"start":{"line":194,"column":1},"end":{"line":194,"column":1}},"children":[{"type":"text","value":"3","position":{"start":{"line":194,"column":1},"end":{"line":194,"column":1}},"key":"zKAIYZWwL8"}],"key":"h9L3SUkCSz"},{"type":"tableCell","position":{"start":{"line":194,"column":1},"end":{"line":194,"column":1}},"children":[{"type":"text","value":"1","position":{"start":{"line":194,"column":1},"end":{"line":194,"column":1}},"key":"i3xxSrKf7E"}],"key":"a62qvlx4q3"}],"key":"NqdXNJyv4G"},{"type":"tableRow","position":{"start":{"line":195,"column":1},"end":{"line":195,"column":1}},"children":[{"type":"tableCell","position":{"start":{"line":195,"column":1},"end":{"line":195,"column":1}},"children":[{"type":"text","value":"C","position":{"start":{"line":195,"column":1},"end":{"line":195,"column":1}},"key":"VFDjNzymAi"}],"key":"ZMoldG39Pw"},{"type":"tableCell","position":{"start":{"line":195,"column":1},"end":{"line":195,"column":1}},"children":[{"type":"text","value":"0","position":{"start":{"line":195,"column":1},"end":{"line":195,"column":1}},"key":"FlNALbtrP4"}],"key":"Md8sw5AVoJ"},{"type":"tableCell","position":{"start":{"line":195,"column":1},"end":{"line":195,"column":1}},"children":[{"type":"text","value":"3","position":{"start":{"line":195,"column":1},"end":{"line":195,"column":1}},"key":"t03P90ZUXl"}],"key":"dzeAxVbOYR"},{"type":"tableCell","position":{"start":{"line":195,"column":1},"end":{"line":195,"column":1}},"children":[{"type":"text","value":"-1","position":{"start":{"line":195,"column":1},"end":{"line":195,"column":1}},"key":"YL6FkOsuej"}],"key":"Cs3D8v2ULT"}],"key":"GSW4TPsQY6"}],"key":"DoRgUb05gS"},{"type":"paragraph","position":{"start":{"line":197,"column":1},"end":{"line":199,"column":1}},"children":[{"type":"text","value":"We can visualize this as the following complete game tree,\nwhere each box contains the value ","position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"key":"CjbuxBtMc2"},{"type":"inlineMath","value":"V_\\hi^\\star(s)","position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"html":"Vh(s)V_\\hi^\\star(s)Vh(s)","key":"Mc21uy9Sgi"},{"type":"text","value":" of that node.\nThe min-max values of the terminal states are already known:","position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"key":"jYmEm6i0CB"}],"key":"L9kaA4mRsO"},{"type":"image","url":"/build/minmax-70b17e866836d498d3d814fd3fc3d9e3.png","position":{"start":{"line":201,"column":1},"end":{"line":201,"column":1}},"key":"IcqggvlJhT","urlSource":"./shared/minmax.png","urlOptimized":"/build/minmax-70b17e866836d498d3d814fd3fc3d9e3.webp"},{"type":"paragraph","position":{"start":{"line":203,"column":1},"end":{"line":207,"column":1}},"children":[{"type":"text","value":"We begin min-max search at the root,\nexploring each of Max’s actions.\nSuppose Max chooses action A.\nThen Min will choose action E to minimize the game score,\nmaking the value of this game node ","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"key":"i4vZYdXL1U"},{"type":"inlineMath","value":"\\min(4, -2, 5) = -2","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"html":"min(4,2,5)=2\\min(4, -2, 5) = -2min(4,2,5)=2","key":"Tke0T5ybMU"},{"type":"text","value":".","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"key":"uZ6gvi4rHF"}],"key":"S2sp1e3E3Y"},{"type":"image","url":"/build/minmax-2-d2c05b455ad2a4aef499542eadb0515d.png","position":{"start":{"line":209,"column":1},"end":{"line":209,"column":1}},"key":"vuYwMkXHlS","urlSource":"./shared/minmax-2.png","urlOptimized":"/build/minmax-2-d2c05b455ad2a4aef499542eadb0515d.webp"},{"type":"paragraph","position":{"start":{"line":211,"column":1},"end":{"line":215,"column":1}},"children":[{"type":"text","value":"Similarly, if Max chooses action A,\nthen Min will choose action D,\nand if Max chooses action C,\nthen Min will choose action F.\nWe can fill in the values of these nodes accordingly:","position":{"start":{"line":211,"column":1},"end":{"line":211,"column":1}},"key":"PP0Ex1HTJC"}],"key":"dS4mKXIDYm"},{"type":"image","url":"/build/minmax-3-f38c4f0467ce1216f1438052ec8a7d85.png","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"key":"oNNZdrOCaj","urlSource":"./shared/minmax-3.png","urlOptimized":"/build/minmax-3-f38c4f0467ce1216f1438052ec8a7d85.webp"},{"type":"paragraph","position":{"start":{"line":219,"column":1},"end":{"line":220,"column":1}},"children":[{"type":"text","value":"Thus, Max’s best move is to take action C,\nresulting in a game score of ","position":{"start":{"line":219,"column":1},"end":{"line":219,"column":1}},"key":"joIPZ81Fvi"},{"type":"inlineMath","value":"\\max(-2, -3, -1) = -1","position":{"start":{"line":219,"column":1},"end":{"line":219,"column":1}},"html":"max(2,3,1)=1\\max(-2, -3, -1) = -1max(2,3,1)=1","key":"JuRiQS4amq"},{"type":"text","value":".","position":{"start":{"line":219,"column":1},"end":{"line":219,"column":1}},"key":"IyvSBd1uQE"}],"key":"feqdjePyUW"},{"type":"image","url":"/build/minmax-4-013da4f214c0c822edc5b0e2b62d2f2a.png","position":{"start":{"line":222,"column":1},"end":{"line":222,"column":1}},"key":"grUIqd1aR9","urlSource":"./shared/minmax-4.png","urlOptimized":"/build/minmax-4-013da4f214c0c822edc5b0e2b62d2f2a.webp"}],"enumerator":"8.1","html_id":"min-max-example","key":"qvXyd4MROr"},{"type":"heading","depth":3,"position":{"start":{"line":225,"column":1},"end":{"line":225,"column":1}},"children":[{"type":"text","value":"Complexity of min-max search","position":{"start":{"line":225,"column":1},"end":{"line":225,"column":1}},"key":"Gp7ktxrg0u"}],"identifier":"complexity-of-min-max-search","label":"Complexity of min-max search","html_id":"complexity-of-min-max-search","implicit":true,"enumerator":"8.3.1","key":"UikbYmSNE4"},{"type":"paragraph","position":{"start":{"line":227,"column":1},"end":{"line":231,"column":1}},"children":[{"type":"text","value":"At each of the ","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"key":"QnLwZHVHnn"},{"type":"inlineMath","value":"\\hor","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"html":"H\\horH","key":"QuY0EYNA6F"},{"type":"text","value":" timesteps,\nthis algorithm iterates through the entire action space at that state,\nand therefore has a time complexity of ","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"key":"mevcGpOY8v"},{"type":"inlineMath","value":"\\hor^{n_A}","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"html":"HnA\\hor^{n_A}HnA","key":"hT1x1p5kSh"},{"type":"text","value":"\n(where ","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"key":"xi94nHhgWM"},{"type":"inlineMath","value":"n_A","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"html":"nAn_AnA","key":"v07EJHi865"},{"type":"text","value":" is the largest number of actions possibly available at once).\nThis makes the min-max algorithm impractical for even moderately sized games.","position":{"start":{"line":227,"column":1},"end":{"line":227,"column":1}},"key":"yT9pcH9tAX"}],"key":"D2Ezs8vCAF"},{"type":"paragraph","position":{"start":{"line":233,"column":1},"end":{"line":236,"column":1}},"children":[{"type":"text","value":"But do we need to compute the exact value of ","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"key":"LNNP5wz0Oh"},{"type":"emphasis","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"children":[{"type":"text","value":"every","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"key":"mGmPjMeOpa"}],"key":"XyP1YK0zzC"},{"type":"text","value":" possible state?\nInstead, is there some way we could “ignore” certain actions and their subtrees\nif we already know of better options?\nThe ","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"key":"ec21bqWkuK"},{"type":"strong","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"children":[{"type":"text","value":"alpha-beta search","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"key":"YstXB7cKxv"}],"key":"ElcooePJUC"},{"type":"text","value":" makes use of this intuition.","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"key":"Fip2NTPYFQ"}],"key":"K9YxDfALJq"},{"type":"heading","depth":2,"position":{"start":{"line":239,"column":1},"end":{"line":239,"column":1}},"children":[{"type":"text","value":"Alpha-beta search","position":{"start":{"line":239,"column":1},"end":{"line":239,"column":1}},"key":"KE5vD6yXec"}],"label":"alpha-beta-search","identifier":"alpha-beta-search","html_id":"alpha-beta-search","enumerator":"8.4","key":"jW6Srgxo4o"},{"type":"paragraph","position":{"start":{"line":241,"column":1},"end":{"line":245,"column":1}},"children":[{"type":"text","value":"The intuition behind alpha-beta search is as follows:\nSuppose Max is in state ","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"key":"bUjY34N5Ah"},{"type":"inlineMath","value":"s","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"html":"sss","key":"RO8D4CClUS"},{"type":"text","value":",\nand considering whether to take action ","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"key":"vm6xpl9BK1"},{"type":"inlineMath","value":"a","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"html":"aaa","key":"ffNqSKR9GW"},{"type":"text","value":" or ","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"key":"kvZASw2O9Z"},{"type":"inlineMath","value":"a'","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"html":"aa'a","key":"J0vJlx9nk3"},{"type":"text","value":".\nIf at any point they find out that action ","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"key":"ZG1Zmz0Ogy"},{"type":"inlineMath","value":"a'","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"html":"aa'a","key":"LZ4Ke8iEum"},{"type":"text","value":" is definitely worse than (or equal to) action ","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"key":"WbuE1yP4Tj"},{"type":"inlineMath","value":"a","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"html":"aaa","key":"MfGTcuqVzw"},{"type":"text","value":",\nthey don’t need to evaluate action ","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"key":"E4KiFQMXxY"},{"type":"inlineMath","value":"a'","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"html":"aa'a","key":"u3XpHZGRxe"},{"type":"text","value":" any further.","position":{"start":{"line":241,"column":1},"end":{"line":241,"column":1}},"key":"T60kfBaKpK"}],"key":"uUHkdOHAFj"},{"type":"paragraph","position":{"start":{"line":247,"column":1},"end":{"line":264,"column":1}},"children":[{"type":"text","value":"Concretely, we run min-max search as above,\nexcept now we keep track of two additional parameters ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"jLS01Xu1a8"},{"type":"inlineMath","value":"\\alpha(s)","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"α(s)\\alpha(s)α(s)","key":"HGrG3EjDh6"},{"type":"text","value":" and ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"pXQ4itIaz9"},{"type":"inlineMath","value":"\\beta(s)","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"β(s)\\beta(s)β(s)","key":"BchAExTGCi"},{"type":"text","value":" while evaluating each state.\nSuppose we are evaluating ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"VfOOUBRjTt"},{"type":"inlineMath","value":"V^\\star_\\hi(s)","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"Vh(s)V^\\star_\\hi(s)Vh(s)","key":"wxEwUusNN1"},{"type":"text","value":",\nwhere it is Max’s turn (","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"EAjXPSFZwN"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"h\\hih","key":"kBHawtEQFA"},{"type":"text","value":" is even).\nWe update ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"tmM7zYw1Al"},{"type":"inlineMath","value":"\\alpha(s)","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"α(s)\\alpha(s)α(s)","key":"oUdlEt3LUI"},{"type":"text","value":" to be the ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"WemWfpDQqp"},{"type":"emphasis","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"children":[{"type":"text","value":"highest","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"klV4BTNGqi"}],"key":"ACoyGrgLtW"},{"type":"text","value":" value achievable from ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"wJC4VIv4dk"},{"type":"inlineMath","value":"s","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"sss","key":"sHkIoMt6k1"},{"type":"text","value":" so far.\nThat is, the value of ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"MEG2nfpB7E"},{"type":"inlineMath","value":"s","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"sss","key":"sCdMryxVzP"},{"type":"text","value":" is ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"f2p59lnNpT"},{"type":"emphasis","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"children":[{"type":"text","value":"at least","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"rU2FpPBddy"}],"key":"w6T45H4yEa"},{"type":"text","value":" ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"l2CD61fhWV"},{"type":"inlineMath","value":"\\alpha(s)","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"α(s)\\alpha(s)α(s)","key":"h3H85eDUTD"},{"type":"text","value":".\nSuppose Max chooses action ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"OVO7Jchd7O"},{"type":"inlineMath","value":"a","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"aaa","key":"Xccld5vQt1"},{"type":"text","value":", which leads to state ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"y8fmcKrFKE"},{"type":"inlineMath","value":"s'","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"ss's","key":"qAZjz1Jy8G"},{"type":"text","value":", in which it is Min’s turn.\nIf any of Min’s actions in ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"t7jbTGzB2m"},{"type":"inlineMath","value":"s'","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"ss's","key":"bjpc7JSohv"},{"type":"text","value":" achieve a value ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"LuL0KTfctl"},{"type":"inlineMath","value":"V^\\star_{\\hi+1}(s') \\le \\alpha(s)","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"Vh+1(s)α(s)V^\\star_{\\hi+1}(s') \\le \\alpha(s)Vh+1(s)α(s)","key":"nGb4GZAWux"},{"type":"text","value":",\nwe know that Max would not choose action ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"n9nPrgcKAe"},{"type":"inlineMath","value":"a","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"aaa","key":"i1QpxVhAzc"},{"type":"text","value":",\nsince they know that it is ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"XCXr0AQHHE"},{"type":"emphasis","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"children":[{"type":"text","value":"worse","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"hyPr0E9vDF"}],"key":"JCIt75ctSJ"},{"type":"text","value":" than whichever action gave the value ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"TnTOkeiLC6"},{"type":"inlineMath","value":"\\alpha(s)","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"α(s)\\alpha(s)α(s)","key":"y0GGKKhP65"},{"type":"text","value":".\nSimilarly, to evaluate a state on Min’s turn,\nwe update ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"p60h1wEK8r"},{"type":"inlineMath","value":"\\beta(s)","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"β(s)\\beta(s)β(s)","key":"qzBkQ2TiaX"},{"type":"text","value":" to be the ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"YYvakLwmXS"},{"type":"emphasis","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"children":[{"type":"text","value":"lowest","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"JLktkIgQ3I"}],"key":"kPlHGpECvw"},{"type":"text","value":" value achievable from ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"SbjDJs8puf"},{"type":"inlineMath","value":"s","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"sss","key":"Ogeu4tWBgC"},{"type":"text","value":" so far.\nThat is, the value of ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"UZgcMppSwe"},{"type":"inlineMath","value":"s","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"sss","key":"hgwJdhiTrE"},{"type":"text","value":" is ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"rtc6tldQdN"},{"type":"emphasis","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"children":[{"type":"text","value":"at most","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"F96U5XuK1B"}],"key":"QwKVoDcPnT"},{"type":"text","value":" ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"LoVrYFo1ZD"},{"type":"inlineMath","value":"\\beta(s)","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"β(s)\\beta(s)β(s)","key":"l7Um8QP99P"},{"type":"text","value":".\nSuppose Min chooses action ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"YBMA9rHnb0"},{"type":"inlineMath","value":"a","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"aaa","key":"T9hAI0s3TH"},{"type":"text","value":",\nwhich leads to state ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"oEzpeV0hVX"},{"type":"inlineMath","value":"s'","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"ss's","key":"GNFiEITQOv"},{"type":"text","value":" for Max.\nIf Max has any actions that do ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"B8uLq1tdag"},{"type":"emphasis","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"children":[{"type":"text","value":"better","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"XHokj86AlL"}],"key":"vEH5KVsIAC"},{"type":"text","value":" than ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"PpgKUriAbo"},{"type":"inlineMath","value":"\\beta(s)","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"β(s)\\beta(s)β(s)","key":"MUFfRbrZBa"},{"type":"text","value":",\nthey would take it,\nmaking action ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"DvrG57l8Ld"},{"type":"inlineMath","value":"a","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"aaa","key":"xAIJt3nPvB"},{"type":"text","value":" a suboptimal choice for Min.","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"pR1LDwiR7i"}],"key":"kxgYj5OXze"},{"type":"proof","kind":"example","label":"alpha-beta-example","identifier":"alpha-beta-example","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Alpha-beta search for a simple game","position":{"start":{"line":266,"column":1},"end":{"line":266,"column":1}},"key":"pE5PelfBmq"}],"key":"nqRIm2iIz5"},{"type":"paragraph","position":{"start":{"line":269,"column":1},"end":{"line":273,"column":1}},"children":[{"type":"text","value":"Let us use the same simple game from ","position":{"start":{"line":269,"column":1},"end":{"line":269,"column":1}},"key":"zfLgNHpwt7"},{"type":"crossReference","position":{"start":{"line":269,"column":1},"end":{"line":269,"column":1}},"children":[{"type":"text","value":"Example ","key":"PS1pPtDZV6"},{"type":"text","value":"8.1","key":"hvFCEE3RvP"}],"identifier":"min-max-example","label":"min-max-example","kind":"proof:example","template":"Example %s","enumerator":"8.1","resolved":true,"html_id":"min-max-example","key":"xWcWOdxMQ0"},{"type":"text","value":".\nWe list the values of ","position":{"start":{"line":269,"column":1},"end":{"line":269,"column":1}},"key":"rzgtC9o3nn"},{"type":"inlineMath","value":"\\alpha(s), \\beta(s)","position":{"start":{"line":269,"column":1},"end":{"line":269,"column":1}},"html":"α(s),β(s)\\alpha(s), \\beta(s)α(s),β(s)","key":"tIIptduMp6"},{"type":"text","value":" in each node throughout the algorithm.\nThese values are initialized to ","position":{"start":{"line":269,"column":1},"end":{"line":269,"column":1}},"key":"WR7atgBCQD"},{"type":"inlineMath","value":"-\\infty, +\\infty","position":{"start":{"line":269,"column":1},"end":{"line":269,"column":1}},"html":",+-\\infty, +\\infty,+","key":"MBLEAUWpRt"},{"type":"text","value":" respectively.\nWe shade any squares that have not been visited by the algorithm,\nand we assume that actions are evaluated from left to right.","position":{"start":{"line":269,"column":1},"end":{"line":269,"column":1}},"key":"JzYjrcbU73"}],"key":"amTns9nhPA"},{"type":"image","url":"/build/alpha-beta-0-7ad590b6317a7a6f64b4e368eda30e33.png","position":{"start":{"line":275,"column":1},"end":{"line":275,"column":1}},"key":"RIA11m51dx","urlSource":"./shared/alpha-beta-0.png","urlOptimized":"/build/alpha-beta-0-7ad590b6317a7a6f64b4e368eda30e33.webp"},{"type":"paragraph","position":{"start":{"line":277,"column":1},"end":{"line":280,"column":1}},"children":[{"type":"text","value":"Suppose Max takes action A. Let ","position":{"start":{"line":277,"column":1},"end":{"line":277,"column":1}},"key":"PsgFxwGC7x"},{"type":"inlineMath","value":"s'","position":{"start":{"line":277,"column":1},"end":{"line":277,"column":1}},"html":"ss's","key":"uFOsKFFVjV"},{"type":"text","value":" be the resulting game state.\nThe values of ","position":{"start":{"line":277,"column":1},"end":{"line":277,"column":1}},"key":"BM2WzFseVD"},{"type":"inlineMath","value":"\\alpha(s')","position":{"start":{"line":277,"column":1},"end":{"line":277,"column":1}},"html":"α(s)\\alpha(s')α(s)","key":"YxHsehyqms"},{"type":"text","value":" and ","position":{"start":{"line":277,"column":1},"end":{"line":277,"column":1}},"key":"c1zdnOtWdK"},{"type":"inlineMath","value":"\\beta(s')","position":{"start":{"line":277,"column":1},"end":{"line":277,"column":1}},"html":"β(s)\\beta(s')β(s)","key":"oqU3yIrftS"},{"type":"text","value":"\nare initialized at the same values as the root state,\nsince we want to prune a subtree if there exists a better action at any step higher in the tree.","position":{"start":{"line":277,"column":1},"end":{"line":277,"column":1}},"key":"LO8DzgUVvH"}],"key":"lRLLsnmpzk"},{"type":"image","url":"/build/alpha-beta-1-b9d0c4a2b1ab3150a403c943682c4a80.png","position":{"start":{"line":282,"column":1},"end":{"line":282,"column":1}},"key":"pr12oZaFh9","urlSource":"./shared/alpha-beta-1.png","urlOptimized":"/build/alpha-beta-1-b9d0c4a2b1ab3150a403c943682c4a80.webp"},{"type":"paragraph","position":{"start":{"line":284,"column":1},"end":{"line":285,"column":1}},"children":[{"type":"text","value":"Then we iterate through Min’s possible actions,\nupdating the value of ","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"key":"hz5PrqTWdU"},{"type":"inlineMath","value":"\\beta(s')","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"html":"β(s)\\beta(s')β(s)","key":"a1IAlckrBW"},{"type":"text","value":" as we go.","position":{"start":{"line":284,"column":1},"end":{"line":284,"column":1}},"key":"qeDYFdik3X"}],"key":"ZJO2dhL7OV"},{"type":"paragraph","position":{"start":{"line":287,"column":1},"end":{"line":288,"column":1}},"children":[{"type":"image","url":"/build/alpha-beta-2-b0d0597f3562685a2759d1d56f661682.png","position":{"start":{"line":287,"column":1},"end":{"line":287,"column":1}},"key":"u50H0sIsBj","urlSource":"./shared/alpha-beta-2.png","urlOptimized":"/build/alpha-beta-2-b0d0597f3562685a2759d1d56f661682.webp"},{"type":"text","value":"\n","position":{"start":{"line":287,"column":1},"end":{"line":287,"column":1}},"key":"PCygiCMW5e"},{"type":"image","url":"/build/alpha-beta-3-fcd7a3fcb02f86c22e47c8168d151549.png","position":{"start":{"line":287,"column":1},"end":{"line":287,"column":1}},"key":"mPyumquQ8b","urlSource":"./shared/alpha-beta-3.png","urlOptimized":"/build/alpha-beta-3-fcd7a3fcb02f86c22e47c8168d151549.webp"}],"key":"H3aYhrO4lR"},{"type":"paragraph","position":{"start":{"line":290,"column":1},"end":{"line":292,"column":1}},"children":[{"type":"text","value":"Once the value of state ","position":{"start":{"line":290,"column":1},"end":{"line":290,"column":1}},"key":"KhrHCEnoRx"},{"type":"inlineMath","value":"s'","position":{"start":{"line":290,"column":1},"end":{"line":290,"column":1}},"html":"ss's","key":"diCXohMQrY"},{"type":"text","value":" is fully evaluated,\nwe know that Max can achieve a value of ","position":{"start":{"line":290,"column":1},"end":{"line":290,"column":1}},"key":"VCcgIhD2XK"},{"type":"emphasis","position":{"start":{"line":290,"column":1},"end":{"line":290,"column":1}},"children":[{"type":"text","value":"at least","position":{"start":{"line":290,"column":1},"end":{"line":290,"column":1}},"key":"NeJZHae4IX"}],"key":"f2PcP7vZqg"},{"type":"text","value":" ","position":{"start":{"line":290,"column":1},"end":{"line":290,"column":1}},"key":"dAQk9QlMMI"},{"type":"text","value":"-2","position":{"start":{"line":290,"column":1},"end":{"line":290,"column":1}},"key":"CQFjqsn46Z"},{"type":"text","value":" starting from the root,\nand so we update ","position":{"start":{"line":290,"column":1},"end":{"line":290,"column":1}},"key":"FZTjPksKLq"},{"type":"inlineMath","value":"\\alpha(s)","position":{"start":{"line":290,"column":1},"end":{"line":290,"column":1}},"html":"α(s)\\alpha(s)α(s)","key":"BfwL4k2ayP"},{"type":"text","value":", where ","position":{"start":{"line":290,"column":1},"end":{"line":290,"column":1}},"key":"M497DW703W"},{"type":"inlineMath","value":"s","position":{"start":{"line":290,"column":1},"end":{"line":290,"column":1}},"html":"sss","key":"etTmb5L4wn"},{"type":"text","value":" is the root state:","position":{"start":{"line":290,"column":1},"end":{"line":290,"column":1}},"key":"e6pTHrgtxd"}],"key":"FgI0ch9l03"},{"type":"image","url":"/build/alpha-beta-4-e3958ef0c8cbcb3b559e8a63d1cc1e6b.png","position":{"start":{"line":294,"column":1},"end":{"line":294,"column":1}},"key":"EcNf9eN1OY","urlSource":"./shared/alpha-beta-4.png","urlOptimized":"/build/alpha-beta-4-e3958ef0c8cbcb3b559e8a63d1cc1e6b.webp"},{"type":"paragraph","position":{"start":{"line":296,"column":1},"end":{"line":297,"column":1}},"children":[{"type":"text","value":"Then Max imagines taking action B. Again, let ","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"key":"wuCC0tnLVq"},{"type":"inlineMath","value":"s'","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"html":"ss's","key":"CoyvvY8xqk"},{"type":"text","value":" denote the resulting game state.\nWe initialize ","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"key":"k7l21aeHHA"},{"type":"inlineMath","value":"\\alpha(s')","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"html":"α(s)\\alpha(s')α(s)","key":"GalhcMByqk"},{"type":"text","value":" and ","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"key":"wuUUZqAAdY"},{"type":"inlineMath","value":"\\beta(s')","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"html":"β(s)\\beta(s')β(s)","key":"PMKKSIo7id"},{"type":"text","value":" from the root:","position":{"start":{"line":296,"column":1},"end":{"line":296,"column":1}},"key":"A8mYMDxWSe"}],"key":"XkVgTFvxIE"},{"type":"image","url":"/build/alpha-beta-5-f16710428d22fbb7c1a5dbc054a71a7c.png","position":{"start":{"line":299,"column":1},"end":{"line":299,"column":1}},"key":"vG79rIHSVW","urlSource":"./shared/alpha-beta-5.png","urlOptimized":"/build/alpha-beta-5-f16710428d22fbb7c1a5dbc054a71a7c.webp"},{"type":"paragraph","position":{"start":{"line":301,"column":1},"end":{"line":309,"column":1}},"children":[{"type":"text","value":"Now suppose Min takes action D, resulting in a value of ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"lipUYAHKX0"},{"type":"text","value":"-3","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"SIAsRNZdHp"},{"type":"text","value":".\nWe see that ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"JGSkA854Sa"},{"type":"inlineMath","value":"V^\\star_\\hi(s') = \\min(-3, x, y)","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"html":"Vh(s)=min(3,x,y)V^\\star_\\hi(s') = \\min(-3, x, y)Vh(s)=min(3,x,y)","key":"ydxXJmA978"},{"type":"text","value":",\nwhere ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"REcB8JO4O0"},{"type":"inlineMath","value":"x","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"html":"xxx","key":"o07Qj9AJ27"},{"type":"text","value":" and ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"A87VqSLS8S"},{"type":"inlineMath","value":"y","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"html":"yyy","key":"j7f4BUr3GR"},{"type":"text","value":" are the values of the remaining two actions.\nBut since ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"qtpr2DDPgE"},{"type":"inlineMath","value":"\\min(-3, x, y) \\le -3","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"html":"min(3,x,y)3\\min(-3, x, y) \\le -3min(3,x,y)3","key":"edAUCcrmbJ"},{"type":"text","value":",\nwe know that the value of ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"w1iz31qpCK"},{"type":"inlineMath","value":"s'","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"html":"ss's","key":"trUAhbujdU"},{"type":"text","value":" is at most ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"Gemk1rlqIb"},{"type":"text","value":"-3","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"RKQoLl6SPf"},{"type":"text","value":".\nBut Max can achieve a better value of ","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"utmHB1Hh9z"},{"type":"inlineMath","value":"\\alpha(s') = -2","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"html":"α(s)=2\\alpha(s') = -2α(s)=2","key":"AGhopR5S3k"},{"type":"text","value":" by taking action A,\nand so Max will never take action B,\nand we can prune the search here.\nWe will use dotted lines to indicate states that have been ruled out from the search:","position":{"start":{"line":301,"column":1},"end":{"line":301,"column":1}},"key":"MXYfIhyQZc"}],"key":"TBMYCAD8Z9"},{"type":"image","url":"/build/alpha-beta-6-1f7516f925d212dc9290ccf221a7d28e.png","position":{"start":{"line":311,"column":1},"end":{"line":311,"column":1}},"key":"OkWBi60rBF","urlSource":"./shared/alpha-beta-6.png","urlOptimized":"/build/alpha-beta-6-1f7516f925d212dc9290ccf221a7d28e.webp"},{"type":"paragraph","position":{"start":{"line":313,"column":1},"end":{"line":316,"column":1}},"children":[{"type":"text","value":"Finally, suppose Max takes action C.\nFor Min’s actions D and E,\nthere is still a chance that action C might outperform action A,\nso we continue expanding:","position":{"start":{"line":313,"column":1},"end":{"line":313,"column":1}},"key":"A3bNfrkZfm"}],"key":"vlPh2hjDlj"},{"type":"paragraph","position":{"start":{"line":318,"column":1},"end":{"line":319,"column":1}},"children":[{"type":"image","url":"/build/alpha-beta-7-648c7023e2fdb207fac5a83dbd8abd64.png","position":{"start":{"line":318,"column":1},"end":{"line":318,"column":1}},"key":"UxzmOiBJYG","urlSource":"./shared/alpha-beta-7.png","urlOptimized":"/build/alpha-beta-7-648c7023e2fdb207fac5a83dbd8abd64.webp"},{"type":"text","value":"\n","position":{"start":{"line":318,"column":1},"end":{"line":318,"column":1}},"key":"I73mjAqYBa"},{"type":"image","url":"/build/alpha-beta-8-fb8654bf1f1f361f3098f7a2c0ace9bd.png","position":{"start":{"line":318,"column":1},"end":{"line":318,"column":1}},"key":"hRWob9rcj8","urlSource":"./shared/alpha-beta-8.png","urlOptimized":"/build/alpha-beta-8-fb8654bf1f1f361f3098f7a2c0ace9bd.webp"}],"key":"uRHqxKo5ZE"},{"type":"paragraph","position":{"start":{"line":321,"column":1},"end":{"line":323,"column":1}},"children":[{"type":"text","value":"Finally, we see that Min taking action F achieves the minimum value at this state.\nThis shows that optimal play is for Max to take action C,\nand Min to take action F.","position":{"start":{"line":321,"column":1},"end":{"line":321,"column":1}},"key":"vPstVy1zVi"}],"key":"sCWHZQzdTP"},{"type":"image","url":"/build/alpha-beta-9-f7d61365563b59cdcecc22ca3e301bc6.png","position":{"start":{"line":325,"column":1},"end":{"line":325,"column":1}},"key":"wYv4HbJAZx","urlSource":"./shared/alpha-beta-9.png","urlOptimized":"/build/alpha-beta-9-f7d61365563b59cdcecc22ca3e301bc6.webp"}],"enumerator":"8.2","html_id":"alpha-beta-example","key":"AgNKChUzUM"},{"type":"code","lang":"python","value":"def alpha_beta_search(s, player, alpha, beta) -> Tuple[\"Action\", \"Value\"]:\n \"\"\"Return the value of the state (for Max) and the best action for Max to take.\"\"\"\n if env.is_terminal(s):\n return None, env.winner(s)\n\n if player is max:\n a_max, v_max = None, None\n for a in actions:\n _, v = minimax_search(env.step(s, a), min, alpha, beta)\n if v > v_max:\n a_max, v_max = a, v\n alpha = max(alpha, v)\n if v_max >= beta:\n # we know Min will not choose the action that leads to this state\n return a_max, v_max\n return a_max, v_max\n\n else:\n a_min, v_min = None, None\n for a in actions:\n _, v = minimax_search(env.step(s, a), max)\n if v < v_min:\n a_min, v_min = a, v\n beta = min(beta, v)\n if v_min <= alpha:\n # we know Max will not choose the action that leads to this state\n return a_min, v_min\n return a_min, v_min","position":{"start":{"line":329,"column":1},"end":{"line":358,"column":1}},"key":"KAffZzDgRj"},{"type":"paragraph","position":{"start":{"line":360,"column":1},"end":{"line":368,"column":1}},"children":[{"type":"text","value":"How do we choose what ","position":{"start":{"line":360,"column":1},"end":{"line":360,"column":1}},"key":"SEIVOecYeJ"},{"type":"emphasis","position":{"start":{"line":360,"column":1},"end":{"line":360,"column":1}},"children":[{"type":"text","value":"order","position":{"start":{"line":360,"column":1},"end":{"line":360,"column":1}},"key":"s991OJZSua"}],"key":"QgoPwDDD4D"},{"type":"text","value":" to explore the branches?\nAs you can tell, this significantly affects the efficiency of the pruning algorithm.\nIf Max explores the possible actions in order from worst to best,\nthey will not be able to prune any branches at all!\nAdditionally, to verify that an action is suboptimal,\nwe must run the search recursively from that action,\nwhich ultimately requires traversing the tree all the way to a leaf node.\nThe longer the game might possibly last,\nthe more computation we have to run.","position":{"start":{"line":360,"column":1},"end":{"line":360,"column":1}},"key":"plUuz1qtwi"}],"key":"CY0biH1hEy"},{"type":"paragraph","position":{"start":{"line":370,"column":1},"end":{"line":373,"column":1}},"children":[{"type":"text","value":"In practice, we can often use background information about the game to develop a ","position":{"start":{"line":370,"column":1},"end":{"line":370,"column":1}},"key":"hdK7vkPOMS"},{"type":"strong","position":{"start":{"line":370,"column":1},"end":{"line":370,"column":1}},"children":[{"type":"text","value":"heuristic","position":{"start":{"line":370,"column":1},"end":{"line":370,"column":1}},"key":"pjDpIsyMDI"}],"key":"f1SCaDyG9W"},{"type":"text","value":" for evaluating possible actions.\nIf a technique is based on background information or intuition,\nespecially if it isn’t rigorously justified,\nwe call it a heuristic.","position":{"start":{"line":370,"column":1},"end":{"line":370,"column":1}},"key":"gGdPCYtiPn"}],"key":"uIXPSOVkSa"},{"type":"paragraph","position":{"start":{"line":375,"column":1},"end":{"line":375,"column":1}},"children":[{"type":"text","value":"Can we develop ","position":{"start":{"line":375,"column":1},"end":{"line":375,"column":1}},"key":"bT3f67dCpy"},{"type":"emphasis","position":{"start":{"line":375,"column":1},"end":{"line":375,"column":1}},"children":[{"type":"text","value":"heuristic methods","position":{"start":{"line":375,"column":1},"end":{"line":375,"column":1}},"key":"q3nk5Oe3vl"}],"key":"G5UswGVLLG"},{"type":"text","value":" for tree exploration that works for all sorts of games?","position":{"start":{"line":375,"column":1},"end":{"line":375,"column":1}},"key":"lU72GEpAN4"}],"key":"GjpDWOYtRc"},{"type":"comment","value":" Here's where we can incorporate the _reinforcement learning_ ","key":"g92yfup9Pd"},{"type":"heading","depth":2,"position":{"start":{"line":379,"column":1},"end":{"line":379,"column":1}},"children":[{"type":"text","value":"Monte Carlo Tree Search","position":{"start":{"line":379,"column":1},"end":{"line":379,"column":1}},"key":"nXhT8RxuUq"}],"label":"monte-carlo-tree-search","identifier":"monte-carlo-tree-search","html_id":"monte-carlo-tree-search","enumerator":"8.5","key":"FbP3Bp9e6b"},{"type":"paragraph","position":{"start":{"line":381,"column":1},"end":{"line":383,"column":1}},"children":[{"type":"text","value":"The task of evaluating actions in a complex environment might seem familiar.\nWe’ve encountered this problem before in both the ","position":{"start":{"line":381,"column":1},"end":{"line":381,"column":1}},"key":"ogtsd0K5Qv"},{"type":"link","url":"/bandits","position":{"start":{"line":381,"column":1},"end":{"line":381,"column":1}},"children":[{"type":"text","value":"multi-armed bandits","position":{"start":{"line":381,"column":1},"end":{"line":381,"column":1}},"key":"o39QRa9uc3"}],"urlSource":"./bandits.md","dataUrl":"/bandits.json","internal":true,"protocol":"file","key":"rsINn3ke33"},{"type":"text","value":" setting and the ","position":{"start":{"line":381,"column":1},"end":{"line":381,"column":1}},"key":"f7ofTo1UxQ"},{"type":"link","url":"/mdps","position":{"start":{"line":381,"column":1},"end":{"line":381,"column":1}},"children":[{"type":"text","value":"Markov decision process","position":{"start":{"line":381,"column":1},"end":{"line":381,"column":1}},"key":"wyaRqH1V3K"}],"urlSource":"./mdps.md","dataUrl":"/mdps.json","internal":true,"protocol":"file","key":"FHrUWe9L8L"},{"type":"text","value":" setting.\nNow we’ll see how to combine concepts from these to form a more general and efficient tree search heuristic called ","position":{"start":{"line":381,"column":1},"end":{"line":381,"column":1}},"key":"xlvKQaIBjM"},{"type":"strong","position":{"start":{"line":381,"column":1},"end":{"line":381,"column":1}},"children":[{"type":"text","value":"Monte Carlo Tree Search","position":{"start":{"line":381,"column":1},"end":{"line":381,"column":1}},"key":"Pe7TqUJL7e"}],"key":"NuNb0SVlos"},{"type":"text","value":" (MCTS).","position":{"start":{"line":381,"column":1},"end":{"line":381,"column":1}},"key":"EPqOfskT4U"}],"key":"TTVB0yqK9w"},{"type":"paragraph","position":{"start":{"line":385,"column":1},"end":{"line":390,"column":1}},"children":[{"type":"text","value":"When a problem is intractable to solve ","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"O5VrfZNFpK"},{"type":"emphasis","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"children":[{"type":"text","value":"exactly","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"Zf3VWebHIE"}],"key":"NWFuFsXKDi"},{"type":"text","value":",\nwe often turn to ","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"E54PyqejaX"},{"type":"emphasis","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"children":[{"type":"text","value":"approximate","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"g0Z68AjpOV"}],"key":"vwsCnYEB8z"},{"type":"text","value":" algorithms that sacrifice some accuracy in exchange for computational efficiency.\nMCTS also improves on alpha-beta search in this sense.\nAs the name suggests,\nMCTS uses ","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"YRBaY3jmnn"},{"type":"emphasis","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"children":[{"type":"text","value":"Monte Carlo","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"FUCHCVC6Yn"}],"key":"Bk9UTFFqvX"},{"type":"text","value":" simulation, that is, collecting random samples and computing the sample statistics,\nin order to ","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"qg9wHYfPu9"},{"type":"emphasis","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"children":[{"type":"text","value":"approximate","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"JBhLFK7dgx"}],"key":"rTBDQUvEs6"},{"type":"text","value":" the value of each action.","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"pXNGzAz1aY"}],"key":"eUScOehWtJ"},{"type":"paragraph","position":{"start":{"line":392,"column":1},"end":{"line":398,"column":1}},"children":[{"type":"text","value":"As before, we imagine a complete game tree in which each path represents an ","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"key":"XwgFoCkiha"},{"type":"emphasis","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"children":[{"type":"text","value":"entire game","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"key":"QokmfwVKw8"}],"key":"qIY8HDrZqA"},{"type":"text","value":".\nThe goal of MCTS is to assign values to only the game states that are ","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"key":"i2uYyXbLrg"},{"type":"emphasis","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"children":[{"type":"text","value":"relevant","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"key":"z6BbMnR6zO"}],"key":"iQXbSStSiX"},{"type":"text","value":" to the ","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"key":"ZnEssV1AvR"},{"type":"emphasis","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"children":[{"type":"text","value":"current game","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"key":"Mg6yGFhHwy"}],"key":"sQW5RQJ4cx"},{"type":"text","value":";\nWe gradually expand the tree at each move.\nFor comparison, in alpha-beta search,\nthe entire tree only needs to be solved ","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"key":"aonwzB2xrV"},{"type":"emphasis","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"children":[{"type":"text","value":"once","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"key":"m1B3NEfDpp"}],"key":"rmuwcluHXP"},{"type":"text","value":",\nand from then on,\nchoosing an action is as simple as taking a maximum over the previously computed values.","position":{"start":{"line":392,"column":1},"end":{"line":392,"column":1}},"key":"fNZ07piHD0"}],"key":"lmNk8rG9ta"},{"type":"paragraph","position":{"start":{"line":400,"column":1},"end":{"line":404,"column":1}},"children":[{"type":"text","value":"The crux of MCTS is approximating the win probability of a state by a ","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"key":"i664pzIidJ"},{"type":"emphasis","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"children":[{"type":"text","value":"sample probability","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"key":"MpPJw5Tmzq"}],"key":"hV4ZSxngEJ"},{"type":"text","value":".\nIn practice, MCTS is used for games with ","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"key":"LOxpwTqqU3"},{"type":"emphasis","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"children":[{"type":"text","value":"binary outcomes","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"key":"Y9lqsMm7jf"}],"key":"nQI1BdWxha"},{"type":"text","value":" where ","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"key":"ETepI8yKIG"},{"type":"inlineMath","value":"r(s) \\in \\{ +1, -1 \\}","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"html":"r(s){+1,1}r(s) \\in \\{ +1, -1 \\}r(s){+1,1}","key":"P9BgkaQ7dD"},{"type":"text","value":",\nand so this is equivalent to approximating the final game score.\nTo approximate the win probability from state ","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"key":"atTU4IpAAi"},{"type":"inlineMath","value":"s","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"html":"sss","key":"rC7R7K6Glz"},{"type":"text","value":",\nMCTS samples random games starting in ","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"key":"ANDzS1oYnX"},{"type":"inlineMath","value":"s","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"html":"sss","key":"v6wFC87tcZ"},{"type":"text","value":" and computes the sample proportion of those that the player wins.","position":{"start":{"line":400,"column":1},"end":{"line":400,"column":1}},"key":"qrHTGeaWfH"}],"key":"KowrZ1Vo1e"},{"type":"paragraph","position":{"start":{"line":406,"column":1},"end":{"line":410,"column":1}},"children":[{"type":"text","value":"Note that, for a given state ","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"key":"rjAdbn0Qiy"},{"type":"inlineMath","value":"s","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"html":"sss","key":"YMWs7MX4Wf"},{"type":"text","value":",\nchoosing the best action ","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"key":"TgXrNi90lB"},{"type":"inlineMath","value":"a","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"html":"aaa","key":"pFqnU4ergC"},{"type":"text","value":" can be framed as a ","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"key":"le2zKc3zl0"},{"type":"link","url":"/bandits","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"children":[{"type":"text","value":"multi-armed bandits","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"key":"TMDathHs4R"}],"urlSource":"./bandits.md","dataUrl":"/bandits.json","internal":true,"protocol":"file","key":"rikNVGErxr"},{"type":"text","value":" problem,\nwhere each action corresponds to an arm,\nand the reward distribution of arm ","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"key":"cVFH0vK14a"},{"type":"inlineMath","value":"k","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"html":"kkk","key":"G99Rv2RJ0x"},{"type":"text","value":" is the distribution of the game score over random games after choosing that arm.\nThe most commonly used bandit algorithm in practice for MCTS is the ","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"key":"bOj1jVtMhJ"},{"type":"crossReference","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"children":[{"type":"text","value":"Upper Confidence Bound (UCB)","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"key":"ERzHy7i6WT"}],"identifier":"ucb","label":"ucb","kind":"heading","template":"Section %s","enumerator":"3.6","resolved":true,"html_id":"ucb","remote":true,"url":"/bandits","dataUrl":"/bandits.json","key":"LWyI17x9kT"},{"type":"text","value":" algorithm.","position":{"start":{"line":406,"column":1},"end":{"line":406,"column":1}},"key":"XJci0Ui91y"}],"key":"g0XQeg3fFw"},{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Summary of UCB","position":{"start":{"line":412,"column":1},"end":{"line":412,"column":1}},"key":"KmgstOTuX8"}],"key":"rqI2cUvBy4"},{"type":"paragraph","position":{"start":{"line":413,"column":1},"end":{"line":425,"column":1}},"children":[{"type":"text","value":"Let us quickly review the UCB bandit algorithm.\nFor each arm ","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"pv5PdLiqXo"},{"type":"inlineMath","value":"k","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"html":"kkk","key":"NnSrEv2zrx"},{"type":"text","value":", we track the sample mean","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"tMpmDeV9ZK"}],"key":"frm42KwRmL"},{"type":"math","value":"\\hat \\mu^k_t = \\frac{1}{N_t^k} \\sum_{\\tau=0}^{t-1} \\ind{a_\\tau = k} r_\\tau","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"tight":true,"html":"μ^tk=1Ntkτ=0t11{aτ=k}rτ\\hat \\mu^k_t = \\frac{1}{N_t^k} \\sum_{\\tau=0}^{t-1} \\ind{a_\\tau = k} r_\\tauμ^tk=Ntk1τ=0t11{aτ=k}rτ","enumerator":"8.2","key":"mC6G79ixum"},{"type":"paragraph","position":{"start":{"line":413,"column":1},"end":{"line":425,"column":1}},"children":[{"type":"text","value":"of all rewards from that arm up to time ","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"CoNV0dWATO"},{"type":"inlineMath","value":"t","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"html":"ttt","key":"TSnFgFV018"},{"type":"text","value":".\nThen we construct a ","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"lHfK7DKZec"},{"type":"emphasis","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"children":[{"type":"text","value":"confidence interval","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"erLBofuAIP"}],"key":"DfFeoIV7l2"},{"type":"text","value":"","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"aOHm3964mP"}],"key":"RAwaDczIwb"},{"type":"math","value":"C_t^k = [\\hat \\mu^k_t - B_t^k, \\hat \\mu^k_t + B_t^k],","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"tight":true,"html":"Ctk=[μ^tkBtk,μ^tk+Btk],C_t^k = [\\hat \\mu^k_t - B_t^k, \\hat \\mu^k_t + B_t^k],Ctk=[μ^tkBtk,μ^tk+Btk],","enumerator":"8.3","key":"Xd2wQ7M6xo"},{"type":"paragraph","position":{"start":{"line":413,"column":1},"end":{"line":425,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"GfUiOdVFfG"},{"type":"inlineMath","value":"B_t^k = \\sqrt{\\frac{\\ln(2 t / \\delta)}{2 N_t^k}}","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"html":"Btk=ln(2t/δ)2NtkB_t^k = \\sqrt{\\frac{\\ln(2 t / \\delta)}{2 N_t^k}}Btk=2Ntkln(2t/δ)","key":"BtuqGnqHhA"},{"type":"text","value":" is given by Hoeffding’s inequality,\nso that with probability ","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"JPcn950d1V"},{"type":"text","value":"δ","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"t4fdAzHXxi"},{"type":"text","value":" (some fixed parameter we choose),\nthe true mean ","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"ayiM9EGduk"},{"type":"inlineMath","value":"\\mu^k","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"html":"μk\\mu^kμk","key":"wm33TFIvjD"},{"type":"text","value":" lies within ","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"YaL7Z4piEx"},{"type":"inlineMath","value":"C_t^k","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"html":"CtkC_t^kCtk","key":"bKJrbLBf2E"},{"type":"text","value":".\nNote that ","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"O1bMRY2HW8"},{"type":"inlineMath","value":"B_t^k","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"html":"BtkB_t^kBtk","key":"BC1U2SnZBp"},{"type":"text","value":" scales like ","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"rnRxpEQG49"},{"type":"inlineMath","value":"\\sqrt{1/N^k_t}","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"html":"1/Ntk\\sqrt{1/N^k_t}1/Ntk","key":"qT9scFyqXQ"},{"type":"text","value":",\ni.e. the more we have visited that arm,\nthe more confident we get about it,\nand the narrower the confidence interval.","position":{"start":{"line":413,"column":1},"end":{"line":413,"column":1}},"key":"caRzH6RHQw"}],"key":"RXiLZFQNR6"},{"type":"paragraph","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"children":[{"type":"text","value":"To select an arm, we pick the arm with the highest ","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"key":"hFxM2bqUCt"},{"type":"emphasis","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"children":[{"type":"text","value":"upper confidence bound","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"key":"fTQxYvluQg"}],"key":"to490EGUyi"},{"type":"text","value":".","position":{"start":{"line":427,"column":1},"end":{"line":427,"column":1}},"key":"nNVJr3KASG"}],"key":"AjMvtNc4OP"}],"key":"TMphx9ClWP"},{"type":"paragraph","position":{"start":{"line":430,"column":1},"end":{"line":431,"column":1}},"children":[{"type":"text","value":"This means that, for each edge (corresponding to a state-action pair ","position":{"start":{"line":430,"column":1},"end":{"line":430,"column":1}},"key":"qDhfcrlxqc"},{"type":"inlineMath","value":"(s, a)","position":{"start":{"line":430,"column":1},"end":{"line":430,"column":1}},"html":"(s,a)(s, a)(s,a)","key":"wS4uSw1GnX"},{"type":"text","value":") in the game tree,\nwe keep track of the statistics required to compute its UCB:","position":{"start":{"line":430,"column":1},"end":{"line":430,"column":1}},"key":"tvSLLWH4Q5"}],"key":"TKQFzF9r5K"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":433,"column":1},"end":{"line":436,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":433,"column":1},"end":{"line":433,"column":1}},"children":[{"type":"text","value":"How many times it has been “visited” (","position":{"start":{"line":433,"column":1},"end":{"line":433,"column":1}},"key":"J9FJTCw4yO"},{"type":"inlineMath","value":"N_t^{s, a}","position":{"start":{"line":433,"column":1},"end":{"line":433,"column":1}},"html":"Nts,aN_t^{s, a}Nts,a","key":"lqIM3sGGmK"},{"type":"text","value":")","position":{"start":{"line":433,"column":1},"end":{"line":433,"column":1}},"key":"BUorh5sxop"}],"key":"dvtG428r1S"},{"type":"listItem","spread":true,"position":{"start":{"line":434,"column":1},"end":{"line":436,"column":1}},"children":[{"type":"text","value":"How many of those visits resulted in victory (","position":{"start":{"line":434,"column":1},"end":{"line":434,"column":1}},"key":"FKpuMKJot1"},{"type":"inlineMath","value":"\\sum_{\\tau=0}^{t-1} \\ind{(s_\\tau, a_\\tau) = (s, a)} r_\\tau","position":{"start":{"line":434,"column":1},"end":{"line":434,"column":1}},"html":"τ=0t11{(sτ,aτ)=(s,a)}rτ\\sum_{\\tau=0}^{t-1} \\ind{(s_\\tau, a_\\tau) = (s, a)} r_\\tauτ=0t11{(sτ,aτ)=(s,a)}rτ","key":"dHMtnC81BX"},{"type":"text","value":").\nLet us call this latter value ","position":{"start":{"line":434,"column":1},"end":{"line":434,"column":1}},"key":"F9VCi3Hqy3"},{"type":"inlineMath","value":"W^{s, a}_t","position":{"start":{"line":434,"column":1},"end":{"line":434,"column":1}},"html":"Wts,aW^{s, a}_tWts,a","key":"Ix12nUmb8O"},{"type":"text","value":" (for number of “wins”).","position":{"start":{"line":434,"column":1},"end":{"line":434,"column":1}},"key":"jhDaA0o12C"}],"key":"keFwTyGX9O"}],"key":"jtLvaKv4Yz"},{"type":"paragraph","position":{"start":{"line":437,"column":1},"end":{"line":444,"column":1}},"children":[{"type":"text","value":"What does ","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"key":"UvXNQD5Kqw"},{"type":"inlineMath","value":"t","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"html":"ttt","key":"kcSfHXsgIN"},{"type":"text","value":" refer to in the above expressions?\nRecall ","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"key":"AKDLTnPHMB"},{"type":"inlineMath","value":"t","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"html":"ttt","key":"Fa4V9yjtBm"},{"type":"text","value":" refers to the number of time steps elapsed in the ","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"key":"AnVzYcQEWA"},{"type":"emphasis","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"children":[{"type":"text","value":"bandit environment","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"key":"Rh3XhLaNKY"}],"key":"A3aAb94gIx"},{"type":"text","value":".\nAs mentioned above,\neach state ","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"key":"uTLgA7Rlef"},{"type":"inlineMath","value":"s","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"html":"sss","key":"VtZyBP3Nkw"},{"type":"text","value":" corresponds to its own bandit environment,\nand so ","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"key":"oheZd6A5vf"},{"type":"inlineMath","value":"t","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"html":"ttt","key":"SBLbcgjCGQ"},{"type":"text","value":" refers to ","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"key":"tcgIAImVRK"},{"type":"inlineMath","value":"N^s","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"html":"NsN^sNs","key":"pCG4xSBDwa"},{"type":"text","value":", that is,\nhow many actions have been taken from state ","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"key":"BtBeAQO5cZ"},{"type":"inlineMath","value":"s","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"html":"sss","key":"lGxQOXCazP"},{"type":"text","value":".\nThis term, ","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"key":"KGCMOw0tN9"},{"type":"inlineMath","value":"N^s","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"html":"NsN^sNs","key":"O3tFxItxpk"},{"type":"text","value":", gets incremented as the algorithm runs;\nfor simplicity, we won’t introduce another index to track how it changes.","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"key":"W6BM4JBygg"}],"key":"rrgpMT0MEE"},{"type":"proof","kind":"algorithm","label":"mcts-algorithm","identifier":"mcts-algorithm","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Monte Carlo tree search algorithm","position":{"start":{"line":446,"column":1},"end":{"line":446,"column":1}},"key":"PRMfrNr90G"}],"key":"aP9jyTlc7p"},{"type":"paragraph","position":{"start":{"line":449,"column":1},"end":{"line":449,"column":1}},"children":[{"type":"text","value":"Inputs:","position":{"start":{"line":449,"column":1},"end":{"line":449,"column":1}},"key":"qK4b64UHCV"}],"key":"LRlN4IvWLo"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":450,"column":1},"end":{"line":453,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":450,"column":1},"end":{"line":450,"column":1}},"children":[{"type":"inlineMath","value":"T","position":{"start":{"line":450,"column":1},"end":{"line":450,"column":1}},"html":"TTT","key":"wJomc7l7pu"},{"type":"text","value":", the number of iterations per move","position":{"start":{"line":450,"column":1},"end":{"line":450,"column":1}},"key":"MZUjinHLrU"}],"key":"u5NBjM8zre"},{"type":"listItem","spread":true,"position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"children":[{"type":"inlineMath","value":"\\pi_{\\text{rollout}}","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"html":"πrollout\\pi_{\\text{rollout}}πrollout","key":"bQkJXmY3bE"},{"type":"text","value":", the ","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"LClaL58Pnz"},{"type":"strong","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"children":[{"type":"text","value":"rollout policy","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"FhRtiydxAA"}],"key":"YZBNHLZpac"},{"type":"text","value":" for randomly sampling games","position":{"start":{"line":451,"column":1},"end":{"line":451,"column":1}},"key":"dl5qHGPxOO"}],"key":"ylbWGRylmW"},{"type":"listItem","spread":true,"position":{"start":{"line":452,"column":1},"end":{"line":453,"column":1}},"children":[{"type":"inlineMath","value":"c","position":{"start":{"line":452,"column":1},"end":{"line":452,"column":1}},"html":"ccc","key":"AYuGWZXqaq"},{"type":"text","value":", a positive value that encourages exploration","position":{"start":{"line":452,"column":1},"end":{"line":452,"column":1}},"key":"V3pZBgjT0Y"}],"key":"n7jeO7Jhd6"}],"key":"dhI7KQbujI"},{"type":"paragraph","position":{"start":{"line":454,"column":1},"end":{"line":458,"column":1}},"children":[{"type":"text","value":"To choose a single move starting at state ","position":{"start":{"line":454,"column":1},"end":{"line":454,"column":1}},"key":"Me7DxdSfpL"},{"type":"inlineMath","value":"s_{\\text{start}}","position":{"start":{"line":454,"column":1},"end":{"line":454,"column":1}},"html":"sstarts_{\\text{start}}sstart","key":"X8pFWwvtOJ"},{"type":"text","value":",\nMCTS first tries to estimate the UCB values for each of the possible actions ","position":{"start":{"line":454,"column":1},"end":{"line":454,"column":1}},"key":"TOnGopHFD8"},{"type":"inlineMath","value":"\\mathcal{A}(s_\\text{start})","position":{"start":{"line":454,"column":1},"end":{"line":454,"column":1}},"html":"A(sstart)\\mathcal{A}(s_\\text{start})A(sstart)","key":"zAKWD8gWsZ"},{"type":"text","value":",\nand then chooses the best one.\nTo estimate the UCB values,\nit repeats the following four steps ","position":{"start":{"line":454,"column":1},"end":{"line":454,"column":1}},"key":"seN6iOj5BH"},{"type":"inlineMath","value":"T","position":{"start":{"line":454,"column":1},"end":{"line":454,"column":1}},"html":"TTT","key":"m8jzxp6jZH"},{"type":"text","value":" times:","position":{"start":{"line":454,"column":1},"end":{"line":454,"column":1}},"key":"hrsxRObqDc"}],"key":"YJX7AMC5TF"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":460,"column":1},"end":{"line":477,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":460,"column":1},"end":{"line":468,"column":1}},"children":[{"type":"strong","position":{"start":{"line":460,"column":1},"end":{"line":460,"column":1}},"children":[{"type":"text","value":"Selection","position":{"start":{"line":460,"column":1},"end":{"line":460,"column":1}},"key":"cdoKPqm9Ld"}],"key":"OlhaKVVBsf"},{"type":"text","value":": We start at ","position":{"start":{"line":460,"column":1},"end":{"line":460,"column":1}},"key":"Fomb2EtsMS"},{"type":"inlineMath","value":"s = s_{\\text{start}}","position":{"start":{"line":460,"column":1},"end":{"line":460,"column":1}},"html":"s=sstarts = s_{\\text{start}}s=sstart","key":"Ri6nPL3XrC"},{"type":"text","value":". Let ","position":{"start":{"line":460,"column":1},"end":{"line":460,"column":1}},"key":"CCFOfI3V7m"},{"type":"text","value":"τ","position":{"start":{"line":460,"column":1},"end":{"line":460,"column":1}},"key":"ifotvn5WyN"},{"type":"text","value":" be an empty list that we will use to track states and actions.","position":{"start":{"line":460,"column":1},"end":{"line":460,"column":1}},"key":"nTOEVqiypO"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":461,"column":1},"end":{"line":468,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":461,"column":1},"end":{"line":468,"column":1}},"children":[{"type":"text","value":"Until ","position":{"start":{"line":461,"column":1},"end":{"line":461,"column":1}},"key":"VnI32V3jl7"},{"type":"inlineMath","value":"s","position":{"start":{"line":461,"column":1},"end":{"line":461,"column":1}},"html":"sss","key":"QF5Cs7VOux"},{"type":"text","value":" has at least one action that hasn’t been taken:","position":{"start":{"line":461,"column":1},"end":{"line":461,"column":1}},"key":"or4lSqeElp"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":462,"column":1},"end":{"line":468,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":462,"column":1},"end":{"line":466,"column":1}},"children":[{"type":"text","value":"Choose ","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"key":"k2wR72hyIU"},{"type":"inlineMath","value":"a \\gets \\argmax_k \\text{UCB}^{s, k}","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"html":"aarg maxkUCBs,ka \\gets \\argmax_k \\text{UCB}^{s, k}aargmaxkUCBs,k","key":"gvjYz9T6dg"},{"type":"text","value":", where\n","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"key":"lq8LffXgHF"},{"type":"math","value":"\\text{UCB}^{s, a} = \\frac{W^{s, a}}{N^s} + c \\sqrt{\\frac{\\ln N^s}{N^{s, a}}}","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"identifier":"ucb-tree","label":"ucb-tree","html_id":"ucb-tree","html":"UCBs,a=Ws,aNs+clnNsNs,a\\text{UCB}^{s, a} = \\frac{W^{s, a}}{N^s} + c \\sqrt{\\frac{\\ln N^s}{N^{s, a}}}UCBs,a=NsWs,a+cNs,alnNs","enumerator":"8.4","key":"wOLRc3XOqD"}],"key":"zioIV3B4RG"},{"type":"listItem","spread":true,"position":{"start":{"line":467,"column":1},"end":{"line":467,"column":1}},"children":[{"type":"text","value":"Append ","position":{"start":{"line":467,"column":1},"end":{"line":467,"column":1}},"key":"FCvAC5RrZv"},{"type":"inlineMath","value":"(s, a)","position":{"start":{"line":467,"column":1},"end":{"line":467,"column":1}},"html":"(s,a)(s, a)(s,a)","key":"l0ladSZffT"},{"type":"text","value":" to ","position":{"start":{"line":467,"column":1},"end":{"line":467,"column":1}},"key":"Vb57uoZHPR"},{"type":"text","value":"τ","position":{"start":{"line":467,"column":1},"end":{"line":467,"column":1}},"key":"PeOBhhcy9L"}],"key":"uERO4YggTm"},{"type":"listItem","spread":true,"position":{"start":{"line":468,"column":1},"end":{"line":468,"column":1}},"children":[{"type":"text","value":"Set ","position":{"start":{"line":468,"column":1},"end":{"line":468,"column":1}},"key":"Eg3ijufVLs"},{"type":"inlineMath","value":"s \\gets P(s, a)","position":{"start":{"line":468,"column":1},"end":{"line":468,"column":1}},"html":"sP(s,a)s \\gets P(s, a)sP(s,a)","key":"dyQstCoX49"}],"key":"xWV5AYuuDT"}],"key":"x7ZjvEyHTE"}],"key":"pQwJrtNK0h"}],"key":"eIZqnABJT8"}],"key":"S0KSyL1MhQ"},{"type":"listItem","spread":true,"position":{"start":{"line":469,"column":1},"end":{"line":469,"column":1}},"children":[{"type":"strong","position":{"start":{"line":469,"column":1},"end":{"line":469,"column":1}},"children":[{"type":"text","value":"Expansion","position":{"start":{"line":469,"column":1},"end":{"line":469,"column":1}},"key":"xrHvgrP1vt"}],"key":"XE6myypQae"},{"type":"text","value":": Let ","position":{"start":{"line":469,"column":1},"end":{"line":469,"column":1}},"key":"AVKOP0Ka9H"},{"type":"inlineMath","value":"s_\\text{new}","position":{"start":{"line":469,"column":1},"end":{"line":469,"column":1}},"html":"snews_\\text{new}snew","key":"HAGCVLL9xZ"},{"type":"text","value":" denote the final state in ","position":{"start":{"line":469,"column":1},"end":{"line":469,"column":1}},"key":"nAz9mw7YOx"},{"type":"text","value":"τ","position":{"start":{"line":469,"column":1},"end":{"line":469,"column":1}},"key":"LhtffDJFFu"},{"type":"text","value":" (that has at least one action that hasn’t been taken). Choose one of these unexplored actions from ","position":{"start":{"line":469,"column":1},"end":{"line":469,"column":1}},"key":"QLMIiPxWdh"},{"type":"inlineMath","value":"s_\\text{new}","position":{"start":{"line":469,"column":1},"end":{"line":469,"column":1}},"html":"snews_\\text{new}snew","key":"FIbNGETyaj"},{"type":"text","value":". Call it ","position":{"start":{"line":469,"column":1},"end":{"line":469,"column":1}},"key":"u7EceRw6iQ"},{"type":"inlineMath","value":"a_{\\text{new}}","position":{"start":{"line":469,"column":1},"end":{"line":469,"column":1}},"html":"anewa_{\\text{new}}anew","key":"F4AOoYPnd8"},{"type":"text","value":". Add it to ","position":{"start":{"line":469,"column":1},"end":{"line":469,"column":1}},"key":"jF1UaKMC8p"},{"type":"text","value":"τ","position":{"start":{"line":469,"column":1},"end":{"line":469,"column":1}},"key":"gbb5xnwF4s"},{"type":"text","value":".","position":{"start":{"line":469,"column":1},"end":{"line":469,"column":1}},"key":"wvANId0STJ"}],"key":"B6erRIkNwg"},{"type":"listItem","spread":true,"position":{"start":{"line":470,"column":1},"end":{"line":472,"column":1}},"children":[{"type":"strong","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"children":[{"type":"text","value":"Simulation","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"v13xPb4bSQ"}],"key":"GrHisN7AVk"},{"type":"text","value":": Simulate a complete game episode by starting with the action ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"hwuQHXBH9b"},{"type":"inlineMath","value":"a_{\\text{new}}","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"anewa_{\\text{new}}anew","key":"OTKNC7PgPD"},{"type":"text","value":"\nand then playing according to ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"rIpOL5uh5Z"},{"type":"inlineMath","value":"\\pi_\\text{rollout}","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"πrollout\\pi_\\text{rollout}πrollout","key":"EBr5Qilw8O"},{"type":"text","value":".\nThis results in the outcome ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"t1V2CRnQja"},{"type":"inlineMath","value":"r \\in \\{ +1, -1 \\}","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"r{+1,1}r \\in \\{ +1, -1 \\}r{+1,1}","key":"ADlyC8j51c"},{"type":"text","value":".","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"KvMz82KGrb"}],"key":"Ovo5pzF20W"},{"type":"listItem","spread":true,"position":{"start":{"line":473,"column":1},"end":{"line":477,"column":1}},"children":[{"type":"strong","position":{"start":{"line":473,"column":1},"end":{"line":473,"column":1}},"children":[{"type":"text","value":"Backup","position":{"start":{"line":473,"column":1},"end":{"line":473,"column":1}},"key":"pyBarI5Asy"}],"key":"Xh3m19cz2F"},{"type":"text","value":": For each ","position":{"start":{"line":473,"column":1},"end":{"line":473,"column":1}},"key":"sAeH70sFvV"},{"type":"inlineMath","value":"(s, a) \\in \\tau","position":{"start":{"line":473,"column":1},"end":{"line":473,"column":1}},"html":"(s,a)τ(s, a) \\in \\tau(s,a)τ","key":"m7JWcGOTty"},{"type":"text","value":":","position":{"start":{"line":473,"column":1},"end":{"line":473,"column":1}},"key":"u8t37ikwL0"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":474,"column":1},"end":{"line":477,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":474,"column":1},"end":{"line":474,"column":1}},"children":[{"type":"text","value":"Set ","position":{"start":{"line":474,"column":1},"end":{"line":474,"column":1}},"key":"F8T9hP3HWw"},{"type":"inlineMath","value":"N^{s, a} \\gets N^{s, a} + 1","position":{"start":{"line":474,"column":1},"end":{"line":474,"column":1}},"html":"Ns,aNs,a+1N^{s, a} \\gets N^{s, a} + 1Ns,aNs,a+1","key":"O1QgqpHfzC"}],"key":"j4qRPCRuQV"},{"type":"listItem","spread":true,"position":{"start":{"line":475,"column":1},"end":{"line":475,"column":1}},"children":[{"type":"inlineMath","value":"W^{s, a} \\gets W^{s, a} + r","position":{"start":{"line":475,"column":1},"end":{"line":475,"column":1}},"html":"Ws,aWs,a+rW^{s, a} \\gets W^{s, a} + rWs,aWs,a+r","key":"MvOXCay66m"}],"key":"ZgNH3aQgOn"},{"type":"listItem","spread":true,"position":{"start":{"line":476,"column":1},"end":{"line":477,"column":1}},"children":[{"type":"text","value":"Set ","position":{"start":{"line":476,"column":1},"end":{"line":476,"column":1}},"key":"v7am60ZzrI"},{"type":"inlineMath","value":"N^s \\gets N^s + 1","position":{"start":{"line":476,"column":1},"end":{"line":476,"column":1}},"html":"NsNs+1N^s \\gets N^s + 1NsNs+1","key":"bABnp5Z6EX"}],"key":"lzbTMeH8vJ"}],"key":"yfqieVNOEV"}],"key":"vfzbIKpuTm"}],"key":"KhdwYhTRf8"},{"type":"paragraph","position":{"start":{"line":478,"column":1},"end":{"line":480,"column":1}},"children":[{"type":"text","value":"After ","position":{"start":{"line":478,"column":1},"end":{"line":478,"column":1}},"key":"vXkcqRIv6H"},{"type":"inlineMath","value":"T","position":{"start":{"line":478,"column":1},"end":{"line":478,"column":1}},"html":"TTT","key":"pjlGOtR4ZQ"},{"type":"text","value":" repeats of the above,\nwe return the action with the highest UCB value ","position":{"start":{"line":478,"column":1},"end":{"line":478,"column":1}},"key":"m6ea0SJPmh"},{"type":"crossReference","position":{"start":{"line":478,"column":1},"end":{"line":478,"column":1}},"children":[{"type":"text","value":"(","key":"aLQC2m3c5b"},{"type":"text","value":"8.4","key":"qe55Y3md21"},{"type":"text","value":")","key":"D0c0aqC2aT"}],"identifier":"ucb-tree","label":"ucb-tree","kind":"equation","template":"(%s)","enumerator":"8.4","resolved":true,"html_id":"ucb-tree","key":"RFM8PPd3Z1"},{"type":"text","value":".\nThen play continues.","position":{"start":{"line":478,"column":1},"end":{"line":478,"column":1}},"key":"KYzP6bGx59"}],"key":"wbtY1Y2buT"},{"type":"paragraph","position":{"start":{"line":482,"column":1},"end":{"line":483,"column":1}},"children":[{"type":"text","value":"Between turns, we can keep the subtree whose statistics we have visited so far.\nHowever, the rest of the tree for the actions we did ","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"X2NcMHFGxE"},{"type":"emphasis","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"children":[{"type":"text","value":"not","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"TiGLJAwhty"}],"key":"BtSIYYL99N"},{"type":"text","value":" end up taking gets discarded.","position":{"start":{"line":482,"column":1},"end":{"line":482,"column":1}},"key":"V4I78GH3e5"}],"key":"NctEPWr98D"}],"enumerator":"8.2","html_id":"mcts-algorithm","key":"N0eDpw3XVf"},{"type":"paragraph","position":{"start":{"line":486,"column":1},"end":{"line":487,"column":1}},"children":[{"type":"text","value":"The application which brought the MCTS algorithm to fame was DeepMind’s ","position":{"start":{"line":486,"column":1},"end":{"line":486,"column":1}},"key":"zNw6wpVHk6"},{"type":"strong","position":{"start":{"line":486,"column":1},"end":{"line":486,"column":1}},"children":[{"type":"text","value":"AlphaGo","position":{"start":{"line":486,"column":1},"end":{"line":486,"column":1}},"key":"cy45ezXrjQ"}],"key":"S1SDzS7Lv9"},{"type":"text","value":" ","position":{"start":{"line":486,"column":1},"end":{"line":486,"column":1}},"key":"hksqpprIpe"},{"type":"cite","kind":"narrative","label":"silver_mastering_2016","identifier":"silver_mastering_2016","children":[{"type":"text","value":"Silver ","key":"pfVH4VZHM7"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"ZAOMwJhzdk"}],"key":"naHv5xOjqT"},{"type":"text","value":" (2016)","key":"TdGB1sBL9e"}],"enumerator":"1","key":"Ha6U0MIIEE"},{"type":"text","value":".\nSince then, it has been used in numerous applications ranging from games to automated theorem proving.","position":{"start":{"line":486,"column":1},"end":{"line":486,"column":1}},"key":"r01wiOeR7u"}],"key":"wUyusf8Jy8"},{"type":"paragraph","position":{"start":{"line":489,"column":1},"end":{"line":492,"column":1}},"children":[{"type":"text","value":"How accurate is this Monte Carlo estimation?\nIt depends heavily on the rollout policy ","position":{"start":{"line":489,"column":1},"end":{"line":489,"column":1}},"key":"FQB4MdDO0x"},{"type":"inlineMath","value":"\\pi_\\text{rollout}","position":{"start":{"line":489,"column":1},"end":{"line":489,"column":1}},"html":"πrollout\\pi_\\text{rollout}πrollout","key":"SQQiqsKexO"},{"type":"text","value":".\nIf the distribution ","position":{"start":{"line":489,"column":1},"end":{"line":489,"column":1}},"key":"NwXWmgQObU"},{"type":"inlineMath","value":"\\pi_\\text{rollout}","position":{"start":{"line":489,"column":1},"end":{"line":489,"column":1}},"html":"πrollout\\pi_\\text{rollout}πrollout","key":"SdnvhHvfSv"},{"type":"text","value":" induces over games is very different from the distribution seen during real gameplay,\nwe might end up with a poor value approximation.","position":{"start":{"line":489,"column":1},"end":{"line":489,"column":1}},"key":"V8SWP7QF4N"}],"key":"MJm8kEaxWZ"},{"type":"heading","depth":3,"position":{"start":{"line":494,"column":1},"end":{"line":494,"column":1}},"children":[{"type":"text","value":"Incorporating value functions and policies","position":{"start":{"line":494,"column":1},"end":{"line":494,"column":1}},"key":"hWSKFZdf7H"}],"identifier":"incorporating-value-functions-and-policies","label":"Incorporating value functions and policies","html_id":"incorporating-value-functions-and-policies","implicit":true,"enumerator":"8.5.1","key":"YKVkYT1GwR"},{"type":"paragraph","position":{"start":{"line":496,"column":1},"end":{"line":498,"column":1}},"children":[{"type":"text","value":"To remedy this,\nwe might make use of a value function ","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"U1xJkH7ip5"},{"type":"inlineMath","value":"v : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"html":"v:SRv : \\mathcal{S} \\to \\mathbb{R}v:SR","key":"HJcIzAzzY0"},{"type":"text","value":" that more efficiently approximates the value of a state.\nThen, we can replace the simulation step of ","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"XcaIIo73L4"},{"type":"crossReference","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"children":[{"type":"text","value":"MCTS","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"hLuTDjpChe"}],"identifier":"mcts-algorithm","label":"mcts-algorithm","kind":"proof:algorithm","template":"Algorithm %s","enumerator":"8.2","resolved":true,"html_id":"mcts-algorithm","key":"oYsj4bTTob"},{"type":"text","value":" with evaluating ","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"l6vxSQG9Pu"},{"type":"inlineMath","value":"r = v(s_\\text{next})","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"html":"r=v(snext)r = v(s_\\text{next})r=v(snext)","key":"y73w3deMcl"},{"type":"text","value":", where ","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"HUJVK0UiDI"},{"type":"inlineMath","value":"s_\\text{next} = P(s_\\text{new}, a_\\text{new})","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"html":"snext=P(snew,anew)s_\\text{next} = P(s_\\text{new}, a_\\text{new})snext=P(snew,anew)","key":"Okg73y9bze"},{"type":"text","value":".","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"PB4RY0kYU6"}],"key":"WB8iNbYGUh"},{"type":"paragraph","position":{"start":{"line":500,"column":1},"end":{"line":501,"column":1}},"children":[{"type":"text","value":"We might also make use of a ","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"key":"yPbylCIXv7"},{"type":"strong","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"children":[{"type":"text","value":"“guiding” policy","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"key":"FSVga7S9GL"}],"key":"RIGY5h0R0j"},{"type":"text","value":" ","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"key":"KrxOabBsoG"},{"type":"inlineMath","value":"\\pi_\\text{guide} : \\mathcal{S} \\to \\triangle(\\mathcal{A})","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"html":"πguide:S(A)\\pi_\\text{guide} : \\mathcal{S} \\to \\triangle(\\mathcal{A})πguide:S(A)","key":"zD2PVxIzIL"},{"type":"text","value":" that provides “intuition” as to which actions are more valuable in a given state.\nWe can scale the exploration term of ","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"key":"K0o5jRvgu5"},{"type":"crossReference","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"children":[{"type":"text","value":"(","key":"wauaRVlwzU"},{"type":"text","value":"8.4","key":"ndCXERut6O"},{"type":"text","value":")","key":"ARYbCYgQAV"}],"identifier":"ucb-tree","label":"ucb-tree","kind":"equation","template":"(%s)","enumerator":"8.4","resolved":true,"html_id":"ucb-tree","key":"Wb6BcbJDdT"},{"type":"text","value":" according to the policy’s outputs.","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"key":"oTI8YfXzSv"}],"key":"LJhjyuiyRE"},{"type":"paragraph","position":{"start":{"line":503,"column":1},"end":{"line":504,"column":1}},"children":[{"type":"text","value":"Putting these together,\nwe can describe an updated version of MCTS that makes use of these value functions and policy:","position":{"start":{"line":503,"column":1},"end":{"line":503,"column":1}},"key":"WGgq3SuQyV"}],"key":"EcH9kDeWir"},{"type":"proof","kind":"algorithm","label":"mcts-policy-value","identifier":"mcts-policy-value","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Monte Carlo tree search with policy and value functions","position":{"start":{"line":506,"column":1},"end":{"line":506,"column":1}},"key":"ZKmpUr42AK"}],"key":"uaMsJGlBl8"},{"type":"paragraph","position":{"start":{"line":509,"column":1},"end":{"line":509,"column":1}},"children":[{"type":"text","value":"Inputs:","position":{"start":{"line":509,"column":1},"end":{"line":509,"column":1}},"key":"kFKCSGKkSE"}],"key":"x8EKAR5aef"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":510,"column":1},"end":{"line":514,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":510,"column":1},"end":{"line":510,"column":1}},"children":[{"type":"inlineMath","value":"T","position":{"start":{"line":510,"column":1},"end":{"line":510,"column":1}},"html":"TTT","key":"hCdfdHoNn0"},{"type":"text","value":", the number of iterations per move","position":{"start":{"line":510,"column":1},"end":{"line":510,"column":1}},"key":"UR4WQwYmZc"}],"key":"jA9bexQzJS"},{"type":"listItem","spread":true,"position":{"start":{"line":511,"column":1},"end":{"line":511,"column":1}},"children":[{"type":"inlineMath","value":"v","position":{"start":{"line":511,"column":1},"end":{"line":511,"column":1}},"html":"vvv","key":"G8fmaifaY6"},{"type":"text","value":", a value function that evaluates how good a state is","position":{"start":{"line":511,"column":1},"end":{"line":511,"column":1}},"key":"jWcSyic7u6"}],"key":"sEVgCR9nNB"},{"type":"listItem","spread":true,"position":{"start":{"line":512,"column":1},"end":{"line":512,"column":1}},"children":[{"type":"inlineMath","value":"\\pi_\\text{guide}","position":{"start":{"line":512,"column":1},"end":{"line":512,"column":1}},"html":"πguide\\pi_\\text{guide}πguide","key":"AFfjkf3cjp"},{"type":"text","value":", a guiding policy that encourages certain actions","position":{"start":{"line":512,"column":1},"end":{"line":512,"column":1}},"key":"xFxkdY5rBQ"}],"key":"mQ7ldmOEZc"},{"type":"listItem","spread":true,"position":{"start":{"line":513,"column":1},"end":{"line":514,"column":1}},"children":[{"type":"inlineMath","value":"c","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"html":"ccc","key":"rkkYJx0Tu6"},{"type":"text","value":", a positive value that encourages exploration","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"UBZHR5oKft"}],"key":"u9Z64kEYHG"}],"key":"XvWICFvETA"},{"type":"paragraph","position":{"start":{"line":515,"column":1},"end":{"line":515,"column":1}},"children":[{"type":"text","value":"To select a move in state ","position":{"start":{"line":515,"column":1},"end":{"line":515,"column":1}},"key":"gTo3mgm9vD"},{"type":"inlineMath","value":"s_\\text{start}","position":{"start":{"line":515,"column":1},"end":{"line":515,"column":1}},"html":"sstarts_\\text{start}sstart","key":"Kf9UzXDygY"},{"type":"text","value":", we repeat the following four steps ","position":{"start":{"line":515,"column":1},"end":{"line":515,"column":1}},"key":"iHdMoKIGBU"},{"type":"inlineMath","value":"T","position":{"start":{"line":515,"column":1},"end":{"line":515,"column":1}},"html":"TTT","key":"MH90S5NZ0P"},{"type":"text","value":" times:","position":{"start":{"line":515,"column":1},"end":{"line":515,"column":1}},"key":"o2FgKBQ5H2"}],"key":"YnJ0SACc5r"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":517,"column":1},"end":{"line":532,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":517,"column":1},"end":{"line":525,"column":1}},"children":[{"type":"strong","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"children":[{"type":"text","value":"Selection","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"key":"gZz3Z98KYk"}],"key":"h9mK7QTPnw"},{"type":"text","value":": We start at ","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"key":"ObPGSLQdxz"},{"type":"inlineMath","value":"s = s_{\\text{start}}","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"html":"s=sstarts = s_{\\text{start}}s=sstart","key":"xRSQQNe0Cx"},{"type":"text","value":". Let ","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"key":"GhdZrg5uTm"},{"type":"text","value":"τ","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"key":"GAVUy3oTXr"},{"type":"text","value":" be an empty list that we will use to track states and actions.","position":{"start":{"line":517,"column":1},"end":{"line":517,"column":1}},"key":"XzLCXsZro5"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":518,"column":1},"end":{"line":525,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":518,"column":1},"end":{"line":525,"column":1}},"children":[{"type":"text","value":"Until ","position":{"start":{"line":518,"column":1},"end":{"line":518,"column":1}},"key":"jcmyLy692g"},{"type":"inlineMath","value":"s","position":{"start":{"line":518,"column":1},"end":{"line":518,"column":1}},"html":"sss","key":"UJVMtiEZno"},{"type":"text","value":" has at least one action that hasn’t been taken:","position":{"start":{"line":518,"column":1},"end":{"line":518,"column":1}},"key":"bCDzsuiKpk"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":519,"column":1},"end":{"line":525,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":519,"column":1},"end":{"line":523,"column":1}},"children":[{"type":"text","value":"Choose ","position":{"start":{"line":519,"column":1},"end":{"line":519,"column":1}},"key":"c5XdlHooTA"},{"type":"inlineMath","value":"a \\gets \\argmax_k \\text{UCB}^{s, k}","position":{"start":{"line":519,"column":1},"end":{"line":519,"column":1}},"html":"aarg maxkUCBs,ka \\gets \\argmax_k \\text{UCB}^{s, k}aargmaxkUCBs,k","key":"cIaEjKyQWU"},{"type":"text","value":", where\n","position":{"start":{"line":519,"column":1},"end":{"line":519,"column":1}},"key":"IspG3ZHtc6"},{"type":"math","value":"\\text{UCB}^{s, a} = \\frac{W^{s, a}}{N^s} + c \\cdot \\pi_\\text{guide}(a \\mid s) \\sqrt{\\frac{\\ln N^s}{N^{s, a}}}","position":{"start":{"line":519,"column":1},"end":{"line":519,"column":1}},"identifier":"ucb-tree-policy","label":"ucb-tree-policy","html_id":"ucb-tree-policy","html":"UCBs,a=Ws,aNs+cπguide(as)lnNsNs,a\\text{UCB}^{s, a} = \\frac{W^{s, a}}{N^s} + c \\cdot \\pi_\\text{guide}(a \\mid s) \\sqrt{\\frac{\\ln N^s}{N^{s, a}}}UCBs,a=NsWs,a+cπguide(as)Ns,alnNs","enumerator":"8.5","key":"v3Tkz2TZMV"}],"key":"uoLHn3p6r7"},{"type":"listItem","spread":true,"position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"children":[{"type":"text","value":"Append ","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"key":"zspRtxrW9y"},{"type":"inlineMath","value":"(s, a)","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"html":"(s,a)(s, a)(s,a)","key":"vJeNsUV6ZZ"},{"type":"text","value":" to ","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"key":"oALyn113az"},{"type":"text","value":"τ","position":{"start":{"line":524,"column":1},"end":{"line":524,"column":1}},"key":"rweaNG2emz"}],"key":"bjpWm5WecN"},{"type":"listItem","spread":true,"position":{"start":{"line":525,"column":1},"end":{"line":525,"column":1}},"children":[{"type":"text","value":"Set ","position":{"start":{"line":525,"column":1},"end":{"line":525,"column":1}},"key":"DoVDFWqDjZ"},{"type":"inlineMath","value":"s \\gets P(s, a)","position":{"start":{"line":525,"column":1},"end":{"line":525,"column":1}},"html":"sP(s,a)s \\gets P(s, a)sP(s,a)","key":"yQG0ehWNhE"}],"key":"g9sOGe8emN"}],"key":"ZKdaeJASCt"}],"key":"L9fMnQ9Kn5"}],"key":"qKdPlUAdxh"}],"key":"xw666wBOoV"},{"type":"listItem","spread":true,"position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"children":[{"type":"strong","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"children":[{"type":"text","value":"Expansion","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"wirBJxeD1i"}],"key":"PlgOETq1Cf"},{"type":"text","value":": Let ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"MD9UvNiCU5"},{"type":"inlineMath","value":"s_\\text{new}","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"html":"snews_\\text{new}snew","key":"lrrSvgywll"},{"type":"text","value":" denote the final state in ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"fKipzc8feI"},{"type":"text","value":"τ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"TcszPR3XUu"},{"type":"text","value":" (that has at least one action that hasn’t been taken). Choose one of these unexplored actions from ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"BG4H9akVFc"},{"type":"inlineMath","value":"s_\\text{new}","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"html":"snews_\\text{new}snew","key":"tV8lPhSPK5"},{"type":"text","value":". Call it ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"V0438uWAOj"},{"type":"inlineMath","value":"a_{\\text{new}}","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"html":"anewa_{\\text{new}}anew","key":"F11h3oid3P"},{"type":"text","value":". Add it to ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"Uqbhz1MatA"},{"type":"text","value":"τ","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"M739mx6tlx"},{"type":"text","value":".","position":{"start":{"line":526,"column":1},"end":{"line":526,"column":1}},"key":"paUpCLLbQA"}],"key":"c5GXMA7e4e"},{"type":"listItem","spread":true,"position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"children":[{"type":"strong","position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"children":[{"type":"text","value":"Simulation","position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"key":"yO1Ev16Bqc"}],"key":"DyjSmsOr5f"},{"type":"text","value":": Let ","position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"key":"Az0ElW0EoV"},{"type":"inlineMath","value":"s_\\text{next} = P(s_\\text{new}, a_\\text{new})","position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"html":"snext=P(snew,anew)s_\\text{next} = P(s_\\text{new}, a_\\text{new})snext=P(snew,anew)","key":"bqoxyrwztE"},{"type":"text","value":". Evaluate ","position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"key":"FS1qgfTWLK"},{"type":"inlineMath","value":"r = v(s_\\text{next})","position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"html":"r=v(snext)r = v(s_\\text{next})r=v(snext)","key":"BNuKAFAwgc"},{"type":"text","value":". This approximates the value of the game after taking the action ","position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"key":"X9XETW2cPV"},{"type":"inlineMath","value":"a_\\text{new}","position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"html":"anewa_\\text{new}anew","key":"Ui676HHOpr"},{"type":"text","value":".","position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"key":"ANInzzTmdp"}],"key":"ZEipVCr0RI"},{"type":"listItem","spread":true,"position":{"start":{"line":528,"column":1},"end":{"line":532,"column":1}},"children":[{"type":"strong","position":{"start":{"line":528,"column":1},"end":{"line":528,"column":1}},"children":[{"type":"text","value":"Backup","position":{"start":{"line":528,"column":1},"end":{"line":528,"column":1}},"key":"BVoKAWEuZL"}],"key":"ZlM4KnVcNG"},{"type":"text","value":": For each ","position":{"start":{"line":528,"column":1},"end":{"line":528,"column":1}},"key":"TeGLFDuN6F"},{"type":"inlineMath","value":"(s, a) \\in \\tau","position":{"start":{"line":528,"column":1},"end":{"line":528,"column":1}},"html":"(s,a)τ(s, a) \\in \\tau(s,a)τ","key":"Eju5AOR1bX"},{"type":"text","value":":","position":{"start":{"line":528,"column":1},"end":{"line":528,"column":1}},"key":"R3ry7oM83Z"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":529,"column":1},"end":{"line":532,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":529,"column":1},"end":{"line":529,"column":1}},"children":[{"type":"inlineMath","value":"N^{s, a} \\gets N^{s, a} + 1","position":{"start":{"line":529,"column":1},"end":{"line":529,"column":1}},"html":"Ns,aNs,a+1N^{s, a} \\gets N^{s, a} + 1Ns,aNs,a+1","key":"yBx27BdnJ4"}],"key":"S1F6S8OHgC"},{"type":"listItem","spread":true,"position":{"start":{"line":530,"column":1},"end":{"line":530,"column":1}},"children":[{"type":"inlineMath","value":"W^{s, a} \\gets W^{s, a} + r","position":{"start":{"line":530,"column":1},"end":{"line":530,"column":1}},"html":"Ws,aWs,a+rW^{s, a} \\gets W^{s, a} + rWs,aWs,a+r","key":"nAmZgxrepP"}],"key":"yTtTOmcMjL"},{"type":"listItem","spread":true,"position":{"start":{"line":531,"column":1},"end":{"line":532,"column":1}},"children":[{"type":"inlineMath","value":"N^s \\gets N^s + 1","position":{"start":{"line":531,"column":1},"end":{"line":531,"column":1}},"html":"NsNs+1N^s \\gets N^s + 1NsNs+1","key":"Y43sbpifQA"}],"key":"ZtE2XbRmnw"}],"key":"TS5ZUGQxlr"}],"key":"hkKOUrjA4Y"}],"key":"A0ye08IZ2Q"},{"type":"paragraph","position":{"start":{"line":533,"column":1},"end":{"line":534,"column":1}},"children":[{"type":"text","value":"We finally return the action with the highest UCB value ","position":{"start":{"line":533,"column":1},"end":{"line":533,"column":1}},"key":"j0RtsNYB0L"},{"type":"crossReference","position":{"start":{"line":533,"column":1},"end":{"line":533,"column":1}},"children":[{"type":"text","value":"(","key":"UCfRIYiJoF"},{"type":"text","value":"8.5","key":"Y0ITQrGww5"},{"type":"text","value":")","key":"dxKuTLKLjn"}],"identifier":"ucb-tree-policy","label":"ucb-tree-policy","kind":"equation","template":"(%s)","enumerator":"8.5","resolved":true,"html_id":"ucb-tree-policy","key":"zN5iKUWvYS"},{"type":"text","value":".\nThen play continues. As before, we can reuse the tree across timesteps.","position":{"start":{"line":533,"column":1},"end":{"line":533,"column":1}},"key":"yIqFHl3X5J"}],"key":"CPlAEltDkb"}],"enumerator":"8.3","html_id":"mcts-policy-value","key":"Jr33vZOmtQ"},{"type":"paragraph","position":{"start":{"line":537,"column":1},"end":{"line":543,"column":1}},"children":[{"type":"text","value":"How do we actually compute a useful ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"J01BILRqLA"},{"type":"inlineMath","value":"\\pi_\\text{guide}","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"πguide\\pi_\\text{guide}πguide","key":"Vlc9Dl018X"},{"type":"text","value":" and ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"onhsJEJdO5"},{"type":"inlineMath","value":"v","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"vvv","key":"pUTCA1fhph"},{"type":"text","value":"?\nIf we have some existing dataset of trajectories,\nwe could use ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"WyYwLqsvq8"},{"type":"link","url":"/imitation-learning","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"children":[{"type":"text","value":"supervised learning","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"gXnwvu8gVA"}],"urlSource":"./imitation_learning.md","dataUrl":"/imitation-learning.json","internal":true,"protocol":"file","key":"onBcmL4AWS"},{"type":"text","value":" (that is, imitation learning)\nto generate a policy ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"d9BhSGhveT"},{"type":"inlineMath","value":"\\pi_\\text{guide}","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"πguide\\pi_\\text{guide}πguide","key":"uXG3FEr9qe"},{"type":"text","value":" via behavioral cloning\nand learn ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"JpPow6oQXg"},{"type":"inlineMath","value":"v","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"vvv","key":"FKvomL8u1T"},{"type":"text","value":" by regressing the game outcomes onto states.\nThen, plugging these into ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"zR3DhjpB91"},{"type":"crossReference","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"children":[{"type":"text","value":"the above algorithm","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"HEzDuXgBGC"}],"identifier":"mcts-policy-value","label":"mcts-policy-value","kind":"proof:algorithm","template":"Algorithm %s","enumerator":"8.3","resolved":true,"html_id":"mcts-policy-value","key":"mCY8k2AN1F"},{"type":"text","value":"\nresults in a stronger policy by using tree search to “think ahead”.","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"UKW736mLnH"}],"key":"eEKkvhurbu"},{"type":"paragraph","position":{"start":{"line":545,"column":1},"end":{"line":546,"column":1}},"children":[{"type":"text","value":"But we don’t have to stop at just one improvement step;\nwe could iterate this process via ","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"MBIEYET2Tg"},{"type":"strong","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"children":[{"type":"text","value":"self-play","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"dOBZy4VWks"}],"key":"AzrlwR8WJP"},{"type":"text","value":".","position":{"start":{"line":545,"column":1},"end":{"line":545,"column":1}},"key":"sSiVv3unmq"}],"key":"xpkZI2osZx"},{"type":"heading","depth":3,"position":{"start":{"line":548,"column":1},"end":{"line":548,"column":1}},"children":[{"type":"text","value":"Self-play","position":{"start":{"line":548,"column":1},"end":{"line":548,"column":1}},"key":"cEKnQoBMJH"}],"identifier":"self-play","label":"Self-play","html_id":"self-play","implicit":true,"enumerator":"8.5.2","key":"svHQMoADmw"},{"type":"paragraph","position":{"start":{"line":550,"column":1},"end":{"line":560,"column":1}},"children":[{"type":"text","value":"Recall the ","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"ej9iHm9sBo"},{"type":"crossReference","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"children":[{"type":"text","value":"policy iteration","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"temliDoAGv"}],"identifier":"policy_iteration","label":"policy_iteration","kind":"heading","template":"Section %s","enumerator":"1.5.3.2","resolved":true,"html_id":"policy-iteration","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"Me55OFMB5t"},{"type":"text","value":" algorithm from the ","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"iTWKD8Wc6u"},{"type":"link","url":"/mdps","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"children":[{"type":"text","value":"MDPs","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"cycS6fT2h4"}],"urlSource":"./mdps.md","dataUrl":"/mdps.json","internal":true,"protocol":"file","key":"xwIdU3EQQ8"},{"type":"text","value":" chapter.\nPolicy iteration alternates between ","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"ZPMudJjVf3"},{"type":"strong","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"children":[{"type":"text","value":"policy evaluation","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"eFAKPCtScv"}],"key":"ZzYhWf9KVf"},{"type":"text","value":" (taking ","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"U2oL1iLExM"},{"type":"text","value":"π","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"nHEyckiiuv"},{"type":"text","value":" and computing ","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"NoHR37CPQx"},{"type":"inlineMath","value":"V^\\pi","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"html":"VπV^\\piVπ","key":"zK97l0p618"},{"type":"text","value":")\nand ","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"sCfI8kC3NH"},{"type":"strong","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"children":[{"type":"text","value":"policy improvement","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"I0ouASvtlB"}],"key":"jogRvxFsjf"},{"type":"text","value":" (setting ","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"dV9BdLHwuL"},{"type":"text","value":"π","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"Vw0NAnBIXz"},{"type":"text","value":" to be greedy with respect to ","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"zQO8D1hbNn"},{"type":"inlineMath","value":"V^\\pi","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"html":"VπV^\\piVπ","key":"yLZoS0kIvt"},{"type":"text","value":").\nAbove, we saw how MCTS can be thought of as a “policy improvement” operation:\nfor a given policy ","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"LUe6bKC7zy"},{"type":"inlineMath","value":"\\pi^0","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"html":"π0\\pi^0π0","key":"BoFJN2GC0G"},{"type":"text","value":",\nwe can use it to guide MCTS,\nresulting in an algorithm that is itself a policy ","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"ZMUPGW7Xzb"},{"type":"inlineMath","value":"\\pi^0_\\text{MCTS}","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"html":"πMCTS0\\pi^0_\\text{MCTS}πMCTS0","key":"SXhzcuwQG1"},{"type":"text","value":" that maps from states to actions.\nNow, we can use ","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"hvvFcw4gcw"},{"type":"link","url":"/imitation-learning","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"children":[{"type":"text","value":"behavioral cloning","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"EsxZ4EZUGs"}],"urlSource":"./imitation_learning.md","dataUrl":"/imitation-learning.json","internal":true,"protocol":"file","key":"LE2E1unlJK"},{"type":"text","value":"\nto obtain a new policy ","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"dVYjdU71QR"},{"type":"inlineMath","value":"\\pi^1","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"html":"π1\\pi^1π1","key":"CDTY4FY0fR"},{"type":"text","value":" that imitates ","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"cSYZmIXzV1"},{"type":"inlineMath","value":"\\pi^0_\\text{MCTS}","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"html":"πMCTS0\\pi^0_\\text{MCTS}πMCTS0","key":"wELvyU38zW"},{"type":"text","value":".\nWe can now use ","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"lEmUelSRzH"},{"type":"inlineMath","value":"\\pi^1","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"html":"π1\\pi^1π1","key":"aWfM3TQny2"},{"type":"text","value":" to guide MCTS,\nand repeat.","position":{"start":{"line":550,"column":1},"end":{"line":550,"column":1}},"key":"XOy6bTUlbp"}],"key":"BnuqnXAMoB"},{"type":"proof","kind":"algorithm","label":"mcts-self-play","identifier":"mcts-self-play","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"MCTS with self-play","position":{"start":{"line":562,"column":1},"end":{"line":562,"column":1}},"key":"MUAukC9X4Q"}],"key":"RTRoBS2RWa"},{"type":"paragraph","position":{"start":{"line":565,"column":1},"end":{"line":565,"column":1}},"children":[{"type":"text","value":"Input:","position":{"start":{"line":565,"column":1},"end":{"line":565,"column":1}},"key":"nno8UhdiCh"}],"key":"AjDa1t2ItM"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":567,"column":1},"end":{"line":571,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":567,"column":1},"end":{"line":567,"column":1}},"children":[{"type":"text","value":"A parameterized policy class ","position":{"start":{"line":567,"column":1},"end":{"line":567,"column":1}},"key":"Cr1yMgYlbv"},{"type":"inlineMath","value":"\\pi_\\theta : \\mathcal{S} \\to \\triangle(\\mathcal{A})","position":{"start":{"line":567,"column":1},"end":{"line":567,"column":1}},"html":"πθ:S(A)\\pi_\\theta : \\mathcal{S} \\to \\triangle(\\mathcal{A})πθ:S(A)","key":"fQqVSu8CjP"}],"key":"mv08EMrVMr"},{"type":"listItem","spread":true,"position":{"start":{"line":568,"column":1},"end":{"line":568,"column":1}},"children":[{"type":"text","value":"A parameterized value function class ","position":{"start":{"line":568,"column":1},"end":{"line":568,"column":1}},"key":"G9oMpG89OF"},{"type":"inlineMath","value":"v_\\lambda : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":568,"column":1},"end":{"line":568,"column":1}},"html":"vλ:SRv_\\lambda : \\mathcal{S} \\to \\mathbb{R}vλ:SR","key":"HC1weQ2fQb"}],"key":"PfV0MiRJ4h"},{"type":"listItem","spread":true,"position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"children":[{"type":"text","value":"A number of trajectories ","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"ttZnRpy68x"},{"type":"inlineMath","value":"M","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"html":"MMM","key":"JeFBH2IEDv"},{"type":"text","value":" to generate","position":{"start":{"line":569,"column":1},"end":{"line":569,"column":1}},"key":"av6aLnkwRS"}],"key":"x54EPykCvn"},{"type":"listItem","spread":true,"position":{"start":{"line":570,"column":1},"end":{"line":571,"column":1}},"children":[{"type":"text","value":"The initial parameters ","position":{"start":{"line":570,"column":1},"end":{"line":570,"column":1}},"key":"TkZgfT3Rpf"},{"type":"inlineMath","value":"\\theta^0, \\lambda^0","position":{"start":{"line":570,"column":1},"end":{"line":570,"column":1}},"html":"θ0,λ0\\theta^0, \\lambda^0θ0,λ0","key":"gcHkaWzVLd"}],"key":"r8YRA1NzCo"}],"key":"t3jXGgMeCH"},{"type":"paragraph","position":{"start":{"line":572,"column":1},"end":{"line":572,"column":1}},"children":[{"type":"text","value":"For ","position":{"start":{"line":572,"column":1},"end":{"line":572,"column":1}},"key":"U8TfxVXeRJ"},{"type":"inlineMath","value":"t = 0, \\dots, T-1","position":{"start":{"line":572,"column":1},"end":{"line":572,"column":1}},"html":"t=0,,T1t = 0, \\dots, T-1t=0,,T1","key":"Y0B1LvWIjC"},{"type":"text","value":":","position":{"start":{"line":572,"column":1},"end":{"line":572,"column":1}},"key":"oB1xRqofIA"}],"key":"mo7gDVle7c"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":574,"column":1},"end":{"line":580,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"children":[{"type":"strong","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"children":[{"type":"text","value":"Policy improvement","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"key":"uwPKukFE55"}],"key":"srAGk0QZyG"},{"type":"text","value":": Let ","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"key":"zkSDGv82wc"},{"type":"inlineMath","value":"\\pi^t_\\text{MCTS}","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"html":"πMCTSt\\pi^t_\\text{MCTS}πMCTSt","key":"DwCiM7NPHC"},{"type":"text","value":" denote the policy obtained by ","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"key":"A82FkqIDuv"},{"type":"crossReference","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"children":[{"type":"text","value":"Algorithm ","key":"ZPAZU9azIi"},{"type":"text","value":"8.3","key":"QdRqX0d4Tk"}],"identifier":"mcts-policy-value","label":"mcts-policy-value","kind":"proof:algorithm","template":"Algorithm %s","enumerator":"8.3","resolved":true,"html_id":"mcts-policy-value","key":"TGhm0sLryg"},{"type":"text","value":" with ","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"key":"IEUhwt0998"},{"type":"inlineMath","value":"\\pi_{\\theta^t}","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"html":"πθt\\pi_{\\theta^t}πθt","key":"MFdCaNZ2VX"},{"type":"text","value":" and ","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"key":"dVI6o4kdgX"},{"type":"inlineMath","value":"v_{\\lambda^t}","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"html":"vλtv_{\\lambda^t}vλt","key":"fzIMmsGCVN"},{"type":"text","value":". We use ","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"key":"dFLXKDLn82"},{"type":"inlineMath","value":"\\pi^t_\\text{MCTS}","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"html":"πMCTSt\\pi^t_\\text{MCTS}πMCTSt","key":"grX7RLIL2a"},{"type":"text","value":" to play against itself ","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"key":"azIHeRjgvH"},{"type":"inlineMath","value":"M","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"html":"MMM","key":"i7miUF36EF"},{"type":"text","value":" times. This generates ","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"key":"NB3GLN2GIB"},{"type":"inlineMath","value":"M","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"html":"MMM","key":"LsgKTyIrVJ"},{"type":"text","value":" trajectories ","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"key":"RDK7k5Uqfz"},{"type":"inlineMath","value":"\\tau_0, \\dots, \\tau_{M-1}","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"html":"τ0,,τM1\\tau_0, \\dots, \\tau_{M-1}τ0,,τM1","key":"X6de0kt4MO"},{"type":"text","value":".","position":{"start":{"line":574,"column":1},"end":{"line":574,"column":1}},"key":"aIP5fmAtOA"}],"key":"BhCXJtqUQr"},{"type":"listItem","spread":true,"position":{"start":{"line":575,"column":1},"end":{"line":580,"column":1}},"children":[{"type":"strong","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"children":[{"type":"text","value":"Policy evaluation","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"key":"bH0HOSQaR0"}],"key":"wy7H995at5"},{"type":"text","value":": Use behavioral cloning to find a set of policy parameters ","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"key":"VhewlV3JwW"},{"type":"inlineMath","value":"\\theta^{t+1}","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"html":"θt+1\\theta^{t+1}θt+1","key":"vBfb6qUcG8"},{"type":"text","value":" that mimic the behavior of ","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"key":"VvfjsTx7Hh"},{"type":"inlineMath","value":"\\pi^t_\\text{MCTS}","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"html":"πMCTSt\\pi^t_\\text{MCTS}πMCTSt","key":"dWovUIb2hQ"},{"type":"text","value":" and a set of value function parameters ","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"key":"Gkpf4Tl83K"},{"type":"inlineMath","value":"\\lambda^{t+1}","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"html":"λt+1\\lambda^{t+1}λt+1","key":"VPUpoorFY3"},{"type":"text","value":" that approximate its value function. That is,","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"key":"NplBq6cMEx"},{"type":"math","tight":"before","value":"\\begin{align*}\n \\theta^{t+1} &\\gets \\argmin_\\theta \\sum_{m=0}^{M-1} \\sum_{\\hi=0}^{H-1} - \\log \\pi_\\theta(a^m_\\hi \\mid s^m_\\hi) \\\\\n \\lambda^{t+1} &\\gets \\argmin_\\lambda \\sum_{m=0}^{M-1} \\sum_{\\hi=0}^{H-1} (v_\\lambda(s^m_\\hi) - R(\\tau_m))^2\n \\end{align*}","position":{"start":{"line":576,"column":1},"end":{"line":578,"column":1}},"html":"θt+1arg minθm=0M1h=0H1logπθ(ahmshm)λt+1arg minλm=0M1h=0H1(vλ(shm)R(τm))2\\begin{align*}\n \\theta^{t+1} &\\gets \\argmin_\\theta \\sum_{m=0}^{M-1} \\sum_{\\hi=0}^{H-1} - \\log \\pi_\\theta(a^m_\\hi \\mid s^m_\\hi) \\\\\n \\lambda^{t+1} &\\gets \\argmin_\\lambda \\sum_{m=0}^{M-1} \\sum_{\\hi=0}^{H-1} (v_\\lambda(s^m_\\hi) - R(\\tau_m))^2\n \\end{align*}θt+1λt+1θargminm=0M1h=0H1logπθ(ahmshm)λargminm=0M1h=0H1(vλ(shm)R(τm))2","enumerator":"8.6","key":"ks1T07oR2m"}],"key":"B8AdADI31M"}],"key":"rTKaQ4aFsT"},{"type":"paragraph","position":{"start":{"line":581,"column":1},"end":{"line":584,"column":1}},"children":[{"type":"text","value":"Note that in implementation,\nthe policy and value are typically both returned by a single deep neural network,\nthat is, with a single set of parameters,\nand the two loss functions are added together.","position":{"start":{"line":581,"column":1},"end":{"line":581,"column":1}},"key":"msSzeO93S9"}],"key":"aQwe599tTF"}],"enumerator":"8.4","html_id":"mcts-self-play","key":"mHfQoaZP3z"},{"type":"paragraph","position":{"start":{"line":587,"column":1},"end":{"line":587,"column":1}},"children":[{"type":"text","value":"This algorithm was brought to fame by AlphaGo Zero ","position":{"start":{"line":587,"column":1},"end":{"line":587,"column":1}},"key":"SQBMALrhKE"},{"type":"cite","kind":"narrative","label":"silver_mastering_2017","identifier":"silver_mastering_2017","children":[{"type":"text","value":"Silver ","key":"TnSXFUsHpz"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"RelYGHTDbV"}],"key":"WpGgLPksyx"},{"type":"text","value":" (2017)","key":"GbfmEKOdBB"}],"enumerator":"2","key":"uk9fxVA6yJ"},{"type":"text","value":".","position":{"start":{"line":587,"column":1},"end":{"line":587,"column":1}},"key":"kc9PBnuzJ5"}],"key":"hYXX5yy6SY"},{"type":"heading","depth":2,"position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"children":[{"type":"text","value":"Summary","position":{"start":{"line":589,"column":1},"end":{"line":589,"column":1}},"key":"iRkG9rMCY2"}],"identifier":"summary","label":"Summary","html_id":"summary","implicit":true,"enumerator":"8.6","key":"cxC9ZHTdpP"},{"type":"paragraph","position":{"start":{"line":591,"column":1},"end":{"line":598,"column":1}},"children":[{"type":"text","value":"In this chapter,\nwe explored tree search-based algorithms for deterministic, zero sum, fully observable two-player games.\nWe began with ","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"RJeS3klJBa"},{"type":"crossReference","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"children":[{"type":"text","value":"min-max search","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"zPhFP7jsxh"}],"identifier":"min-max-search","label":"min-max-search","kind":"heading","template":"Section %s","enumerator":"8.3","resolved":true,"html_id":"min-max-search","key":"JyKSAmx81t"},{"type":"text","value":",\nan algorithm for exactly solving the game value of every possible state.\nHowever, this is impossible to execute in practice,\nand so we must resort to various ways to reduce the number of states and actions that we must explore.\n","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"M2h2crDeTf"},{"type":"crossReference","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"children":[{"type":"text","value":"Alpha-beta search","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"ttyznV8CrH"}],"identifier":"alpha-beta-search","label":"alpha-beta-search","kind":"heading","template":"Section %s","enumerator":"8.4","resolved":true,"html_id":"alpha-beta-search","key":"MFYA2YzgRp"},{"type":"text","value":" does this by ","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"kkTXlev6oM"},{"type":"emphasis","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"children":[{"type":"text","value":"pruning","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"wqSD0zp3eu"}],"key":"fLDv0SCc0B"},{"type":"text","value":" away states that we already know to be suboptimal,\nand ","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"uewAUbnJGQ"},{"type":"crossReference","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"children":[{"type":"text","value":"Monte Carlo Tree Search","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"kvmDqMNMqT"}],"identifier":"monte-carlo-tree-search","label":"monte-carlo-tree-search","kind":"heading","template":"Section %s","enumerator":"8.5","resolved":true,"html_id":"monte-carlo-tree-search","key":"GBTL7rnVmW"},{"type":"text","value":" ","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"hXDkfz1Suj"},{"type":"emphasis","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"children":[{"type":"text","value":"approximates","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"JhphtQKiAC"}],"key":"JijyY9szTK"},{"type":"text","value":" the value of states instead of evaluating them exactly.","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"JHOhHySA2I"}],"key":"WMEpeRCtql"},{"type":"heading","depth":2,"position":{"start":{"line":601,"column":1},"end":{"line":601,"column":1}},"children":[{"type":"text","value":"References","position":{"start":{"line":601,"column":1},"end":{"line":601,"column":1}},"key":"eyXZR5i7C5"}],"identifier":"references","label":"References","html_id":"references","implicit":true,"enumerator":"8.7","key":"cd565GADLB"},{"type":"paragraph","position":{"start":{"line":603,"column":1},"end":{"line":611,"column":1}},"children":[{"type":"text","value":"Chapter 5 of ","position":{"start":{"line":603,"column":1},"end":{"line":603,"column":1}},"key":"JOdi0dL7Z1"},{"type":"cite","kind":"narrative","label":"russell_artificial_2021","identifier":"russell_artificial_2021","children":[{"type":"text","value":"Russell & Norvig (2021)","key":"QqdI9UQpln"}],"enumerator":"3","key":"KhO9rVRneN"},{"type":"text","value":" provides an excellent overview of search methods in games.\nThe original AlphaGo paper ","position":{"start":{"line":603,"column":1},"end":{"line":603,"column":1}},"key":"lnkToUpv9f"},{"type":"cite","kind":"narrative","label":"silver_mastering_2016","identifier":"silver_mastering_2016","children":[{"type":"text","value":"Silver ","key":"s6ylqlPXUQ"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"YV1Rh9avTW"}],"key":"fa9IYWavPn"},{"type":"text","value":" (2016)","key":"hHr7tYln1S"}],"enumerator":"1","key":"xWns1upxMt"},{"type":"text","value":" was a groundbreaking application of these technologies.\n","position":{"start":{"line":603,"column":1},"end":{"line":603,"column":1}},"key":"b4EP2372qn"},{"type":"cite","kind":"narrative","label":"silver_mastering_2017","identifier":"silver_mastering_2017","children":[{"type":"text","value":"Silver ","key":"Yhjtycn6HM"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"YKu8sxhPN3"}],"key":"YbBQkGvCVO"},{"type":"text","value":" (2017)","key":"leCyxm6yTm"}],"enumerator":"2","key":"Kz6MJosUsf"},{"type":"text","value":" removed the imitation learning phase,\nlearning from scratch.\nAlphaZero ","position":{"start":{"line":603,"column":1},"end":{"line":603,"column":1}},"key":"KUdRAuN8fC"},{"type":"cite","kind":"narrative","label":"silver_general_2018","identifier":"silver_general_2018","children":[{"type":"text","value":"Silver ","key":"d3uG58KdmC"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"tViykXVPCG"}],"key":"texB8hbaec"},{"type":"text","value":" (2018)","key":"I8SdQCQ0fv"}],"enumerator":"4","key":"KYAA2Ip2QT"},{"type":"text","value":" then extended to other games beyond Go,\nnamely shogi and chess,\nalso learning from scratch.\nIn MuZero ","position":{"start":{"line":603,"column":1},"end":{"line":603,"column":1}},"key":"PnJxt0YUrp"},{"type":"cite","kind":"narrative","label":"schrittwieser_mastering_2020","identifier":"schrittwieser_mastering_2020","children":[{"type":"text","value":"Schrittwieser ","key":"ZeIvEkzXRu"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"zYTOvoWD4y"}],"key":"FvdadOSeqO"},{"type":"text","value":" (2020)","key":"TSFSIImeHh"}],"enumerator":"5","key":"kriGRNbRPk"},{"type":"text","value":",\nthis was further extended by learning a model of the game dynamics.","position":{"start":{"line":603,"column":1},"end":{"line":603,"column":1}},"key":"o4yL9RSzpI"}],"key":"iqoLfZkaLA"}],"key":"ZSlU4QZdZa"}],"key":"FqxwqZe177"},"references":{"cite":{"order":["silver_mastering_2016","silver_mastering_2017","russell_artificial_2021","silver_general_2018","schrittwieser_mastering_2020"],"data":{"silver_mastering_2016":{"label":"silver_mastering_2016","enumerator":"1","doi":"10.1038/nature16961","html":"Silver, D., Huang, A., Maddison, C. J., Guez, A., Sifre, L., van den Driessche, G., Schrittwieser, J., Antonoglou, I., Panneershelvam, V., Lanctot, M., Dieleman, S., Grewe, D., Nham, J., Kalchbrenner, N., Sutskever, I., Lillicrap, T., Leach, M., Kavukcuoglu, K., Graepel, T., & Hassabis, D. (2016). Mastering the Game of Go with Deep Neural Networks and Tree Search. Nature, 529(7587), 484–489. 10.1038/nature16961","url":"https://doi.org/10.1038/nature16961"},"silver_mastering_2017":{"label":"silver_mastering_2017","enumerator":"2","doi":"10.1038/nature24270","html":"Silver, D., Schrittwieser, J., Simonyan, K., Antonoglou, I., Huang, A., Guez, A., Hubert, T., Baker, L., Lai, M., Bolton, A., Chen, Y., Lillicrap, T., Hui, F., Sifre, L., van den Driessche, G., Graepel, T., & Hassabis, D. (2017). Mastering the Game of Go without Human Knowledge. Nature, 550(7676), 354–359. 10.1038/nature24270","url":"https://doi.org/10.1038/nature24270"},"russell_artificial_2021":{"label":"russell_artificial_2021","enumerator":"3","html":"Russell, S. J., & Norvig, P. (2021). Artificial Intelligence: A Modern Approach (Fourth edition). Pearson."},"silver_general_2018":{"label":"silver_general_2018","enumerator":"4","doi":"10.1126/science.aar6404","html":"Silver, D., Hubert, T., Schrittwieser, J., Antonoglou, I., Lai, M., Guez, A., Lanctot, M., Sifre, L., Kumaran, D., Graepel, T., Lillicrap, T., Simonyan, K., & Hassabis, D. (2018). A General Reinforcement Learning Algorithm That Masters Chess, Shogi, and Go through Self-Play. Science, 362(6419), 1140–1144. 10.1126/science.aar6404","url":"https://doi.org/10.1126/science.aar6404"},"schrittwieser_mastering_2020":{"label":"schrittwieser_mastering_2020","enumerator":"5","doi":"10.1038/s41586-020-03051-4","html":"Schrittwieser, J., Antonoglou, I., Hubert, T., Simonyan, K., Sifre, L., Schmitt, S., Guez, A., Lockhart, E., Hassabis, D., Graepel, T., Lillicrap, T., & Silver, D. (2020). Mastering Atari, Go, Chess and Shogi by Planning with a Learned Model. Nature, 588(7839), 604–609. 10.1038/s41586-020-03051-4","url":"https://doi.org/10.1038/s41586-020-03051-4"}}}},"footer":{"navigation":{"prev":{"title":"7 Imitation Learning","url":"/imitation-learning","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"9 Exploration in MDPs","url":"/exploration","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file +{"kind":"Notebook","sha256":"a369bd1f8010f4fa8c1455e357896c0a07167e488f24f0a7a86087dac58b06e5","slug":"planning","location":"/planning.md","dependencies":[],"frontmatter":{"title":"8 Tree Search Methods","numbering":{"all":{"enabled":true},"enumerator":{"template":"8.%s"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"thumbnail":"/build/tic_tac_toe-a6b4190582d91cb90a4dd4ea91b55ed0.png","thumbnailOptimized":"/build/tic_tac_toe-a6b4190582d91cb90a4dd4ea91b55ed0.webp","exports":[{"format":"md","filename":"planning.md","url":"/build/planning-887f75403e2b948135692cad33515828.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"MzeGNfkCft"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"8.1","key":"O5ajfHVOwS"},{"type":"paragraph","position":{"start":{"line":22,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"text","value":"Have you ever lost a strategy game against a skilled opponent?\nIt probably seemed like they were ahead of you at every turn.\nThey might have been ","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"key":"VxzfzEpgno"},{"type":"emphasis","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"children":[{"type":"text","value":"planning ahead","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"key":"I37fRvwy4G"}],"key":"FxfukSvdZS"},{"type":"text","value":" and anticipating your actions,\nthen planning around them in order to win.\nIf this opponent was a computer,\nthey might have been using one of the strategies that we are about to explore.","position":{"start":{"line":22,"column":1},"end":{"line":22,"column":1}},"key":"zYvmEUnloG"}],"key":"cwqMluOj2R"},{"type":"heading","depth":2,"position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"text","value":"Deterministic, zero sum, fully observable two-player games","position":{"start":{"line":29,"column":1},"end":{"line":29,"column":1}},"key":"Gr22e978Mq"}],"identifier":"deterministic-zero-sum-fully-observable-two-player-games","label":"Deterministic, zero sum, fully observable two-player games","html_id":"deterministic-zero-sum-fully-observable-two-player-games","implicit":true,"enumerator":"8.2","key":"aVun3rzCMz"},{"type":"paragraph","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"children":[{"type":"text","value":"In this chapter, we will focus on games that are:","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"dndHPDstAD"}],"key":"tAwHoJl0Ja"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":33,"column":1},"end":{"line":37,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"emphasis","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"children":[{"type":"text","value":"deterministic,","position":{"start":{"line":33,"column":1},"end":{"line":33,"column":1}},"key":"AcyVcX7NxK"}],"key":"NMfhYt021q"}],"key":"uwTXmpJ46I"},{"type":"listItem","spread":true,"position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"children":[{"type":"emphasis","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"children":[{"type":"text","value":"zero sum","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"IfwvwgG0SP"}],"key":"Spf8cqyiDK"},{"type":"text","value":" (one player wins and the other loses),","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"z2x9jVpWYh"}],"key":"NiUeRo1Ak0"},{"type":"listItem","spread":true,"position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"children":[{"type":"emphasis","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"children":[{"type":"text","value":"fully observable,","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"WNOFi1vRqY"}],"key":"NCPm7Jrqrn"},{"type":"text","value":" that is, the state of the game is perfectly known by both players,","position":{"start":{"line":35,"column":1},"end":{"line":35,"column":1}},"key":"J4dL5dVrD9"}],"key":"kIv3hjnNIU"},{"type":"listItem","spread":true,"position":{"start":{"line":36,"column":1},"end":{"line":37,"column":1}},"children":[{"type":"text","value":"for ","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"key":"aNENuucmT4"},{"type":"emphasis","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"children":[{"type":"text","value":"two players","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"key":"Oybk6H85Sr"}],"key":"afQmsPe7AO"},{"type":"text","value":" that alternate turns,","position":{"start":{"line":36,"column":1},"end":{"line":36,"column":1}},"key":"MWPWpBKsM6"}],"key":"Tw5msxR8MU"}],"key":"hQcYXTK8bz"},{"type":"paragraph","position":{"start":{"line":38,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"We can represent such a game as a ","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"uy5bGyZL5w"},{"type":"emphasis","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"children":[{"type":"text","value":"complete game tree.","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"STTrywc1Iz"}],"key":"G7ME7lb5fn"},{"type":"text","value":"\nEach possible state is a node in the tree,\nand since we only consider deterministic games,\nwe can represent actions as edges leading from the current state to the next.\nEach path through the tree, from root to leaf, represents a single game.","position":{"start":{"line":38,"column":1},"end":{"line":38,"column":1}},"key":"AyPX2Qsmub"}],"key":"If2dEjq9jG"},{"type":"container","kind":"figure","children":[{"type":"image","url":"/build/tic_tac_toe-a6b4190582d91cb90a4dd4ea91b55ed0.png","alt":"The first two layers of the complete game tree of tic-tac-toe.\nFrom Wikimedia.","align":"center","data":{"altTextIsAutoGenerated":true},"key":"KfnZMyoeCT","urlSource":"shared/tic_tac_toe.png","urlOptimized":"/build/tic_tac_toe-a6b4190582d91cb90a4dd4ea91b55ed0.webp"},{"type":"caption","children":[{"type":"paragraph","position":{"start":{"line":47,"column":1},"end":{"line":48,"column":1}},"children":[{"type":"text","value":"The first two layers of the complete game tree of tic-tac-toe.\nFrom Wikimedia.","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"key":"tM8jxpjlBP"}],"key":"daDn3fjq65"}],"key":"QXaUnFfHFS"}],"enumerator":"8.1","key":"msoMtcoUSo"},{"type":"paragraph","position":{"start":{"line":51,"column":1},"end":{"line":56,"column":1}},"children":[{"type":"text","value":"If you could store the complete game tree on a computer,\nyou would be able to win every potentially winnable game\nby searching all paths from your current state and taking a winning move.\nWe will see an explicit algorithm for this in ","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"f2gHrFHspC"},{"type":"crossReference","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"children":[{"type":"text","value":"the next section","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"mVhMogP85x"}],"identifier":"min-max-search","label":"min-max-search","kind":"heading","template":"Section %s","enumerator":"8.3","resolved":true,"html_id":"min-max-search","key":"bsB9U9tApA"},{"type":"text","value":".\nHowever, as games become more complex,\nit becomes computationally impossible to search every possible path.","position":{"start":{"line":51,"column":1},"end":{"line":51,"column":1}},"key":"bnUORM7xh2"}],"key":"BPUbc0V7b8"},{"type":"paragraph","position":{"start":{"line":58,"column":1},"end":{"line":66,"column":1}},"children":[{"type":"text","value":"For instance,\na chess player has roughly 30 actions to choose from at each turn,\nand each game takes roughly 40 moves per player,\nso trying to solve chess exactly using minimax\nwould take somewhere on the order of ","position":{"start":{"line":58,"column":1},"end":{"line":58,"column":1}},"key":"lN5Ch6O2cQ"},{"type":"inlineMath","value":"30^{80} \\approx 10^{118}","position":{"start":{"line":58,"column":1},"end":{"line":58,"column":1}},"html":"30801011830^{80} \\approx 10^{118}308010118","key":"WNlqXYWtLs"},{"type":"text","value":" operations.\nThat’s 10 billion billion billion billion billion billion billion billion billion billion billion billion billion operations.\nAs of the time of writing,\nthe fastest processor can achieve almost 10 GHz (10 billion operations per second),\nso to fully solve chess using minimax is many, many orders of magnitude out of reach.","position":{"start":{"line":58,"column":1},"end":{"line":58,"column":1}},"key":"JsQbRcovhY"}],"key":"zYDlWcIEIc"},{"type":"paragraph","position":{"start":{"line":68,"column":1},"end":{"line":74,"column":1}},"children":[{"type":"text","value":"It is thus intractable, in any realistic setting, to solve the complete game tree exactly.\nLuckily, only a small fraction of those games ever occur in reality;\nLater in this chapter,\nwe will explore ways to ","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"key":"Q6G6oIR7cg"},{"type":"emphasis","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"children":[{"type":"text","value":"prune away","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"key":"SMx2ErIYfU"}],"key":"lxIh3pElpg"},{"type":"text","value":" parts of the tree that we know we can safely ignore.\nWe can also ","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"key":"BrmKfZM2pr"},{"type":"emphasis","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"children":[{"type":"text","value":"approximate","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"key":"GujhVYGrSb"}],"key":"LaonbYgAEE"},{"type":"text","value":" the value of a state without fully evaluating it.\nUsing these approximations, we can no longer ","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"key":"gAILNvg4c3"},{"type":"emphasis","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"children":[{"type":"text","value":"guarantee","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"key":"KpmCulS8RO"}],"key":"wcoGNY5lYl"},{"type":"text","value":" winning the game,\nbut we can come up with strategies that will do well against most opponents.","position":{"start":{"line":68,"column":1},"end":{"line":68,"column":1}},"key":"EO75stT43N"}],"key":"OHhdh2uPeo"},{"type":"heading","depth":3,"position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"children":[{"type":"text","value":"Notation","position":{"start":{"line":76,"column":1},"end":{"line":76,"column":1}},"key":"gmpRPC36bg"}],"identifier":"notation","label":"Notation","html_id":"notation","implicit":true,"enumerator":"8.2.1","key":"bCXxiJV59B"},{"type":"paragraph","position":{"start":{"line":78,"column":1},"end":{"line":81,"column":1}},"children":[{"type":"text","value":"Let us now describe these games formally.\nWe’ll call the first player Max and the second player Min.\nMax seeks to maximize the final game score,\nwhile Min seeks to minimize the final game score.","position":{"start":{"line":78,"column":1},"end":{"line":78,"column":1}},"key":"KC5ArHLos8"}],"key":"gQ7Lgkfxf1"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":83,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"children":[{"type":"text","value":"We’ll use ","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"JweOoK4GHP"},{"type":"inlineMath","value":"\\mathcal{S}","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"html":"S\\mathcal{S}S","key":"M66VCvJ8hX"},{"type":"text","value":" to denote the set of all possible game states.","position":{"start":{"line":83,"column":1},"end":{"line":83,"column":1}},"key":"qUbccbl0H4"}],"key":"kpExEioswb"},{"type":"listItem","spread":true,"position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"children":[{"type":"text","value":"The game begins in some ","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"key":"TPeTthJOsr"},{"type":"strong","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"children":[{"type":"text","value":"initial state","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"key":"ULZEMhi9hp"}],"key":"O6vdNCy0An"},{"type":"text","value":" ","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"key":"uoBZ5ZcNE9"},{"type":"inlineMath","value":"s_0 \\in \\mathcal{S}","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"html":"s0Ss_0 \\in \\mathcal{S}s0S","key":"F7VTra5r7f"},{"type":"text","value":".","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"key":"pp6IHKbmIp"}],"key":"Sr5Ukdf9Yk"},{"type":"listItem","spread":true,"position":{"start":{"line":85,"column":1},"end":{"line":87,"column":1}},"children":[{"type":"text","value":"Max moves on even turn numbers ","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"rjXSMwyVbY"},{"type":"inlineMath","value":"h = 2n","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"html":"h=2nh = 2nh=2n","key":"fmPafNfnKS"},{"type":"text","value":",\nand Min moves on odd turn numbers ","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"TodweiFhnB"},{"type":"inlineMath","value":"h = 2n+1","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"html":"h=2n+1h = 2n+1h=2n+1","key":"hTQcuOfxAx"},{"type":"text","value":",\nwhere ","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"VF3nXEcCST"},{"type":"inlineMath","value":"n","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"html":"nnn","key":"IQyIfyAQKa"},{"type":"text","value":" is a natural number.","position":{"start":{"line":85,"column":1},"end":{"line":85,"column":1}},"key":"K4gAzok3If"}],"key":"dq2tyx0oXV"},{"type":"listItem","spread":true,"position":{"start":{"line":88,"column":1},"end":{"line":90,"column":1}},"children":[{"type":"text","value":"The space of possible actions, ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"jogpAEWdJ3"},{"type":"inlineMath","value":"\\mathcal{A}_h(s)","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"html":"Ah(s)\\mathcal{A}_h(s)Ah(s)","key":"lVZEI2QS61"},{"type":"text","value":",\ndepends on the state itself, as well as whose turn it is.\n(For example, in tic-tac-toe, Max can only play ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"XDyuVKArsb"},{"type":"inlineCode","value":"X","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"dWKVjVvraG"},{"type":"text","value":"s while Min can only play ","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"il0V9NHqa0"},{"type":"inlineCode","value":"O","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"vuL5JasaXY"},{"type":"text","value":"s.)","position":{"start":{"line":88,"column":1},"end":{"line":88,"column":1}},"key":"M5EPL5Fcqw"}],"key":"WfBMGDNBaI"},{"type":"listItem","spread":true,"position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"children":[{"type":"text","value":"The game ends after ","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"key":"VJCm3TTjCY"},{"type":"inlineMath","value":"H","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"html":"HHH","key":"lZXSqxuoCk"},{"type":"text","value":" total moves (which might be even or odd). We call the final state a ","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"key":"amLSlrOthx"},{"type":"strong","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"children":[{"type":"text","value":"terminal state","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"key":"Zz2rubSqDV"}],"key":"w4vg19UhF9"},{"type":"text","value":".","position":{"start":{"line":91,"column":1},"end":{"line":91,"column":1}},"key":"KvejJMGUHk"}],"key":"k8eHCMvEPq"},{"type":"listItem","spread":true,"position":{"start":{"line":92,"column":1},"end":{"line":93,"column":1}},"children":[{"type":"inlineMath","value":"P","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"html":"PPP","key":"wPhCGngM3h"},{"type":"text","value":" denotes the ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"mXoV6Y2WcN"},{"type":"strong","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"children":[{"type":"text","value":"state transitions","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"Z4dLABNFUV"}],"key":"gnmiivSUpb"},{"type":"text","value":", that is,\n","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"o7bQQ0DI6A"},{"type":"inlineMath","value":"P(s, a)","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"html":"P(s,a)P(s, a)P(s,a)","key":"S0ECvBqGMk"},{"type":"text","value":" denotes the resulting state when taking action ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"SheD5gEbhV"},{"type":"inlineMath","value":"a \\in \\mathcal{A}(s)","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"html":"aA(s)a \\in \\mathcal{A}(s)aA(s)","key":"mfpOd8tEdV"},{"type":"text","value":" in state ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"e6vpnEyMqH"},{"type":"inlineMath","value":"s","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"html":"sss","key":"S5rnPeCsMx"},{"type":"text","value":". We’ll assume that this function is time-homogeneous (a.k.a. stationary) and doesn’t change across timesteps.","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"oqmJhtx6Ci"}],"key":"wUxCHtACtx"},{"type":"listItem","spread":true,"position":{"start":{"line":94,"column":1},"end":{"line":97,"column":1}},"children":[{"type":"inlineMath","value":"r(s)","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"r(s)r(s)r(s)","key":"c02XVZiH03"},{"type":"text","value":" denotes the ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"wPieZr1Rjp"},{"type":"strong","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"game score","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"NX1thuJoFp"}],"key":"F7FSStD1xQ"},{"type":"text","value":" of the terminal state ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"lxpY5oTCIJ"},{"type":"inlineMath","value":"s","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"html":"sss","key":"IBv3MCNMJj"},{"type":"text","value":".\nNote that this is some positive or negative value seen by both players:\nA positive value indicates Max winning, a negative value indicates Min winning, and a value of ","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"wjmp2RpnJu"},{"type":"text","value":"0","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"TLFPRxiLBO"},{"type":"text","value":" indicates a tie.","position":{"start":{"line":94,"column":1},"end":{"line":94,"column":1}},"key":"s54nv1kNdm"}],"key":"YTAdVZF23b"}],"key":"hMxU6P48Jh"},{"type":"paragraph","position":{"start":{"line":98,"column":1},"end":{"line":98,"column":1}},"children":[{"type":"text","value":"We also call the sequence of states and actions a ","position":{"start":{"line":98,"column":1},"end":{"line":98,"column":1}},"key":"NybyJzwy9h"},{"type":"strong","position":{"start":{"line":98,"column":1},"end":{"line":98,"column":1}},"children":[{"type":"text","value":"trajectory","position":{"start":{"line":98,"column":1},"end":{"line":98,"column":1}},"key":"Q9622pXyNy"}],"key":"GuaUpQxXVu"},{"type":"text","value":".","position":{"start":{"line":98,"column":1},"end":{"line":98,"column":1}},"key":"vfxplyiEJ2"}],"key":"p7t6KkHkXo"},{"type":"admonition","kind":"attention","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Attention","key":"lPCbWOHYrh"}],"key":"GtHpvwXkmi"},{"type":"paragraph","position":{"start":{"line":101,"column":1},"end":{"line":103,"column":1}},"children":[{"type":"text","value":"Above, we suppose that the game ends after ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"TSokWFXagt"},{"type":"inlineMath","value":"H","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"html":"HHH","key":"cK7R4nrlhW"},{"type":"text","value":" total moves.\nBut most real games have a ","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"U1DXx3ITWO"},{"type":"emphasis","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"children":[{"type":"text","value":"variable","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"x6fefsi5D4"}],"key":"clJz1VB8De"},{"type":"text","value":" length.\nHow would you describe this?","position":{"start":{"line":101,"column":1},"end":{"line":101,"column":1}},"key":"xIbtjNj5SV"}],"key":"cc280AbRD0"}],"key":"KjBLvpq4YB"},{"type":"proof","kind":"example","label":"tic-tac-toe","identifier":"tic-tac-toe","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Tic-tac-toe","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"key":"ETEdaAOEw2"}],"key":"ewPvADmYG9"},{"type":"paragraph","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"children":[{"type":"text","value":"Let us frame tic-tac-toe in this setting.","position":{"start":{"line":109,"column":1},"end":{"line":109,"column":1}},"key":"L90wBuSRQK"}],"key":"IB6xldJkHe"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":111,"column":1},"end":{"line":119,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":111,"column":1},"end":{"line":113,"column":1}},"children":[{"type":"text","value":"Each of the ","position":{"start":{"line":111,"column":1},"end":{"line":111,"column":1}},"key":"BJWYhvHMgM"},{"type":"text","value":"9","position":{"start":{"line":111,"column":1},"end":{"line":111,"column":1}},"key":"Exp2HpZHfT"},{"type":"text","value":" squares is either empty, marked X, or marked O.\nSo there are ","position":{"start":{"line":111,"column":1},"end":{"line":111,"column":1}},"key":"liOszqQzQO"},{"type":"inlineMath","value":"|\\mathcal{S}| = 3^9","position":{"start":{"line":111,"column":1},"end":{"line":111,"column":1}},"html":"S=39|\\mathcal{S}| = 3^9S=39","key":"l9YkDkG3un"},{"type":"text","value":" potential states.\nNot all of these may be reachable!","position":{"start":{"line":111,"column":1},"end":{"line":111,"column":1}},"key":"ru43pLpBin"}],"key":"JDP811cPgz"},{"type":"listItem","spread":true,"position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"children":[{"type":"text","value":"The initial state ","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"MAdOG7lsFt"},{"type":"inlineMath","value":"s_0","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"html":"s0s_0s0","key":"AcuNtdaqzr"},{"type":"text","value":" is the empty board.","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"ZAnCOls4gn"}],"key":"J9u7S9rTWc"},{"type":"listItem","spread":true,"position":{"start":{"line":115,"column":1},"end":{"line":116,"column":1}},"children":[{"type":"text","value":"The set of possible actions for Max in state ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"NIpB8IJ1dJ"},{"type":"inlineMath","value":"s","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"sss","key":"A0jNYxnAW7"},{"type":"text","value":", ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"Cy06bng0iN"},{"type":"inlineMath","value":"\\mathcal{A}_{2n}(s)","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"A2n(s)\\mathcal{A}_{2n}(s)A2n(s)","key":"eMJ0qiZO8M"},{"type":"text","value":", is the set of tuples ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"StX1anWSS6"},{"type":"inlineMath","value":"(\\text{``X''}, i)","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"(“X”,i)(\\text{``X''}, i)(“X”,i)","key":"v5xm2R1yno"},{"type":"text","value":" where ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"el6upekBGs"},{"type":"inlineMath","value":"i","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"iii","key":"z4h2tZcu99"},{"type":"text","value":" refers to an empty square in ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"haiF3bW02P"},{"type":"inlineMath","value":"s","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"sss","key":"HXzgIwi3dR"},{"type":"text","value":".\nSimilarly, ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"tjuMVYrymu"},{"type":"inlineMath","value":"\\mathcal{A}_{2n+1}(s)","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"A2n+1(s)\\mathcal{A}_{2n+1}(s)A2n+1(s)","key":"P7k9YptDrp"},{"type":"text","value":" is the set of tuples ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"y5KpUgDUpy"},{"type":"inlineMath","value":"(\\text{``O''}, i)","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"(“O”,i)(\\text{``O''}, i)(“O”,i)","key":"MCNDdHKNQR"},{"type":"text","value":" where ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"pwtMiyi6MB"},{"type":"inlineMath","value":"i","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"iii","key":"Gm2nE80JSD"},{"type":"text","value":" refers to an empty square in ","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"WCHf4TOqWu"},{"type":"inlineMath","value":"s","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"html":"sss","key":"l2jYxixd73"},{"type":"text","value":".","position":{"start":{"line":115,"column":1},"end":{"line":115,"column":1}},"key":"WwQBwXPAS5"}],"key":"FZeTWqEeek"},{"type":"listItem","spread":true,"position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"children":[{"type":"text","value":"We can take ","position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"key":"buIJGgHGtg"},{"type":"inlineMath","value":"H = 9","position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"html":"H=9H = 9H=9","key":"xwELfLQUpa"},{"type":"text","value":" as the longest possible game length.","position":{"start":{"line":117,"column":1},"end":{"line":117,"column":1}},"key":"dBHAKmOj6o"}],"key":"NkrSKjmRzm"},{"type":"listItem","spread":true,"position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"inlineMath","value":"P(s, a)","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"html":"P(s,a)P(s, a)P(s,a)","key":"uWX3kPFuRk"},{"type":"text","value":" for a ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"XtV7zjOpt9"},{"type":"emphasis","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"nonterminal","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"nzrQ6uRP85"}],"key":"QZjPyRoBtT"},{"type":"text","value":" state ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"AvYI9wr5dT"},{"type":"inlineMath","value":"s","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"html":"sss","key":"lSPc08doLB"},{"type":"text","value":" is simply the board with the symbol and square specified by ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"mlpaQZtnbd"},{"type":"inlineMath","value":"a","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"html":"aaa","key":"n6eYTARvqL"},{"type":"text","value":" marked into ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"Pk1VGTmY9f"},{"type":"inlineMath","value":"s","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"html":"sss","key":"s7uyGUcmkp"},{"type":"text","value":". Otherwise, if ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"EFyK4HITCt"},{"type":"inlineMath","value":"s","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"html":"sss","key":"TTvWcXUz1q"},{"type":"text","value":" is a ","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"PYA4y4q8tf"},{"type":"emphasis","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"terminal","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"IPUC9F3KJc"}],"key":"QdatfV1NhY"},{"type":"text","value":" state, i.e. it already has three symbols in a row, the state no longer changes.","position":{"start":{"line":118,"column":1},"end":{"line":118,"column":1}},"key":"lRmqf82Gdf"}],"key":"ER8tggc6Qr"},{"type":"listItem","spread":true,"position":{"start":{"line":119,"column":1},"end":{"line":119,"column":1}},"children":[{"type":"inlineMath","value":"r(s)","position":{"start":{"line":119,"column":1},"end":{"line":119,"column":1}},"html":"r(s)r(s)r(s)","key":"Tt5tbhWq60"},{"type":"text","value":" at a ","position":{"start":{"line":119,"column":1},"end":{"line":119,"column":1}},"key":"LoNIi4hCvs"},{"type":"emphasis","position":{"start":{"line":119,"column":1},"end":{"line":119,"column":1}},"children":[{"type":"text","value":"terminal","position":{"start":{"line":119,"column":1},"end":{"line":119,"column":1}},"key":"Ee08iwQa61"}],"key":"oJafZZUofw"},{"type":"text","value":" state is ","position":{"start":{"line":119,"column":1},"end":{"line":119,"column":1}},"key":"oACRwhw7N7"},{"type":"text","value":"+1","position":{"start":{"line":119,"column":1},"end":{"line":119,"column":1}},"key":"SgpTN4Cfwd"},{"type":"text","value":" if there are three Xs in a row, ","position":{"start":{"line":119,"column":1},"end":{"line":119,"column":1}},"key":"mU18c8hV4o"},{"type":"text","value":"-1","position":{"start":{"line":119,"column":1},"end":{"line":119,"column":1}},"key":"tMLjr6Z95w"},{"type":"text","value":" if there are three Os in a row, and ","position":{"start":{"line":119,"column":1},"end":{"line":119,"column":1}},"key":"ZDMZHMoiOT"},{"type":"text","value":"0","position":{"start":{"line":119,"column":1},"end":{"line":119,"column":1}},"key":"HDMTbf1KMS"},{"type":"text","value":" otherwise.","position":{"start":{"line":119,"column":1},"end":{"line":119,"column":1}},"key":"VZEmRWVr16"}],"key":"wybQ0Ug5A9"}],"key":"Ck6CUXQuet"}],"enumerator":"8.1","html_id":"tic-tac-toe","key":"wT7AUqxy37"},{"type":"paragraph","position":{"start":{"line":122,"column":1},"end":{"line":129,"column":1}},"children":[{"type":"text","value":"Our notation may remind you of ","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"LuDiSMbpwo"},{"type":"link","url":"/mdps","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"children":[{"type":"text","value":"Markov decision processes","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"Y2asgVs1LO"}],"urlSource":"./mdps.md","dataUrl":"/mdps.json","internal":true,"protocol":"file","key":"nBqJgGqQmI"},{"type":"text","value":".\nGiven that these games also involve a sequence of states and actions,\ncan we formulate them as finite-horizon MDPs?\nThe two settings are not exactly analogous,\nsince in MDPs we only consider a ","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"SpjoixYaKS"},{"type":"emphasis","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"children":[{"type":"text","value":"single","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"DW8fM3aEUR"}],"key":"RCQHmMitgY"},{"type":"text","value":" policy,\nwhile these games involve two distinct players with opposite objectives.\nSince we want to analyze the behavior of ","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"YttNSfENel"},{"type":"emphasis","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"children":[{"type":"text","value":"both","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"uiRTtYuptk"}],"key":"XNKAUG1dmw"},{"type":"text","value":" players at the same time,\ndescribing such a game as an MDP is more trouble than it’s worth.","position":{"start":{"line":122,"column":1},"end":{"line":122,"column":1}},"key":"pddzG9tuo5"}],"key":"yj0yfsYRwt"},{"type":"heading","depth":2,"position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"children":[{"type":"text","value":"Min-max search *","position":{"start":{"line":132,"column":1},"end":{"line":132,"column":1}},"key":"xxGRNwt0Hg"}],"label":"min-max-search","identifier":"min-max-search","html_id":"min-max-search","enumerator":"8.3","key":"WJoxaJNzZ6"},{"type":"admonition","kind":"important","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Important","key":"VKoNECtgGZ"}],"key":"xftDxaFKfR"},{"type":"paragraph","position":{"start":{"line":135,"column":1},"end":{"line":136,"column":1}},"children":[{"type":"text","value":"The course (Fall 2024) does not cover min-max search.\nThis content is here to provide background on ","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"key":"sO6JHnqFB1"},{"type":"emphasis","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"children":[{"type":"text","value":"optimally","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"key":"sWwv5cW44z"}],"key":"T9qXECDwwh"},{"type":"text","value":" solving these deterministic, zero-sum, two-player games.","position":{"start":{"line":135,"column":1},"end":{"line":135,"column":1}},"key":"Tg89KNGsCv"}],"key":"mYQ6mPnoe8"}],"key":"l3fd3X85ac"},{"type":"paragraph","position":{"start":{"line":139,"column":1},"end":{"line":143,"column":1}},"children":[{"type":"text","value":"In the introduction,\nwe claimed that we could win any potentially winnable game by looking ahead and predicting the opponent’s actions.\nThis would mean that each ","position":{"start":{"line":139,"column":1},"end":{"line":139,"column":1}},"key":"F6ShL4p2ab"},{"type":"emphasis","position":{"start":{"line":139,"column":1},"end":{"line":139,"column":1}},"children":[{"type":"text","value":"nonterminal","position":{"start":{"line":139,"column":1},"end":{"line":139,"column":1}},"key":"LrVBvCOkOA"}],"key":"WQrIp1Zwxa"},{"type":"text","value":" state already has some predetermined game score,\nthat is, in each state,\nit is already “obvious” which player is going to win.","position":{"start":{"line":139,"column":1},"end":{"line":139,"column":1}},"key":"NmhTExd6cp"}],"key":"dAw9T03wLd"},{"type":"paragraph","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"children":[{"type":"text","value":"Let ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"uLTTzRtR26"},{"type":"inlineMath","value":"V_\\hi^\\star(s)","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"Vh(s)V_\\hi^\\star(s)Vh(s)","key":"e5uyDyPBpS"},{"type":"text","value":" denote the game score under optimal play from both players starting in state ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"t745WpjKO9"},{"type":"inlineMath","value":"s","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"sss","key":"umpPvtEC3z"},{"type":"text","value":" at time ","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"bzfqnjRNiP"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"html":"h\\hih","key":"LD1FuJFMIv"},{"type":"text","value":".","position":{"start":{"line":145,"column":1},"end":{"line":145,"column":1}},"key":"m6ssdHAEQw"}],"key":"HUkS4zbToX"},{"type":"proof","kind":"definition","label":"min-max-value","identifier":"min-max-value","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Min-max search algorithm","position":{"start":{"line":147,"column":1},"end":{"line":147,"column":1}},"key":"VcjXwkfHSS"}],"key":"NeAMps3aT8"},{"type":"math","value":"V_\\hi^{\\star}(s) = \\begin{cases}\nr(s) & \\hi = \\hor \\\\\n\\max_{a \\in \\mathcal{A}_\\hi(s)} V_{\\hi+1}^{\\star}(P(s, a)) & \\hi \\text{ is even and } \\hi < H \\\\\n\\min_{a \\in \\mathcal{A}_\\hi(s)} V_{\\hi+1}^{\\star}(P(s, a)) & \\hi \\text{ is odd and } \\hi < H \\\\\n\\end{cases}","position":{"start":{"line":150,"column":1},"end":{"line":156,"column":1}},"html":"Vh(s)={r(s)h=HmaxaAh(s)Vh+1(P(s,a))h is even and h<HminaAh(s)Vh+1(P(s,a))h is odd and h<HV_\\hi^{\\star}(s) = \\begin{cases}\nr(s) & \\hi = \\hor \\\\\n\\max_{a \\in \\mathcal{A}_\\hi(s)} V_{\\hi+1}^{\\star}(P(s, a)) & \\hi \\text{ is even and } \\hi < H \\\\\n\\min_{a \\in \\mathcal{A}_\\hi(s)} V_{\\hi+1}^{\\star}(P(s, a)) & \\hi \\text{ is odd and } \\hi < H \\\\\n\\end{cases}Vh(s)=r(s)maxaAh(s)Vh+1(P(s,a))minaAh(s)Vh+1(P(s,a))h=Hh is even and h<Hh is odd and h<H","enumerator":"8.1","key":"i05NynM9VD"}],"enumerator":"8.1","html_id":"min-max-value","key":"XkFlNdhm1H"},{"type":"paragraph","position":{"start":{"line":159,"column":1},"end":{"line":163,"column":1}},"children":[{"type":"text","value":"We can compute this by starting at the terminal states,\nwhen the game’s outcome is known,\nand working backwards,\nassuming that Max chooses the action that leads to the highest score\nand Min chooses the action that leads to the lowest score.","position":{"start":{"line":159,"column":1},"end":{"line":159,"column":1}},"key":"Oh3lEDrgyW"}],"key":"YdAnG4wZHu"},{"type":"paragraph","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"children":[{"type":"text","value":"This translates directly into a recursive depth-first search algorithm for searching the complete game tree.","position":{"start":{"line":165,"column":1},"end":{"line":165,"column":1}},"key":"XkERNEBvkQ"}],"key":"Z4NafMMTWA"},{"type":"code","lang":"python","value":"def minimax_search(s, player) -> Tuple[\"Action\", \"Value\"]:\n \"\"\"Return the value of the state (for Max) and the best action for Max to take.\"\"\"\n if env.is_terminal(s):\n return None, env.winner(s)\n\n if player is max:\n a_max, v_max = None, None\n for a in actions:\n _, v = minimax_search(env.step(s, a), min)\n if v > v_max:\n a_max, v_max = a, v\n return a_max, v_max\n else:\n a_min, v_min = None, None\n for a in actions:\n _, v = minimax_search(env.step(s, a), max)\n if v < v_min:\n a_min, v_min = a, v\n return a_min, v_min","position":{"start":{"line":167,"column":1},"end":{"line":187,"column":1}},"key":"iGWG6KzcE6"},{"type":"proof","kind":"example","label":"min-max-example","identifier":"min-max-example","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Min-max search for a simple game","position":{"start":{"line":189,"column":1},"end":{"line":189,"column":1}},"key":"Gm2PDdjpfh"}],"key":"dt1GMwI5UX"},{"type":"paragraph","position":{"start":{"line":192,"column":1},"end":{"line":195,"column":1}},"children":[{"type":"text","value":"Consider a simple game with just two steps: Max chooses one of three possible actions (A, B, C),\nand then Min chooses one of three possible actions (D, E, F).\nThe combination leads to a certain integer outcome,\nshown in the table below:","position":{"start":{"line":192,"column":1},"end":{"line":192,"column":1}},"key":"fU6CWb6tjT"}],"key":"dnUGFVDYgU"},{"type":"table","position":{"start":{"line":197,"column":1},"end":{"line":201,"column":1}},"children":[{"type":"tableRow","position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"children":[{"type":"tableCell","header":true,"position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"children":[],"key":"AVu3C3Njqz"},{"type":"tableCell","header":true,"position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"children":[{"type":"text","value":"D","position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"key":"vsTviXXDQQ"}],"key":"b7wBHPX4rW"},{"type":"tableCell","header":true,"position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"children":[{"type":"text","value":"E","position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"key":"lVIWybVhfl"}],"key":"igRlUkXmF1"},{"type":"tableCell","header":true,"position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"children":[{"type":"text","value":"F","position":{"start":{"line":197,"column":1},"end":{"line":197,"column":1}},"key":"VGTfwXe1bN"}],"key":"Tupj20fjT0"}],"key":"eCdFlPvhQP"},{"type":"tableRow","position":{"start":{"line":199,"column":1},"end":{"line":199,"column":1}},"children":[{"type":"tableCell","position":{"start":{"line":199,"column":1},"end":{"line":199,"column":1}},"children":[{"type":"text","value":"A","position":{"start":{"line":199,"column":1},"end":{"line":199,"column":1}},"key":"HYiqFRzgPv"}],"key":"tgiwxBGAc8"},{"type":"tableCell","position":{"start":{"line":199,"column":1},"end":{"line":199,"column":1}},"children":[{"type":"text","value":"4","position":{"start":{"line":199,"column":1},"end":{"line":199,"column":1}},"key":"hxJjl3fq9K"}],"key":"K3QUy3c41c"},{"type":"tableCell","position":{"start":{"line":199,"column":1},"end":{"line":199,"column":1}},"children":[{"type":"text","value":"-2","position":{"start":{"line":199,"column":1},"end":{"line":199,"column":1}},"key":"IVr3W4vRNc"}],"key":"SgU7NMUoXK"},{"type":"tableCell","position":{"start":{"line":199,"column":1},"end":{"line":199,"column":1}},"children":[{"type":"text","value":"5","position":{"start":{"line":199,"column":1},"end":{"line":199,"column":1}},"key":"cN3yVOylrL"}],"key":"AskMspIzD6"}],"key":"DObtTpclVB"},{"type":"tableRow","position":{"start":{"line":200,"column":1},"end":{"line":200,"column":1}},"children":[{"type":"tableCell","position":{"start":{"line":200,"column":1},"end":{"line":200,"column":1}},"children":[{"type":"text","value":"B","position":{"start":{"line":200,"column":1},"end":{"line":200,"column":1}},"key":"umUBWoiMBk"}],"key":"tyaHwV11OR"},{"type":"tableCell","position":{"start":{"line":200,"column":1},"end":{"line":200,"column":1}},"children":[{"type":"text","value":"-3","position":{"start":{"line":200,"column":1},"end":{"line":200,"column":1}},"key":"uzsGDHNtva"}],"key":"p5dusBi7gj"},{"type":"tableCell","position":{"start":{"line":200,"column":1},"end":{"line":200,"column":1}},"children":[{"type":"text","value":"3","position":{"start":{"line":200,"column":1},"end":{"line":200,"column":1}},"key":"q7W7tiNSNR"}],"key":"gjYH7iuU0d"},{"type":"tableCell","position":{"start":{"line":200,"column":1},"end":{"line":200,"column":1}},"children":[{"type":"text","value":"1","position":{"start":{"line":200,"column":1},"end":{"line":200,"column":1}},"key":"d0smmCiZcr"}],"key":"O2b2Wfy8JU"}],"key":"wFXktcuyTZ"},{"type":"tableRow","position":{"start":{"line":201,"column":1},"end":{"line":201,"column":1}},"children":[{"type":"tableCell","position":{"start":{"line":201,"column":1},"end":{"line":201,"column":1}},"children":[{"type":"text","value":"C","position":{"start":{"line":201,"column":1},"end":{"line":201,"column":1}},"key":"xyAi3k6Yd0"}],"key":"gI26koolIn"},{"type":"tableCell","position":{"start":{"line":201,"column":1},"end":{"line":201,"column":1}},"children":[{"type":"text","value":"0","position":{"start":{"line":201,"column":1},"end":{"line":201,"column":1}},"key":"nFpX2wSD1t"}],"key":"EmM8iJKDfC"},{"type":"tableCell","position":{"start":{"line":201,"column":1},"end":{"line":201,"column":1}},"children":[{"type":"text","value":"3","position":{"start":{"line":201,"column":1},"end":{"line":201,"column":1}},"key":"lztYL9HVqS"}],"key":"mPy6fLqt5w"},{"type":"tableCell","position":{"start":{"line":201,"column":1},"end":{"line":201,"column":1}},"children":[{"type":"text","value":"-1","position":{"start":{"line":201,"column":1},"end":{"line":201,"column":1}},"key":"b44R966I8P"}],"key":"vRTFRryiMj"}],"key":"vMhG8bUOOu"}],"key":"Qzwp2F7Zlt"},{"type":"paragraph","position":{"start":{"line":203,"column":1},"end":{"line":205,"column":1}},"children":[{"type":"text","value":"We can visualize this as the following complete game tree,\nwhere each box contains the value ","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"key":"FVjm21KLny"},{"type":"inlineMath","value":"V_\\hi^\\star(s)","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"html":"Vh(s)V_\\hi^\\star(s)Vh(s)","key":"y4HBk7q8oL"},{"type":"text","value":" of that node.\nThe min-max values of the terminal states are already known:","position":{"start":{"line":203,"column":1},"end":{"line":203,"column":1}},"key":"HLhx69Pmhl"}],"key":"l9cG5vFmYN"},{"type":"image","url":"/build/minmax-70b17e866836d498d3d814fd3fc3d9e3.png","position":{"start":{"line":207,"column":1},"end":{"line":207,"column":1}},"key":"Po3Jstjmsb","urlSource":"./shared/minmax.png","urlOptimized":"/build/minmax-70b17e866836d498d3d814fd3fc3d9e3.webp"},{"type":"paragraph","position":{"start":{"line":209,"column":1},"end":{"line":213,"column":1}},"children":[{"type":"text","value":"We begin min-max search at the root,\nexploring each of Max’s actions.\nSuppose Max chooses action A.\nThen Min will choose action E to minimize the game score,\nmaking the value of this game node ","position":{"start":{"line":209,"column":1},"end":{"line":209,"column":1}},"key":"Qby7awc1FF"},{"type":"inlineMath","value":"\\min(4, -2, 5) = -2","position":{"start":{"line":209,"column":1},"end":{"line":209,"column":1}},"html":"min(4,2,5)=2\\min(4, -2, 5) = -2min(4,2,5)=2","key":"ZYc2553VPP"},{"type":"text","value":".","position":{"start":{"line":209,"column":1},"end":{"line":209,"column":1}},"key":"k7AvPZWFkP"}],"key":"elvk0qhplF"},{"type":"image","url":"/build/minmax-2-d2c05b455ad2a4aef499542eadb0515d.png","position":{"start":{"line":215,"column":1},"end":{"line":215,"column":1}},"key":"b3hoigER28","urlSource":"./shared/minmax-2.png","urlOptimized":"/build/minmax-2-d2c05b455ad2a4aef499542eadb0515d.webp"},{"type":"paragraph","position":{"start":{"line":217,"column":1},"end":{"line":221,"column":1}},"children":[{"type":"text","value":"Similarly, if Max chooses action B,\nthen Min will choose action D,\nand if Max chooses action C,\nthen Min will choose action F.\nWe can fill in the values of these nodes accordingly:","position":{"start":{"line":217,"column":1},"end":{"line":217,"column":1}},"key":"na2IaToEbk"}],"key":"crkJLkNibS"},{"type":"image","url":"/build/minmax-3-f38c4f0467ce1216f1438052ec8a7d85.png","position":{"start":{"line":223,"column":1},"end":{"line":223,"column":1}},"key":"H6PLvMXKzA","urlSource":"./shared/minmax-3.png","urlOptimized":"/build/minmax-3-f38c4f0467ce1216f1438052ec8a7d85.webp"},{"type":"paragraph","position":{"start":{"line":225,"column":1},"end":{"line":226,"column":1}},"children":[{"type":"text","value":"Thus, Max’s best move is to take action C,\nresulting in a game score of ","position":{"start":{"line":225,"column":1},"end":{"line":225,"column":1}},"key":"XipizSjFY7"},{"type":"inlineMath","value":"\\max(-2, -3, -1) = -1","position":{"start":{"line":225,"column":1},"end":{"line":225,"column":1}},"html":"max(2,3,1)=1\\max(-2, -3, -1) = -1max(2,3,1)=1","key":"OizblGQd6Z"},{"type":"text","value":".","position":{"start":{"line":225,"column":1},"end":{"line":225,"column":1}},"key":"Z2k6Y9EYo2"}],"key":"A7dAhWkdVZ"},{"type":"image","url":"/build/minmax-4-013da4f214c0c822edc5b0e2b62d2f2a.png","position":{"start":{"line":228,"column":1},"end":{"line":228,"column":1}},"key":"r8LtB5BA5G","urlSource":"./shared/minmax-4.png","urlOptimized":"/build/minmax-4-013da4f214c0c822edc5b0e2b62d2f2a.webp"}],"enumerator":"8.2","html_id":"min-max-example","key":"F5dkflVXrg"},{"type":"heading","depth":3,"position":{"start":{"line":231,"column":1},"end":{"line":231,"column":1}},"children":[{"type":"text","value":"Complexity of min-max search","position":{"start":{"line":231,"column":1},"end":{"line":231,"column":1}},"key":"XGessI7cZx"}],"identifier":"complexity-of-min-max-search","label":"Complexity of min-max search","html_id":"complexity-of-min-max-search","implicit":true,"enumerator":"8.3.1","key":"kCaw7p5Wg8"},{"type":"paragraph","position":{"start":{"line":233,"column":1},"end":{"line":237,"column":1}},"children":[{"type":"text","value":"At each of the ","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"key":"OaYCBj8wLS"},{"type":"inlineMath","value":"\\hor","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"html":"H\\horH","key":"qLtN39Hcyg"},{"type":"text","value":" timesteps,\nthis algorithm iterates through the entire action space at that state,\nand therefore has a time complexity of ","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"key":"zOe8AElMKt"},{"type":"inlineMath","value":"\\hor^{n_A}","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"html":"HnA\\hor^{n_A}HnA","key":"Vc4oYHujO5"},{"type":"text","value":"\n(where ","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"key":"zfzq6mmPbF"},{"type":"inlineMath","value":"n_A","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"html":"nAn_AnA","key":"RczanuSqs2"},{"type":"text","value":" is the largest number of actions possibly available at once).\nThis makes the min-max algorithm impractical for even moderately sized games.","position":{"start":{"line":233,"column":1},"end":{"line":233,"column":1}},"key":"v5u0tiEgdW"}],"key":"QI1VD6jT4v"},{"type":"paragraph","position":{"start":{"line":239,"column":1},"end":{"line":242,"column":1}},"children":[{"type":"text","value":"But do we need to compute the exact value of ","position":{"start":{"line":239,"column":1},"end":{"line":239,"column":1}},"key":"qKph5WzbuL"},{"type":"emphasis","position":{"start":{"line":239,"column":1},"end":{"line":239,"column":1}},"children":[{"type":"text","value":"every","position":{"start":{"line":239,"column":1},"end":{"line":239,"column":1}},"key":"Z4aQSxpS6H"}],"key":"q0OzC1WZSi"},{"type":"text","value":" possible state?\nInstead, is there some way we could “ignore” certain actions and their subtrees\nif we already know of better options?\nThe ","position":{"start":{"line":239,"column":1},"end":{"line":239,"column":1}},"key":"PPB1B3oDS0"},{"type":"strong","position":{"start":{"line":239,"column":1},"end":{"line":239,"column":1}},"children":[{"type":"text","value":"alpha-beta search","position":{"start":{"line":239,"column":1},"end":{"line":239,"column":1}},"key":"eL9XPhbK4r"}],"key":"FvESyucPfn"},{"type":"text","value":" makes use of this intuition.","position":{"start":{"line":239,"column":1},"end":{"line":239,"column":1}},"key":"epVnnHzTmk"}],"key":"eaTw2yb2RL"},{"type":"heading","depth":2,"position":{"start":{"line":245,"column":1},"end":{"line":245,"column":1}},"children":[{"type":"text","value":"Alpha-beta search","position":{"start":{"line":245,"column":1},"end":{"line":245,"column":1}},"key":"EQZ7P4Tw94"}],"label":"alpha-beta-search","identifier":"alpha-beta-search","html_id":"alpha-beta-search","enumerator":"8.4","key":"Es6wq7tQ06"},{"type":"paragraph","position":{"start":{"line":247,"column":1},"end":{"line":251,"column":1}},"children":[{"type":"text","value":"The intuition behind alpha-beta search is as follows:\nSuppose Max is in state ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"TFgrpnXr41"},{"type":"inlineMath","value":"s","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"sss","key":"XfndzzBoJT"},{"type":"text","value":",\nand considering whether to take action ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"p3Amc7CW6a"},{"type":"inlineMath","value":"a","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"aaa","key":"HVruLpQyNq"},{"type":"text","value":" or ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"t09DB1mnEa"},{"type":"inlineMath","value":"a'","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"aa'a","key":"QWH5PP3nSk"},{"type":"text","value":".\nIf at any point they find out that action ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"KzZcmWUemz"},{"type":"inlineMath","value":"a'","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"aa'a","key":"mtsM2H5pmg"},{"type":"text","value":" is definitely worse than (or equal to) action ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"QKrrDxV4mO"},{"type":"inlineMath","value":"a","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"aaa","key":"crRRXaGkem"},{"type":"text","value":",\nthey don’t need to evaluate action ","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"BWNkPPDQZh"},{"type":"inlineMath","value":"a'","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"html":"aa'a","key":"unuDCye0j3"},{"type":"text","value":" any further.","position":{"start":{"line":247,"column":1},"end":{"line":247,"column":1}},"key":"LuzmTMyKom"}],"key":"goovezn09M"},{"type":"paragraph","position":{"start":{"line":253,"column":1},"end":{"line":254,"column":1}},"children":[{"type":"text","value":"Concretely, we run min-max search as above,\nexcept now we keep track of two additional parameters ","position":{"start":{"line":253,"column":1},"end":{"line":253,"column":1}},"key":"GmzcNyDjjd"},{"type":"inlineMath","value":"\\alpha(s)","position":{"start":{"line":253,"column":1},"end":{"line":253,"column":1}},"html":"α(s)\\alpha(s)α(s)","key":"AnSBAnPzjE"},{"type":"text","value":" and ","position":{"start":{"line":253,"column":1},"end":{"line":253,"column":1}},"key":"zWUuGz4Ygs"},{"type":"inlineMath","value":"\\beta(s)","position":{"start":{"line":253,"column":1},"end":{"line":253,"column":1}},"html":"β(s)\\beta(s)β(s)","key":"AXSyFIyzkm"},{"type":"text","value":" while evaluating each state:","position":{"start":{"line":253,"column":1},"end":{"line":253,"column":1}},"key":"ezvO1tfkm6"}],"key":"ZhPLt8Xv6I"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":256,"column":1},"end":{"line":258,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"children":[{"type":"text","value":"Starting in state ","position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"key":"KJuKk88M0h"},{"type":"inlineMath","value":"s","position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"html":"sss","key":"fMqdBUnnay"},{"type":"text","value":", Max can achieve a game score of ","position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"key":"HKKWPpzbZ2"},{"type":"emphasis","position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"children":[{"type":"text","value":"at least","position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"key":"ZFkOGE2LQK"}],"key":"D69KN7AL8F"},{"type":"text","value":" ","position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"key":"q9E454hRXr"},{"type":"inlineMath","value":"\\alpha(s)","position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"html":"α(s)\\alpha(s)α(s)","key":"iwWKM7HMq9"},{"type":"text","value":" assuming Min plays optimally. That is, ","position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"key":"CL9DidaiGP"},{"type":"inlineMath","value":"V^\\star_\\hi(s) \\ge \\alpha(s)","position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"html":"Vh(s)α(s)V^\\star_\\hi(s) \\ge \\alpha(s)Vh(s)α(s)","key":"k3aBsIttH4"},{"type":"text","value":" at all points.","position":{"start":{"line":256,"column":1},"end":{"line":256,"column":1}},"key":"GNtgFdyF1n"}],"key":"seyBuMaliA"},{"type":"listItem","spread":true,"position":{"start":{"line":257,"column":1},"end":{"line":258,"column":1}},"children":[{"type":"text","value":"Analogously, starting in state ","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"key":"HU3fHJ3ZcF"},{"type":"inlineMath","value":"s","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"html":"sss","key":"RIN9M1L0gE"},{"type":"text","value":", Min can ensure a game score of ","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"key":"OSU2w2FKAX"},{"type":"emphasis","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"children":[{"type":"text","value":"at most","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"key":"XOGN5F7l4v"}],"key":"azBzv6atrb"},{"type":"text","value":" ","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"key":"WDfPm3ep77"},{"type":"inlineMath","value":"\\beta(s)","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"html":"β(s)\\beta(s)β(s)","key":"Ebd398ZjXp"},{"type":"text","value":" assuming Max plays optimally. That is, ","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"key":"V1FDTtSLyZ"},{"type":"inlineMath","value":"V^\\star_\\hi(s) \\le \\beta(s)","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"html":"Vh(s)β(s)V^\\star_\\hi(s) \\le \\beta(s)Vh(s)β(s)","key":"x7vICdbV3V"},{"type":"text","value":" at all points.","position":{"start":{"line":257,"column":1},"end":{"line":257,"column":1}},"key":"EKoLnljiMR"}],"key":"dHP4KWnZjI"}],"key":"tgKPluE2gc"},{"type":"paragraph","position":{"start":{"line":259,"column":1},"end":{"line":274,"column":1}},"children":[{"type":"text","value":"Suppose we are evaluating ","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"n8tUAcPz0Z"},{"type":"inlineMath","value":"V^\\star_\\hi(s)","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"html":"Vh(s)V^\\star_\\hi(s)Vh(s)","key":"MuIVl3AtSr"},{"type":"text","value":",\nwhere it is Max’s turn (","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"lrFgpsUzMV"},{"type":"inlineMath","value":"\\hi","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"html":"h\\hih","key":"gbXimpLLG4"},{"type":"text","value":" is even).\nWe update ","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"kXKDpz31vA"},{"type":"inlineMath","value":"\\alpha(s)","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"html":"α(s)\\alpha(s)α(s)","key":"qGq6imgwKB"},{"type":"text","value":" to be the ","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"d5yXZgtW0a"},{"type":"emphasis","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"children":[{"type":"text","value":"highest","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"SL3M1qIJAx"}],"key":"GBdEed9bh9"},{"type":"text","value":" minimax value achievable from ","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"KfOn25eeia"},{"type":"inlineMath","value":"s","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"html":"sss","key":"ZtG3UjMgai"},{"type":"text","value":" so far.\nThat is, the value of ","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"zIThsEgezY"},{"type":"inlineMath","value":"s","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"html":"sss","key":"NRFZyzKSyL"},{"type":"text","value":" is ","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"SWTILMdiB5"},{"type":"emphasis","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"children":[{"type":"text","value":"at least","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"imxRlv2DFh"}],"key":"h4xJJjAXdr"},{"type":"text","value":" ","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"JP1LGhzoPK"},{"type":"inlineMath","value":"\\alpha(s)","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"html":"α(s)\\alpha(s)α(s)","key":"QeH2Q0IfT7"},{"type":"text","value":".\nSuppose Max chooses action ","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"EUvno3MbJj"},{"type":"inlineMath","value":"a","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"html":"aaa","key":"m1CXwetrnX"},{"type":"text","value":", which leads to state ","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"ZLnbmRchGF"},{"type":"inlineMath","value":"s'","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"html":"ss's","key":"Cj1FN9aG9Y"},{"type":"text","value":", in which it is Min’s turn.\nIf any of Min’s actions in ","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"J0ukLVeOmg"},{"type":"inlineMath","value":"s'","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"html":"ss's","key":"dboynDandB"},{"type":"text","value":" achieve a value ","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"vUup49mB6i"},{"type":"inlineMath","value":"V^\\star_{\\hi+1}(s') \\le \\alpha(s)","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"html":"Vh+1(s)α(s)V^\\star_{\\hi+1}(s') \\le \\alpha(s)Vh+1(s)α(s)","key":"OZVqBoIvvn"},{"type":"text","value":",\nwe know that Max would not choose action ","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"XgMsoltIYb"},{"type":"inlineMath","value":"a","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"html":"aaa","key":"EqBGSWqBso"},{"type":"text","value":",\nsince they know that it is ","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"BcbduRA76P"},{"type":"emphasis","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"children":[{"type":"text","value":"worse","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"csM4pOUHf0"}],"key":"HMIoHMp96R"},{"type":"text","value":" than whichever action gave the value ","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"dM0mQc41tW"},{"type":"inlineMath","value":"\\alpha(s)","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"html":"α(s)\\alpha(s)α(s)","key":"bFO9hSVrzX"},{"type":"text","value":".\nSimilarly, to evaluate a state on Min’s turn,\nwe update ","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"Zd8DWdXNYS"},{"type":"inlineMath","value":"\\beta(s)","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"html":"β(s)\\beta(s)β(s)","key":"Rs8WpeBWU1"},{"type":"text","value":" to be the ","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"WMHEe0m9fs"},{"type":"emphasis","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"children":[{"type":"text","value":"lowest","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"SHZDsAsPAO"}],"key":"pJNjO9nO6M"},{"type":"text","value":" value achievable from ","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"uQgzoSkJBz"},{"type":"inlineMath","value":"s","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"html":"sss","key":"CN4Dqgsw7B"},{"type":"text","value":" so far.\nThat is, the value of ","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"Mnsdw85C7h"},{"type":"inlineMath","value":"s","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"html":"sss","key":"vNht1zqaVM"},{"type":"text","value":" is ","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"wGUA553QVj"},{"type":"emphasis","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"children":[{"type":"text","value":"at most","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"VhXEpeoHZr"}],"key":"c7lqQXGxaQ"},{"type":"text","value":" ","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"W2Uij5wE8e"},{"type":"inlineMath","value":"\\beta(s)","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"html":"β(s)\\beta(s)β(s)","key":"Z6VxKTKWsQ"},{"type":"text","value":".\nSuppose Min chooses action ","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"PMS2Gg2NYx"},{"type":"inlineMath","value":"a","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"html":"aaa","key":"yvSm3wgYU5"},{"type":"text","value":",\nwhich leads to state ","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"CKTLK9JVLr"},{"type":"inlineMath","value":"s'","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"html":"ss's","key":"ZSDg8VpnXA"},{"type":"text","value":" for Max.\nIf Max has any actions that do ","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"LUSwsHoZLN"},{"type":"emphasis","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"children":[{"type":"text","value":"better","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"NM9kzyms45"}],"key":"h8Nnn41cCY"},{"type":"text","value":" than ","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"nSuuPmGwQ7"},{"type":"inlineMath","value":"\\beta(s)","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"html":"β(s)\\beta(s)β(s)","key":"rtEahBb5Pi"},{"type":"text","value":",\nthey would take it,\nmaking action ","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"bZrlGu6BqZ"},{"type":"inlineMath","value":"a","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"html":"aaa","key":"nlAEk4UHo7"},{"type":"text","value":" a suboptimal choice for Min.","position":{"start":{"line":259,"column":1},"end":{"line":259,"column":1}},"key":"lRgp3sXQoj"}],"key":"RfzFmfjtUB"},{"type":"proof","kind":"example","label":"alpha-beta-example","identifier":"alpha-beta-example","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Alpha-beta search for a simple game","position":{"start":{"line":276,"column":1},"end":{"line":276,"column":1}},"key":"aQLGAMtwlw"}],"key":"o189BK1wur"},{"type":"paragraph","position":{"start":{"line":279,"column":1},"end":{"line":283,"column":1}},"children":[{"type":"text","value":"Let us use the same simple game from ","position":{"start":{"line":279,"column":1},"end":{"line":279,"column":1}},"key":"YZFMjd7Oho"},{"type":"crossReference","position":{"start":{"line":279,"column":1},"end":{"line":279,"column":1}},"children":[{"type":"text","value":"Example ","key":"StNasbTkex"},{"type":"text","value":"8.2","key":"ryMWv63N7L"}],"identifier":"min-max-example","label":"min-max-example","kind":"proof:example","template":"Example %s","enumerator":"8.2","resolved":true,"html_id":"min-max-example","key":"oLMhWLSMrm"},{"type":"text","value":".\nWe list the values of ","position":{"start":{"line":279,"column":1},"end":{"line":279,"column":1}},"key":"caIeyZ04CY"},{"type":"inlineMath","value":"\\alpha(s), \\beta(s)","position":{"start":{"line":279,"column":1},"end":{"line":279,"column":1}},"html":"α(s),β(s)\\alpha(s), \\beta(s)α(s),β(s)","key":"as1CuEqjTt"},{"type":"text","value":" in each node throughout the algorithm.\nThese values are initialized to ","position":{"start":{"line":279,"column":1},"end":{"line":279,"column":1}},"key":"ZgSE51bPLS"},{"type":"inlineMath","value":"-\\infty, +\\infty","position":{"start":{"line":279,"column":1},"end":{"line":279,"column":1}},"html":",+-\\infty, +\\infty,+","key":"lJJa2tfuJr"},{"type":"text","value":" respectively.\nWe shade any squares that have not been visited by the algorithm,\nand we assume that actions are evaluated from left to right.","position":{"start":{"line":279,"column":1},"end":{"line":279,"column":1}},"key":"VIqmYUXl5c"}],"key":"PlZw8Rs6S4"},{"type":"image","url":"/build/alpha-beta-0-7ad590b6317a7a6f64b4e368eda30e33.png","position":{"start":{"line":285,"column":1},"end":{"line":285,"column":1}},"key":"ZKBNahBwvu","urlSource":"./shared/alpha-beta-0.png","urlOptimized":"/build/alpha-beta-0-7ad590b6317a7a6f64b4e368eda30e33.webp"},{"type":"paragraph","position":{"start":{"line":287,"column":1},"end":{"line":290,"column":1}},"children":[{"type":"text","value":"Suppose Max takes action A. Let ","position":{"start":{"line":287,"column":1},"end":{"line":287,"column":1}},"key":"siqCL2BkuG"},{"type":"inlineMath","value":"s'","position":{"start":{"line":287,"column":1},"end":{"line":287,"column":1}},"html":"ss's","key":"hHJtnxDflY"},{"type":"text","value":" be the resulting game state.\nThe values of ","position":{"start":{"line":287,"column":1},"end":{"line":287,"column":1}},"key":"DonXm6aCdE"},{"type":"inlineMath","value":"\\alpha(s')","position":{"start":{"line":287,"column":1},"end":{"line":287,"column":1}},"html":"α(s)\\alpha(s')α(s)","key":"A27n0GYrpq"},{"type":"text","value":" and ","position":{"start":{"line":287,"column":1},"end":{"line":287,"column":1}},"key":"r61C7T3d5P"},{"type":"inlineMath","value":"\\beta(s')","position":{"start":{"line":287,"column":1},"end":{"line":287,"column":1}},"html":"β(s)\\beta(s')β(s)","key":"xKUUUq8Z70"},{"type":"text","value":"\nare initialized at the same values as the root state,\nsince we want to prune a subtree if there exists a better action at any step higher in the tree.","position":{"start":{"line":287,"column":1},"end":{"line":287,"column":1}},"key":"moGFP4HNhu"}],"key":"JOlnzdumES"},{"type":"image","url":"/build/alpha-beta-1-b9d0c4a2b1ab3150a403c943682c4a80.png","position":{"start":{"line":292,"column":1},"end":{"line":292,"column":1}},"key":"zsvMcEkXSk","urlSource":"./shared/alpha-beta-1.png","urlOptimized":"/build/alpha-beta-1-b9d0c4a2b1ab3150a403c943682c4a80.webp"},{"type":"paragraph","position":{"start":{"line":294,"column":1},"end":{"line":295,"column":1}},"children":[{"type":"text","value":"Then we iterate through Min’s possible actions,\nupdating the value of ","position":{"start":{"line":294,"column":1},"end":{"line":294,"column":1}},"key":"KtjhjrgPvE"},{"type":"inlineMath","value":"\\beta(s')","position":{"start":{"line":294,"column":1},"end":{"line":294,"column":1}},"html":"β(s)\\beta(s')β(s)","key":"MZT6lGCKK7"},{"type":"text","value":" as we go.","position":{"start":{"line":294,"column":1},"end":{"line":294,"column":1}},"key":"wWSVcuq95D"}],"key":"HfJBs7oZtJ"},{"type":"paragraph","position":{"start":{"line":297,"column":1},"end":{"line":298,"column":1}},"children":[{"type":"image","url":"/build/alpha-beta-2-b0d0597f3562685a2759d1d56f661682.png","position":{"start":{"line":297,"column":1},"end":{"line":297,"column":1}},"key":"HAdp007EFl","urlSource":"./shared/alpha-beta-2.png","urlOptimized":"/build/alpha-beta-2-b0d0597f3562685a2759d1d56f661682.webp"},{"type":"text","value":"\n","position":{"start":{"line":297,"column":1},"end":{"line":297,"column":1}},"key":"oN0vZ6qdKJ"},{"type":"image","url":"/build/alpha-beta-3-fcd7a3fcb02f86c22e47c8168d151549.png","position":{"start":{"line":297,"column":1},"end":{"line":297,"column":1}},"key":"Xlbu9hEcCg","urlSource":"./shared/alpha-beta-3.png","urlOptimized":"/build/alpha-beta-3-fcd7a3fcb02f86c22e47c8168d151549.webp"}],"key":"FJTkiq4O6W"},{"type":"paragraph","position":{"start":{"line":300,"column":1},"end":{"line":302,"column":1}},"children":[{"type":"text","value":"Once the value of state ","position":{"start":{"line":300,"column":1},"end":{"line":300,"column":1}},"key":"YMtqJzqDZr"},{"type":"inlineMath","value":"s'","position":{"start":{"line":300,"column":1},"end":{"line":300,"column":1}},"html":"ss's","key":"YdU8Q3tLjk"},{"type":"text","value":" is fully evaluated,\nwe know that Max can achieve a value of ","position":{"start":{"line":300,"column":1},"end":{"line":300,"column":1}},"key":"GVeqU6Be3I"},{"type":"emphasis","position":{"start":{"line":300,"column":1},"end":{"line":300,"column":1}},"children":[{"type":"text","value":"at least","position":{"start":{"line":300,"column":1},"end":{"line":300,"column":1}},"key":"zKMxcA1WQC"}],"key":"cRFIefxbYu"},{"type":"text","value":" ","position":{"start":{"line":300,"column":1},"end":{"line":300,"column":1}},"key":"cHC0MQeJv1"},{"type":"text","value":"-2","position":{"start":{"line":300,"column":1},"end":{"line":300,"column":1}},"key":"xTMYSMfTPZ"},{"type":"text","value":" starting from the root,\nand so we update ","position":{"start":{"line":300,"column":1},"end":{"line":300,"column":1}},"key":"JmhxTxUt8z"},{"type":"inlineMath","value":"\\alpha(s)","position":{"start":{"line":300,"column":1},"end":{"line":300,"column":1}},"html":"α(s)\\alpha(s)α(s)","key":"SqHwMwkfGQ"},{"type":"text","value":", where ","position":{"start":{"line":300,"column":1},"end":{"line":300,"column":1}},"key":"SNHz31MQEi"},{"type":"inlineMath","value":"s","position":{"start":{"line":300,"column":1},"end":{"line":300,"column":1}},"html":"sss","key":"RzinPpnQOA"},{"type":"text","value":" is the root state:","position":{"start":{"line":300,"column":1},"end":{"line":300,"column":1}},"key":"MleYjXZufj"}],"key":"bO5v8awow5"},{"type":"image","url":"/build/alpha-beta-4-e3958ef0c8cbcb3b559e8a63d1cc1e6b.png","position":{"start":{"line":304,"column":1},"end":{"line":304,"column":1}},"key":"BDpd47OEi0","urlSource":"./shared/alpha-beta-4.png","urlOptimized":"/build/alpha-beta-4-e3958ef0c8cbcb3b559e8a63d1cc1e6b.webp"},{"type":"paragraph","position":{"start":{"line":306,"column":1},"end":{"line":307,"column":1}},"children":[{"type":"text","value":"Then Max imagines taking action B. Again, let ","position":{"start":{"line":306,"column":1},"end":{"line":306,"column":1}},"key":"OD78pRUM5o"},{"type":"inlineMath","value":"s'","position":{"start":{"line":306,"column":1},"end":{"line":306,"column":1}},"html":"ss's","key":"BDaRqkZvsJ"},{"type":"text","value":" denote the resulting game state.\nWe initialize ","position":{"start":{"line":306,"column":1},"end":{"line":306,"column":1}},"key":"YPLdFm63iu"},{"type":"inlineMath","value":"\\alpha(s')","position":{"start":{"line":306,"column":1},"end":{"line":306,"column":1}},"html":"α(s)\\alpha(s')α(s)","key":"H6iePH9hNZ"},{"type":"text","value":" and ","position":{"start":{"line":306,"column":1},"end":{"line":306,"column":1}},"key":"bPqWV91FJJ"},{"type":"inlineMath","value":"\\beta(s')","position":{"start":{"line":306,"column":1},"end":{"line":306,"column":1}},"html":"β(s)\\beta(s')β(s)","key":"KLaEurNukG"},{"type":"text","value":" from the root:","position":{"start":{"line":306,"column":1},"end":{"line":306,"column":1}},"key":"QCgCZVMLKD"}],"key":"wXjJMRBptC"},{"type":"image","url":"/build/alpha-beta-5-f16710428d22fbb7c1a5dbc054a71a7c.png","position":{"start":{"line":309,"column":1},"end":{"line":309,"column":1}},"key":"F47eiLPYAH","urlSource":"./shared/alpha-beta-5.png","urlOptimized":"/build/alpha-beta-5-f16710428d22fbb7c1a5dbc054a71a7c.webp"},{"type":"paragraph","position":{"start":{"line":311,"column":1},"end":{"line":319,"column":1}},"children":[{"type":"text","value":"Now suppose Min takes action D, resulting in a value of ","position":{"start":{"line":311,"column":1},"end":{"line":311,"column":1}},"key":"qVTxeGMWeU"},{"type":"text","value":"-3","position":{"start":{"line":311,"column":1},"end":{"line":311,"column":1}},"key":"EZFdqVZht7"},{"type":"text","value":".\nWe see that ","position":{"start":{"line":311,"column":1},"end":{"line":311,"column":1}},"key":"cq6ygQrrtN"},{"type":"inlineMath","value":"V^\\star_\\hi(s') = \\min(-3, x, y)","position":{"start":{"line":311,"column":1},"end":{"line":311,"column":1}},"html":"Vh(s)=min(3,x,y)V^\\star_\\hi(s') = \\min(-3, x, y)Vh(s)=min(3,x,y)","key":"lW7rsEpY1O"},{"type":"text","value":",\nwhere ","position":{"start":{"line":311,"column":1},"end":{"line":311,"column":1}},"key":"aT2p5pkB5m"},{"type":"inlineMath","value":"x","position":{"start":{"line":311,"column":1},"end":{"line":311,"column":1}},"html":"xxx","key":"RRftYtmAqj"},{"type":"text","value":" and ","position":{"start":{"line":311,"column":1},"end":{"line":311,"column":1}},"key":"YDhk6E1q6W"},{"type":"inlineMath","value":"y","position":{"start":{"line":311,"column":1},"end":{"line":311,"column":1}},"html":"yyy","key":"Mrbzh9CTcW"},{"type":"text","value":" are the values of the remaining two actions.\nBut since ","position":{"start":{"line":311,"column":1},"end":{"line":311,"column":1}},"key":"nbeQOBI2ts"},{"type":"inlineMath","value":"\\min(-3, x, y) \\le -3","position":{"start":{"line":311,"column":1},"end":{"line":311,"column":1}},"html":"min(3,x,y)3\\min(-3, x, y) \\le -3min(3,x,y)3","key":"t8uVev9Ubv"},{"type":"text","value":",\nwe know that the value of ","position":{"start":{"line":311,"column":1},"end":{"line":311,"column":1}},"key":"DWuOkrPrxz"},{"type":"inlineMath","value":"s'","position":{"start":{"line":311,"column":1},"end":{"line":311,"column":1}},"html":"ss's","key":"SXISpoclKv"},{"type":"text","value":" is at most ","position":{"start":{"line":311,"column":1},"end":{"line":311,"column":1}},"key":"Yz4rMrcn7g"},{"type":"text","value":"-3","position":{"start":{"line":311,"column":1},"end":{"line":311,"column":1}},"key":"WEpgkY7WGQ"},{"type":"text","value":".\nBut Max can achieve a better value of ","position":{"start":{"line":311,"column":1},"end":{"line":311,"column":1}},"key":"zTySoqARcc"},{"type":"inlineMath","value":"\\alpha(s') = -2","position":{"start":{"line":311,"column":1},"end":{"line":311,"column":1}},"html":"α(s)=2\\alpha(s') = -2α(s)=2","key":"OmBPZFLOir"},{"type":"text","value":" by taking action A,\nand so Max will never take action B,\nand we can prune the search here.\nWe will use dotted lines to indicate states that have been ruled out from the search:","position":{"start":{"line":311,"column":1},"end":{"line":311,"column":1}},"key":"WimMC2fXtY"}],"key":"nDhYb3N55b"},{"type":"image","url":"/build/alpha-beta-6-1f7516f925d212dc9290ccf221a7d28e.png","position":{"start":{"line":321,"column":1},"end":{"line":321,"column":1}},"key":"dqYWPlFvA9","urlSource":"./shared/alpha-beta-6.png","urlOptimized":"/build/alpha-beta-6-1f7516f925d212dc9290ccf221a7d28e.webp"},{"type":"paragraph","position":{"start":{"line":323,"column":1},"end":{"line":326,"column":1}},"children":[{"type":"text","value":"Finally, suppose Max takes action C.\nFor Min’s actions D and E,\nthere is still a chance that action C might outperform action A,\nso we continue expanding:","position":{"start":{"line":323,"column":1},"end":{"line":323,"column":1}},"key":"Ycl7S2zdvQ"}],"key":"XxlflCwAjz"},{"type":"paragraph","position":{"start":{"line":328,"column":1},"end":{"line":329,"column":1}},"children":[{"type":"image","url":"/build/alpha-beta-7-648c7023e2fdb207fac5a83dbd8abd64.png","position":{"start":{"line":328,"column":1},"end":{"line":328,"column":1}},"key":"pLOLzr7znQ","urlSource":"./shared/alpha-beta-7.png","urlOptimized":"/build/alpha-beta-7-648c7023e2fdb207fac5a83dbd8abd64.webp"},{"type":"text","value":"\n","position":{"start":{"line":328,"column":1},"end":{"line":328,"column":1}},"key":"ROa5tbiRAl"},{"type":"image","url":"/build/alpha-beta-8-fb8654bf1f1f361f3098f7a2c0ace9bd.png","position":{"start":{"line":328,"column":1},"end":{"line":328,"column":1}},"key":"DTttfMvHo5","urlSource":"./shared/alpha-beta-8.png","urlOptimized":"/build/alpha-beta-8-fb8654bf1f1f361f3098f7a2c0ace9bd.webp"}],"key":"ib5A8QwNst"},{"type":"paragraph","position":{"start":{"line":331,"column":1},"end":{"line":333,"column":1}},"children":[{"type":"text","value":"Finally, we see that Min taking action F achieves the minimum value at this state.\nThis shows that optimal play is for Max to take action C,\nand Min to take action F.","position":{"start":{"line":331,"column":1},"end":{"line":331,"column":1}},"key":"Omnta9vSE9"}],"key":"YJBpGZ8JCx"},{"type":"image","url":"/build/alpha-beta-9-f7d61365563b59cdcecc22ca3e301bc6.png","position":{"start":{"line":335,"column":1},"end":{"line":335,"column":1}},"key":"aHGBVpFm5M","urlSource":"./shared/alpha-beta-9.png","urlOptimized":"/build/alpha-beta-9-f7d61365563b59cdcecc22ca3e301bc6.webp"}],"enumerator":"8.3","html_id":"alpha-beta-example","key":"MBNp5ET4AB"},{"type":"code","lang":"python","value":"def alpha_beta_search(s, player, alpha, beta) -> Tuple[\"Action\", \"Value\"]:\n \"\"\"Return the value of the state (for Max) and the best action for Max to take.\"\"\"\n if env.is_terminal(s):\n return None, env.winner(s)\n\n if player is max:\n a_max, v_max = None, None\n for a in actions:\n _, v = minimax_search(env.step(s, a), min, alpha, beta)\n if v > v_max:\n a_max, v_max = a, v\n alpha = max(alpha, v)\n if v_max >= beta:\n # we know Min will not choose the action that leads to this state\n return a_max, v_max\n return a_max, v_max\n\n else:\n a_min, v_min = None, None\n for a in actions:\n _, v = minimax_search(env.step(s, a), max)\n if v < v_min:\n a_min, v_min = a, v\n beta = min(beta, v)\n if v_min <= alpha:\n # we know Max will not choose the action that leads to this state\n return a_min, v_min\n return a_min, v_min","position":{"start":{"line":339,"column":1},"end":{"line":368,"column":1}},"key":"OjnqRC5qBy"},{"type":"paragraph","position":{"start":{"line":370,"column":1},"end":{"line":378,"column":1}},"children":[{"type":"text","value":"How do we choose what ","position":{"start":{"line":370,"column":1},"end":{"line":370,"column":1}},"key":"cC0wAWampS"},{"type":"emphasis","position":{"start":{"line":370,"column":1},"end":{"line":370,"column":1}},"children":[{"type":"text","value":"order","position":{"start":{"line":370,"column":1},"end":{"line":370,"column":1}},"key":"KHTqHT5itk"}],"key":"w39gV8Ysp0"},{"type":"text","value":" to explore the branches?\nAs you can tell, this significantly affects the efficiency of the pruning algorithm.\nIf Max explores the possible actions in order from worst to best,\nthey will not be able to prune any branches at all!\nAdditionally, to verify that an action is suboptimal,\nwe must run the search recursively from that action,\nwhich ultimately requires traversing the tree all the way to a leaf node.\nThe longer the game might possibly last,\nthe more computation we have to run.","position":{"start":{"line":370,"column":1},"end":{"line":370,"column":1}},"key":"ocbjNsES5Q"}],"key":"v7WDRNfLjY"},{"type":"paragraph","position":{"start":{"line":380,"column":1},"end":{"line":383,"column":1}},"children":[{"type":"text","value":"In practice, we can often use background information about the game to develop a ","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"zwukOn3ggM"},{"type":"strong","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"children":[{"type":"text","value":"heuristic","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"Z4rjY56aoU"}],"key":"vxzIGxEt6F"},{"type":"text","value":" for evaluating possible actions.\nIf a technique is based on background information or intuition,\nespecially if it isn’t rigorously justified,\nwe call it a heuristic.","position":{"start":{"line":380,"column":1},"end":{"line":380,"column":1}},"key":"tmE952P2QY"}],"key":"ifuWBaPM98"},{"type":"paragraph","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"children":[{"type":"text","value":"Can we develop ","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"dP437gNt7o"},{"type":"emphasis","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"children":[{"type":"text","value":"heuristic methods","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"NGZaCSYMz1"}],"key":"OWBtO2wmqO"},{"type":"text","value":" for tree exploration that works for all sorts of games?","position":{"start":{"line":385,"column":1},"end":{"line":385,"column":1}},"key":"gp0Qt3mb1K"}],"key":"HUgv3rHHNX"},{"type":"comment","value":" Here's where we can incorporate the _reinforcement learning_ ","key":"xFbjoYQnjE"},{"type":"heading","depth":2,"position":{"start":{"line":389,"column":1},"end":{"line":389,"column":1}},"children":[{"type":"text","value":"Monte Carlo Tree Search","position":{"start":{"line":389,"column":1},"end":{"line":389,"column":1}},"key":"NjEsK1U5t9"}],"label":"monte-carlo-tree-search","identifier":"monte-carlo-tree-search","html_id":"monte-carlo-tree-search","enumerator":"8.5","key":"dpbmsXhk2I"},{"type":"paragraph","position":{"start":{"line":391,"column":1},"end":{"line":393,"column":1}},"children":[{"type":"text","value":"The task of evaluating actions in a complex environment might seem familiar.\nWe’ve encountered this problem before in both the ","position":{"start":{"line":391,"column":1},"end":{"line":391,"column":1}},"key":"DTqLkKzHfM"},{"type":"link","url":"/bandits","position":{"start":{"line":391,"column":1},"end":{"line":391,"column":1}},"children":[{"type":"text","value":"multi-armed bandits","position":{"start":{"line":391,"column":1},"end":{"line":391,"column":1}},"key":"T12RFI7MPK"}],"urlSource":"./bandits.md","dataUrl":"/bandits.json","internal":true,"protocol":"file","key":"oPvxkP5IZD"},{"type":"text","value":" setting and the ","position":{"start":{"line":391,"column":1},"end":{"line":391,"column":1}},"key":"wJrjPLx4HR"},{"type":"link","url":"/mdps","position":{"start":{"line":391,"column":1},"end":{"line":391,"column":1}},"children":[{"type":"text","value":"Markov decision process","position":{"start":{"line":391,"column":1},"end":{"line":391,"column":1}},"key":"Nvub9SCoDa"}],"urlSource":"./mdps.md","dataUrl":"/mdps.json","internal":true,"protocol":"file","key":"vobMhUCCKL"},{"type":"text","value":" setting.\nNow we’ll see how to combine concepts from these to form a more general and efficient tree search heuristic called ","position":{"start":{"line":391,"column":1},"end":{"line":391,"column":1}},"key":"FV5txUpq0w"},{"type":"strong","position":{"start":{"line":391,"column":1},"end":{"line":391,"column":1}},"children":[{"type":"text","value":"Monte Carlo Tree Search","position":{"start":{"line":391,"column":1},"end":{"line":391,"column":1}},"key":"XfL0XAntWe"}],"key":"BCuuovW9eg"},{"type":"text","value":" (MCTS).","position":{"start":{"line":391,"column":1},"end":{"line":391,"column":1}},"key":"KDR14HnMJg"}],"key":"fJ0GpijWKZ"},{"type":"paragraph","position":{"start":{"line":395,"column":1},"end":{"line":400,"column":1}},"children":[{"type":"text","value":"When a problem is intractable to solve ","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"ejUDe3MeuM"},{"type":"emphasis","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"children":[{"type":"text","value":"exactly","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"toLaP8oKF6"}],"key":"YoqiAiLKJq"},{"type":"text","value":",\nwe often turn to ","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"B41SzTCMcb"},{"type":"emphasis","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"children":[{"type":"text","value":"approximate","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"zSJdaHC1qd"}],"key":"d7lZB31T5I"},{"type":"text","value":" algorithms that sacrifice some accuracy in exchange for computational efficiency.\nMCTS also improves on alpha-beta search in this sense.\nAs the name suggests,\nMCTS uses ","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"vhWh1KAQeW"},{"type":"emphasis","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"children":[{"type":"text","value":"Monte Carlo","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"t25zfAXJli"}],"key":"DQnvZVsWnO"},{"type":"text","value":" simulation, that is, collecting random samples and computing the sample statistics,\nin order to ","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"DTpxzG54ff"},{"type":"emphasis","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"children":[{"type":"text","value":"approximate","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"MYMf2MQDtI"}],"key":"yeyehZNEcM"},{"type":"text","value":" the value of each action.","position":{"start":{"line":395,"column":1},"end":{"line":395,"column":1}},"key":"OLlZBPPiNO"}],"key":"gtJug7XNY7"},{"type":"paragraph","position":{"start":{"line":402,"column":1},"end":{"line":408,"column":1}},"children":[{"type":"text","value":"As before, we imagine a complete game tree in which each path represents an ","position":{"start":{"line":402,"column":1},"end":{"line":402,"column":1}},"key":"aM5BX3klCw"},{"type":"emphasis","position":{"start":{"line":402,"column":1},"end":{"line":402,"column":1}},"children":[{"type":"text","value":"entire game","position":{"start":{"line":402,"column":1},"end":{"line":402,"column":1}},"key":"T9GyY7NwgJ"}],"key":"sxOFlQgzAW"},{"type":"text","value":".\nThe goal of MCTS is to assign values to only the game states that are ","position":{"start":{"line":402,"column":1},"end":{"line":402,"column":1}},"key":"z2unNwT50n"},{"type":"emphasis","position":{"start":{"line":402,"column":1},"end":{"line":402,"column":1}},"children":[{"type":"text","value":"relevant","position":{"start":{"line":402,"column":1},"end":{"line":402,"column":1}},"key":"ZZdq5luvhv"}],"key":"nIabdxuveY"},{"type":"text","value":" to the ","position":{"start":{"line":402,"column":1},"end":{"line":402,"column":1}},"key":"zOGVb0Jyzw"},{"type":"emphasis","position":{"start":{"line":402,"column":1},"end":{"line":402,"column":1}},"children":[{"type":"text","value":"current game","position":{"start":{"line":402,"column":1},"end":{"line":402,"column":1}},"key":"uKOKeleMJv"}],"key":"P96CB0VvPN"},{"type":"text","value":";\nWe gradually expand the tree at each move.\nFor comparison, in alpha-beta search,\nthe entire tree only needs to be solved ","position":{"start":{"line":402,"column":1},"end":{"line":402,"column":1}},"key":"Vlj7TOonvB"},{"type":"emphasis","position":{"start":{"line":402,"column":1},"end":{"line":402,"column":1}},"children":[{"type":"text","value":"once","position":{"start":{"line":402,"column":1},"end":{"line":402,"column":1}},"key":"hyQQMGPHDQ"}],"key":"YfASxZHWyT"},{"type":"text","value":",\nand from then on,\nchoosing an action is as simple as taking a maximum over the previously computed values.","position":{"start":{"line":402,"column":1},"end":{"line":402,"column":1}},"key":"iZQUCaxQup"}],"key":"znPu9tQD6K"},{"type":"paragraph","position":{"start":{"line":410,"column":1},"end":{"line":414,"column":1}},"children":[{"type":"text","value":"The crux of MCTS is approximating the win probability of a state by a ","position":{"start":{"line":410,"column":1},"end":{"line":410,"column":1}},"key":"tjQCA4JX7e"},{"type":"emphasis","position":{"start":{"line":410,"column":1},"end":{"line":410,"column":1}},"children":[{"type":"text","value":"sample probability","position":{"start":{"line":410,"column":1},"end":{"line":410,"column":1}},"key":"vFfH1rbeEZ"}],"key":"HkxMVYdKX4"},{"type":"text","value":".\nIn practice, MCTS is used for games with ","position":{"start":{"line":410,"column":1},"end":{"line":410,"column":1}},"key":"GTRBtnjHpB"},{"type":"emphasis","position":{"start":{"line":410,"column":1},"end":{"line":410,"column":1}},"children":[{"type":"text","value":"binary outcomes","position":{"start":{"line":410,"column":1},"end":{"line":410,"column":1}},"key":"QmfsbDtnjA"}],"key":"DzTJFvyJEc"},{"type":"text","value":" where ","position":{"start":{"line":410,"column":1},"end":{"line":410,"column":1}},"key":"V0s8rlBY2A"},{"type":"inlineMath","value":"r(s) \\in \\{ +1, -1 \\}","position":{"start":{"line":410,"column":1},"end":{"line":410,"column":1}},"html":"r(s){+1,1}r(s) \\in \\{ +1, -1 \\}r(s){+1,1}","key":"s8meh9keIJ"},{"type":"text","value":",\nand so this is equivalent to approximating the final game score.\nTo approximate the win probability from state ","position":{"start":{"line":410,"column":1},"end":{"line":410,"column":1}},"key":"XY5gqDUUAe"},{"type":"inlineMath","value":"s","position":{"start":{"line":410,"column":1},"end":{"line":410,"column":1}},"html":"sss","key":"dauZEEumzF"},{"type":"text","value":",\nMCTS samples random games starting in ","position":{"start":{"line":410,"column":1},"end":{"line":410,"column":1}},"key":"BBzDZXoLQk"},{"type":"inlineMath","value":"s","position":{"start":{"line":410,"column":1},"end":{"line":410,"column":1}},"html":"sss","key":"WGe56Ht0YZ"},{"type":"text","value":" and computes the sample proportion of those that the player wins.","position":{"start":{"line":410,"column":1},"end":{"line":410,"column":1}},"key":"vZchevPaf0"}],"key":"V224xCvyfi"},{"type":"paragraph","position":{"start":{"line":416,"column":1},"end":{"line":420,"column":1}},"children":[{"type":"text","value":"Note that, for a given state ","position":{"start":{"line":416,"column":1},"end":{"line":416,"column":1}},"key":"AP11SVsr99"},{"type":"inlineMath","value":"s","position":{"start":{"line":416,"column":1},"end":{"line":416,"column":1}},"html":"sss","key":"FhqN2zybuq"},{"type":"text","value":",\nchoosing the best action ","position":{"start":{"line":416,"column":1},"end":{"line":416,"column":1}},"key":"AopioJWhIi"},{"type":"inlineMath","value":"a","position":{"start":{"line":416,"column":1},"end":{"line":416,"column":1}},"html":"aaa","key":"dcxqt70zoU"},{"type":"text","value":" can be framed as a ","position":{"start":{"line":416,"column":1},"end":{"line":416,"column":1}},"key":"xncjfGn7zv"},{"type":"link","url":"/bandits","position":{"start":{"line":416,"column":1},"end":{"line":416,"column":1}},"children":[{"type":"text","value":"multi-armed bandits","position":{"start":{"line":416,"column":1},"end":{"line":416,"column":1}},"key":"MjL8y1gd6t"}],"urlSource":"./bandits.md","dataUrl":"/bandits.json","internal":true,"protocol":"file","key":"s5OK6ujBHK"},{"type":"text","value":" problem,\nwhere each action corresponds to an arm,\nand the reward distribution of arm ","position":{"start":{"line":416,"column":1},"end":{"line":416,"column":1}},"key":"XvodirOCWO"},{"type":"inlineMath","value":"k","position":{"start":{"line":416,"column":1},"end":{"line":416,"column":1}},"html":"kkk","key":"ZdcxYzscrE"},{"type":"text","value":" is the distribution of the game score over random games after choosing that arm.\nThe most commonly used bandit algorithm in practice for MCTS is the ","position":{"start":{"line":416,"column":1},"end":{"line":416,"column":1}},"key":"ar6pKEuoDs"},{"type":"crossReference","position":{"start":{"line":416,"column":1},"end":{"line":416,"column":1}},"children":[{"type":"text","value":"Upper Confidence Bound (UCB)","position":{"start":{"line":500,"column":1},"end":{"line":500,"column":1}},"key":"wqQa2VVBTR"}],"identifier":"ucb","label":"ucb","kind":"heading","template":"Section %s","enumerator":"3.6","resolved":true,"html_id":"ucb","remote":true,"url":"/bandits","dataUrl":"/bandits.json","key":"LHQ2iMQHGL"},{"type":"text","value":" algorithm.","position":{"start":{"line":416,"column":1},"end":{"line":416,"column":1}},"key":"vVEVYPO64I"}],"key":"eP8JDqLz8G"},{"type":"admonition","kind":"note","children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Summary of UCB","position":{"start":{"line":422,"column":1},"end":{"line":422,"column":1}},"key":"pYf3eQvDps"}],"key":"gzBQWAPk18"},{"type":"paragraph","position":{"start":{"line":423,"column":1},"end":{"line":435,"column":1}},"children":[{"type":"text","value":"Let us quickly review the UCB bandit algorithm.\nFor each arm ","position":{"start":{"line":423,"column":1},"end":{"line":423,"column":1}},"key":"JH4eRHsawL"},{"type":"inlineMath","value":"k","position":{"start":{"line":423,"column":1},"end":{"line":423,"column":1}},"html":"kkk","key":"I73BpbPKt4"},{"type":"text","value":", we track the sample mean","position":{"start":{"line":423,"column":1},"end":{"line":423,"column":1}},"key":"VmdZTxgYKQ"}],"key":"lpQfUOwxCt"},{"type":"math","value":"\\hat \\mu^k_t = \\frac{1}{N_t^k} \\sum_{\\tau=0}^{t-1} \\ind{a_\\tau = k} r_\\tau","position":{"start":{"line":423,"column":1},"end":{"line":423,"column":1}},"tight":true,"html":"μ^tk=1Ntkτ=0t11{aτ=k}rτ\\hat \\mu^k_t = \\frac{1}{N_t^k} \\sum_{\\tau=0}^{t-1} \\ind{a_\\tau = k} r_\\tauμ^tk=Ntk1τ=0t11{aτ=k}rτ","enumerator":"8.2","key":"TeKSsOQDHb"},{"type":"paragraph","position":{"start":{"line":423,"column":1},"end":{"line":435,"column":1}},"children":[{"type":"text","value":"of all rewards from that arm up to time ","position":{"start":{"line":423,"column":1},"end":{"line":423,"column":1}},"key":"riIHGQL9fL"},{"type":"inlineMath","value":"t","position":{"start":{"line":423,"column":1},"end":{"line":423,"column":1}},"html":"ttt","key":"R7aznyAxpy"},{"type":"text","value":".\nThen we construct a ","position":{"start":{"line":423,"column":1},"end":{"line":423,"column":1}},"key":"h36tfVFSXe"},{"type":"emphasis","position":{"start":{"line":423,"column":1},"end":{"line":423,"column":1}},"children":[{"type":"text","value":"confidence interval","position":{"start":{"line":423,"column":1},"end":{"line":423,"column":1}},"key":"iUPwqZ8EUu"}],"key":"IGhyuMbpin"},{"type":"text","value":"","position":{"start":{"line":423,"column":1},"end":{"line":423,"column":1}},"key":"wnt7kirkFR"}],"key":"K020445llR"},{"type":"math","value":"C_t^k = [\\hat \\mu^k_t - B_t^k, \\hat \\mu^k_t + B_t^k],","position":{"start":{"line":423,"column":1},"end":{"line":423,"column":1}},"tight":true,"html":"Ctk=[μ^tkBtk,μ^tk+Btk],C_t^k = [\\hat \\mu^k_t - B_t^k, \\hat \\mu^k_t + B_t^k],Ctk=[μ^tkBtk,μ^tk+Btk],","enumerator":"8.3","key":"atNneOTgjI"},{"type":"paragraph","position":{"start":{"line":423,"column":1},"end":{"line":435,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":423,"column":1},"end":{"line":423,"column":1}},"key":"zGNWGwo6VL"},{"type":"inlineMath","value":"B_t^k = \\sqrt{\\frac{\\ln(2 t / \\delta)}{2 N_t^k}}","position":{"start":{"line":423,"column":1},"end":{"line":423,"column":1}},"html":"Btk=ln(2t/δ)2NtkB_t^k = \\sqrt{\\frac{\\ln(2 t / \\delta)}{2 N_t^k}}Btk=2Ntkln(2t/δ)","key":"XyzcqI1hJ3"},{"type":"text","value":" is given by Hoeffding’s inequality,\nso that with probability ","position":{"start":{"line":423,"column":1},"end":{"line":423,"column":1}},"key":"E3W2uS3J46"},{"type":"text","value":"δ","position":{"start":{"line":423,"column":1},"end":{"line":423,"column":1}},"key":"kSjuKqPyaW"},{"type":"text","value":" (some fixed parameter we choose),\nthe true mean ","position":{"start":{"line":423,"column":1},"end":{"line":423,"column":1}},"key":"c3Jm0P8m1W"},{"type":"inlineMath","value":"\\mu^k","position":{"start":{"line":423,"column":1},"end":{"line":423,"column":1}},"html":"μk\\mu^kμk","key":"hgQrzcnOyk"},{"type":"text","value":" lies within ","position":{"start":{"line":423,"column":1},"end":{"line":423,"column":1}},"key":"CEvDoQ2DpP"},{"type":"inlineMath","value":"C_t^k","position":{"start":{"line":423,"column":1},"end":{"line":423,"column":1}},"html":"CtkC_t^kCtk","key":"qwHwIbXt4b"},{"type":"text","value":".\nNote that ","position":{"start":{"line":423,"column":1},"end":{"line":423,"column":1}},"key":"lrPY3nCK0l"},{"type":"inlineMath","value":"B_t^k","position":{"start":{"line":423,"column":1},"end":{"line":423,"column":1}},"html":"BtkB_t^kBtk","key":"Re3lgZeylQ"},{"type":"text","value":" scales like ","position":{"start":{"line":423,"column":1},"end":{"line":423,"column":1}},"key":"oPKeTU2d2J"},{"type":"inlineMath","value":"\\sqrt{1/N^k_t}","position":{"start":{"line":423,"column":1},"end":{"line":423,"column":1}},"html":"1/Ntk\\sqrt{1/N^k_t}1/Ntk","key":"SW82cF3mhY"},{"type":"text","value":",\ni.e. the more we have visited that arm,\nthe more confident we get about it,\nand the narrower the confidence interval.","position":{"start":{"line":423,"column":1},"end":{"line":423,"column":1}},"key":"besegtjVMJ"}],"key":"yTg9F3ezBn"},{"type":"paragraph","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"children":[{"type":"text","value":"To select an arm, we pick the arm with the highest ","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"key":"vO1GnbTjHs"},{"type":"emphasis","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"children":[{"type":"text","value":"upper confidence bound","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"key":"oaaasCamnn"}],"key":"xsfeuyQAkR"},{"type":"text","value":".","position":{"start":{"line":437,"column":1},"end":{"line":437,"column":1}},"key":"Knix3bXako"}],"key":"rup7SKhX0Q"}],"key":"vNCAWAZ5pi"},{"type":"paragraph","position":{"start":{"line":440,"column":1},"end":{"line":441,"column":1}},"children":[{"type":"text","value":"This means that, for each edge (corresponding to a state-action pair ","position":{"start":{"line":440,"column":1},"end":{"line":440,"column":1}},"key":"IFALcoZds0"},{"type":"inlineMath","value":"(s, a)","position":{"start":{"line":440,"column":1},"end":{"line":440,"column":1}},"html":"(s,a)(s, a)(s,a)","key":"vIgtfBJGfS"},{"type":"text","value":") in the game tree,\nwe keep track of the statistics required to compute its UCB:","position":{"start":{"line":440,"column":1},"end":{"line":440,"column":1}},"key":"TQQ72bhlGv"}],"key":"t9Ch3mZDiH"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":443,"column":1},"end":{"line":446,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":443,"column":1},"end":{"line":443,"column":1}},"children":[{"type":"text","value":"How many times it has been “visited” (","position":{"start":{"line":443,"column":1},"end":{"line":443,"column":1}},"key":"c2pfHFgSqe"},{"type":"inlineMath","value":"N_t^{s, a}","position":{"start":{"line":443,"column":1},"end":{"line":443,"column":1}},"html":"Nts,aN_t^{s, a}Nts,a","key":"XRYaJLbaMl"},{"type":"text","value":")","position":{"start":{"line":443,"column":1},"end":{"line":443,"column":1}},"key":"WcAbzOpGfO"}],"key":"Vmpz9dYD1I"},{"type":"listItem","spread":true,"position":{"start":{"line":444,"column":1},"end":{"line":446,"column":1}},"children":[{"type":"text","value":"How many of those visits resulted in victory (","position":{"start":{"line":444,"column":1},"end":{"line":444,"column":1}},"key":"v5sGgblWtD"},{"type":"inlineMath","value":"\\sum_{\\tau=0}^{t-1} \\ind{(s_\\tau, a_\\tau) = (s, a)} r_\\tau","position":{"start":{"line":444,"column":1},"end":{"line":444,"column":1}},"html":"τ=0t11{(sτ,aτ)=(s,a)}rτ\\sum_{\\tau=0}^{t-1} \\ind{(s_\\tau, a_\\tau) = (s, a)} r_\\tauτ=0t11{(sτ,aτ)=(s,a)}rτ","key":"VelBmZ0YFy"},{"type":"text","value":").\nLet us call this latter value ","position":{"start":{"line":444,"column":1},"end":{"line":444,"column":1}},"key":"RQxAxgWwaA"},{"type":"inlineMath","value":"W^{s, a}_t","position":{"start":{"line":444,"column":1},"end":{"line":444,"column":1}},"html":"Wts,aW^{s, a}_tWts,a","key":"ixFwiMGB6h"},{"type":"text","value":" (for number of “wins”).","position":{"start":{"line":444,"column":1},"end":{"line":444,"column":1}},"key":"ZQmTSirNne"}],"key":"eS5FmIWjzc"}],"key":"zBS4xF4akO"},{"type":"paragraph","position":{"start":{"line":447,"column":1},"end":{"line":454,"column":1}},"children":[{"type":"text","value":"What does ","position":{"start":{"line":447,"column":1},"end":{"line":447,"column":1}},"key":"GT4atr4vvH"},{"type":"inlineMath","value":"t","position":{"start":{"line":447,"column":1},"end":{"line":447,"column":1}},"html":"ttt","key":"usJQAZ9A8G"},{"type":"text","value":" refer to in the above expressions?\nRecall ","position":{"start":{"line":447,"column":1},"end":{"line":447,"column":1}},"key":"L6fBafFkAW"},{"type":"inlineMath","value":"t","position":{"start":{"line":447,"column":1},"end":{"line":447,"column":1}},"html":"ttt","key":"Hio2KWH8m6"},{"type":"text","value":" refers to the number of time steps elapsed in the ","position":{"start":{"line":447,"column":1},"end":{"line":447,"column":1}},"key":"kH2dVyA8lb"},{"type":"emphasis","position":{"start":{"line":447,"column":1},"end":{"line":447,"column":1}},"children":[{"type":"text","value":"bandit environment","position":{"start":{"line":447,"column":1},"end":{"line":447,"column":1}},"key":"YFc3RyQy8q"}],"key":"XtANzEzVBP"},{"type":"text","value":".\nAs mentioned above,\neach state ","position":{"start":{"line":447,"column":1},"end":{"line":447,"column":1}},"key":"KZ4SGY8KUj"},{"type":"inlineMath","value":"s","position":{"start":{"line":447,"column":1},"end":{"line":447,"column":1}},"html":"sss","key":"JCC8gd4eIO"},{"type":"text","value":" corresponds to its own bandit environment,\nand so ","position":{"start":{"line":447,"column":1},"end":{"line":447,"column":1}},"key":"EosPlSqrdh"},{"type":"inlineMath","value":"t","position":{"start":{"line":447,"column":1},"end":{"line":447,"column":1}},"html":"ttt","key":"p9MFl0kCfV"},{"type":"text","value":" refers to ","position":{"start":{"line":447,"column":1},"end":{"line":447,"column":1}},"key":"VfC10efWTo"},{"type":"inlineMath","value":"N^s","position":{"start":{"line":447,"column":1},"end":{"line":447,"column":1}},"html":"NsN^sNs","key":"Oe1XTZvhTT"},{"type":"text","value":", that is,\nhow many actions have been taken from state ","position":{"start":{"line":447,"column":1},"end":{"line":447,"column":1}},"key":"Lt1zNI1QDQ"},{"type":"inlineMath","value":"s","position":{"start":{"line":447,"column":1},"end":{"line":447,"column":1}},"html":"sss","key":"onMYfyP7Ps"},{"type":"text","value":".\nThis term, ","position":{"start":{"line":447,"column":1},"end":{"line":447,"column":1}},"key":"D1HuAMfPkA"},{"type":"inlineMath","value":"N^s","position":{"start":{"line":447,"column":1},"end":{"line":447,"column":1}},"html":"NsN^sNs","key":"FITnZMWLA7"},{"type":"text","value":", gets incremented as the algorithm runs;\nfor simplicity, we won’t introduce another index to track how it changes.","position":{"start":{"line":447,"column":1},"end":{"line":447,"column":1}},"key":"Ms4AmpEefl"}],"key":"Wf0IAHzVwy"},{"type":"proof","kind":"algorithm","label":"mcts-algorithm","identifier":"mcts-algorithm","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Monte Carlo tree search algorithm","position":{"start":{"line":456,"column":1},"end":{"line":456,"column":1}},"key":"ZZVwwqxSWl"}],"key":"jpDxCCsh7R"},{"type":"paragraph","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"children":[{"type":"text","value":"Inputs:","position":{"start":{"line":459,"column":1},"end":{"line":459,"column":1}},"key":"zDjze9n6e9"}],"key":"Fv3WlPlelQ"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":460,"column":1},"end":{"line":463,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":460,"column":1},"end":{"line":460,"column":1}},"children":[{"type":"inlineMath","value":"T","position":{"start":{"line":460,"column":1},"end":{"line":460,"column":1}},"html":"TTT","key":"WGRmIzIzgV"},{"type":"text","value":", the number of iterations per move","position":{"start":{"line":460,"column":1},"end":{"line":460,"column":1}},"key":"KsKQPrYnhm"}],"key":"cmWfgpAfkF"},{"type":"listItem","spread":true,"position":{"start":{"line":461,"column":1},"end":{"line":461,"column":1}},"children":[{"type":"inlineMath","value":"\\pi_{\\text{rollout}}","position":{"start":{"line":461,"column":1},"end":{"line":461,"column":1}},"html":"πrollout\\pi_{\\text{rollout}}πrollout","key":"UxLPEU5tFY"},{"type":"text","value":", the ","position":{"start":{"line":461,"column":1},"end":{"line":461,"column":1}},"key":"cFPTjMWmdZ"},{"type":"strong","position":{"start":{"line":461,"column":1},"end":{"line":461,"column":1}},"children":[{"type":"text","value":"rollout policy","position":{"start":{"line":461,"column":1},"end":{"line":461,"column":1}},"key":"Qjxs12d3DG"}],"key":"hrTsN0Gql4"},{"type":"text","value":" for randomly sampling games","position":{"start":{"line":461,"column":1},"end":{"line":461,"column":1}},"key":"MhIJcqWudL"}],"key":"OW467ly0cN"},{"type":"listItem","spread":true,"position":{"start":{"line":462,"column":1},"end":{"line":463,"column":1}},"children":[{"type":"inlineMath","value":"c","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"html":"ccc","key":"jfWWPLMV4n"},{"type":"text","value":", a positive value that encourages exploration","position":{"start":{"line":462,"column":1},"end":{"line":462,"column":1}},"key":"F4rpTEBAOD"}],"key":"JEbDnS8Eq7"}],"key":"HBi8Hl4ppK"},{"type":"paragraph","position":{"start":{"line":464,"column":1},"end":{"line":468,"column":1}},"children":[{"type":"text","value":"To choose a single move starting at state ","position":{"start":{"line":464,"column":1},"end":{"line":464,"column":1}},"key":"n5kckwvmom"},{"type":"inlineMath","value":"s_{\\text{start}}","position":{"start":{"line":464,"column":1},"end":{"line":464,"column":1}},"html":"sstarts_{\\text{start}}sstart","key":"XouOT9ds1e"},{"type":"text","value":",\nMCTS first tries to estimate the UCB values for each of the possible actions ","position":{"start":{"line":464,"column":1},"end":{"line":464,"column":1}},"key":"oP59YA7UXj"},{"type":"inlineMath","value":"\\mathcal{A}(s_\\text{start})","position":{"start":{"line":464,"column":1},"end":{"line":464,"column":1}},"html":"A(sstart)\\mathcal{A}(s_\\text{start})A(sstart)","key":"kOImNLAvgw"},{"type":"text","value":",\nand then chooses the best one.\nTo estimate the UCB values,\nit repeats the following four steps ","position":{"start":{"line":464,"column":1},"end":{"line":464,"column":1}},"key":"Jcmw2pHrkD"},{"type":"inlineMath","value":"T","position":{"start":{"line":464,"column":1},"end":{"line":464,"column":1}},"html":"TTT","key":"OT16j5fBlV"},{"type":"text","value":" times:","position":{"start":{"line":464,"column":1},"end":{"line":464,"column":1}},"key":"SxKFRY32Ij"}],"key":"x4kZUinrG8"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":470,"column":1},"end":{"line":487,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":470,"column":1},"end":{"line":478,"column":1}},"children":[{"type":"strong","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"children":[{"type":"text","value":"Selection","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"GY5YmslbIq"}],"key":"oCdEXV25rl"},{"type":"text","value":": We start at ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"DfWT5Yqdmx"},{"type":"inlineMath","value":"s = s_{\\text{start}}","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"html":"s=sstarts = s_{\\text{start}}s=sstart","key":"hxeFSY7uhh"},{"type":"text","value":". Let ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"YWIMHxrYbB"},{"type":"text","value":"τ","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"iyRqTtvfwY"},{"type":"text","value":" be an empty list that we will use to track states and actions.","position":{"start":{"line":470,"column":1},"end":{"line":470,"column":1}},"key":"TCcQDYe3nO"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":471,"column":1},"end":{"line":478,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":471,"column":1},"end":{"line":478,"column":1}},"children":[{"type":"text","value":"Until ","position":{"start":{"line":471,"column":1},"end":{"line":471,"column":1}},"key":"MWrbRm5jUy"},{"type":"inlineMath","value":"s","position":{"start":{"line":471,"column":1},"end":{"line":471,"column":1}},"html":"sss","key":"gl9SYo7lIC"},{"type":"text","value":" has at least one action that hasn’t been taken:","position":{"start":{"line":471,"column":1},"end":{"line":471,"column":1}},"key":"Imj3jozuwr"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":472,"column":1},"end":{"line":478,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":472,"column":1},"end":{"line":476,"column":1}},"children":[{"type":"text","value":"Choose ","position":{"start":{"line":472,"column":1},"end":{"line":472,"column":1}},"key":"tSZBoXzd1V"},{"type":"inlineMath","value":"a \\gets \\argmax_k \\text{UCB}^{s, k}","position":{"start":{"line":472,"column":1},"end":{"line":472,"column":1}},"html":"aarg maxkUCBs,ka \\gets \\argmax_k \\text{UCB}^{s, k}aargmaxkUCBs,k","key":"aE83B9sCUQ"},{"type":"text","value":", where\n","position":{"start":{"line":472,"column":1},"end":{"line":472,"column":1}},"key":"j67R77srCU"},{"type":"math","value":"\\text{UCB}^{s, a} = \\frac{W^{s, a}}{N^s} + c \\sqrt{\\frac{\\ln N^s}{N^{s, a}}}","position":{"start":{"line":472,"column":1},"end":{"line":472,"column":1}},"identifier":"ucb-tree","label":"ucb-tree","html_id":"ucb-tree","html":"UCBs,a=Ws,aNs+clnNsNs,a\\text{UCB}^{s, a} = \\frac{W^{s, a}}{N^s} + c \\sqrt{\\frac{\\ln N^s}{N^{s, a}}}UCBs,a=NsWs,a+cNs,alnNs","enumerator":"8.4","key":"Fy6izhKQkU"}],"key":"Pzh6qLGymV"},{"type":"listItem","spread":true,"position":{"start":{"line":477,"column":1},"end":{"line":477,"column":1}},"children":[{"type":"text","value":"Append ","position":{"start":{"line":477,"column":1},"end":{"line":477,"column":1}},"key":"LjMkdck2So"},{"type":"inlineMath","value":"(s, a)","position":{"start":{"line":477,"column":1},"end":{"line":477,"column":1}},"html":"(s,a)(s, a)(s,a)","key":"BVrJr93Exm"},{"type":"text","value":" to ","position":{"start":{"line":477,"column":1},"end":{"line":477,"column":1}},"key":"xxhxdFUPrT"},{"type":"text","value":"τ","position":{"start":{"line":477,"column":1},"end":{"line":477,"column":1}},"key":"e3XzmcYBmM"}],"key":"Hk79PqI9QT"},{"type":"listItem","spread":true,"position":{"start":{"line":478,"column":1},"end":{"line":478,"column":1}},"children":[{"type":"text","value":"Set ","position":{"start":{"line":478,"column":1},"end":{"line":478,"column":1}},"key":"oHDCY2Vdc4"},{"type":"inlineMath","value":"s \\gets P(s, a)","position":{"start":{"line":478,"column":1},"end":{"line":478,"column":1}},"html":"sP(s,a)s \\gets P(s, a)sP(s,a)","key":"HFaX7J6tzY"}],"key":"PzdkdhpIil"}],"key":"BcoGHKg6n7"}],"key":"yR6mtF1gZc"}],"key":"t05KZzUCjS"}],"key":"ssx291tFms"},{"type":"listItem","spread":true,"position":{"start":{"line":479,"column":1},"end":{"line":479,"column":1}},"children":[{"type":"strong","position":{"start":{"line":479,"column":1},"end":{"line":479,"column":1}},"children":[{"type":"text","value":"Expansion","position":{"start":{"line":479,"column":1},"end":{"line":479,"column":1}},"key":"KbNhwUHQer"}],"key":"DevQ9f5tQv"},{"type":"text","value":": Let ","position":{"start":{"line":479,"column":1},"end":{"line":479,"column":1}},"key":"qpxUDu8N0x"},{"type":"inlineMath","value":"s_\\text{new}","position":{"start":{"line":479,"column":1},"end":{"line":479,"column":1}},"html":"snews_\\text{new}snew","key":"umHAnFHIfm"},{"type":"text","value":" denote the final state in ","position":{"start":{"line":479,"column":1},"end":{"line":479,"column":1}},"key":"k4UINgg7j9"},{"type":"text","value":"τ","position":{"start":{"line":479,"column":1},"end":{"line":479,"column":1}},"key":"cKOHEwggzr"},{"type":"text","value":" (that has at least one action that hasn’t been taken). Choose one of these unexplored actions from ","position":{"start":{"line":479,"column":1},"end":{"line":479,"column":1}},"key":"Mva3EyTIPl"},{"type":"inlineMath","value":"s_\\text{new}","position":{"start":{"line":479,"column":1},"end":{"line":479,"column":1}},"html":"snews_\\text{new}snew","key":"n2nkjVT8u1"},{"type":"text","value":". Call it ","position":{"start":{"line":479,"column":1},"end":{"line":479,"column":1}},"key":"uQc7bPR497"},{"type":"inlineMath","value":"a_{\\text{new}}","position":{"start":{"line":479,"column":1},"end":{"line":479,"column":1}},"html":"anewa_{\\text{new}}anew","key":"UeTEkMpczb"},{"type":"text","value":". Add it to ","position":{"start":{"line":479,"column":1},"end":{"line":479,"column":1}},"key":"GFTSvpHQlT"},{"type":"text","value":"τ","position":{"start":{"line":479,"column":1},"end":{"line":479,"column":1}},"key":"LFKUDNeUR6"},{"type":"text","value":".","position":{"start":{"line":479,"column":1},"end":{"line":479,"column":1}},"key":"Uhr4HxnJyD"}],"key":"fXYgZV2Dux"},{"type":"listItem","spread":true,"position":{"start":{"line":480,"column":1},"end":{"line":482,"column":1}},"children":[{"type":"strong","position":{"start":{"line":480,"column":1},"end":{"line":480,"column":1}},"children":[{"type":"text","value":"Simulation","position":{"start":{"line":480,"column":1},"end":{"line":480,"column":1}},"key":"NnGnznoqls"}],"key":"modydEsfTv"},{"type":"text","value":": Simulate a complete game episode by starting with the action ","position":{"start":{"line":480,"column":1},"end":{"line":480,"column":1}},"key":"Tq9vb9kFsL"},{"type":"inlineMath","value":"a_{\\text{new}}","position":{"start":{"line":480,"column":1},"end":{"line":480,"column":1}},"html":"anewa_{\\text{new}}anew","key":"Cbm4htWf14"},{"type":"text","value":"\nand then playing according to ","position":{"start":{"line":480,"column":1},"end":{"line":480,"column":1}},"key":"dIfd6ZqD6B"},{"type":"inlineMath","value":"\\pi_\\text{rollout}","position":{"start":{"line":480,"column":1},"end":{"line":480,"column":1}},"html":"πrollout\\pi_\\text{rollout}πrollout","key":"JTtXezslzL"},{"type":"text","value":".\nThis results in the outcome ","position":{"start":{"line":480,"column":1},"end":{"line":480,"column":1}},"key":"rfbcfzNXMX"},{"type":"inlineMath","value":"r \\in \\{ +1, -1 \\}","position":{"start":{"line":480,"column":1},"end":{"line":480,"column":1}},"html":"r{+1,1}r \\in \\{ +1, -1 \\}r{+1,1}","key":"hJ2yOIPGDu"},{"type":"text","value":".","position":{"start":{"line":480,"column":1},"end":{"line":480,"column":1}},"key":"GzL36pu5M3"}],"key":"zKdZmaOAHi"},{"type":"listItem","spread":true,"position":{"start":{"line":483,"column":1},"end":{"line":487,"column":1}},"children":[{"type":"strong","position":{"start":{"line":483,"column":1},"end":{"line":483,"column":1}},"children":[{"type":"text","value":"Backup","position":{"start":{"line":483,"column":1},"end":{"line":483,"column":1}},"key":"GKgjBskw5y"}],"key":"Rl9bEtsahX"},{"type":"text","value":": For each ","position":{"start":{"line":483,"column":1},"end":{"line":483,"column":1}},"key":"zGoZ1MHxB2"},{"type":"inlineMath","value":"(s, a) \\in \\tau","position":{"start":{"line":483,"column":1},"end":{"line":483,"column":1}},"html":"(s,a)τ(s, a) \\in \\tau(s,a)τ","key":"RVmVn05ZgH"},{"type":"text","value":":","position":{"start":{"line":483,"column":1},"end":{"line":483,"column":1}},"key":"PRjA5j7mjL"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":484,"column":1},"end":{"line":487,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":484,"column":1},"end":{"line":484,"column":1}},"children":[{"type":"text","value":"Set ","position":{"start":{"line":484,"column":1},"end":{"line":484,"column":1}},"key":"k29Pgsnif1"},{"type":"inlineMath","value":"N^{s, a} \\gets N^{s, a} + 1","position":{"start":{"line":484,"column":1},"end":{"line":484,"column":1}},"html":"Ns,aNs,a+1N^{s, a} \\gets N^{s, a} + 1Ns,aNs,a+1","key":"ETMkTIw91a"}],"key":"OxkTP4sh4A"},{"type":"listItem","spread":true,"position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"children":[{"type":"inlineMath","value":"W^{s, a} \\gets W^{s, a} + r","position":{"start":{"line":485,"column":1},"end":{"line":485,"column":1}},"html":"Ws,aWs,a+rW^{s, a} \\gets W^{s, a} + rWs,aWs,a+r","key":"ZFvSFjq0aj"}],"key":"INHY8Oi3sv"},{"type":"listItem","spread":true,"position":{"start":{"line":486,"column":1},"end":{"line":487,"column":1}},"children":[{"type":"text","value":"Set ","position":{"start":{"line":486,"column":1},"end":{"line":486,"column":1}},"key":"vQsxygMcVu"},{"type":"inlineMath","value":"N^s \\gets N^s + 1","position":{"start":{"line":486,"column":1},"end":{"line":486,"column":1}},"html":"NsNs+1N^s \\gets N^s + 1NsNs+1","key":"yvBeh7fUB7"}],"key":"YZP6ZGNCRR"}],"key":"z9PchlPWkZ"}],"key":"jDV4QOZWl2"}],"key":"zbYdwTL17R"},{"type":"paragraph","position":{"start":{"line":488,"column":1},"end":{"line":490,"column":1}},"children":[{"type":"text","value":"After ","position":{"start":{"line":488,"column":1},"end":{"line":488,"column":1}},"key":"vaQe3qPU3T"},{"type":"inlineMath","value":"T","position":{"start":{"line":488,"column":1},"end":{"line":488,"column":1}},"html":"TTT","key":"nhBmhj7EWB"},{"type":"text","value":" repeats of the above,\nwe return the action with the highest UCB value ","position":{"start":{"line":488,"column":1},"end":{"line":488,"column":1}},"key":"GNesHGgUil"},{"type":"crossReference","position":{"start":{"line":488,"column":1},"end":{"line":488,"column":1}},"children":[{"type":"text","value":"(","key":"hg1hgBC3o5"},{"type":"text","value":"8.4","key":"g1SKCOzqvB"},{"type":"text","value":")","key":"RO3P3q6Xol"}],"identifier":"ucb-tree","label":"ucb-tree","kind":"equation","template":"(%s)","enumerator":"8.4","resolved":true,"html_id":"ucb-tree","key":"ZGKLnkzYRB"},{"type":"text","value":".\nThen play continues.","position":{"start":{"line":488,"column":1},"end":{"line":488,"column":1}},"key":"Lo2q4GKzwZ"}],"key":"E95c94BFpu"},{"type":"paragraph","position":{"start":{"line":492,"column":1},"end":{"line":493,"column":1}},"children":[{"type":"text","value":"Between turns, we can keep the subtree whose statistics we have visited so far.\nHowever, the rest of the tree for the actions we did ","position":{"start":{"line":492,"column":1},"end":{"line":492,"column":1}},"key":"LMAvJVhLuQ"},{"type":"emphasis","position":{"start":{"line":492,"column":1},"end":{"line":492,"column":1}},"children":[{"type":"text","value":"not","position":{"start":{"line":492,"column":1},"end":{"line":492,"column":1}},"key":"UnCyFEVBpY"}],"key":"lHiRcBKpRB"},{"type":"text","value":" end up taking gets discarded.","position":{"start":{"line":492,"column":1},"end":{"line":492,"column":1}},"key":"ByzfWzONKO"}],"key":"w0lnkMyfNT"}],"enumerator":"8.1","html_id":"mcts-algorithm","key":"wReh2OfZid"},{"type":"paragraph","position":{"start":{"line":496,"column":1},"end":{"line":497,"column":1}},"children":[{"type":"text","value":"The application which brought the MCTS algorithm to fame was DeepMind’s ","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"dzfHIjJAQW"},{"type":"strong","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"children":[{"type":"text","value":"AlphaGo","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"grwpsXC9he"}],"key":"Myyv3ktIN5"},{"type":"text","value":" ","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"DRRjVrjrmP"},{"type":"cite","kind":"narrative","label":"silver_mastering_2016","identifier":"silver_mastering_2016","children":[{"type":"text","value":"Silver ","key":"PeGMcaJ9Cq"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"PAwDcJ5rYJ"}],"key":"CmU6Kbhw2l"},{"type":"text","value":" (2016)","key":"z8Uf6xAhps"}],"enumerator":"1","key":"kxEWlYDWEQ"},{"type":"text","value":".\nSince then, it has been used in numerous applications ranging from games to automated theorem proving.","position":{"start":{"line":496,"column":1},"end":{"line":496,"column":1}},"key":"R3dxe4Lv1a"}],"key":"cCBvo4Xfkm"},{"type":"paragraph","position":{"start":{"line":499,"column":1},"end":{"line":502,"column":1}},"children":[{"type":"text","value":"How accurate is this Monte Carlo estimation?\nIt depends heavily on the rollout policy ","position":{"start":{"line":499,"column":1},"end":{"line":499,"column":1}},"key":"xB6fnEq5K7"},{"type":"inlineMath","value":"\\pi_\\text{rollout}","position":{"start":{"line":499,"column":1},"end":{"line":499,"column":1}},"html":"πrollout\\pi_\\text{rollout}πrollout","key":"MV6aX6a3Mk"},{"type":"text","value":".\nIf the distribution ","position":{"start":{"line":499,"column":1},"end":{"line":499,"column":1}},"key":"zyzHbOaPuj"},{"type":"inlineMath","value":"\\pi_\\text{rollout}","position":{"start":{"line":499,"column":1},"end":{"line":499,"column":1}},"html":"πrollout\\pi_\\text{rollout}πrollout","key":"cut5PAsw4T"},{"type":"text","value":" induces over games is very different from the distribution seen during real gameplay,\nwe might end up with a poor value approximation.","position":{"start":{"line":499,"column":1},"end":{"line":499,"column":1}},"key":"WgXnOQxSea"}],"key":"G7oHyhQT75"},{"type":"heading","depth":3,"position":{"start":{"line":504,"column":1},"end":{"line":504,"column":1}},"children":[{"type":"text","value":"Incorporating value functions and policies","position":{"start":{"line":504,"column":1},"end":{"line":504,"column":1}},"key":"UI4UILQrBh"}],"identifier":"incorporating-value-functions-and-policies","label":"Incorporating value functions and policies","html_id":"incorporating-value-functions-and-policies","implicit":true,"enumerator":"8.5.1","key":"UTuOLzs15X"},{"type":"paragraph","position":{"start":{"line":506,"column":1},"end":{"line":508,"column":1}},"children":[{"type":"text","value":"To remedy this,\nwe might make use of a value function ","position":{"start":{"line":506,"column":1},"end":{"line":506,"column":1}},"key":"rmOzWIBhzk"},{"type":"inlineMath","value":"v : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":506,"column":1},"end":{"line":506,"column":1}},"html":"v:SRv : \\mathcal{S} \\to \\mathbb{R}v:SR","key":"fPouNEslQL"},{"type":"text","value":" that more efficiently approximates the value of a state.\nThen, we can replace the simulation step of ","position":{"start":{"line":506,"column":1},"end":{"line":506,"column":1}},"key":"BCysyBkf4b"},{"type":"crossReference","position":{"start":{"line":506,"column":1},"end":{"line":506,"column":1}},"children":[{"type":"text","value":"MCTS","position":{"start":{"line":506,"column":1},"end":{"line":506,"column":1}},"key":"MfoimqoDSC"}],"identifier":"mcts-algorithm","label":"mcts-algorithm","kind":"proof:algorithm","template":"Algorithm %s","enumerator":"8.1","resolved":true,"html_id":"mcts-algorithm","key":"KwyWdqIFBy"},{"type":"text","value":" with evaluating ","position":{"start":{"line":506,"column":1},"end":{"line":506,"column":1}},"key":"cyWq6x8je8"},{"type":"inlineMath","value":"r = v(s_\\text{next})","position":{"start":{"line":506,"column":1},"end":{"line":506,"column":1}},"html":"r=v(snext)r = v(s_\\text{next})r=v(snext)","key":"DaQyHtMJA5"},{"type":"text","value":", where ","position":{"start":{"line":506,"column":1},"end":{"line":506,"column":1}},"key":"jkldp2bqzn"},{"type":"inlineMath","value":"s_\\text{next} = P(s_\\text{new}, a_\\text{new})","position":{"start":{"line":506,"column":1},"end":{"line":506,"column":1}},"html":"snext=P(snew,anew)s_\\text{next} = P(s_\\text{new}, a_\\text{new})snext=P(snew,anew)","key":"oxVo02L5fP"},{"type":"text","value":".","position":{"start":{"line":506,"column":1},"end":{"line":506,"column":1}},"key":"eRimKChwDu"}],"key":"xyBniXLrsl"},{"type":"paragraph","position":{"start":{"line":510,"column":1},"end":{"line":511,"column":1}},"children":[{"type":"text","value":"We might also make use of a ","position":{"start":{"line":510,"column":1},"end":{"line":510,"column":1}},"key":"jHWrATCWiv"},{"type":"strong","position":{"start":{"line":510,"column":1},"end":{"line":510,"column":1}},"children":[{"type":"text","value":"“guiding” policy","position":{"start":{"line":510,"column":1},"end":{"line":510,"column":1}},"key":"bx2AGrtlFE"}],"key":"pkYFlqlxh5"},{"type":"text","value":" ","position":{"start":{"line":510,"column":1},"end":{"line":510,"column":1}},"key":"it559iFFDl"},{"type":"inlineMath","value":"\\pi_\\text{guide} : \\mathcal{S} \\to \\triangle(\\mathcal{A})","position":{"start":{"line":510,"column":1},"end":{"line":510,"column":1}},"html":"πguide:S(A)\\pi_\\text{guide} : \\mathcal{S} \\to \\triangle(\\mathcal{A})πguide:S(A)","key":"cVce3wekSV"},{"type":"text","value":" that provides “intuition” as to which actions are more valuable in a given state.\nWe can scale the exploration term of ","position":{"start":{"line":510,"column":1},"end":{"line":510,"column":1}},"key":"EhrVEYD18i"},{"type":"crossReference","position":{"start":{"line":510,"column":1},"end":{"line":510,"column":1}},"children":[{"type":"text","value":"(","key":"qU2HTvK9mI"},{"type":"text","value":"8.4","key":"RUOPKDOXKv"},{"type":"text","value":")","key":"TjqKltwDVd"}],"identifier":"ucb-tree","label":"ucb-tree","kind":"equation","template":"(%s)","enumerator":"8.4","resolved":true,"html_id":"ucb-tree","key":"xyM6E4Qw1h"},{"type":"text","value":" according to the policy’s outputs.","position":{"start":{"line":510,"column":1},"end":{"line":510,"column":1}},"key":"vt0lrk2OVL"}],"key":"xwbbbyY1I4"},{"type":"paragraph","position":{"start":{"line":513,"column":1},"end":{"line":514,"column":1}},"children":[{"type":"text","value":"Putting these together,\nwe can describe an updated version of MCTS that makes use of these value functions and policy:","position":{"start":{"line":513,"column":1},"end":{"line":513,"column":1}},"key":"xVtksYmBCY"}],"key":"z6RBUuVJ8E"},{"type":"proof","kind":"algorithm","label":"mcts-policy-value","identifier":"mcts-policy-value","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Monte Carlo tree search with policy and value functions","position":{"start":{"line":516,"column":1},"end":{"line":516,"column":1}},"key":"Qe90eg8JIh"}],"key":"vsWryZ7XyZ"},{"type":"paragraph","position":{"start":{"line":519,"column":1},"end":{"line":519,"column":1}},"children":[{"type":"text","value":"Inputs:","position":{"start":{"line":519,"column":1},"end":{"line":519,"column":1}},"key":"IAx2FjSNTF"}],"key":"XARZpyzqQK"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":520,"column":1},"end":{"line":524,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":520,"column":1},"end":{"line":520,"column":1}},"children":[{"type":"inlineMath","value":"T","position":{"start":{"line":520,"column":1},"end":{"line":520,"column":1}},"html":"TTT","key":"ZQMlPKYyRJ"},{"type":"text","value":", the number of iterations per move","position":{"start":{"line":520,"column":1},"end":{"line":520,"column":1}},"key":"W1stSGmOn4"}],"key":"mLFu3JsxSj"},{"type":"listItem","spread":true,"position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"children":[{"type":"inlineMath","value":"v","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"html":"vvv","key":"OVOhZeSsMt"},{"type":"text","value":", a value function that evaluates how good a state is","position":{"start":{"line":521,"column":1},"end":{"line":521,"column":1}},"key":"FBlDvsQQV3"}],"key":"ogFJ0qGcei"},{"type":"listItem","spread":true,"position":{"start":{"line":522,"column":1},"end":{"line":522,"column":1}},"children":[{"type":"inlineMath","value":"\\pi_\\text{guide}","position":{"start":{"line":522,"column":1},"end":{"line":522,"column":1}},"html":"πguide\\pi_\\text{guide}πguide","key":"CIohosbuf4"},{"type":"text","value":", a guiding policy that encourages certain actions","position":{"start":{"line":522,"column":1},"end":{"line":522,"column":1}},"key":"ctIuBgDu2a"}],"key":"pt7Q8nhrL7"},{"type":"listItem","spread":true,"position":{"start":{"line":523,"column":1},"end":{"line":524,"column":1}},"children":[{"type":"inlineMath","value":"c","position":{"start":{"line":523,"column":1},"end":{"line":523,"column":1}},"html":"ccc","key":"zWCnNmAwUA"},{"type":"text","value":", a positive value that encourages exploration","position":{"start":{"line":523,"column":1},"end":{"line":523,"column":1}},"key":"PsFLBvEypF"}],"key":"WdDJj1ciY0"}],"key":"Mgi1Y7A6TI"},{"type":"paragraph","position":{"start":{"line":525,"column":1},"end":{"line":525,"column":1}},"children":[{"type":"text","value":"To select a move in state ","position":{"start":{"line":525,"column":1},"end":{"line":525,"column":1}},"key":"ISt7RT1CSt"},{"type":"inlineMath","value":"s_\\text{start}","position":{"start":{"line":525,"column":1},"end":{"line":525,"column":1}},"html":"sstarts_\\text{start}sstart","key":"GkcfHD9wvt"},{"type":"text","value":", we repeat the following four steps ","position":{"start":{"line":525,"column":1},"end":{"line":525,"column":1}},"key":"ZpwnKJq8Ja"},{"type":"inlineMath","value":"T","position":{"start":{"line":525,"column":1},"end":{"line":525,"column":1}},"html":"TTT","key":"aEEXShc3Jg"},{"type":"text","value":" times:","position":{"start":{"line":525,"column":1},"end":{"line":525,"column":1}},"key":"qzbRXu7HgJ"}],"key":"EEL9CWTpwM"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":527,"column":1},"end":{"line":542,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":527,"column":1},"end":{"line":535,"column":1}},"children":[{"type":"strong","position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"children":[{"type":"text","value":"Selection","position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"key":"E1B2WAeUpD"}],"key":"lUodEX89NL"},{"type":"text","value":": We start at ","position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"key":"vuBeusaCHr"},{"type":"inlineMath","value":"s = s_{\\text{start}}","position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"html":"s=sstarts = s_{\\text{start}}s=sstart","key":"vQ4nKazK4c"},{"type":"text","value":". Let ","position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"key":"urSBvwtvIE"},{"type":"text","value":"τ","position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"key":"eWbGPjxYL3"},{"type":"text","value":" be an empty list that we will use to track states and actions.","position":{"start":{"line":527,"column":1},"end":{"line":527,"column":1}},"key":"o6sFPdnJiR"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":528,"column":1},"end":{"line":535,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":528,"column":1},"end":{"line":535,"column":1}},"children":[{"type":"text","value":"Until ","position":{"start":{"line":528,"column":1},"end":{"line":528,"column":1}},"key":"XKQ4pjvsvy"},{"type":"inlineMath","value":"s","position":{"start":{"line":528,"column":1},"end":{"line":528,"column":1}},"html":"sss","key":"zD4Uxyla6c"},{"type":"text","value":" has at least one action that hasn’t been taken:","position":{"start":{"line":528,"column":1},"end":{"line":528,"column":1}},"key":"FU7rC756a2"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":529,"column":1},"end":{"line":535,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":529,"column":1},"end":{"line":533,"column":1}},"children":[{"type":"text","value":"Choose ","position":{"start":{"line":529,"column":1},"end":{"line":529,"column":1}},"key":"j3G1dfjr88"},{"type":"inlineMath","value":"a \\gets \\argmax_k \\text{UCB}^{s, k}","position":{"start":{"line":529,"column":1},"end":{"line":529,"column":1}},"html":"aarg maxkUCBs,ka \\gets \\argmax_k \\text{UCB}^{s, k}aargmaxkUCBs,k","key":"c0hvBIK0nb"},{"type":"text","value":", where\n","position":{"start":{"line":529,"column":1},"end":{"line":529,"column":1}},"key":"Dqxx2cje62"},{"type":"math","value":"\\text{UCB}^{s, a} = \\frac{W^{s, a}}{N^s} + c \\cdot \\pi_\\text{guide}(a \\mid s) \\sqrt{\\frac{\\ln N^s}{N^{s, a}}}","position":{"start":{"line":529,"column":1},"end":{"line":529,"column":1}},"identifier":"ucb-tree-policy","label":"ucb-tree-policy","html_id":"ucb-tree-policy","html":"UCBs,a=Ws,aNs+cπguide(as)lnNsNs,a\\text{UCB}^{s, a} = \\frac{W^{s, a}}{N^s} + c \\cdot \\pi_\\text{guide}(a \\mid s) \\sqrt{\\frac{\\ln N^s}{N^{s, a}}}UCBs,a=NsWs,a+cπguide(as)Ns,alnNs","enumerator":"8.5","key":"nCVGWKVWXd"}],"key":"GykwoBsnxS"},{"type":"listItem","spread":true,"position":{"start":{"line":534,"column":1},"end":{"line":534,"column":1}},"children":[{"type":"text","value":"Append ","position":{"start":{"line":534,"column":1},"end":{"line":534,"column":1}},"key":"rwESrj8os4"},{"type":"inlineMath","value":"(s, a)","position":{"start":{"line":534,"column":1},"end":{"line":534,"column":1}},"html":"(s,a)(s, a)(s,a)","key":"W1bJwEif7u"},{"type":"text","value":" to ","position":{"start":{"line":534,"column":1},"end":{"line":534,"column":1}},"key":"DHzdwBseEv"},{"type":"text","value":"τ","position":{"start":{"line":534,"column":1},"end":{"line":534,"column":1}},"key":"Gc68EA3ro5"}],"key":"lXGvb08EMi"},{"type":"listItem","spread":true,"position":{"start":{"line":535,"column":1},"end":{"line":535,"column":1}},"children":[{"type":"text","value":"Set ","position":{"start":{"line":535,"column":1},"end":{"line":535,"column":1}},"key":"z3XzFCVojM"},{"type":"inlineMath","value":"s \\gets P(s, a)","position":{"start":{"line":535,"column":1},"end":{"line":535,"column":1}},"html":"sP(s,a)s \\gets P(s, a)sP(s,a)","key":"C7zHb9od1B"}],"key":"DCVsJabQtx"}],"key":"u3BKPUcd3I"}],"key":"jt5SZYXvAm"}],"key":"i02VRN4FZh"}],"key":"PY6ro9eFOD"},{"type":"listItem","spread":true,"position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"children":[{"type":"strong","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"children":[{"type":"text","value":"Expansion","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"giP461wlrs"}],"key":"XxnrZv2gk4"},{"type":"text","value":": Let ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"Qcxw9KUbJq"},{"type":"inlineMath","value":"s_\\text{new}","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"html":"snews_\\text{new}snew","key":"lQEiqgfGzJ"},{"type":"text","value":" denote the final state in ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"zcl0WF1jnY"},{"type":"text","value":"τ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"dcTqX8EJvR"},{"type":"text","value":" (that has at least one action that hasn’t been taken). Choose one of these unexplored actions from ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"La4gT56AKN"},{"type":"inlineMath","value":"s_\\text{new}","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"html":"snews_\\text{new}snew","key":"udsjVbXmR0"},{"type":"text","value":". Call it ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"RRQ7uACBM1"},{"type":"inlineMath","value":"a_{\\text{new}}","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"html":"anewa_{\\text{new}}anew","key":"yJaeg5IYJ9"},{"type":"text","value":". Add it to ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"uPu6NuYJqX"},{"type":"text","value":"τ","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"Zpif9kiSYN"},{"type":"text","value":".","position":{"start":{"line":536,"column":1},"end":{"line":536,"column":1}},"key":"N9JLxi97mZ"}],"key":"JM3iHUhRkK"},{"type":"listItem","spread":true,"position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"children":[{"type":"strong","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"children":[{"type":"text","value":"Simulation","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"eecginK5S4"}],"key":"WbhV0YZWal"},{"type":"text","value":": Let ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"hpQUB6HyYw"},{"type":"inlineMath","value":"s_\\text{next} = P(s_\\text{new}, a_\\text{new})","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"snext=P(snew,anew)s_\\text{next} = P(s_\\text{new}, a_\\text{new})snext=P(snew,anew)","key":"MesuLVR4QX"},{"type":"text","value":". Evaluate ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"hUs7qSEFXs"},{"type":"inlineMath","value":"r = v(s_\\text{next})","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"r=v(snext)r = v(s_\\text{next})r=v(snext)","key":"BZCSs6qnNv"},{"type":"text","value":". This approximates the value of the game after taking the action ","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"qVEoSuaiZA"},{"type":"inlineMath","value":"a_\\text{new}","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"html":"anewa_\\text{new}anew","key":"kTFzdQwoXD"},{"type":"text","value":".","position":{"start":{"line":537,"column":1},"end":{"line":537,"column":1}},"key":"A3dPj8dSTh"}],"key":"YYCw3YdKN5"},{"type":"listItem","spread":true,"position":{"start":{"line":538,"column":1},"end":{"line":542,"column":1}},"children":[{"type":"strong","position":{"start":{"line":538,"column":1},"end":{"line":538,"column":1}},"children":[{"type":"text","value":"Backup","position":{"start":{"line":538,"column":1},"end":{"line":538,"column":1}},"key":"nTcccbRlsn"}],"key":"iQGmPbPyrE"},{"type":"text","value":": For each ","position":{"start":{"line":538,"column":1},"end":{"line":538,"column":1}},"key":"wcM5obA7iG"},{"type":"inlineMath","value":"(s, a) \\in \\tau","position":{"start":{"line":538,"column":1},"end":{"line":538,"column":1}},"html":"(s,a)τ(s, a) \\in \\tau(s,a)τ","key":"tsu8D9rvGV"},{"type":"text","value":":","position":{"start":{"line":538,"column":1},"end":{"line":538,"column":1}},"key":"xCitzujC9q"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":539,"column":1},"end":{"line":542,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":539,"column":1},"end":{"line":539,"column":1}},"children":[{"type":"inlineMath","value":"N^{s, a} \\gets N^{s, a} + 1","position":{"start":{"line":539,"column":1},"end":{"line":539,"column":1}},"html":"Ns,aNs,a+1N^{s, a} \\gets N^{s, a} + 1Ns,aNs,a+1","key":"tYDvosAvFn"}],"key":"IgdpQnUGuD"},{"type":"listItem","spread":true,"position":{"start":{"line":540,"column":1},"end":{"line":540,"column":1}},"children":[{"type":"inlineMath","value":"W^{s, a} \\gets W^{s, a} + r","position":{"start":{"line":540,"column":1},"end":{"line":540,"column":1}},"html":"Ws,aWs,a+rW^{s, a} \\gets W^{s, a} + rWs,aWs,a+r","key":"lXrrs7bOw3"}],"key":"PzxVInIKHl"},{"type":"listItem","spread":true,"position":{"start":{"line":541,"column":1},"end":{"line":542,"column":1}},"children":[{"type":"inlineMath","value":"N^s \\gets N^s + 1","position":{"start":{"line":541,"column":1},"end":{"line":541,"column":1}},"html":"NsNs+1N^s \\gets N^s + 1NsNs+1","key":"yxdMEJY9R1"}],"key":"xUAkvXAq8x"}],"key":"isbZdBabCp"}],"key":"GkM7JMFeqj"}],"key":"mTQv13OKBf"},{"type":"paragraph","position":{"start":{"line":543,"column":1},"end":{"line":544,"column":1}},"children":[{"type":"text","value":"We finally return the action with the highest UCB value ","position":{"start":{"line":543,"column":1},"end":{"line":543,"column":1}},"key":"lAVsskep2Z"},{"type":"crossReference","position":{"start":{"line":543,"column":1},"end":{"line":543,"column":1}},"children":[{"type":"text","value":"(","key":"naVKjq10Nh"},{"type":"text","value":"8.5","key":"QKMPu3th21"},{"type":"text","value":")","key":"jIKHXYVL0Z"}],"identifier":"ucb-tree-policy","label":"ucb-tree-policy","kind":"equation","template":"(%s)","enumerator":"8.5","resolved":true,"html_id":"ucb-tree-policy","key":"AboNRnRN4e"},{"type":"text","value":".\nThen play continues. As before, we can reuse the tree across timesteps.","position":{"start":{"line":543,"column":1},"end":{"line":543,"column":1}},"key":"uFZnjYQ3Qh"}],"key":"VpMjwdnEIQ"}],"enumerator":"8.2","html_id":"mcts-policy-value","key":"kDNzgoRsds"},{"type":"paragraph","position":{"start":{"line":547,"column":1},"end":{"line":553,"column":1}},"children":[{"type":"text","value":"How do we actually compute a useful ","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"key":"SkmKgNyIEK"},{"type":"inlineMath","value":"\\pi_\\text{guide}","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"html":"πguide\\pi_\\text{guide}πguide","key":"SQ1cW5N5xK"},{"type":"text","value":" and ","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"key":"nDxsil8R5Q"},{"type":"inlineMath","value":"v","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"html":"vvv","key":"JVd9TejnuL"},{"type":"text","value":"?\nIf we have some existing dataset of trajectories,\nwe could use ","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"key":"MrBO1UWn5x"},{"type":"link","url":"/imitation-learning","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"children":[{"type":"text","value":"supervised learning","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"key":"amLx3KzXGM"}],"urlSource":"./imitation_learning.md","dataUrl":"/imitation-learning.json","internal":true,"protocol":"file","key":"fqRQvQCHIo"},{"type":"text","value":" (that is, imitation learning)\nto generate a policy ","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"key":"HPjWUOTkRT"},{"type":"inlineMath","value":"\\pi_\\text{guide}","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"html":"πguide\\pi_\\text{guide}πguide","key":"pRm8Ux0Hzo"},{"type":"text","value":" via behavioral cloning\nand learn ","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"key":"v9TtUejVG5"},{"type":"inlineMath","value":"v","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"html":"vvv","key":"IdR9XJSWfc"},{"type":"text","value":" by regressing the game outcomes onto states.\nThen, plugging these into ","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"key":"MZR74RgZDy"},{"type":"crossReference","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"children":[{"type":"text","value":"the above algorithm","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"key":"b9dMWyFhop"}],"identifier":"mcts-policy-value","label":"mcts-policy-value","kind":"proof:algorithm","template":"Algorithm %s","enumerator":"8.2","resolved":true,"html_id":"mcts-policy-value","key":"LMDe8uE4NE"},{"type":"text","value":"\nresults in a stronger policy by using tree search to “think ahead”.","position":{"start":{"line":547,"column":1},"end":{"line":547,"column":1}},"key":"s1rDClPASm"}],"key":"RMKes2L4W2"},{"type":"paragraph","position":{"start":{"line":555,"column":1},"end":{"line":556,"column":1}},"children":[{"type":"text","value":"But we don’t have to stop at just one improvement step;\nwe could iterate this process via ","position":{"start":{"line":555,"column":1},"end":{"line":555,"column":1}},"key":"c8wGUY00vG"},{"type":"strong","position":{"start":{"line":555,"column":1},"end":{"line":555,"column":1}},"children":[{"type":"text","value":"self-play","position":{"start":{"line":555,"column":1},"end":{"line":555,"column":1}},"key":"EMYsNnhXmb"}],"key":"UejngwKfPT"},{"type":"text","value":".","position":{"start":{"line":555,"column":1},"end":{"line":555,"column":1}},"key":"J5MXlTTbPa"}],"key":"Xf54sP74bR"},{"type":"heading","depth":3,"position":{"start":{"line":558,"column":1},"end":{"line":558,"column":1}},"children":[{"type":"text","value":"Self-play","position":{"start":{"line":558,"column":1},"end":{"line":558,"column":1}},"key":"ITrLXhBuEs"}],"identifier":"self-play","label":"Self-play","html_id":"self-play","implicit":true,"enumerator":"8.5.2","key":"cZuRypoKzF"},{"type":"paragraph","position":{"start":{"line":560,"column":1},"end":{"line":570,"column":1}},"children":[{"type":"text","value":"Recall the ","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"key":"nEbwpUzMop"},{"type":"crossReference","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"children":[{"type":"text","value":"policy iteration","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"key":"GTi3PBfymy"}],"identifier":"policy_iteration","label":"policy_iteration","kind":"heading","template":"Section %s","enumerator":"1.5.3.2","resolved":true,"html_id":"policy-iteration","remote":true,"url":"/mdps","dataUrl":"/mdps.json","key":"ZtgFtGEapo"},{"type":"text","value":" algorithm from the ","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"key":"nqRIWxMLtA"},{"type":"link","url":"/mdps","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"children":[{"type":"text","value":"MDPs","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"key":"JZoYoV34Ma"}],"urlSource":"./mdps.md","dataUrl":"/mdps.json","internal":true,"protocol":"file","key":"hFw4m5RESA"},{"type":"text","value":" chapter.\nPolicy iteration alternates between ","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"key":"V9QO5yq3Oa"},{"type":"strong","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"children":[{"type":"text","value":"policy evaluation","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"key":"c2Z7jl698T"}],"key":"FaUe7UG2e0"},{"type":"text","value":" (taking ","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"key":"Sc190b6bLv"},{"type":"text","value":"π","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"key":"Vr9xMQO0Mb"},{"type":"text","value":" and computing ","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"key":"GBpQi2L4OR"},{"type":"inlineMath","value":"V^\\pi","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"html":"VπV^\\piVπ","key":"B1cYWDJXxl"},{"type":"text","value":")\nand ","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"key":"vePPZReeno"},{"type":"strong","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"children":[{"type":"text","value":"policy improvement","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"key":"dWSxvtxuNL"}],"key":"MmlSzWu3pG"},{"type":"text","value":" (setting ","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"key":"NK15O0hvAn"},{"type":"text","value":"π","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"key":"esZd1ZS8m0"},{"type":"text","value":" to be greedy with respect to ","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"key":"F8QtT6anx1"},{"type":"inlineMath","value":"V^\\pi","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"html":"VπV^\\piVπ","key":"hepPJlsmvH"},{"type":"text","value":").\nAbove, we saw how MCTS can be thought of as a “policy improvement” operation:\nfor a given policy ","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"key":"tlYCpR6da2"},{"type":"inlineMath","value":"\\pi^0","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"html":"π0\\pi^0π0","key":"NPIih6jZ2q"},{"type":"text","value":",\nwe can use it to guide MCTS,\nresulting in an algorithm that is itself a policy ","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"key":"XANe5vDeu0"},{"type":"inlineMath","value":"\\pi^0_\\text{MCTS}","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"html":"πMCTS0\\pi^0_\\text{MCTS}πMCTS0","key":"UEOpxzjcmt"},{"type":"text","value":" that maps from states to actions.\nNow, we can use ","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"key":"yF8308cRGN"},{"type":"link","url":"/imitation-learning","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"children":[{"type":"text","value":"behavioral cloning","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"key":"Qbm3aayKHJ"}],"urlSource":"./imitation_learning.md","dataUrl":"/imitation-learning.json","internal":true,"protocol":"file","key":"qJlSFxGBLO"},{"type":"text","value":"\nto obtain a new policy ","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"key":"re0y2oIH9m"},{"type":"inlineMath","value":"\\pi^1","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"html":"π1\\pi^1π1","key":"SAg9lfppJv"},{"type":"text","value":" that imitates ","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"key":"p2KRYvz4rX"},{"type":"inlineMath","value":"\\pi^0_\\text{MCTS}","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"html":"πMCTS0\\pi^0_\\text{MCTS}πMCTS0","key":"QuK9L5M52B"},{"type":"text","value":".\nWe can now use ","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"key":"nYXX5PyoRh"},{"type":"inlineMath","value":"\\pi^1","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"html":"π1\\pi^1π1","key":"gtSi9gFqXn"},{"type":"text","value":" to guide MCTS,\nand repeat.","position":{"start":{"line":560,"column":1},"end":{"line":560,"column":1}},"key":"IjQ5ckrxkO"}],"key":"ua746GqkQ5"},{"type":"proof","kind":"algorithm","label":"mcts-self-play","identifier":"mcts-self-play","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"MCTS with self-play","position":{"start":{"line":572,"column":1},"end":{"line":572,"column":1}},"key":"z1jA5xyjeE"}],"key":"QdYJvI82vw"},{"type":"paragraph","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"children":[{"type":"text","value":"Input:","position":{"start":{"line":575,"column":1},"end":{"line":575,"column":1}},"key":"R7KBoLFRTR"}],"key":"wn12RbLZdM"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":577,"column":1},"end":{"line":581,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":577,"column":1},"end":{"line":577,"column":1}},"children":[{"type":"text","value":"A parameterized policy class ","position":{"start":{"line":577,"column":1},"end":{"line":577,"column":1}},"key":"QlznzJzBur"},{"type":"inlineMath","value":"\\pi_\\theta : \\mathcal{S} \\to \\triangle(\\mathcal{A})","position":{"start":{"line":577,"column":1},"end":{"line":577,"column":1}},"html":"πθ:S(A)\\pi_\\theta : \\mathcal{S} \\to \\triangle(\\mathcal{A})πθ:S(A)","key":"BdNNkVAsDw"}],"key":"E1Z1yhLPYR"},{"type":"listItem","spread":true,"position":{"start":{"line":578,"column":1},"end":{"line":578,"column":1}},"children":[{"type":"text","value":"A parameterized value function class ","position":{"start":{"line":578,"column":1},"end":{"line":578,"column":1}},"key":"VTAFAzlbfJ"},{"type":"inlineMath","value":"v_\\lambda : \\mathcal{S} \\to \\mathbb{R}","position":{"start":{"line":578,"column":1},"end":{"line":578,"column":1}},"html":"vλ:SRv_\\lambda : \\mathcal{S} \\to \\mathbb{R}vλ:SR","key":"gp0Kk2DPmC"}],"key":"Aj3QIxLWZ3"},{"type":"listItem","spread":true,"position":{"start":{"line":579,"column":1},"end":{"line":579,"column":1}},"children":[{"type":"text","value":"A number of trajectories ","position":{"start":{"line":579,"column":1},"end":{"line":579,"column":1}},"key":"OOGvsAlTCz"},{"type":"inlineMath","value":"M","position":{"start":{"line":579,"column":1},"end":{"line":579,"column":1}},"html":"MMM","key":"idnBHyJp4h"},{"type":"text","value":" to generate","position":{"start":{"line":579,"column":1},"end":{"line":579,"column":1}},"key":"pzsYPRYwmh"}],"key":"PjlmK9VhX9"},{"type":"listItem","spread":true,"position":{"start":{"line":580,"column":1},"end":{"line":581,"column":1}},"children":[{"type":"text","value":"The initial parameters ","position":{"start":{"line":580,"column":1},"end":{"line":580,"column":1}},"key":"ArUjwL59Ra"},{"type":"inlineMath","value":"\\theta^0, \\lambda^0","position":{"start":{"line":580,"column":1},"end":{"line":580,"column":1}},"html":"θ0,λ0\\theta^0, \\lambda^0θ0,λ0","key":"kl0fIf8pLW"}],"key":"rMBahpWZB2"}],"key":"ZEYdyItlG8"},{"type":"paragraph","position":{"start":{"line":582,"column":1},"end":{"line":582,"column":1}},"children":[{"type":"text","value":"For ","position":{"start":{"line":582,"column":1},"end":{"line":582,"column":1}},"key":"DPmDhD8nVu"},{"type":"inlineMath","value":"t = 0, \\dots, T-1","position":{"start":{"line":582,"column":1},"end":{"line":582,"column":1}},"html":"t=0,,T1t = 0, \\dots, T-1t=0,,T1","key":"rebssUi2To"},{"type":"text","value":":","position":{"start":{"line":582,"column":1},"end":{"line":582,"column":1}},"key":"fsTSj9TZiF"}],"key":"fhB7okhdlv"},{"type":"list","ordered":false,"spread":false,"position":{"start":{"line":584,"column":1},"end":{"line":590,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":584,"column":1},"end":{"line":584,"column":1}},"children":[{"type":"strong","position":{"start":{"line":584,"column":1},"end":{"line":584,"column":1}},"children":[{"type":"text","value":"Policy improvement","position":{"start":{"line":584,"column":1},"end":{"line":584,"column":1}},"key":"wPbVuiSXXt"}],"key":"lcJ7lz6VYE"},{"type":"text","value":": Let ","position":{"start":{"line":584,"column":1},"end":{"line":584,"column":1}},"key":"Oqqo1xD4fF"},{"type":"inlineMath","value":"\\pi^t_\\text{MCTS}","position":{"start":{"line":584,"column":1},"end":{"line":584,"column":1}},"html":"πMCTSt\\pi^t_\\text{MCTS}πMCTSt","key":"gnTNEhZaqN"},{"type":"text","value":" denote the policy obtained by ","position":{"start":{"line":584,"column":1},"end":{"line":584,"column":1}},"key":"E2icA7aPiA"},{"type":"crossReference","position":{"start":{"line":584,"column":1},"end":{"line":584,"column":1}},"children":[{"type":"text","value":"Algorithm ","key":"GiWjdU1rsN"},{"type":"text","value":"8.2","key":"IbHWG8hMME"}],"identifier":"mcts-policy-value","label":"mcts-policy-value","kind":"proof:algorithm","template":"Algorithm %s","enumerator":"8.2","resolved":true,"html_id":"mcts-policy-value","key":"rKkciOJZvJ"},{"type":"text","value":" with ","position":{"start":{"line":584,"column":1},"end":{"line":584,"column":1}},"key":"Tj2fk4YS2X"},{"type":"inlineMath","value":"\\pi_{\\theta^t}","position":{"start":{"line":584,"column":1},"end":{"line":584,"column":1}},"html":"πθt\\pi_{\\theta^t}πθt","key":"eMBowDXAa7"},{"type":"text","value":" and ","position":{"start":{"line":584,"column":1},"end":{"line":584,"column":1}},"key":"Vok4ZYGLPc"},{"type":"inlineMath","value":"v_{\\lambda^t}","position":{"start":{"line":584,"column":1},"end":{"line":584,"column":1}},"html":"vλtv_{\\lambda^t}vλt","key":"hPrVK41zrN"},{"type":"text","value":". We use ","position":{"start":{"line":584,"column":1},"end":{"line":584,"column":1}},"key":"CkjT611v0E"},{"type":"inlineMath","value":"\\pi^t_\\text{MCTS}","position":{"start":{"line":584,"column":1},"end":{"line":584,"column":1}},"html":"πMCTSt\\pi^t_\\text{MCTS}πMCTSt","key":"aj8CSSMywd"},{"type":"text","value":" to play against itself ","position":{"start":{"line":584,"column":1},"end":{"line":584,"column":1}},"key":"FRBhc2Dl5K"},{"type":"inlineMath","value":"M","position":{"start":{"line":584,"column":1},"end":{"line":584,"column":1}},"html":"MMM","key":"APSt1tPLsF"},{"type":"text","value":" times. This generates ","position":{"start":{"line":584,"column":1},"end":{"line":584,"column":1}},"key":"Krn3K4cZaC"},{"type":"inlineMath","value":"M","position":{"start":{"line":584,"column":1},"end":{"line":584,"column":1}},"html":"MMM","key":"G19ZrQu4Xo"},{"type":"text","value":" trajectories ","position":{"start":{"line":584,"column":1},"end":{"line":584,"column":1}},"key":"gzErhDLUxV"},{"type":"inlineMath","value":"\\tau_0, \\dots, \\tau_{M-1}","position":{"start":{"line":584,"column":1},"end":{"line":584,"column":1}},"html":"τ0,,τM1\\tau_0, \\dots, \\tau_{M-1}τ0,,τM1","key":"eyAN1zqsGu"},{"type":"text","value":".","position":{"start":{"line":584,"column":1},"end":{"line":584,"column":1}},"key":"yGKIAXzpy3"}],"key":"jGoqIieHyK"},{"type":"listItem","spread":true,"position":{"start":{"line":585,"column":1},"end":{"line":590,"column":1}},"children":[{"type":"strong","position":{"start":{"line":585,"column":1},"end":{"line":585,"column":1}},"children":[{"type":"text","value":"Policy evaluation","position":{"start":{"line":585,"column":1},"end":{"line":585,"column":1}},"key":"PMUIfIu6Q1"}],"key":"WIKyhuK5zj"},{"type":"text","value":": Use behavioral cloning to find a set of policy parameters ","position":{"start":{"line":585,"column":1},"end":{"line":585,"column":1}},"key":"C3MRk0HHhI"},{"type":"inlineMath","value":"\\theta^{t+1}","position":{"start":{"line":585,"column":1},"end":{"line":585,"column":1}},"html":"θt+1\\theta^{t+1}θt+1","key":"AfL7ejbnue"},{"type":"text","value":" that mimic the behavior of ","position":{"start":{"line":585,"column":1},"end":{"line":585,"column":1}},"key":"tUhazQ7t00"},{"type":"inlineMath","value":"\\pi^t_\\text{MCTS}","position":{"start":{"line":585,"column":1},"end":{"line":585,"column":1}},"html":"πMCTSt\\pi^t_\\text{MCTS}πMCTSt","key":"B2UUcVBWqF"},{"type":"text","value":" and a set of value function parameters ","position":{"start":{"line":585,"column":1},"end":{"line":585,"column":1}},"key":"d37opKeMXE"},{"type":"inlineMath","value":"\\lambda^{t+1}","position":{"start":{"line":585,"column":1},"end":{"line":585,"column":1}},"html":"λt+1\\lambda^{t+1}λt+1","key":"m8B0IfbR4U"},{"type":"text","value":" that approximate its value function. That is,","position":{"start":{"line":585,"column":1},"end":{"line":585,"column":1}},"key":"MGc2R5qACe"},{"type":"math","tight":"before","value":"\\begin{align*}\n \\theta^{t+1} &\\gets \\argmin_\\theta \\sum_{m=0}^{M-1} \\sum_{\\hi=0}^{H-1} - \\log \\pi_\\theta(a^m_\\hi \\mid s^m_\\hi) \\\\\n \\lambda^{t+1} &\\gets \\argmin_\\lambda \\sum_{m=0}^{M-1} \\sum_{\\hi=0}^{H-1} (v_\\lambda(s^m_\\hi) - R(\\tau_m))^2\n \\end{align*}","position":{"start":{"line":586,"column":1},"end":{"line":588,"column":1}},"html":"θt+1arg minθm=0M1h=0H1logπθ(ahmshm)λt+1arg minλm=0M1h=0H1(vλ(shm)R(τm))2\\begin{align*}\n \\theta^{t+1} &\\gets \\argmin_\\theta \\sum_{m=0}^{M-1} \\sum_{\\hi=0}^{H-1} - \\log \\pi_\\theta(a^m_\\hi \\mid s^m_\\hi) \\\\\n \\lambda^{t+1} &\\gets \\argmin_\\lambda \\sum_{m=0}^{M-1} \\sum_{\\hi=0}^{H-1} (v_\\lambda(s^m_\\hi) - R(\\tau_m))^2\n \\end{align*}θt+1λt+1θargminm=0M1h=0H1logπθ(ahmshm)λargminm=0M1h=0H1(vλ(shm)R(τm))2","enumerator":"8.6","key":"kBJNmJXDSu"}],"key":"G83s0AFJXu"}],"key":"gnXol4SaGz"},{"type":"paragraph","position":{"start":{"line":591,"column":1},"end":{"line":594,"column":1}},"children":[{"type":"text","value":"Note that in implementation,\nthe policy and value are typically both returned by a single deep neural network,\nthat is, with a single set of parameters,\nand the two loss functions are added together.","position":{"start":{"line":591,"column":1},"end":{"line":591,"column":1}},"key":"XXDK1FFORO"}],"key":"gvkYuCer3i"}],"enumerator":"8.3","html_id":"mcts-self-play","key":"lVv5PuXbci"},{"type":"paragraph","position":{"start":{"line":597,"column":1},"end":{"line":597,"column":1}},"children":[{"type":"text","value":"This algorithm was brought to fame by AlphaGo Zero ","position":{"start":{"line":597,"column":1},"end":{"line":597,"column":1}},"key":"r3UjRvhAlr"},{"type":"cite","kind":"narrative","label":"silver_mastering_2017","identifier":"silver_mastering_2017","children":[{"type":"text","value":"Silver ","key":"dinZ3Jz54Y"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"Hs4VW1b5it"}],"key":"VNzBd5mrMC"},{"type":"text","value":" (2017)","key":"z5qjmSupQ0"}],"enumerator":"2","key":"VqxrAMZh2E"},{"type":"text","value":".","position":{"start":{"line":597,"column":1},"end":{"line":597,"column":1}},"key":"mARbqUxSJU"}],"key":"bpu3kFRiyQ"},{"type":"heading","depth":2,"position":{"start":{"line":599,"column":1},"end":{"line":599,"column":1}},"children":[{"type":"text","value":"Summary","position":{"start":{"line":599,"column":1},"end":{"line":599,"column":1}},"key":"iIfg2wtdna"}],"identifier":"summary","label":"Summary","html_id":"summary","implicit":true,"enumerator":"8.6","key":"NHXVlvL6JI"},{"type":"paragraph","position":{"start":{"line":601,"column":1},"end":{"line":608,"column":1}},"children":[{"type":"text","value":"In this chapter,\nwe explored tree search-based algorithms for deterministic, zero sum, fully observable two-player games.\nWe began with ","position":{"start":{"line":601,"column":1},"end":{"line":601,"column":1}},"key":"rN0DTTaenM"},{"type":"crossReference","position":{"start":{"line":601,"column":1},"end":{"line":601,"column":1}},"children":[{"type":"text","value":"min-max search","position":{"start":{"line":601,"column":1},"end":{"line":601,"column":1}},"key":"qFja8y3zRj"}],"identifier":"min-max-search","label":"min-max-search","kind":"heading","template":"Section %s","enumerator":"8.3","resolved":true,"html_id":"min-max-search","key":"RohyMYdUXK"},{"type":"text","value":",\nan algorithm for exactly solving the game value of every possible state.\nHowever, this is impossible to execute in practice,\nand so we must resort to various ways to reduce the number of states and actions that we must explore.\n","position":{"start":{"line":601,"column":1},"end":{"line":601,"column":1}},"key":"g79rTel1Pt"},{"type":"crossReference","position":{"start":{"line":601,"column":1},"end":{"line":601,"column":1}},"children":[{"type":"text","value":"Alpha-beta search","position":{"start":{"line":601,"column":1},"end":{"line":601,"column":1}},"key":"FQwPGoXESj"}],"identifier":"alpha-beta-search","label":"alpha-beta-search","kind":"heading","template":"Section %s","enumerator":"8.4","resolved":true,"html_id":"alpha-beta-search","key":"NWWqoF10ll"},{"type":"text","value":" does this by ","position":{"start":{"line":601,"column":1},"end":{"line":601,"column":1}},"key":"WpFHqMg1oF"},{"type":"emphasis","position":{"start":{"line":601,"column":1},"end":{"line":601,"column":1}},"children":[{"type":"text","value":"pruning","position":{"start":{"line":601,"column":1},"end":{"line":601,"column":1}},"key":"f7X5Tno0Hf"}],"key":"e5qkyV7f2g"},{"type":"text","value":" away states that we already know to be suboptimal,\nand ","position":{"start":{"line":601,"column":1},"end":{"line":601,"column":1}},"key":"wXiKtK0h7Q"},{"type":"crossReference","position":{"start":{"line":601,"column":1},"end":{"line":601,"column":1}},"children":[{"type":"text","value":"Monte Carlo Tree Search","position":{"start":{"line":601,"column":1},"end":{"line":601,"column":1}},"key":"ZacbLQxiFt"}],"identifier":"monte-carlo-tree-search","label":"monte-carlo-tree-search","kind":"heading","template":"Section %s","enumerator":"8.5","resolved":true,"html_id":"monte-carlo-tree-search","key":"Rxftq8RSaV"},{"type":"text","value":" ","position":{"start":{"line":601,"column":1},"end":{"line":601,"column":1}},"key":"CJBiK7HVoE"},{"type":"emphasis","position":{"start":{"line":601,"column":1},"end":{"line":601,"column":1}},"children":[{"type":"text","value":"approximates","position":{"start":{"line":601,"column":1},"end":{"line":601,"column":1}},"key":"xQBmBCDxDR"}],"key":"LPdHA4fwTi"},{"type":"text","value":" the value of states instead of evaluating them exactly.","position":{"start":{"line":601,"column":1},"end":{"line":601,"column":1}},"key":"gVwqVMEbtY"}],"key":"rWpPpwbHkE"},{"type":"heading","depth":2,"position":{"start":{"line":611,"column":1},"end":{"line":611,"column":1}},"children":[{"type":"text","value":"References","position":{"start":{"line":611,"column":1},"end":{"line":611,"column":1}},"key":"hRBoBFgEw1"}],"identifier":"references","label":"References","html_id":"references","implicit":true,"enumerator":"8.7","key":"pvORoYUG3d"},{"type":"paragraph","position":{"start":{"line":613,"column":1},"end":{"line":621,"column":1}},"children":[{"type":"text","value":"Chapter 5 of ","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"key":"R4F5ALwGf8"},{"type":"cite","kind":"narrative","label":"russell_artificial_2021","identifier":"russell_artificial_2021","children":[{"type":"text","value":"Russell & Norvig (2021)","key":"dyP8c6FEIY"}],"enumerator":"3","key":"XemVYuOdma"},{"type":"text","value":" provides an excellent overview of search methods in games.\nThe original AlphaGo paper ","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"key":"x27wvsyrPC"},{"type":"cite","kind":"narrative","label":"silver_mastering_2016","identifier":"silver_mastering_2016","children":[{"type":"text","value":"Silver ","key":"u0WGOVbqKy"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"BMYIsWzhRg"}],"key":"VaXlaOb3bF"},{"type":"text","value":" (2016)","key":"HyJcHhkbOy"}],"enumerator":"1","key":"JyMdIb8xrJ"},{"type":"text","value":" was a groundbreaking application of these technologies.\n","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"key":"BlOjC33gPX"},{"type":"cite","kind":"narrative","label":"silver_mastering_2017","identifier":"silver_mastering_2017","children":[{"type":"text","value":"Silver ","key":"QmHKLx1wRb"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"Y7hanWy4jM"}],"key":"saRVaDM4Ye"},{"type":"text","value":" (2017)","key":"BnTs83Xsjk"}],"enumerator":"2","key":"Y4TP771jNC"},{"type":"text","value":" removed the imitation learning phase,\nlearning from scratch.\nAlphaZero ","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"key":"UOrrpqkgoi"},{"type":"cite","kind":"narrative","label":"silver_general_2018","identifier":"silver_general_2018","children":[{"type":"text","value":"Silver ","key":"sD1jB3VLX9"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"DIVZp0mN9W"}],"key":"LrBvJ8JlqD"},{"type":"text","value":" (2018)","key":"Ycd0dJUGo4"}],"enumerator":"4","key":"OjvPMINj2V"},{"type":"text","value":" then extended to other games beyond Go,\nnamely shogi and chess,\nalso learning from scratch.\nIn MuZero ","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"key":"BRkb7BCxcm"},{"type":"cite","kind":"narrative","label":"schrittwieser_mastering_2020","identifier":"schrittwieser_mastering_2020","children":[{"type":"text","value":"Schrittwieser ","key":"nnabeeLbli"},{"type":"emphasis","children":[{"type":"text","value":"et al.","key":"Be9Q1dgbKW"}],"key":"NsH13MewY3"},{"type":"text","value":" (2020)","key":"sxXi8zYecX"}],"enumerator":"5","key":"KchTKTqZG8"},{"type":"text","value":",\nthis was further extended by learning a model of the game dynamics.","position":{"start":{"line":613,"column":1},"end":{"line":613,"column":1}},"key":"s4dSgcyC5k"}],"key":"ttuNxkfhFb"}],"key":"alWkkhdEsS"}],"key":"wNnRa56F0O"},"references":{"cite":{"order":["silver_mastering_2016","silver_mastering_2017","russell_artificial_2021","silver_general_2018","schrittwieser_mastering_2020"],"data":{"silver_mastering_2016":{"label":"silver_mastering_2016","enumerator":"1","doi":"10.1038/nature16961","html":"Silver, D., Huang, A., Maddison, C. J., Guez, A., Sifre, L., van den Driessche, G., Schrittwieser, J., Antonoglou, I., Panneershelvam, V., Lanctot, M., Dieleman, S., Grewe, D., Nham, J., Kalchbrenner, N., Sutskever, I., Lillicrap, T., Leach, M., Kavukcuoglu, K., Graepel, T., & Hassabis, D. (2016). Mastering the Game of Go with Deep Neural Networks and Tree Search. Nature, 529(7587), 484–489. 10.1038/nature16961","url":"https://doi.org/10.1038/nature16961"},"silver_mastering_2017":{"label":"silver_mastering_2017","enumerator":"2","doi":"10.1038/nature24270","html":"Silver, D., Schrittwieser, J., Simonyan, K., Antonoglou, I., Huang, A., Guez, A., Hubert, T., Baker, L., Lai, M., Bolton, A., Chen, Y., Lillicrap, T., Hui, F., Sifre, L., van den Driessche, G., Graepel, T., & Hassabis, D. (2017). Mastering the Game of Go without Human Knowledge. Nature, 550(7676), 354–359. 10.1038/nature24270","url":"https://doi.org/10.1038/nature24270"},"russell_artificial_2021":{"label":"russell_artificial_2021","enumerator":"3","html":"Russell, S. J., & Norvig, P. (2021). Artificial Intelligence: A Modern Approach (Fourth edition). Pearson."},"silver_general_2018":{"label":"silver_general_2018","enumerator":"4","doi":"10.1126/science.aar6404","html":"Silver, D., Hubert, T., Schrittwieser, J., Antonoglou, I., Lai, M., Guez, A., Lanctot, M., Sifre, L., Kumaran, D., Graepel, T., Lillicrap, T., Simonyan, K., & Hassabis, D. (2018). A General Reinforcement Learning Algorithm That Masters Chess, Shogi, and Go through Self-Play. Science, 362(6419), 1140–1144. 10.1126/science.aar6404","url":"https://doi.org/10.1126/science.aar6404"},"schrittwieser_mastering_2020":{"label":"schrittwieser_mastering_2020","enumerator":"5","doi":"10.1038/s41586-020-03051-4","html":"Schrittwieser, J., Antonoglou, I., Hubert, T., Simonyan, K., Sifre, L., Schmitt, S., Guez, A., Lockhart, E., Hassabis, D., Graepel, T., Lillicrap, T., & Silver, D. (2020). Mastering Atari, Go, Chess and Shogi by Planning with a Learned Model. Nature, 588(7839), 604–609. 10.1038/s41586-020-03051-4","url":"https://doi.org/10.1038/s41586-020-03051-4"}}}},"footer":{"navigation":{"prev":{"title":"7 Imitation Learning","url":"/imitation-learning","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"9 Exploration in MDPs","url":"/exploration","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file diff --git a/supervised-learning.html b/supervised-learning.html index f71ebbd..d560df7 100644 --- a/supervised-learning.html +++ b/supervised-learning.html @@ -1,4 +1,4 @@ -4 Supervised learning - CS/STAT 184: Introduction to Reinforcement Learning

    4 Supervised learning

    4.1Introduction

    This section will cover the details of implementing the fit function above: + c-1.4,4.9-3.6,8.6-6.8,11.1c-3.1,2.5-7.3,3.7-12.4,3.8H63c-6.7,0-10,2.9-10,8.8">Made with MyST

    4 Supervised learning

    4.1Introduction

    This section will cover the details of implementing the fit function above: That is, how to use a dataset of labelled samples (x1,y1),,(xN,yN)(x_1, y_1), \dots, (x_N, y_N) to find a function ff that minimizes the empirical risk. This requires two ingredients:

    1. A function class F\mathcal{F} to search over
    2. A fitting method for minimizing the empirical risk over this class

    The two main function classes we will cover are linear models and neural networks. Both of these function classes are parameterized by some parameters θ, and the fitting method will search over these parameters to minimize the empirical risk:

    The most common fitting method for parameterized models is gradient descent.

    from jaxtyping import Float, Array
    -from collections.abc import Callable
    Params = Float[Array, " D"]
    +we to find a parameter (vector) θ^\hat \theta that minimizes the empirical risk:

    θ^=argminθ1Ni=1N(yifθ(xi))2\hat \theta = \arg\min_{\theta} \frac{1}{N} \sum_{i=1}^N (y_i - f_\theta(x_i))^2

    The most common fitting method for parameterized models is gradient descent.

    from jaxtyping import Float, Array
    +from collections.abc import Callable
    Params = Float[Array, " D"]
     
     
     def gradient_descent(
    @@ -40,26 +40,26 @@
         θ = θ_init
         for _ in range(epochs):
             θ = θ - η * grad(loss)(θ)
    -    return θ

    4.2Linear regression

    In linear regression, we assume that the function ff is linear in the parameters:

    F={xθxθRD}\mathcal{F} = \{ x \mapsto \theta^\top x \mid \theta \in \mathbb{R}^D \}

    This function class is extremely simple and only contains linear functions. + return θ

    4.2Linear regression

    In linear regression, we assume that the function ff is linear in the parameters:

    F={xθxθRD}\mathcal{F} = \{ x \mapsto \theta^\top x \mid \theta \in \mathbb{R}^D \}

    This function class is extremely simple and only contains linear functions. To expand its expressivity, we can transform the input xx using some feature function ϕ, i.e. x~=ϕ(x)\widetilde x = \phi(x), and then fit a linear model in the transformed space instead.

    def fit_linear(X: Float[Array, "N D"], y: Float[Array, " N"], φ=lambda x: x):
    +-68.267.847-113-73.952-191-73.952z'/>=ϕ(x), and then fit a linear model in the transformed space instead.

    def fit_linear(X: Float[Array, "N D"], y: Float[Array, " N"], φ=lambda x: x):
         """Fit a linear model to the given dataset using ordinary least squares."""
         X = vmap(φ)(X)
         θ = np.linalg.lstsq(X, y, rcond=None)[0]
    -    return lambda x: np.dot(φ(x), θ)

    4.3Neural networks

    In neural networks, we assume that the function ff is a composition of linear functions (represented by matrices WiW_i) and non-linear activation functions (denoted by σ):

    F={xσ(WLσ(WL1σ(W1x+b1)+bL1)+bL)}\mathcal{F} = \{ x \mapsto \sigma(W_L \sigma(W_{L-1} \dots \sigma(W_1 x + b_1) \dots + b_{L-1}) + b_L) \}

    where WiRDi+1×DiW_i \in \mathbb{R}^{D_{i+1} \times D_i} and biRDi+1b_i \in \mathbb{R}^{D_{i+1}} are the parameters of the ii-th layer, and σ is the activation function.

    This function class is much more expressive and contains many more parameters. + return lambda x: np.dot(φ(x), θ)

    4.3Neural networks

    In neural networks, we assume that the function ff is a composition of linear functions (represented by matrices WiW_i) and non-linear activation functions (denoted by σ):

    F={xσ(WLσ(WL1σ(W1x+b1)+bL1)+bL)}\mathcal{F} = \{ x \mapsto \sigma(W_L \sigma(W_{L-1} \dots \sigma(W_1 x + b_1) \dots + b_{L-1}) + b_L) \}

    where WiRDi+1×DiW_i \in \mathbb{R}^{D_{i+1} \times D_i} and biRDi+1b_i \in \mathbb{R}^{D_{i+1}} are the parameters of the ii-th layer, and σ is the activation function.

    This function class is much more expressive and contains many more parameters. This makes it more susceptible to overfitting on smaller datasets, but also allows it to represent more complex functions. In practice, however, neural networks exhibit interesting phenomena during training, and are often able to generalize well even with many parameters.

    Another reason for their popularity is the efficient backpropagation algorithm for computing the gradient of the empirical risk with respect to the parameters. Essentially, the hierarchical structure of the neural network, i.e. computing the output of the network as a composition of functions, -allows us to use the chain rule to compute the gradient of the output with respect to the parameters of each layer.

    Nielsen (2015) provides a comprehensive introduction to neural networks and backpropagation.

    References
    1. Nielsen, M. A. (2015). Neural Networks and Deep Learning. Determination Press.
    \ No newline at end of file diff --git a/supervised-learning.json b/supervised-learning.json index 3d3bc92..3d2f5fc 100644 --- a/supervised-learning.json +++ b/supervised-learning.json @@ -1 +1 @@ -{"kind":"Notebook","sha256":"e56ff69c011ee78674304db47cc4e85c51d95181fd2f1ca46bac12965fb5e8ee","slug":"supervised-learning","location":"/supervised_learning.md","dependencies":[],"frontmatter":{"title":"4 Supervised learning","numbering":{"all":{"enabled":true},"enumerator":{"template":"4.%s"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"exports":[{"format":"md","filename":"supervised_learning.md","url":"/build/supervised_learning-350bcacee6e0c7c9985fcefbbc20f999.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"KD1iZgJ5vg"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"4.1","key":"pebQqo8HZD"},{"type":"paragraph","position":{"start":{"line":20,"column":1},"end":{"line":22,"column":1}},"children":[{"type":"text","value":"This section will cover the details of implementing the ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"zA4wO8ch9l"},{"type":"inlineCode","value":"fit","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"ODNmkwQUtI"},{"type":"text","value":" function above:\nThat is, how to use a dataset of labelled samples ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"LylV3btd5G"},{"type":"inlineMath","value":"(x_1, y_1), \\dots, (x_N, y_N)","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"html":"(x1,y1),,(xN,yN)(x_1, y_1), \\dots, (x_N, y_N)(x1,y1),,(xN,yN)","key":"VBq3lYaj2L"},{"type":"text","value":" to find a function ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"Vh1mMtiSYm"},{"type":"inlineMath","value":"f","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"html":"fff","key":"MV1ZyJhB86"},{"type":"text","value":" that minimizes the empirical risk.\nThis requires two ingredients:","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"G6VQM7GD4o"}],"key":"O2EAmbuJhl"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":24,"column":1},"end":{"line":26,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"children":[{"type":"text","value":"A ","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"V2v5HGPsOE"},{"type":"strong","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"children":[{"type":"text","value":"function class","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"Ei7YgCnN7Y"}],"key":"sL3HSZh7h6"},{"type":"text","value":" ","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"h5PO1UFqT4"},{"type":"inlineMath","value":"\\mathcal{F}","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"html":"F\\mathcal{F}F","key":"LAOFkXgjni"},{"type":"text","value":" to search over","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"T0CGelH1wN"}],"key":"pEqRF7Xf5R"},{"type":"listItem","spread":true,"position":{"start":{"line":25,"column":1},"end":{"line":26,"column":1}},"children":[{"type":"text","value":"A ","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"KFRuvJI8vc"},{"type":"strong","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"fitting method","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"yklpYpC13J"}],"key":"wwxufNKvnb"},{"type":"text","value":" for minimizing the empirical risk over this class","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"nL29nQWsBi"}],"key":"Ypnzx5ARiS"}],"key":"EkeckTRHez"},{"type":"paragraph","position":{"start":{"line":27,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"text","value":"The two main function classes we will cover are ","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"wEgrdHG0qs"},{"type":"strong","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"text","value":"linear models","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"QE3fISrtjV"}],"key":"gn5wkDN4MA"},{"type":"text","value":" and ","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"alTOrrC0Nl"},{"type":"strong","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"text","value":"neural networks","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"A7bIpXFfZl"}],"key":"bClu4CDnZb"},{"type":"text","value":".\nBoth of these function classes are ","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"Gk76XO3FBb"},{"type":"emphasis","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"text","value":"parameterized","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"xTzWSGbbIn"}],"key":"YyrccUpKsH"},{"type":"text","value":" by some parameters ","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"oVTnE3Ktdg"},{"type":"text","value":"θ","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"wwUxz9J4ra"},{"type":"text","value":",\nand the fitting method will search over these parameters to minimize the empirical risk:","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"TS8Wijegzw"}],"key":"kxtmyCJXkb"},{"type":"proof","kind":"definition","label":"parameterized_empirical_risk_minimization","identifier":"parameterized_empirical_risk_minimization","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Parameterized empirical risk minimization","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"yu9nXesKL4"}],"key":"DxGd2PHTCY"},{"type":"paragraph","position":{"start":{"line":34,"column":1},"end":{"line":35,"column":1}},"children":[{"type":"text","value":"Given a dataset of samples ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"Ye9L27kVId"},{"type":"inlineMath","value":"(x_1, y_1), \\dots, (x_N, y_N)","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"html":"(x1,y1),,(xN,yN)(x_1, y_1), \\dots, (x_N, y_N)(x1,y1),,(xN,yN)","key":"SNlNsPWdRm"},{"type":"text","value":" and a class of functions ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"p6S1Ds5fN3"},{"type":"inlineMath","value":"\\mathcal{F}","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"html":"F\\mathcal{F}F","key":"nivVJ54weK"},{"type":"text","value":" parameterized by ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"X9roMHlmOu"},{"type":"text","value":"θ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"gjEnyikAD7"},{"type":"text","value":",\nwe to find a parameter (vector) ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"NK9yrFSw3Z"},{"type":"inlineMath","value":"\\hat \\theta","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"html":"θ^\\hat \\thetaθ^","key":"UnhKKNWb2w"},{"type":"text","value":" that minimizes the empirical risk:","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"DoujyfzkE6"}],"key":"DMcZFdYkpM"},{"type":"math","value":"\\hat \\theta = \\arg\\min_{\\theta} \\frac{1}{N} \\sum_{i=1}^N (y_i - f_\\theta(x_i))^2","position":{"start":{"line":37,"column":1},"end":{"line":39,"column":1}},"html":"θ^=argminθ1Ni=1N(yifθ(xi))2\\hat \\theta = \\arg\\min_{\\theta} \\frac{1}{N} \\sum_{i=1}^N (y_i - f_\\theta(x_i))^2θ^=argθminN1i=1N(yifθ(xi))2","enumerator":"4.1","key":"BEFH03QLqJ"}],"enumerator":"4.1","html_id":"parameterized-empirical-risk-minimization","key":"nJ4vn8bQyQ"},{"type":"paragraph","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"The most common fitting method for parameterized models is ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"sxbkUynbFo"},{"type":"strong","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"gradient descent","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"hr38c6c8Av"}],"key":"A7IIRqQC8D"},{"type":"text","value":".","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"l5yX9IaxUM"}],"key":"BRsa570YwW"},{"type":"proof","kind":"definition","label":"gd_def","identifier":"gd_def","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Gradient descent","position":{"start":{"line":44,"column":1},"end":{"line":44,"column":1}},"key":"fxfQC4hU2q"}],"key":"fzAkyZH0ut"},{"type":"paragraph","position":{"start":{"line":47,"column":1},"end":{"line":48,"column":1}},"children":[{"type":"text","value":"Letting ","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"key":"d3BoljxG3I"},{"type":"inlineMath","value":"L(\\theta) \\in \\mathbb{R}","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"html":"L(θ)RL(\\theta) \\in \\mathbb{R}L(θ)R","key":"jnJVjgUPpd"},{"type":"text","value":" denote the empirical risk in terms of the parameters,\nthe gradient descent algorithm updates the parameters according to the rule","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"key":"CHvniuFkjW"}],"key":"LR63ZkhYeK"},{"type":"math","value":"\\theta^{t+1} = \\theta^t - \\eta \\nabla_\\theta L(\\theta^t)","position":{"start":{"line":50,"column":1},"end":{"line":52,"column":1}},"html":"θt+1=θtηθL(θt)\\theta^{t+1} = \\theta^t - \\eta \\nabla_\\theta L(\\theta^t)θt+1=θtηθL(θt)","enumerator":"4.2","key":"lrYFXjQf8U"},{"type":"paragraph","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"key":"EgPPRQxfPE"},{"type":"inlineMath","value":"\\eta > 0","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"html":"η>0\\eta > 0η>0","key":"Bk5f4DLgZ3"},{"type":"text","value":" is the ","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"key":"p5BHjk3n4n"},{"type":"strong","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"children":[{"type":"text","value":"learning rate","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"key":"StW4wRaClA"}],"key":"CZQwwvvzkc"},{"type":"text","value":".","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"key":"ZotNUNgq9G"}],"key":"uo1Pj8o7wE"}],"enumerator":"4.2","html_id":"gd-def","key":"GblmsYnDxo"}],"key":"vPmMg3cnOH"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"from jaxtyping import Float, Array\nfrom collections.abc import Callable","visibility":"hide","key":"FKNfaxNGVz"},{"type":"output","id":"BqAqQcLQ4CcDWuEd00PDF","data":[],"visibility":"show","key":"OBNLsfAxwv"}],"data":{"tags":[]},"visibility":"show","key":"NJ9oUs87lZ"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"Params = Float[Array, \" D\"]\n\n\ndef gradient_descent(\n loss: Callable[[Params], float],\n θ_init: Params,\n η: float,\n epochs: int,\n):\n \"\"\"\n Run gradient descent to minimize the given loss function\n (expressed in terms of the parameters).\n \"\"\"\n θ = θ_init\n for _ in range(epochs):\n θ = θ - η * grad(loss)(θ)\n return θ","key":"WsT0xrl4X7"},{"type":"output","id":"7Jstr4NGR0mKGJP88uOhw","data":[],"key":"LNTNtSGo4M"}],"data":{},"key":"vLcXA3GWU0"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"children":[{"type":"text","value":"Linear regression","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"key":"DECjYrMxE3"}],"identifier":"linear-regression","label":"Linear regression","html_id":"linear-regression","implicit":true,"enumerator":"4.2","key":"Wia8dezJUQ"},{"type":"paragraph","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"children":[{"type":"text","value":"In linear regression, we assume that the function ","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"key":"j7JirY01n1"},{"type":"inlineMath","value":"f","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"html":"fff","key":"GeFBC7t1FZ"},{"type":"text","value":" is linear in the parameters:","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"key":"pbqR7gCOSr"}],"key":"v86Qadv1ia"},{"type":"math","value":"\\mathcal{F} = \\{ x \\mapsto \\theta^\\top x \\mid \\theta \\in \\mathbb{R}^D \\}","position":{"start":{"line":88,"column":1},"end":{"line":90,"column":1}},"html":"F={xθxθRD}\\mathcal{F} = \\{ x \\mapsto \\theta^\\top x \\mid \\theta \\in \\mathbb{R}^D \\}F={xθxθRD}","enumerator":"4.3","key":"r1nbGPNSDR"},{"type":"paragraph","position":{"start":{"line":92,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"This function class is extremely simple and only contains linear functions.\nTo expand its expressivity, we can ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"mKinbYPIti"},{"type":"emphasis","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"children":[{"type":"text","value":"transform","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"uPOyVxk5oR"}],"key":"ho5mVUfhXV"},{"type":"text","value":" the input ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"DH8ch3IdEi"},{"type":"inlineMath","value":"x","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"html":"xxx","key":"BNCAIciMht"},{"type":"text","value":" using some feature function ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"NyXYjAMutx"},{"type":"text","value":"ϕ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"DZxxxkMgL5"},{"type":"text","value":",\ni.e. ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"eg7BGVsW3z"},{"type":"inlineMath","value":"\\widetilde x = \\phi(x)","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"html":"x~=ϕ(x)\\widetilde x = \\phi(x)x=ϕ(x)","key":"eiFOsFYUqf"},{"type":"text","value":", and then fit a linear model in the transformed space instead.","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"Sbs9TXmxm9"}],"key":"AJNcS5WLXB"}],"key":"a0tCqMAtet"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def fit_linear(X: Float[Array, \"N D\"], y: Float[Array, \" N\"], φ=lambda x: x):\n \"\"\"Fit a linear model to the given dataset using ordinary least squares.\"\"\"\n X = vmap(φ)(X)\n θ = np.linalg.lstsq(X, y, rcond=None)[0]\n return lambda x: np.dot(φ(x), θ)","key":"iwDc5j1hCF"},{"type":"output","id":"dAgcGS3_T-pNb4KJwR9-N","data":[],"key":"knQduzvK9q"}],"data":{},"key":"hasPo7pBKp"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"children":[{"type":"text","value":"Neural networks","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"key":"vJVUPSCGxd"}],"identifier":"neural-networks","label":"Neural networks","html_id":"neural-networks","implicit":true,"enumerator":"4.3","key":"muMSTjfhuI"},{"type":"paragraph","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"children":[{"type":"text","value":"In neural networks, we assume that the function ","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"key":"nXJpDDRUQq"},{"type":"inlineMath","value":"f","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"html":"fff","key":"NOMRIa5Trz"},{"type":"text","value":" is a composition of linear functions (represented by matrices ","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"key":"Y86v66uhOA"},{"type":"inlineMath","value":"W_i","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"html":"WiW_iWi","key":"b5jrt25QDf"},{"type":"text","value":") and non-linear activation functions (denoted by ","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"key":"RyaR8Gza3S"},{"type":"text","value":"σ","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"key":"njSjMZnVWP"},{"type":"text","value":"):","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"key":"arVWrzgeqW"}],"key":"f1HVvfQgG3"},{"type":"math","value":"\\mathcal{F} = \\{ x \\mapsto \\sigma(W_L \\sigma(W_{L-1} \\dots \\sigma(W_1 x + b_1) \\dots + b_{L-1}) + b_L) \\}","position":{"start":{"line":108,"column":1},"end":{"line":110,"column":1}},"html":"F={xσ(WLσ(WL1σ(W1x+b1)+bL1)+bL)}\\mathcal{F} = \\{ x \\mapsto \\sigma(W_L \\sigma(W_{L-1} \\dots \\sigma(W_1 x + b_1) \\dots + b_{L-1}) + b_L) \\}F={xσ(WLσ(WL1σ(W1x+b1)+bL1)+bL)}","enumerator":"4.4","key":"kDgvlP8JFy"},{"type":"paragraph","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"EvGpnVVQN6"},{"type":"inlineMath","value":"W_i \\in \\mathbb{R}^{D_{i+1} \\times D_i}","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"WiRDi+1×DiW_i \\in \\mathbb{R}^{D_{i+1} \\times D_i}WiRDi+1×Di","key":"jb02pl9uiB"},{"type":"text","value":" and ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"WchxlwAKUu"},{"type":"inlineMath","value":"b_i \\in \\mathbb{R}^{D_{i+1}}","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"biRDi+1b_i \\in \\mathbb{R}^{D_{i+1}}biRDi+1","key":"VA6LoQ4ndl"},{"type":"text","value":" are the parameters of the ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"LPVmx7pjJR"},{"type":"inlineMath","value":"i","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"iii","key":"iEXE9yTOoG"},{"type":"text","value":"-th layer, and ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"ajs9ucUe6Z"},{"type":"text","value":"σ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"vyOl0SWDux"},{"type":"text","value":" is the activation function.","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"UVbVcryVrw"}],"key":"LQpaNw6FqF"},{"type":"paragraph","position":{"start":{"line":114,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"This function class is much more expressive and contains many more parameters.\nThis makes it more susceptible to overfitting on smaller datasets,\nbut also allows it to represent more complex functions.\nIn practice, however, neural networks exhibit interesting phenomena during training,\nand are often able to generalize well even with many parameters.","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"GxvFR2x0dT"}],"key":"igBd6gTFXE"},{"type":"paragraph","position":{"start":{"line":120,"column":1},"end":{"line":123,"column":1}},"children":[{"type":"text","value":"Another reason for their popularity is the efficient ","position":{"start":{"line":120,"column":1},"end":{"line":120,"column":1}},"key":"NRMG6cYDNn"},{"type":"strong","position":{"start":{"line":120,"column":1},"end":{"line":120,"column":1}},"children":[{"type":"text","value":"backpropagation","position":{"start":{"line":120,"column":1},"end":{"line":120,"column":1}},"key":"q93KDRlXfB"}],"key":"aCVELtHlbA"},{"type":"text","value":" algorithm for computing the gradient of the empirical risk with respect to the parameters.\nEssentially, the hierarchical structure of the neural network,\ni.e. computing the output of the network as a composition of functions,\nallows us to use the chain rule to compute the gradient of the output with respect to the parameters of each layer.","position":{"start":{"line":120,"column":1},"end":{"line":120,"column":1}},"key":"XGVHP3v4cR"}],"key":"KbBiKo0Zt9"},{"type":"paragraph","position":{"start":{"line":125,"column":1},"end":{"line":125,"column":1}},"children":[{"type":"cite","kind":"narrative","label":"nielsen_neural_2015","identifier":"nielsen_neural_2015","children":[{"type":"text","value":"Nielsen (2015)","key":"LT4pPadDPN"}],"enumerator":"1","key":"AqLgREm4LX"},{"type":"text","value":" provides a comprehensive introduction to neural networks and backpropagation.","position":{"start":{"line":125,"column":1},"end":{"line":125,"column":1}},"key":"q7eHBesTv9"}],"key":"iPMKoNAMlQ"}],"key":"HtRhzYXIsI"}],"key":"a3iaY0WL4C"},"references":{"cite":{"order":["nielsen_neural_2015"],"data":{"nielsen_neural_2015":{"label":"nielsen_neural_2015","enumerator":"1","html":"Nielsen, M. A. (2015). Neural Networks and Deep Learning. Determination Press."}}}},"footer":{"navigation":{"prev":{"title":"3 Multi-Armed Bandits","url":"/bandits","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"5 Fitted Dynamic Programming Algorithms","url":"/fitted-dp","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file +{"kind":"Notebook","sha256":"e56ff69c011ee78674304db47cc4e85c51d95181fd2f1ca46bac12965fb5e8ee","slug":"supervised-learning","location":"/supervised_learning.md","dependencies":[],"frontmatter":{"title":"4 Supervised learning","numbering":{"all":{"enabled":true},"enumerator":{"template":"4.%s"}},"kernelspec":{"name":"python3","display_name":"Python 3 (ipykernel)","language":"python"},"jupytext":{"text_representation":{"extension":".md","format_name":"myst","format_version":"0.13","jupytext_version":"1.16.2"}},"content_includes_title":false,"authors":[{"nameParsed":{"literal":"Fall 2024","given":"Fall","family":"2024"},"name":"Fall 2024","id":"contributors-myst-generated-uid-0"}],"github":"https://github.com/adzcai/cs-stat-184-notes","math":{"\\E":{"macro":"\\mathop{\\mathbb{E}}"},"\\pr":{"macro":"\\mathop{\\mathbb{P}}"},"\\kl":{"macro":"\\mathrm{KL}\\left(#1\\parallel#2\\right)"},"\\ind":{"macro":"\\mathbf{1}\\left\\{#1\\right\\}"},"\\hi":{"macro":"h"},"\\hor":{"macro":"H"},"\\st":{"macro":"s"},"\\act":{"macro":"a"}},"exports":[{"format":"md","filename":"supervised_learning.md","url":"/build/supervised_learning-350bcacee6e0c7c9985fcefbbc20f999.md"}]},"mdast":{"type":"root","children":[{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"children":[{"type":"text","value":"Introduction","position":{"start":{"line":18,"column":1},"end":{"line":18,"column":1}},"key":"vX1zLLvEdP"}],"identifier":"introduction","label":"Introduction","html_id":"introduction","implicit":true,"enumerator":"4.1","key":"vXrexbxjd5"},{"type":"paragraph","position":{"start":{"line":20,"column":1},"end":{"line":22,"column":1}},"children":[{"type":"text","value":"This section will cover the details of implementing the ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"BIKJHqL3ye"},{"type":"inlineCode","value":"fit","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"qQQQGMKbbS"},{"type":"text","value":" function above:\nThat is, how to use a dataset of labelled samples ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"DoDp1njNLj"},{"type":"inlineMath","value":"(x_1, y_1), \\dots, (x_N, y_N)","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"html":"(x1,y1),,(xN,yN)(x_1, y_1), \\dots, (x_N, y_N)(x1,y1),,(xN,yN)","key":"fgPIwnxZKN"},{"type":"text","value":" to find a function ","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"U38FgQ1rGn"},{"type":"inlineMath","value":"f","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"html":"fff","key":"fN9LNqrQod"},{"type":"text","value":" that minimizes the empirical risk.\nThis requires two ingredients:","position":{"start":{"line":20,"column":1},"end":{"line":20,"column":1}},"key":"Cmj54vhMAZ"}],"key":"nLLUfebN9i"},{"type":"list","ordered":true,"start":1,"spread":false,"position":{"start":{"line":24,"column":1},"end":{"line":26,"column":1}},"children":[{"type":"listItem","spread":true,"position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"children":[{"type":"text","value":"A ","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"yLW1j0k76J"},{"type":"strong","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"children":[{"type":"text","value":"function class","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"ZOlyVynwqA"}],"key":"y8PfSeEncg"},{"type":"text","value":" ","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"IukB1XgxQ1"},{"type":"inlineMath","value":"\\mathcal{F}","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"html":"F\\mathcal{F}F","key":"ltYmE31mak"},{"type":"text","value":" to search over","position":{"start":{"line":24,"column":1},"end":{"line":24,"column":1}},"key":"uofXWM6Xn4"}],"key":"AE6cvR4En9"},{"type":"listItem","spread":true,"position":{"start":{"line":25,"column":1},"end":{"line":26,"column":1}},"children":[{"type":"text","value":"A ","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"WatxgX6cWR"},{"type":"strong","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"children":[{"type":"text","value":"fitting method","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"wzV6MMu5zm"}],"key":"YimlY5oVac"},{"type":"text","value":" for minimizing the empirical risk over this class","position":{"start":{"line":25,"column":1},"end":{"line":25,"column":1}},"key":"zamCpTfmct"}],"key":"ujXWrnSWdX"}],"key":"PQLgUZ3ja4"},{"type":"paragraph","position":{"start":{"line":27,"column":1},"end":{"line":29,"column":1}},"children":[{"type":"text","value":"The two main function classes we will cover are ","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"VsA98OYvjI"},{"type":"strong","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"text","value":"linear models","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"uRR3zMGblL"}],"key":"WK2i8FHlpK"},{"type":"text","value":" and ","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"aaTJNJDtVV"},{"type":"strong","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"text","value":"neural networks","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"OZpt2xQDmJ"}],"key":"yJIhxqcgM8"},{"type":"text","value":".\nBoth of these function classes are ","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"S1AIKl3Nxn"},{"type":"emphasis","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"children":[{"type":"text","value":"parameterized","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"vby3bnxcsX"}],"key":"EhVniAR9l3"},{"type":"text","value":" by some parameters ","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"Trbkb3Fqem"},{"type":"text","value":"θ","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"hyMn2ebc1I"},{"type":"text","value":",\nand the fitting method will search over these parameters to minimize the empirical risk:","position":{"start":{"line":27,"column":1},"end":{"line":27,"column":1}},"key":"aSP6XyKYpK"}],"key":"bcbNfMWJ3F"},{"type":"proof","kind":"definition","label":"parameterized_empirical_risk_minimization","identifier":"parameterized_empirical_risk_minimization","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Parameterized empirical risk minimization","position":{"start":{"line":31,"column":1},"end":{"line":31,"column":1}},"key":"NrqwnDx9fM"}],"key":"USu63Mb3iZ"},{"type":"paragraph","position":{"start":{"line":34,"column":1},"end":{"line":35,"column":1}},"children":[{"type":"text","value":"Given a dataset of samples ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"dRfqZcutf3"},{"type":"inlineMath","value":"(x_1, y_1), \\dots, (x_N, y_N)","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"html":"(x1,y1),,(xN,yN)(x_1, y_1), \\dots, (x_N, y_N)(x1,y1),,(xN,yN)","key":"eo5KL8ab4G"},{"type":"text","value":" and a class of functions ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"sqkxWCxmKp"},{"type":"inlineMath","value":"\\mathcal{F}","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"html":"F\\mathcal{F}F","key":"Iahv70qr1L"},{"type":"text","value":" parameterized by ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"y9CetTeiqS"},{"type":"text","value":"θ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"B0yQjWMjX6"},{"type":"text","value":",\nwe to find a parameter (vector) ","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"f2qiYMW9ay"},{"type":"inlineMath","value":"\\hat \\theta","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"html":"θ^\\hat \\thetaθ^","key":"ELjoxnGgL9"},{"type":"text","value":" that minimizes the empirical risk:","position":{"start":{"line":34,"column":1},"end":{"line":34,"column":1}},"key":"Hh5IJc17rU"}],"key":"tIo97VgHRW"},{"type":"math","value":"\\hat \\theta = \\arg\\min_{\\theta} \\frac{1}{N} \\sum_{i=1}^N (y_i - f_\\theta(x_i))^2","position":{"start":{"line":37,"column":1},"end":{"line":39,"column":1}},"html":"θ^=argminθ1Ni=1N(yifθ(xi))2\\hat \\theta = \\arg\\min_{\\theta} \\frac{1}{N} \\sum_{i=1}^N (y_i - f_\\theta(x_i))^2θ^=argθminN1i=1N(yifθ(xi))2","enumerator":"4.1","key":"R77xFpwvWQ"}],"enumerator":"4.1","html_id":"parameterized-empirical-risk-minimization","key":"wZLjtQorsS"},{"type":"paragraph","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"The most common fitting method for parameterized models is ","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"AJzt7rDYGb"},{"type":"strong","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"children":[{"type":"text","value":"gradient descent","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"DpNeXfKLpp"}],"key":"VphAWlQ9o5"},{"type":"text","value":".","position":{"start":{"line":42,"column":1},"end":{"line":42,"column":1}},"key":"HeicwPDXhx"}],"key":"xwuPPQ2eiY"},{"type":"proof","kind":"definition","label":"gd_def","identifier":"gd_def","enumerated":true,"children":[{"type":"admonitionTitle","children":[{"type":"text","value":"Gradient descent","position":{"start":{"line":44,"column":1},"end":{"line":44,"column":1}},"key":"NVMmWVQnnv"}],"key":"Yb35zL98pm"},{"type":"paragraph","position":{"start":{"line":47,"column":1},"end":{"line":48,"column":1}},"children":[{"type":"text","value":"Letting ","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"key":"PbxrCc8e6Y"},{"type":"inlineMath","value":"L(\\theta) \\in \\mathbb{R}","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"html":"L(θ)RL(\\theta) \\in \\mathbb{R}L(θ)R","key":"eZqNHGUlAz"},{"type":"text","value":" denote the empirical risk in terms of the parameters,\nthe gradient descent algorithm updates the parameters according to the rule","position":{"start":{"line":47,"column":1},"end":{"line":47,"column":1}},"key":"IG4iE5t1uC"}],"key":"HU1iWqi0kp"},{"type":"math","value":"\\theta^{t+1} = \\theta^t - \\eta \\nabla_\\theta L(\\theta^t)","position":{"start":{"line":50,"column":1},"end":{"line":52,"column":1}},"html":"θt+1=θtηθL(θt)\\theta^{t+1} = \\theta^t - \\eta \\nabla_\\theta L(\\theta^t)θt+1=θtηθL(θt)","enumerator":"4.2","key":"JNTnlvU7vO"},{"type":"paragraph","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"key":"d9X6cEAf2K"},{"type":"inlineMath","value":"\\eta > 0","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"html":"η>0\\eta > 0η>0","key":"u3xxypamOr"},{"type":"text","value":" is the ","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"key":"pREsxRAipG"},{"type":"strong","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"children":[{"type":"text","value":"learning rate","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"key":"smfiKdIVr7"}],"key":"Dg4zYzMsEy"},{"type":"text","value":".","position":{"start":{"line":54,"column":1},"end":{"line":54,"column":1}},"key":"wgM03B5dpb"}],"key":"c2aKzf1jfS"}],"enumerator":"4.2","html_id":"gd-def","key":"Hcf96AwBU3"}],"key":"XJmnmuoR7u"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"from jaxtyping import Float, Array\nfrom collections.abc import Callable","visibility":"hide","key":"UlfBNseLmZ"},{"type":"output","id":"YxNbfahCk9TotaUod6aiy","data":[],"visibility":"show","key":"zRQ2h3RyZY"}],"data":{"tags":[]},"visibility":"show","key":"TDTxnVEXpI"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"Params = Float[Array, \" D\"]\n\n\ndef gradient_descent(\n loss: Callable[[Params], float],\n θ_init: Params,\n η: float,\n epochs: int,\n):\n \"\"\"\n Run gradient descent to minimize the given loss function\n (expressed in terms of the parameters).\n \"\"\"\n θ = θ_init\n for _ in range(epochs):\n θ = θ - η * grad(loss)(θ)\n return θ","key":"lwxIfdU2KH"},{"type":"output","id":"7wK-0pfcsM20bkl6N8oDt","data":[],"key":"TVkkjjz4xg"}],"data":{},"key":"XXXX0Vczgv"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"children":[{"type":"text","value":"Linear regression","position":{"start":{"line":84,"column":1},"end":{"line":84,"column":1}},"key":"Rc8DQydicb"}],"identifier":"linear-regression","label":"Linear regression","html_id":"linear-regression","implicit":true,"enumerator":"4.2","key":"WJAPd3b4Fm"},{"type":"paragraph","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"children":[{"type":"text","value":"In linear regression, we assume that the function ","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"key":"fs1XewBj4S"},{"type":"inlineMath","value":"f","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"html":"fff","key":"kDFZ0YKdry"},{"type":"text","value":" is linear in the parameters:","position":{"start":{"line":86,"column":1},"end":{"line":86,"column":1}},"key":"jkrQ6SCa8t"}],"key":"arvYO5Ylyr"},{"type":"math","value":"\\mathcal{F} = \\{ x \\mapsto \\theta^\\top x \\mid \\theta \\in \\mathbb{R}^D \\}","position":{"start":{"line":88,"column":1},"end":{"line":90,"column":1}},"html":"F={xθxθRD}\\mathcal{F} = \\{ x \\mapsto \\theta^\\top x \\mid \\theta \\in \\mathbb{R}^D \\}F={xθxθRD}","enumerator":"4.3","key":"r2oI7i0c6p"},{"type":"paragraph","position":{"start":{"line":92,"column":1},"end":{"line":94,"column":1}},"children":[{"type":"text","value":"This function class is extremely simple and only contains linear functions.\nTo expand its expressivity, we can ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"X2LnSjO2TK"},{"type":"emphasis","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"children":[{"type":"text","value":"transform","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"D3cAG2ZrA6"}],"key":"cGtPQvGutM"},{"type":"text","value":" the input ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"TgvrUn7Ll3"},{"type":"inlineMath","value":"x","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"html":"xxx","key":"m5shBuZ5WP"},{"type":"text","value":" using some feature function ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"Z0WI2UJvKx"},{"type":"text","value":"ϕ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"gxXtkJDLKe"},{"type":"text","value":",\ni.e. ","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"Tc1GaDJ8yS"},{"type":"inlineMath","value":"\\widetilde x = \\phi(x)","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"html":"x~=ϕ(x)\\widetilde x = \\phi(x)x=ϕ(x)","key":"ktz62o88w0"},{"type":"text","value":", and then fit a linear model in the transformed space instead.","position":{"start":{"line":92,"column":1},"end":{"line":92,"column":1}},"key":"PUybsaLCpD"}],"key":"cv7hiN9pW7"}],"key":"BYftHXaqRw"},{"type":"block","kind":"notebook-code","children":[{"type":"code","lang":"python","executable":true,"value":"def fit_linear(X: Float[Array, \"N D\"], y: Float[Array, \" N\"], φ=lambda x: x):\n \"\"\"Fit a linear model to the given dataset using ordinary least squares.\"\"\"\n X = vmap(φ)(X)\n θ = np.linalg.lstsq(X, y, rcond=None)[0]\n return lambda x: np.dot(φ(x), θ)","key":"YudE6b4brM"},{"type":"output","id":"tGUvIRiW03SO8LRHCc80t","data":[],"key":"o8UscifzIN"}],"data":{},"key":"CfjapOyPLJ"},{"type":"block","children":[{"type":"heading","depth":2,"position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"children":[{"type":"text","value":"Neural networks","position":{"start":{"line":104,"column":1},"end":{"line":104,"column":1}},"key":"w3DAamXikV"}],"identifier":"neural-networks","label":"Neural networks","html_id":"neural-networks","implicit":true,"enumerator":"4.3","key":"PiQwD8CI2A"},{"type":"paragraph","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"children":[{"type":"text","value":"In neural networks, we assume that the function ","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"key":"MhYp3DYnNa"},{"type":"inlineMath","value":"f","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"html":"fff","key":"EbREQugrys"},{"type":"text","value":" is a composition of linear functions (represented by matrices ","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"key":"QEG7W6vJei"},{"type":"inlineMath","value":"W_i","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"html":"WiW_iWi","key":"QWRyGTmBY6"},{"type":"text","value":") and non-linear activation functions (denoted by ","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"key":"uqab3gwwp2"},{"type":"text","value":"σ","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"key":"TExlnKIpJ7"},{"type":"text","value":"):","position":{"start":{"line":106,"column":1},"end":{"line":106,"column":1}},"key":"YFTOMq7PNP"}],"key":"JeNaCqoDad"},{"type":"math","value":"\\mathcal{F} = \\{ x \\mapsto \\sigma(W_L \\sigma(W_{L-1} \\dots \\sigma(W_1 x + b_1) \\dots + b_{L-1}) + b_L) \\}","position":{"start":{"line":108,"column":1},"end":{"line":110,"column":1}},"html":"F={xσ(WLσ(WL1σ(W1x+b1)+bL1)+bL)}\\mathcal{F} = \\{ x \\mapsto \\sigma(W_L \\sigma(W_{L-1} \\dots \\sigma(W_1 x + b_1) \\dots + b_{L-1}) + b_L) \\}F={xσ(WLσ(WL1σ(W1x+b1)+bL1)+bL)}","enumerator":"4.4","key":"DTpkQKwpYC"},{"type":"paragraph","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"children":[{"type":"text","value":"where ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"KkBg2Shyqz"},{"type":"inlineMath","value":"W_i \\in \\mathbb{R}^{D_{i+1} \\times D_i}","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"WiRDi+1×DiW_i \\in \\mathbb{R}^{D_{i+1} \\times D_i}WiRDi+1×Di","key":"WZJD38Ypy7"},{"type":"text","value":" and ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"oLGK65RjQJ"},{"type":"inlineMath","value":"b_i \\in \\mathbb{R}^{D_{i+1}}","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"biRDi+1b_i \\in \\mathbb{R}^{D_{i+1}}biRDi+1","key":"u6MwHH8Tfr"},{"type":"text","value":" are the parameters of the ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"KremLvhfe5"},{"type":"inlineMath","value":"i","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"html":"iii","key":"vBcy8yePAJ"},{"type":"text","value":"-th layer, and ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"c5O6yp4QJ3"},{"type":"text","value":"σ","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"piVcIC4AM4"},{"type":"text","value":" is the activation function.","position":{"start":{"line":112,"column":1},"end":{"line":112,"column":1}},"key":"itH64dSZBB"}],"key":"WsTNnFjF43"},{"type":"paragraph","position":{"start":{"line":114,"column":1},"end":{"line":118,"column":1}},"children":[{"type":"text","value":"This function class is much more expressive and contains many more parameters.\nThis makes it more susceptible to overfitting on smaller datasets,\nbut also allows it to represent more complex functions.\nIn practice, however, neural networks exhibit interesting phenomena during training,\nand are often able to generalize well even with many parameters.","position":{"start":{"line":114,"column":1},"end":{"line":114,"column":1}},"key":"krjX2E5ZGE"}],"key":"OFZOZOJJGk"},{"type":"paragraph","position":{"start":{"line":120,"column":1},"end":{"line":123,"column":1}},"children":[{"type":"text","value":"Another reason for their popularity is the efficient ","position":{"start":{"line":120,"column":1},"end":{"line":120,"column":1}},"key":"DdGgQ8lZuu"},{"type":"strong","position":{"start":{"line":120,"column":1},"end":{"line":120,"column":1}},"children":[{"type":"text","value":"backpropagation","position":{"start":{"line":120,"column":1},"end":{"line":120,"column":1}},"key":"HnM9ZwOjIu"}],"key":"HimAJZs0bm"},{"type":"text","value":" algorithm for computing the gradient of the empirical risk with respect to the parameters.\nEssentially, the hierarchical structure of the neural network,\ni.e. computing the output of the network as a composition of functions,\nallows us to use the chain rule to compute the gradient of the output with respect to the parameters of each layer.","position":{"start":{"line":120,"column":1},"end":{"line":120,"column":1}},"key":"CoWLQjwFRu"}],"key":"hSigOx80yH"},{"type":"paragraph","position":{"start":{"line":125,"column":1},"end":{"line":125,"column":1}},"children":[{"type":"cite","kind":"narrative","label":"nielsen_neural_2015","identifier":"nielsen_neural_2015","children":[{"type":"text","value":"Nielsen (2015)","key":"GGqYu5LWu2"}],"enumerator":"1","key":"IqCWY1y9pS"},{"type":"text","value":" provides a comprehensive introduction to neural networks and backpropagation.","position":{"start":{"line":125,"column":1},"end":{"line":125,"column":1}},"key":"z0YAZzcOhC"}],"key":"QEW6mgHLRG"}],"key":"mTYrykQ6rK"}],"key":"S59D9UTNRF"},"references":{"cite":{"order":["nielsen_neural_2015"],"data":{"nielsen_neural_2015":{"label":"nielsen_neural_2015","enumerator":"1","html":"Nielsen, M. A. (2015). Neural Networks and Deep Learning. Determination Press."}}}},"footer":{"navigation":{"prev":{"title":"3 Multi-Armed Bandits","url":"/bandits","group":"CS/STAT 184: Introduction to Reinforcement Learning"},"next":{"title":"5 Fitted Dynamic Programming Algorithms","url":"/fitted-dp","group":"CS/STAT 184: Introduction to Reinforcement Learning"}}},"domain":"http://localhost:3000"} \ No newline at end of file