diff --git a/hail/python/hail/docs/tutorials/10-ggplot2.ipynb b/hail/python/hail/docs/tutorials/10-ggplot2.ipynb new file mode 100644 index 00000000000..9bebabe170d --- /dev/null +++ b/hail/python/hail/docs/tutorials/10-ggplot2.ipynb @@ -0,0 +1,2188 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "247fd83d-9599-4379-9549-e7863594dc7a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + " \n", + "
\n", + " \n", + " Loading BokehJS ...\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function now() {\n", + " return new Date();\n", + " }\n", + "\n", + " const force = true;\n", + "\n", + " if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n", + " root._bokeh_onload_callbacks = [];\n", + " root._bokeh_is_loading = undefined;\n", + " }\n", + "\n", + "const JS_MIME_TYPE = 'application/javascript';\n", + " const HTML_MIME_TYPE = 'text/html';\n", + " const EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n", + " const CLASS_NAME = 'output_bokeh rendered_html';\n", + "\n", + " /**\n", + " * Render data to the DOM node\n", + " */\n", + " function render(props, node) {\n", + " const script = document.createElement(\"script\");\n", + " node.appendChild(script);\n", + " }\n", + "\n", + " /**\n", + " * Handle when an output is cleared or removed\n", + " */\n", + " function handleClearOutput(event, handle) {\n", + " function drop(id) {\n", + " const view = Bokeh.index.get_by_id(id)\n", + " if (view != null) {\n", + " view.model.document.clear()\n", + " Bokeh.index.delete(view)\n", + " }\n", + " }\n", + "\n", + " const cell = handle.cell;\n", + "\n", + " const id = cell.output_area._bokeh_element_id;\n", + " const server_id = cell.output_area._bokeh_server_id;\n", + "\n", + " // Clean up Bokeh references\n", + " if (id != null) {\n", + " drop(id)\n", + " }\n", + "\n", + " if (server_id !== undefined) {\n", + " // Clean up Bokeh references\n", + " const cmd_clean = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n", + " cell.notebook.kernel.execute(cmd_clean, {\n", + " iopub: {\n", + " output: function(msg) {\n", + " const id = msg.content.text.trim()\n", + " drop(id)\n", + " }\n", + " }\n", + " });\n", + " // Destroy server and session\n", + " const cmd_destroy = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n", + " cell.notebook.kernel.execute(cmd_destroy);\n", + " }\n", + " }\n", + "\n", + " /**\n", + " * Handle when a new output is added\n", + " */\n", + " function handleAddOutput(event, handle) {\n", + " const output_area = handle.output_area;\n", + " const output = handle.output;\n", + "\n", + " // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n", + " if ((output.output_type != \"display_data\") || (!Object.prototype.hasOwnProperty.call(output.data, EXEC_MIME_TYPE))) {\n", + " return\n", + " }\n", + "\n", + " const toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n", + "\n", + " if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n", + " toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n", + " // store reference to embed id on output_area\n", + " output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n", + " }\n", + " if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n", + " const bk_div = document.createElement(\"div\");\n", + " bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n", + " const script_attrs = bk_div.children[0].attributes;\n", + " for (let i = 0; i < script_attrs.length; i++) {\n", + " toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n", + " toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent\n", + " }\n", + " // store reference to server id on output_area\n", + " output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n", + " }\n", + " }\n", + "\n", + " function register_renderer(events, OutputArea) {\n", + "\n", + " function append_mime(data, metadata, element) {\n", + " // create a DOM node to render to\n", + " const toinsert = this.create_output_subarea(\n", + " metadata,\n", + " CLASS_NAME,\n", + " EXEC_MIME_TYPE\n", + " );\n", + " this.keyboard_manager.register_events(toinsert);\n", + " // Render to node\n", + " const props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n", + " render(props, toinsert[toinsert.length - 1]);\n", + " element.append(toinsert);\n", + " return toinsert\n", + " }\n", + "\n", + " /* Handle when an output is cleared or removed */\n", + " events.on('clear_output.CodeCell', handleClearOutput);\n", + " events.on('delete.Cell', handleClearOutput);\n", + "\n", + " /* Handle when a new output is added */\n", + " events.on('output_added.OutputArea', handleAddOutput);\n", + "\n", + " /**\n", + " * Register the mime type and append_mime function with output_area\n", + " */\n", + " OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n", + " /* Is output safe? */\n", + " safe: true,\n", + " /* Index of renderer in `output_area.display_order` */\n", + " index: 0\n", + " });\n", + " }\n", + "\n", + " // register the mime type if in Jupyter Notebook environment and previously unregistered\n", + " if (root.Jupyter !== undefined) {\n", + " const events = require('base/js/events');\n", + " const OutputArea = require('notebook/js/outputarea').OutputArea;\n", + "\n", + " if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n", + " register_renderer(events, OutputArea);\n", + " }\n", + " }\n", + " if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n", + " root._bokeh_timeout = Date.now() + 5000;\n", + " root._bokeh_failed_load = false;\n", + " }\n", + "\n", + " const NB_LOAD_WARNING = {'data': {'text/html':\n", + " \"
\\n\"+\n", + " \"

\\n\"+\n", + " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", + " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", + " \"

\\n\"+\n", + " \"\\n\"+\n", + " \"\\n\"+\n", + " \"from bokeh.resources import INLINE\\n\"+\n", + " \"output_notebook(resources=INLINE)\\n\"+\n", + " \"\\n\"+\n", + " \"
\"}};\n", + "\n", + " function display_loaded() {\n", + " const el = document.getElementById(\"bc0a10cc-873c-4048-948c-987933ab78ac\");\n", + " if (el != null) {\n", + " el.textContent = \"BokehJS is loading...\";\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " if (el != null) {\n", + " el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n", + " }\n", + " } else if (Date.now() < root._bokeh_timeout) {\n", + " setTimeout(display_loaded, 100)\n", + " }\n", + " }\n", + "\n", + " function run_callbacks() {\n", + " try {\n", + " root._bokeh_onload_callbacks.forEach(function(callback) {\n", + " if (callback != null)\n", + " callback();\n", + " });\n", + " } finally {\n", + " delete root._bokeh_onload_callbacks\n", + " }\n", + " console.debug(\"Bokeh: all callbacks have finished\");\n", + " }\n", + "\n", + " function load_libs(css_urls, js_urls, callback) {\n", + " if (css_urls == null) css_urls = [];\n", + " if (js_urls == null) js_urls = [];\n", + "\n", + " root._bokeh_onload_callbacks.push(callback);\n", + " if (root._bokeh_is_loading > 0) {\n", + " console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", + " return null;\n", + " }\n", + " if (js_urls == null || js_urls.length === 0) {\n", + " run_callbacks();\n", + " return null;\n", + " }\n", + " console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", + " root._bokeh_is_loading = css_urls.length + js_urls.length;\n", + "\n", + " function on_load() {\n", + " root._bokeh_is_loading--;\n", + " if (root._bokeh_is_loading === 0) {\n", + " console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n", + " run_callbacks()\n", + " }\n", + " }\n", + "\n", + " function on_error(url) {\n", + " console.error(\"failed to load \" + url);\n", + " }\n", + "\n", + " for (let i = 0; i < css_urls.length; i++) {\n", + " const url = css_urls[i];\n", + " const element = document.createElement(\"link\");\n", + " element.onload = on_load;\n", + " element.onerror = on_error.bind(null, url);\n", + " element.rel = \"stylesheet\";\n", + " element.type = \"text/css\";\n", + " element.href = url;\n", + " console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n", + " document.body.appendChild(element);\n", + " }\n", + "\n", + " for (let i = 0; i < js_urls.length; i++) {\n", + " const url = js_urls[i];\n", + " const element = document.createElement('script');\n", + " element.onload = on_load;\n", + " element.onerror = on_error.bind(null, url);\n", + " element.async = false;\n", + " element.src = url;\n", + " console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", + " document.head.appendChild(element);\n", + " }\n", + " };\n", + "\n", + " function inject_raw_css(css) {\n", + " const element = document.createElement(\"style\");\n", + " element.appendChild(document.createTextNode(css));\n", + " document.body.appendChild(element);\n", + " }\n", + "\n", + " const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-3.3.4.min.js\"];\n", + " const css_urls = [];\n", + "\n", + " const inline_js = [ function(Bokeh) {\n", + " Bokeh.set_log_level(\"info\");\n", + " },\n", + "function(Bokeh) {\n", + " }\n", + " ];\n", + "\n", + " function run_inline_js() {\n", + " if (root.Bokeh !== undefined || force === true) {\n", + " for (let i = 0; i < inline_js.length; i++) {\n", + " inline_js[i].call(root, root.Bokeh);\n", + " }\n", + "if (force === true) {\n", + " display_loaded();\n", + " }} else if (Date.now() < root._bokeh_timeout) {\n", + " setTimeout(run_inline_js, 100);\n", + " } else if (!root._bokeh_failed_load) {\n", + " console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n", + " root._bokeh_failed_load = true;\n", + " } else if (force !== true) {\n", + " const cell = $(document.getElementById(\"bc0a10cc-873c-4048-948c-987933ab78ac\")).parents('.cell').data().cell;\n", + " cell.output_area.append_execute_result(NB_LOAD_WARNING)\n", + " }\n", + " }\n", + "\n", + " if (root._bokeh_is_loading === 0) {\n", + " console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n", + " run_inline_js();\n", + " } else {\n", + " load_libs(css_urls, js_urls, function() {\n", + " console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n", + " run_inline_js();\n", + " });\n", + " }\n", + "}(window));" + ], + "application/vnd.bokehjs_load.v0+json": "(function(root) {\n function now() {\n return new Date();\n }\n\n const force = true;\n\n if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\n\n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n const NB_LOAD_WARNING = {'data': {'text/html':\n \"
\\n\"+\n \"

\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"

\\n\"+\n \"\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"\\n\"+\n \"
\"}};\n\n function display_loaded() {\n const el = document.getElementById(\"bc0a10cc-873c-4048-948c-987933ab78ac\");\n if (el != null) {\n el.textContent = \"BokehJS is loading...\";\n }\n if (root.Bokeh !== undefined) {\n if (el != null) {\n el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(display_loaded, 100)\n }\n }\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n\n function on_error(url) {\n console.error(\"failed to load \" + url);\n }\n\n for (let i = 0; i < css_urls.length; i++) {\n const url = css_urls[i];\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n }\n\n for (let i = 0; i < js_urls.length; i++) {\n const url = js_urls[i];\n const element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.async = false;\n element.src = url;\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n };\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-3.3.4.min.js\"];\n const css_urls = [];\n\n const inline_js = [ function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\nfunction(Bokeh) {\n }\n ];\n\n function run_inline_js() {\n if (root.Bokeh !== undefined || force === true) {\n for (let i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }\nif (force === true) {\n display_loaded();\n }} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n const cell = $(document.getElementById(\"bc0a10cc-873c-4048-948c-987933ab78ac\")).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n }\n\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(css_urls, js_urls, function() {\n console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Initializing Hail with default parameters...\n", + "/Users/irademac/src/hail/hail/python/hail/backend/backend.py:55: UserWarning:\n", + "\n", + "!!! THIS IS A DEVELOPMENT VERSION OF HAIL !!!\n", + "\n", + "SLF4J: Failed to load class \"org.slf4j.impl.StaticLoggerBinder\".\n", + "SLF4J: Defaulting to no-operation (NOP) logger implementation\n", + "SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.\n", + "Running on Apache Spark version 3.5.0\n", + "SparkUI available at http://localhost:4040\n", + "Welcome to\n", + " __ __ <>__\n", + " / /_/ /__ __/ /\n", + " / __ / _ `/ / /\n", + " /_/ /_/\\_,_/_/_/ version 0.2.132-ee6d17ebbcb6\n", + "LOGGING: writing to /Users/irademac/src/hail/hail/python/hail/docs/tutorials/hail-20240801-1723-0.2.132-ee6d17ebbcb6.log\n", + "SLF4J: Failed to load class \"org.slf4j.impl.StaticMDCBinder\".\n", + "SLF4J: Defaulting to no-operation MDCAdapter implementation.\n", + "SLF4J: See http://www.slf4j.org/codes.html#no_static_mdc_binder for further details.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
idx
idx_2
idx_3
int32float64float64
04.68e+013.27e+01
18.24e+013.61e+01
27.33e+017.62e+01
38.99e+011.82e+01
44.03e+012.52e+01
55.45e+011.44e+01
66.47e+019.74e+01
76.72e+015.70e+01
85.44e+013.35e+00
94.80e+012.50e+01

showing top 10 rows

\n" + ], + "text/plain": [ + "+-------+----------+----------+\n", + "| idx | idx_2 | idx_3 |\n", + "+-------+----------+----------+\n", + "| int32 | float64 | float64 |\n", + "+-------+----------+----------+\n", + "| 0 | 4.68e+01 | 3.27e+01 |\n", + "| 1 | 8.24e+01 | 3.61e+01 |\n", + "| 2 | 7.33e+01 | 7.62e+01 |\n", + "| 3 | 8.99e+01 | 1.82e+01 |\n", + "| 4 | 4.03e+01 | 2.52e+01 |\n", + "| 5 | 5.45e+01 | 1.44e+01 |\n", + "| 6 | 6.47e+01 | 9.74e+01 |\n", + "| 7 | 6.72e+01 | 5.70e+01 |\n", + "| 8 | 5.44e+01 | 3.35e+00 |\n", + "| 9 | 4.80e+01 | 2.50e+01 |\n", + "+-------+----------+----------+\n", + "showing top 10 rows" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from hail import rand_unif\n", + "from hail.utils import range_table\n", + "\n", + "data = range_table(100)\n", + "data = data.annotate(idx_2=rand_unif(0, 100), idx_3=rand_unif(0, 100))\n", + "data.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "98c77839-1706-4f9c-abb2-daccba645304", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from hail.ggplot2 import ChartWrapper\n", + "\n", + "wrapper = ChartWrapper(data)\n", + "wrapper.chart.mark_point().encode(\n", + " x=\"idx\",\n", + " y=\"idx_2\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "614c6573-effd-4bb4-a1ad-11844ffde8d0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
xx2y
01.0000054.2633163
14.2633167.5266280
27.52662810.7899392
310.78993914.0532502
414.05325017.3165613
517.31656120.5798723
620.57987223.8431832
723.84318327.1064943
827.10649430.3698052
930.36980533.6331163
1033.63311636.8964272
1136.89642740.1597384
1240.15973843.4230495
1343.42304946.6863604
1446.68636049.9496716
1549.94967153.2129821
1653.21298256.4762935
1756.47629359.7396052
1859.73960563.0029164
1963.00291666.2662274
2066.26622769.5295383
2169.52953872.7928493
2272.79284976.0561604
2376.05616079.3194715
2479.31947182.5827826
2582.58278285.8460933
2685.84609389.1094044
2789.10940492.3727152
2892.37271595.6360267
2995.63602698.8993373
\n", + "
" + ], + "text/plain": [ + " x x2 y\n", + "0 1.000005 4.263316 3\n", + "1 4.263316 7.526628 0\n", + "2 7.526628 10.789939 2\n", + "3 10.789939 14.053250 2\n", + "4 14.053250 17.316561 3\n", + "5 17.316561 20.579872 3\n", + "6 20.579872 23.843183 2\n", + "7 23.843183 27.106494 3\n", + "8 27.106494 30.369805 2\n", + "9 30.369805 33.633116 3\n", + "10 33.633116 36.896427 2\n", + "11 36.896427 40.159738 4\n", + "12 40.159738 43.423049 5\n", + "13 43.423049 46.686360 4\n", + "14 46.686360 49.949671 6\n", + "15 49.949671 53.212982 1\n", + "16 53.212982 56.476293 5\n", + "17 56.476293 59.739605 2\n", + "18 59.739605 63.002916 4\n", + "19 63.002916 66.266227 4\n", + "20 66.266227 69.529538 3\n", + "21 69.529538 72.792849 3\n", + "22 72.792849 76.056160 4\n", + "23 76.056160 79.319471 5\n", + "24 79.319471 82.582782 6\n", + "25 82.582782 85.846093 3\n", + "26 85.846093 89.109404 4\n", + "27 89.109404 92.372715 2\n", + "28 92.372715 95.636026 7\n", + "29 95.636026 98.899337 3" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from altair import X\n", + "\n", + "wrapper.histogram(\"idx_2\")\n", + "wrapper.chart.data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "7783876e-12f5-41ee-b3ba-f125e57f0b94", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wrapper.chart.mark_bar().encode(\n", + " x=X(\"x\", bin=\"binned\"),\n", + " x2=\"x2\",\n", + " y=\"y\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "2bf2bf75", + "metadata": {}, + "source": [ + "# Plotting Data with Hail\n", + "\n", + "[The `ggplot2` module]() provides a set of functions for aggregating and plotting data that is stored in [`Table`](https://hail.is/docs/0.2/hail.Table.html)s and [`MatrixTable`](https://hail.is/docs/0.2/hail.MatrixTable.html)s. It attempts to mimic the interface of [R's `ggplot2` library](https://ggplot2.tidyverse.org) as closely as possible. Plots are displayed using the [Vega-Altair](https://altair-viz.github.io/) library.\n", + "\n", + "On this page, you'll find an explanation of the basics of how this library works, as well as examples of how to use it to create some common types of plot.\n", + "\n", + "## Example Data\n", + "\n", + "In order to provide example plots, we'll need example data. The following code uses [`range_table`](https://hail.is/docs/0.2/utils/index.html#hail.utils.range_table) to generate a table with a column containing the index of each row, and then uses [`annotate`](https://hail.is/docs/0.2/hail.Table.html#hail.Table.annotate) and [`rand_unif`](https://hail.is/docs/0.2/functions/random.html#hail.expr.functions.rand_unif) to add a few other columns that each contain randomized values:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "ca1ba550", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + " \n", + "
\n", + " \n", + " Loading BokehJS ...\n", + "
\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "(function(root) {\n", + " function now() {\n", + " return new Date();\n", + " }\n", + "\n", + " const force = true;\n", + "\n", + " if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n", + " root._bokeh_onload_callbacks = [];\n", + " root._bokeh_is_loading = undefined;\n", + " }\n", + "\n", + "const JS_MIME_TYPE = 'application/javascript';\n", + " const HTML_MIME_TYPE = 'text/html';\n", + " const EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n", + " const CLASS_NAME = 'output_bokeh rendered_html';\n", + "\n", + " /**\n", + " * Render data to the DOM node\n", + " */\n", + " function render(props, node) {\n", + " const script = document.createElement(\"script\");\n", + " node.appendChild(script);\n", + " }\n", + "\n", + " /**\n", + " * Handle when an output is cleared or removed\n", + " */\n", + " function handleClearOutput(event, handle) {\n", + " function drop(id) {\n", + " const view = Bokeh.index.get_by_id(id)\n", + " if (view != null) {\n", + " view.model.document.clear()\n", + " Bokeh.index.delete(view)\n", + " }\n", + " }\n", + "\n", + " const cell = handle.cell;\n", + "\n", + " const id = cell.output_area._bokeh_element_id;\n", + " const server_id = cell.output_area._bokeh_server_id;\n", + "\n", + " // Clean up Bokeh references\n", + " if (id != null) {\n", + " drop(id)\n", + " }\n", + "\n", + " if (server_id !== undefined) {\n", + " // Clean up Bokeh references\n", + " const cmd_clean = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n", + " cell.notebook.kernel.execute(cmd_clean, {\n", + " iopub: {\n", + " output: function(msg) {\n", + " const id = msg.content.text.trim()\n", + " drop(id)\n", + " }\n", + " }\n", + " });\n", + " // Destroy server and session\n", + " const cmd_destroy = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n", + " cell.notebook.kernel.execute(cmd_destroy);\n", + " }\n", + " }\n", + "\n", + " /**\n", + " * Handle when a new output is added\n", + " */\n", + " function handleAddOutput(event, handle) {\n", + " const output_area = handle.output_area;\n", + " const output = handle.output;\n", + "\n", + " // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n", + " if ((output.output_type != \"display_data\") || (!Object.prototype.hasOwnProperty.call(output.data, EXEC_MIME_TYPE))) {\n", + " return\n", + " }\n", + "\n", + " const toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n", + "\n", + " if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n", + " toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n", + " // store reference to embed id on output_area\n", + " output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n", + " }\n", + " if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n", + " const bk_div = document.createElement(\"div\");\n", + " bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n", + " const script_attrs = bk_div.children[0].attributes;\n", + " for (let i = 0; i < script_attrs.length; i++) {\n", + " toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n", + " toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent\n", + " }\n", + " // store reference to server id on output_area\n", + " output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n", + " }\n", + " }\n", + "\n", + " function register_renderer(events, OutputArea) {\n", + "\n", + " function append_mime(data, metadata, element) {\n", + " // create a DOM node to render to\n", + " const toinsert = this.create_output_subarea(\n", + " metadata,\n", + " CLASS_NAME,\n", + " EXEC_MIME_TYPE\n", + " );\n", + " this.keyboard_manager.register_events(toinsert);\n", + " // Render to node\n", + " const props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n", + " render(props, toinsert[toinsert.length - 1]);\n", + " element.append(toinsert);\n", + " return toinsert\n", + " }\n", + "\n", + " /* Handle when an output is cleared or removed */\n", + " events.on('clear_output.CodeCell', handleClearOutput);\n", + " events.on('delete.Cell', handleClearOutput);\n", + "\n", + " /* Handle when a new output is added */\n", + " events.on('output_added.OutputArea', handleAddOutput);\n", + "\n", + " /**\n", + " * Register the mime type and append_mime function with output_area\n", + " */\n", + " OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n", + " /* Is output safe? */\n", + " safe: true,\n", + " /* Index of renderer in `output_area.display_order` */\n", + " index: 0\n", + " });\n", + " }\n", + "\n", + " // register the mime type if in Jupyter Notebook environment and previously unregistered\n", + " if (root.Jupyter !== undefined) {\n", + " const events = require('base/js/events');\n", + " const OutputArea = require('notebook/js/outputarea').OutputArea;\n", + "\n", + " if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n", + " register_renderer(events, OutputArea);\n", + " }\n", + " }\n", + " if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n", + " root._bokeh_timeout = Date.now() + 5000;\n", + " root._bokeh_failed_load = false;\n", + " }\n", + "\n", + " const NB_LOAD_WARNING = {'data': {'text/html':\n", + " \"
\\n\"+\n", + " \"

\\n\"+\n", + " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", + " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", + " \"

\\n\"+\n", + " \"\\n\"+\n", + " \"\\n\"+\n", + " \"from bokeh.resources import INLINE\\n\"+\n", + " \"output_notebook(resources=INLINE)\\n\"+\n", + " \"\\n\"+\n", + " \"
\"}};\n", + "\n", + " function display_loaded() {\n", + " const el = document.getElementById(\"c0c86951-4266-4199-9794-950b65062b63\");\n", + " if (el != null) {\n", + " el.textContent = \"BokehJS is loading...\";\n", + " }\n", + " if (root.Bokeh !== undefined) {\n", + " if (el != null) {\n", + " el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n", + " }\n", + " } else if (Date.now() < root._bokeh_timeout) {\n", + " setTimeout(display_loaded, 100)\n", + " }\n", + " }\n", + "\n", + " function run_callbacks() {\n", + " try {\n", + " root._bokeh_onload_callbacks.forEach(function(callback) {\n", + " if (callback != null)\n", + " callback();\n", + " });\n", + " } finally {\n", + " delete root._bokeh_onload_callbacks\n", + " }\n", + " console.debug(\"Bokeh: all callbacks have finished\");\n", + " }\n", + "\n", + " function load_libs(css_urls, js_urls, callback) {\n", + " if (css_urls == null) css_urls = [];\n", + " if (js_urls == null) js_urls = [];\n", + "\n", + " root._bokeh_onload_callbacks.push(callback);\n", + " if (root._bokeh_is_loading > 0) {\n", + " console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", + " return null;\n", + " }\n", + " if (js_urls == null || js_urls.length === 0) {\n", + " run_callbacks();\n", + " return null;\n", + " }\n", + " console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", + " root._bokeh_is_loading = css_urls.length + js_urls.length;\n", + "\n", + " function on_load() {\n", + " root._bokeh_is_loading--;\n", + " if (root._bokeh_is_loading === 0) {\n", + " console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n", + " run_callbacks()\n", + " }\n", + " }\n", + "\n", + " function on_error(url) {\n", + " console.error(\"failed to load \" + url);\n", + " }\n", + "\n", + " for (let i = 0; i < css_urls.length; i++) {\n", + " const url = css_urls[i];\n", + " const element = document.createElement(\"link\");\n", + " element.onload = on_load;\n", + " element.onerror = on_error.bind(null, url);\n", + " element.rel = \"stylesheet\";\n", + " element.type = \"text/css\";\n", + " element.href = url;\n", + " console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n", + " document.body.appendChild(element);\n", + " }\n", + "\n", + " for (let i = 0; i < js_urls.length; i++) {\n", + " const url = js_urls[i];\n", + " const element = document.createElement('script');\n", + " element.onload = on_load;\n", + " element.onerror = on_error.bind(null, url);\n", + " element.async = false;\n", + " element.src = url;\n", + " console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", + " document.head.appendChild(element);\n", + " }\n", + " };\n", + "\n", + " function inject_raw_css(css) {\n", + " const element = document.createElement(\"style\");\n", + " element.appendChild(document.createTextNode(css));\n", + " document.body.appendChild(element);\n", + " }\n", + "\n", + " const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-3.3.4.min.js\"];\n", + " const css_urls = [];\n", + "\n", + " const inline_js = [ function(Bokeh) {\n", + " Bokeh.set_log_level(\"info\");\n", + " },\n", + "function(Bokeh) {\n", + " }\n", + " ];\n", + "\n", + " function run_inline_js() {\n", + " if (root.Bokeh !== undefined || force === true) {\n", + " for (let i = 0; i < inline_js.length; i++) {\n", + " inline_js[i].call(root, root.Bokeh);\n", + " }\n", + "if (force === true) {\n", + " display_loaded();\n", + " }} else if (Date.now() < root._bokeh_timeout) {\n", + " setTimeout(run_inline_js, 100);\n", + " } else if (!root._bokeh_failed_load) {\n", + " console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n", + " root._bokeh_failed_load = true;\n", + " } else if (force !== true) {\n", + " const cell = $(document.getElementById(\"c0c86951-4266-4199-9794-950b65062b63\")).parents('.cell').data().cell;\n", + " cell.output_area.append_execute_result(NB_LOAD_WARNING)\n", + " }\n", + " }\n", + "\n", + " if (root._bokeh_is_loading === 0) {\n", + " console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n", + " run_inline_js();\n", + " } else {\n", + " load_libs(css_urls, js_urls, function() {\n", + " console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n", + " run_inline_js();\n", + " });\n", + " }\n", + "}(window));" + ], + "application/vnd.bokehjs_load.v0+json": "(function(root) {\n function now() {\n return new Date();\n }\n\n const force = true;\n\n if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\n\n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n const NB_LOAD_WARNING = {'data': {'text/html':\n \"
\\n\"+\n \"

\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"

\\n\"+\n \"\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"\\n\"+\n \"
\"}};\n\n function display_loaded() {\n const el = document.getElementById(\"c0c86951-4266-4199-9794-950b65062b63\");\n if (el != null) {\n el.textContent = \"BokehJS is loading...\";\n }\n if (root.Bokeh !== undefined) {\n if (el != null) {\n el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(display_loaded, 100)\n }\n }\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n\n function on_error(url) {\n console.error(\"failed to load \" + url);\n }\n\n for (let i = 0; i < css_urls.length; i++) {\n const url = css_urls[i];\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n }\n\n for (let i = 0; i < js_urls.length; i++) {\n const url = js_urls[i];\n const element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.async = false;\n element.src = url;\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n };\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-3.3.4.min.js\"];\n const css_urls = [];\n\n const inline_js = [ function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\nfunction(Bokeh) {\n }\n ];\n\n function run_inline_js() {\n if (root.Bokeh !== undefined || force === true) {\n for (let i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }\nif (force === true) {\n display_loaded();\n }} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n const cell = $(document.getElementById(\"c0c86951-4266-4199-9794-950b65062b63\")).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n }\n\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(css_urls, js_urls, function() {\n console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));" + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Initializing Hail with default parameters...\n", + "/Users/irademac/src/hail/hail/python/hail/backend/backend.py:55: UserWarning:\n", + "\n", + "!!! THIS IS A DEVELOPMENT VERSION OF HAIL !!!\n", + "\n", + "SLF4J: Failed to load class \"org.slf4j.impl.StaticLoggerBinder\".\n", + "SLF4J: Defaulting to no-operation (NOP) logger implementation\n", + "SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.\n", + "Running on Apache Spark version 3.5.0\n", + "SparkUI available at http://localhost:4040\n", + "Welcome to\n", + " __ __ <>__\n", + " / /_/ /__ __/ /\n", + " / __ / _ `/ / /\n", + " /_/ /_/\\_,_/_/_/ version 0.2.132-6a8d0380da57\n", + "LOGGING: writing to /Users/irademac/src/hail/hail/python/hail/docs/tutorials/hail-20240723-1322-0.2.132-6a8d0380da57.log\n", + "SLF4J: Failed to load class \"org.slf4j.impl.StaticMDCBinder\".\n", + "SLF4J: Defaulting to no-operation MDCAdapter implementation.\n", + "SLF4J: See http://www.slf4j.org/codes.html#no_static_mdc_binder for further details.\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
idx
idx_2
idx_3
int32float64float64
04.68e+013.27e+01
18.24e+013.61e+01
27.33e+017.62e+01
38.99e+011.82e+01
44.03e+012.52e+01
55.45e+011.44e+01
66.47e+019.74e+01
76.72e+015.70e+01
85.44e+013.35e+00
94.80e+012.50e+01

showing top 10 rows

\n" + ], + "text/plain": [ + "+-------+----------+----------+\n", + "| idx | idx_2 | idx_3 |\n", + "+-------+----------+----------+\n", + "| int32 | float64 | float64 |\n", + "+-------+----------+----------+\n", + "| 0 | 4.68e+01 | 3.27e+01 |\n", + "| 1 | 8.24e+01 | 3.61e+01 |\n", + "| 2 | 7.33e+01 | 7.62e+01 |\n", + "| 3 | 8.99e+01 | 1.82e+01 |\n", + "| 4 | 4.03e+01 | 2.52e+01 |\n", + "| 5 | 5.45e+01 | 1.44e+01 |\n", + "| 6 | 6.47e+01 | 9.74e+01 |\n", + "| 7 | 6.72e+01 | 5.70e+01 |\n", + "| 8 | 5.44e+01 | 3.35e+00 |\n", + "| 9 | 4.80e+01 | 2.50e+01 |\n", + "+-------+----------+----------+\n", + "showing top 10 rows" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from hail import rand_unif\n", + "from hail.utils import range_table\n", + "\n", + "data = range_table(100)\n", + "data = data.annotate(idx_2=rand_unif(0, 100), idx_3=rand_unif(0, 100))\n", + "data.show()" + ] + }, + { + "cell_type": "markdown", + "id": "860d0a3a", + "metadata": {}, + "source": [ + "## Plot Objects\n", + "\n", + "When creating a plot, we start out with a basic **plot object**, which wraps our data. We can create such an object using the [`ggplot`](https://hail.is/docs/0.2/ggplot/index.html#hail.ggplot.ggplot) function, and take a closer look using Python's builtin [`print`](https://docs.python.org/3.9/library/functions.html#print) function:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "45ac0248", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Plot(\n", + " data = ,\n", + " mapping = Mapping(\n", + " x = None,\n", + " y = None,\n", + " ),\n", + " layers = [],\n", + ")\n" + ] + } + ], + "source": [ + "from hail.ggplot2 import ggplot\n", + "\n", + "plot = ggplot(data)\n", + "print(plot)" + ] + }, + { + "cell_type": "markdown", + "id": "b584643d", + "metadata": {}, + "source": [ + "## Aesthetic Mappings\n", + "\n", + "Next, we'll need to specify which values to plot along our x- and y-axes.\n", + "\n", + "We'll use the [`aes`](https://hail.is/docs/0.2/ggplot2/index.html#hail.ggplot2.aes) function to create an **aesthetic mapping**, which links data to visual components of our plot. `aes` assumes its first two arguments are `x` and `y`, and can be passed any other arguments via keyword.\n", + "\n", + "Let's map the `idx` column of our data to the x-axis, and the `idx_2` column to the y-axis:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4bd3ce19", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Plot(\n", + " data = ,\n", + " mapping = Mapping(\n", + " x = idx,\n", + " y = idx_2,\n", + " ),\n", + " layers = [],\n", + ")\n" + ] + } + ], + "source": [ + "from hail.ggplot2 import aes\n", + "\n", + "plot += aes(\"idx\", \"idx_2\")\n", + "print(plot)" + ] + }, + { + "cell_type": "markdown", + "id": "28d8b3c5-f656-4e16-8eb9-f04b38962a96", + "metadata": {}, + "source": [ + "Aesthetic mappings can also be used to specify visual properties of a plot, such as the color of a line, as we'll see in the next section." + ] + }, + { + "cell_type": "markdown", + "id": "d3edd5a4", + "metadata": {}, + "source": [ + "## Layers\n", + "\n", + "Like R's `ggplot2`, Hail `ggplot2` bases its approach to plotting on a [layered grammar of graphics](https://ggplot2-book.org/introduction.html#what-is-the-grammar-of-graphics).\n", + "\n", + "A **layer** is a self-contained collection of visual characteristics, data, and/or aggregations over that data. We'll build up our plot from the data by adding different layers to it.\n", + "\n", + "### Geoms\n", + "\n", + "A **geom** is a type of layer that specifies which kind of plot we're making. For example, `geom_point` indicates that we're making a scatterplot:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "257821d6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Plot(\n", + " data = ,\n", + " mapping = Mapping(\n", + " x = idx,\n", + " y = idx_2,\n", + " ),\n", + " layers = [Layer(mapping=Mapping(x=None, y=None, color=None), data=None, geom='circle', stat='identity', params={})],\n", + ")\n" + ] + } + ], + "source": [ + "from hail.ggplot2 import geom_point\n", + "\n", + "plot += geom_point()\n", + "print(plot)" + ] + }, + { + "cell_type": "markdown", + "id": "6a6ac0ec", + "metadata": {}, + "source": [ + "Since we've now added information about what the plot will look like, we can display it using the `show` function:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "71d0e4a2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from hail.ggplot2 import show\n", + "\n", + "show(plot)" + ] + }, + { + "cell_type": "markdown", + "id": "4fa9d8b9", + "metadata": {}, + "source": [ + "We can also add multiple geoms to a single plot.\n", + "\n", + "For example, we can plot a line on top of our scatterplot using `geom_line`. This line will still plot the values of the `idx` column along the x-axis, but use the `idx_3` column for the y-axis. We can override the `y` mapping for this geom by passing in its own `aes`:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0c266d8f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from hail.ggplot2 import geom_line\n", + "\n", + "plot += geom_line(aes(y=\"idx_3\"))\n", + "show(plot)" + ] + }, + { + "cell_type": "markdown", + "id": "bd233399-39a9-46e0-a125-bfbc044e866e", + "metadata": {}, + "source": [ + "It's a little tough to visually distinguish this line from the scatterplot's points, so let's remove it and re-add it with a different color." + ] + }, + { + "cell_type": "markdown", + "id": "d138d5c4", + "metadata": {}, + "source": [ + "### Removing Layers\n", + "\n", + "With R's `ggplot2`, if you add a layer to a plot object, [it can be tough to remove it](https://stackoverflow.com/questions/50434608/remove-geoms-from-an-existing-ggplot-chart).\n", + "\n", + "However, Hail GGPlot's plot objects keep track of each addition made to them, allowing us to use the `undo` method to roll back a single addition:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "5ffeb79f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from hail.ggplot2 import undo\n", + "\n", + "plot = undo(plot)\n", + "show(plot)" + ] + }, + { + "cell_type": "markdown", + "id": "045ebf6c-713b-4f2c-bd81-e2722daeb917", + "metadata": {}, + "source": [ + "Now we can specify the color of our line by passing its [hexadecimal color code](https://www.w3schools.com/colors/colors_picker.asp) to the [`color`]() keyword of [`aes`](https://hail.is/docs/0.2/ggplot/index.html#hail.ggplot.aes):" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "349211fd-6d5c-4c2d-8f0d-4450a91ddd24", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "plot += geom_line(aes(y=\"idx_3\", color=\"#e3b5ce\"))\n", + "show(plot)" + ] + }, + { + "cell_type": "markdown", + "id": "c3d781bb", + "metadata": {}, + "source": [ + "### Stats\n", + "\n", + "Let's try making another type of plot. First, we can clear out the layers that we've added so far all at once by passing the [`depth`]() keyword argument to [`undo`]():" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "e99dc4c1-c9c1-4138-80c7-90fab37ac121", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Plot(\n", + " data = ,\n", + " mapping = Mapping(\n", + " x = None,\n", + " y = None,\n", + " ),\n", + " layers = [],\n", + ")\n" + ] + } + ], + "source": [ + "plot = undo(plot, depth=3)\n", + "print(plot)" + ] + }, + { + "cell_type": "markdown", + "id": "153f0fbb-1ce2-41c1-a196-e0088baa5b15", + "metadata": {}, + "source": [ + "Now, let's make a histogram using [`geom_histogram`]():" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "8a4c6ec3-62a3-463c-8500-3fc449746d32", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from hail.ggplot2 import geom_histogram\n", + "\n", + "plot += geom_histogram(aes(\"idx_2\"))\n", + "show(plot)" + ] + }, + { + "cell_type": "markdown", + "id": "46b31847-534b-4caa-bf4a-565cab1b9fb1", + "metadata": {}, + "source": [ + "[`geom_histogram`]() places the data into 30 bins by default, but what if we want to bin the data differently? We can use the [`bins`]() argument to [`geom_histogram`]():" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "8c20df8d-092b-48f5-85cb-45e486bbbb5b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "plot = undo(plot)\n", + "plot += geom_histogram(aes(\"idx_2\"), bins=3)\n", + "show(plot)" + ] + }, + { + "cell_type": "markdown", + "id": "86193b45-31a4-428e-9e1c-222c632ad7a6", + "metadata": {}, + "source": [ + "TODO: you may have noticed that some geoms, like histogram, implicitly compute some statistics about the data before rendering it. but what if you need to transform your data independently of a geom?\n", + "\n", + "TODO: stats are cached, so when you recompute them, the plot object will attempt to reuse the cached values of previously applied aggregations" + ] + }, + { + "cell_type": "markdown", + "id": "825baee3", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "## Titles\n", + "\n", + "TODO" + ] + }, + { + "cell_type": "markdown", + "id": "8fdbe273", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "## Axis Labels\n", + "\n", + "TODO" + ] + }, + { + "cell_type": "markdown", + "id": "9f3b6784", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "## Scales\n", + "\n", + "TODO: what are scales" + ] + }, + { + "cell_type": "markdown", + "id": "74613547", + "metadata": { + "jp-MarkdownHeadingCollapsed": true + }, + "source": [ + "## Facets\n", + "\n", + "TODO" + ] + }, + { + "cell_type": "markdown", + "id": "4c37a13e", + "metadata": {}, + "source": [ + "## Examples" + ] + }, + { + "cell_type": "markdown", + "id": "bc860479", + "metadata": {}, + "source": [ + "### Cumulative Histogram\n", + "\n", + "TODO" + ] + }, + { + "cell_type": "markdown", + "id": "ac2baeee", + "metadata": {}, + "source": [ + "### 2D Histogram\n", + "\n", + "TODO" + ] + }, + { + "cell_type": "markdown", + "id": "b60781bf", + "metadata": {}, + "source": [ + "### Scatter Plot\n", + "\n", + "TODO" + ] + }, + { + "cell_type": "markdown", + "id": "5b802442", + "metadata": {}, + "source": [ + "### QQ Plot\n", + "\n", + "TODO: To create a quantile-quantile (QQ) plot, ..." + ] + }, + { + "cell_type": "markdown", + "id": "633d723b", + "metadata": {}, + "source": [ + "### Manhattan Plot\n", + "\n", + "TODO (use actual genetics data)" + ] + }, + { + "cell_type": "markdown", + "id": "73fdbf30", + "metadata": {}, + "source": [ + "## Common Mistakes\n", + "\n", + "TODO: don't use the class constructors, call the special functions" + ] + }, + { + "cell_type": "markdown", + "id": "c08df783", + "metadata": {}, + "source": [ + "TODO: example specifically demonstrating downsampling" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/hail/python/hail/ggplot2/.ipynb_checkpoints/ggplot2-checkpoint.py b/hail/python/hail/ggplot2/.ipynb_checkpoints/ggplot2-checkpoint.py new file mode 100644 index 00000000000..df390de4d89 --- /dev/null +++ b/hail/python/hail/ggplot2/.ipynb_checkpoints/ggplot2-checkpoint.py @@ -0,0 +1,190 @@ +from dataclasses import asdict, replace +from textwrap import dedent, indent +from typing import Any, Dict, List, Literal, Optional, Tuple, Union + +from altair import X2, Chart, LayerChart, X, Y +from pandas import DataFrame + +import hail as hl +from hail import MatrixTable, Table +from hail.ggplot2.utils import typeguard_dataclass + + +### types ### +Data = Union[Table, MatrixTable] + + +@typeguard_dataclass +class Mapping: + x: Optional[str] + y: Optional[str] + # TODO add the rest of the supported aesthetic names + color: Optional[str] + + +Geom = Literal["bar", "line", "circle"] +Stat = Literal["identity", "bin"] + + +@typeguard_dataclass +class Layer: + mapping: Mapping + data: Optional[Data] + geom: Optional[Geom] + stat: Stat + # FIXME if there's only one type per param name we can make this a typeddict + params: Dict[str, Any] + + +@typeguard_dataclass +class Plot: + data: Optional[Data] + mapping: Mapping + layers: list[Layer] + + +### module-level variables ### +_plot_cache: Dict[int, List[Plot]] = {} +_stat_cache: Dict[Tuple[int, ...], Data] = {} + + +### constructor functions ### +def aes(x: Optional[str] = None, y: Optional[str] = None, color: Optional[str] = None) -> Mapping: + return Mapping(x, y, color) + + +def geom_histogram(mapping: Mapping = aes(), data: Optional[Data] = None, bins: int = 30) -> Layer: + return Layer(mapping, data, "bar", "bin", {"bins": bins}) + + +def geom_line(mapping: Mapping = aes(), data: Optional[Data] = None) -> Layer: + return Layer(mapping, data, "line", "identity", {}) + + +def geom_point(mapping: Mapping = aes(), data: Optional[Data] = None) -> Layer: + return Layer(mapping, data, "circle", "identity", {}) + + +def ggplot(data: Optional[Data] = None, mapping: Mapping = aes()) -> Plot: + global _plot_cache + new_plot = Plot(data, mapping, []) + _plot_cache |= {id(new_plot): []} + return new_plot + + +### functionality ### +def extend(plot: Plot, other: Any) -> Plot: + global _plot_cache + kwargs: Optional[Dict[str, Any]] = None + if isinstance(other, Mapping): + kwargs = { + "mapping": replace( + plot.mapping, + **{k: v for k, v in {"x": other.x, "y": other.y, "color": other.color}.items() if v is not None}, + ) + } + elif isinstance(other, Layer): + kwargs = {"layers": [*plot.layers, other]} + + if kwargs is None: + raise ValueError("unsupported addition to plot") + + new_plot = replace(plot, **kwargs) + _plot_cache |= {id(new_plot): _plot_cache[id(plot)] + [plot]} + _plot_cache = {k: v for k, v in _plot_cache.items() if k != id(plot)} + return new_plot + + +setattr(Plot, "__add__", extend) + + +_altair_configure_mark_keys = {"color"} +_altair_encode_keys = {"x": X, "x2": X2, "y": Y} + + +def show(plot: Plot) -> Union[Chart, LayerChart]: + global _stat_cache + base_chart = None + for layer in plot.layers: + mapping_dict = {} + for mapping in [plot.mapping, layer.mapping]: + mapping_dict = {**mapping_dict, **{k: v for k, v in asdict(mapping).items() if v is not None}} + # TODO should we break the stat stuff out to its own function? + kwargs = {"x": {}, "x2": {}, "y": {}} + cached = _stat_cache.get((id(plot.data), layer.stat), None) + if cached is not None: + data, df = cached + elif layer.stat == "identity": + data = plot.data + df = data.to_pandas() + elif layer.stat == "bin": + # TODO add caching + x = mapping_dict.get("x", None) + if x is None: + raise ValueError("x must be supplied for stat bin") + data = plot.data.aggregate( + hl.agg.hist( + plot.data[x], + plot.data.aggregate(hl.agg.min(plot.data[x])), + plot.data.aggregate(hl.agg.max(plot.data[x])), + layer.params["bins"], + ) + ) + df = DataFrame([ + {x: data["bin_edges"][i], "x2": data["bin_edges"][i + 1], "y": data["bin_freq"][i]} + for i in range(len(data["bin_freq"])) + ]) + kwargs["x"] = {"bin": "binned"} + mapping_dict["x2"] = "x2" + mapping_dict["y"] = "y" + else: + raise ValueError("unknown stat") + _stat_cache |= {(id(plot.data), layer.stat): (data, df)} + chart = Chart(df) + if layer.geom is not None: + chart = getattr(chart, f"mark_{layer.geom}")(**{ + k: v for k, v in mapping_dict.items() if k in _altair_configure_mark_keys + }) + chart = chart.encode(**{ + k: _altair_encode_keys[k](v, **kwargs[k]) for k, v in mapping_dict.items() if k in _altair_encode_keys + }) + base_chart = chart if base_chart is None else base_chart + chart + return base_chart + + +def undo(plot: Plot, *, depth: int = 1) -> Plot: + global _plot_cache + old_plot = _plot_cache[id(plot)][0 - depth] + _plot_cache |= {id(old_plot): _plot_cache[id(plot)][: 0 - depth]} + _plot_cache = {k: v for k, v in _plot_cache.items() if k != id(plot)} + return old_plot + + +## introspection ## +def plot_to_string(plot: Plot) -> str: + return dedent(f"""\ + Plot( + data = {plot.data}, + mapping = {indent_tail(str(plot.mapping), 3)}, + layers = {indent_tail(str(plot.layers), 3)}, + )""") + + +def indent_tail(string: str, indent_level: int = 1) -> str: + return "".join([ + indent(part, " " * indent_level) if index == 2 else part for index, part in enumerate(string.partition("\n")) + ]) + + +setattr(Plot, "__str__", plot_to_string) + + +def mapping_to_string(mapping: Mapping) -> str: + return dedent(f"""\ + Mapping( + x = {mapping.x}, + y = {mapping.y}, + )""") + + +setattr(Mapping, "__str__", mapping_to_string) diff --git a/hail/python/hail/ggplot2/__init__.py b/hail/python/hail/ggplot2/__init__.py new file mode 100644 index 00000000000..a41e7235a23 --- /dev/null +++ b/hail/python/hail/ggplot2/__init__.py @@ -0,0 +1,29 @@ +from typeguard import install_import_hook + +install_import_hook("hail.ggplot2") + +# These imports need to be placed after the import hook in order for typechecking to work. +# https://typeguard.readthedocs.io/en/stable/userguide.html#using-the-import-hook +from .altair_wrapper import ChartWrapper # noqa: E402 +from .ggplot2 import ( # noqa: E402 + aes, + extend, + geom_histogram, + geom_line, + geom_point, + ggplot, + show, + undo, +) + +__all__ = [ + "ChartWrapper", + "aes", + "extend", + "geom_point", + "geom_line", + "geom_histogram", + "ggplot", + "undo", + "show", +] diff --git a/hail/python/hail/ggplot2/altair_wrapper.py b/hail/python/hail/ggplot2/altair_wrapper.py new file mode 100644 index 00000000000..dfe1a55f393 --- /dev/null +++ b/hail/python/hail/ggplot2/altair_wrapper.py @@ -0,0 +1,46 @@ +from typing import Any, Union + +from altair import Chart +from pandas import DataFrame + +import hail +from hail import MatrixTable, Table + +Data = Union[Table, MatrixTable] + + +class ChartWrapper: + def __init__(self, data: Data, *args, **kwargs) -> None: + self.chart_args = args + self.chart_kwargs = kwargs + self.data = data + + def __setattr__(self, name: str, value: Any) -> None: + super().__setattr__(name, value) + if name == "data": + self.update_data() + + def update_data(self) -> None: + self.cache = {} + self.chart = Chart(self.data.to_pandas(), *self.chart_args, **self.chart_kwargs) + + def histogram(self, x: str, bins: int = 30) -> None: + if (aggregated := self.cache.get("histogram", None)) is None: + self.cache["histogram"] = ( + aggregated := self.data.aggregate( + hail.agg.hist( + self.data[x], + self.data.aggregate(hail.agg.min(self.data[x])), + self.data.aggregate(hail.agg.max(self.data[x])), + bins, + ) + ) + ) + self.chart = Chart( + DataFrame([ + {"x": aggregated["bin_edges"][i], "x2": aggregated["bin_edges"][i + 1], "y": aggregated["bin_freq"][i]} + for i in range(len(aggregated["bin_freq"])) + ]), + *self.chart_args, + **self.chart_kwargs, + ) diff --git a/hail/python/hail/ggplot2/ggplot2.py b/hail/python/hail/ggplot2/ggplot2.py new file mode 100644 index 00000000000..0a435aa5d00 --- /dev/null +++ b/hail/python/hail/ggplot2/ggplot2.py @@ -0,0 +1,197 @@ +from dataclasses import asdict, replace +from textwrap import dedent, indent +from typing import Any, Dict, List, Literal, Optional, Tuple, Union + +from altair import X2, Chart, LayerChart, X, Y +from pandas import DataFrame + +import hail as hl +from hail import MatrixTable, Table +from hail.ggplot2.utils import typeguard_dataclass + +# TODO before review +# expose the underlying altair object +# write unit tests asserting against that +# check whether there's stuff you can put in the ggplot interface that doesnt work for histogram +# apply stat caching to histogram +# finish histogram section in notebook +# TODO later +# extract core wrapper that does hail table -> vega, expose ggplot interface and altair style interface over that + +### types ### +Data = Union[Table, MatrixTable] + + +@typeguard_dataclass +class Mapping: + x: Optional[str] + y: Optional[str] + # TODO add the rest of the supported aesthetic names + color: Optional[str] + + +Geom = Literal["bar", "line", "circle"] +Stat = Literal["identity", "bin"] + + +@typeguard_dataclass +class Layer: + mapping: Mapping + data: Optional[Data] + geom: Optional[Geom] + stat: Stat + # FIXME if there's only one type per param name we can make this a typeddict + params: Dict[str, Any] + + +@typeguard_dataclass +class Plot: + data: Optional[Data] + mapping: Mapping + layers: list[Layer] + + +### module-level variables ### +# TODO eviction policy for cache & hl init level config for it +_plot_cache: Dict[int, List[Plot]] = {} +_stat_cache: Dict[Tuple[int, ...], Data] = {} + + +### constructor functions ### +def aes(x: Optional[str] = None, y: Optional[str] = None, color: Optional[str] = None) -> Mapping: + return Mapping(x, y, color) + + +def geom_histogram(mapping: Mapping = aes(), data: Optional[Data] = None, bins: int = 30) -> Layer: + return Layer(mapping, data, "bar", "bin", {"bins": bins}) + + +def geom_line(mapping: Mapping = aes(), data: Optional[Data] = None) -> Layer: + return Layer(mapping, data, "line", "identity", {}) + + +def geom_point(mapping: Mapping = aes(), data: Optional[Data] = None) -> Layer: + return Layer(mapping, data, "circle", "identity", {}) + + +def ggplot(data: Optional[Data] = None, mapping: Mapping = aes()) -> Plot: + global _plot_cache + new_plot = Plot(data, mapping, []) + _plot_cache |= {id(new_plot): []} + return new_plot + + +### functionality ### +def extend(plot: Plot, other: Any) -> Plot: + global _plot_cache + kwargs: Optional[Dict[str, Any]] = None + if isinstance(other, Mapping): + kwargs = { + "mapping": replace( + plot.mapping, + **{k: v for k, v in {"x": other.x, "y": other.y, "color": other.color}.items() if v is not None}, + ) + } + elif isinstance(other, Layer): + kwargs = {"layers": [*plot.layers, other]} + if kwargs is None: + raise ValueError("unsupported addition to plot") + new_plot = replace(plot, **kwargs) + _plot_cache |= {id(new_plot): _plot_cache[id(plot)] + [plot]} + _plot_cache = {k: v for k, v in _plot_cache.items() if k != id(plot)} + return new_plot + + +setattr(Plot, "__add__", extend) + + +_altair_configure_mark_keys = {"color"} +_altair_encode_keys = {"x": X, "x2": X2, "y": Y} + + +def show(plot: Plot) -> Union[Chart, LayerChart]: + global _stat_cache + base_chart = None + for layer in plot.layers: + mapping_dict = {} + for mapping in [plot.mapping, layer.mapping]: + mapping_dict = {**mapping_dict, **{k: v for k, v in asdict(mapping).items() if v is not None}} + # TODO should we break the stat stuff out to its own function? + kwargs = {"x": {}, "x2": {}, "y": {}} + cached = _stat_cache.get((id(plot.data), layer.stat), None) + if cached is not None: + data, df = cached + elif layer.stat == "identity": + data = plot.data + df = data.to_pandas() + elif layer.stat == "bin": + x = mapping_dict.get("x", None) + if x is None: + raise ValueError("x must be supplied for stat bin") + data = plot.data.aggregate( + hl.agg.hist( + plot.data[x], + plot.data.aggregate(hl.agg.min(plot.data[x])), + plot.data.aggregate(hl.agg.max(plot.data[x])), + layer.params["bins"], + ) + ) + df = DataFrame([ + {x: data["bin_edges"][i], "x2": data["bin_edges"][i + 1], "y": data["bin_freq"][i]} + for i in range(len(data["bin_freq"])) + ]) + kwargs["x"] = {"bin": "binned"} + mapping_dict["x2"] = "x2" + mapping_dict["y"] = "y" + else: + raise ValueError("unknown stat") + # TODO remove the data from the cache, we only need the df + _stat_cache |= {(id(plot.data), layer.stat): (data, df)} + chart = Chart(df) + if layer.geom is not None: + chart = getattr(chart, f"mark_{layer.geom}")(**{ + k: v for k, v in mapping_dict.items() if k in _altair_configure_mark_keys + }) + chart = chart.encode(**{ + k: _altair_encode_keys[k](v, **kwargs[k]) for k, v in mapping_dict.items() if k in _altair_encode_keys + }) + base_chart = chart if base_chart is None else base_chart + chart + return base_chart + + +def undo(plot: Plot, *, depth: int = 1) -> Plot: + global _plot_cache + old_plot = _plot_cache[id(plot)][0 - depth] + _plot_cache |= {id(old_plot): _plot_cache[id(plot)][: 0 - depth]} + _plot_cache = {k: v for k, v in _plot_cache.items() if k != id(plot)} + return old_plot + + +## introspection ## +def plot_to_string(plot: Plot) -> str: + return dedent(f"""\ + Plot( + data = {plot.data}, + mapping = {indent_tail(str(plot.mapping), 3)}, + layers = {indent_tail(str(plot.layers), 3)}, + )""") + + +def indent_tail(string: str, indent_level: int = 1) -> str: + return "".join([ + indent(part, " " * indent_level) if index == 2 else part for index, part in enumerate(string.partition("\n")) + ]) + + +setattr(Plot, "__str__", plot_to_string) + + +def mapping_to_string(mapping: Mapping) -> str: + return dedent(f"""\ + Mapping( + x = {mapping.x}, + y = {mapping.y}, + )""") + + +setattr(Mapping, "__str__", mapping_to_string) diff --git a/hail/python/hail/ggplot2/utils.py b/hail/python/hail/ggplot2/utils.py new file mode 100644 index 00000000000..cb84814e44b --- /dev/null +++ b/hail/python/hail/ggplot2/utils.py @@ -0,0 +1,38 @@ +from dataclasses import dataclass, fields +from functools import wraps +from typing import Any, Callable, TypeVar, Union + +from typeguard import check_type + +ReturnType = TypeVar("ReturnType") +WrappedDecorator = Callable[[ReturnType], ReturnType] + + +def typeguard_dataclass(cls: ReturnType = None, /, **kwargs: Any) -> Union[ReturnType, WrappedDecorator]: + """ + Creates a `dataclass` that is `frozen` by default and has runtime typechecking for its fields. + """ + + @wraps(dataclass) + def wrapper(cls: ReturnType) -> ReturnType: + def __setattr__(obj: ReturnType, name: str, value: Any) -> None: + if len(types := [_field.type for _field in fields(obj) if _field.name == name]) == 0: + raise TypeError(f"'{getattr(cls, '__name__', str(cls))}' has no field '{name}'.") + super().__setattr__(name, check_type(value, types[0])) + + def __post_init__(obj: ReturnType) -> None: + for _field in fields(obj): + check_type(getattr(obj, _field.name), _field.type) + + setattr( + cls, + *( + ["__post_init__", __post_init__] + if (frozen := kwargs.get("frozen", True)) + else ["__setattr__", __setattr__] + ), + ) + dataclass(cls, frozen=frozen, **{k: v for k, v in kwargs.items() if k != "frozen"}) + return cls + + return wrapper if cls is None else wrapper(cls) diff --git a/hail/python/pinned-requirements.txt b/hail/python/pinned-requirements.txt index 394193216e0..de398cd6413 100644 --- a/hail/python/pinned-requirements.txt +++ b/hail/python/pinned-requirements.txt @@ -1,5 +1,7 @@ # This file was autogenerated by uv via the following command: # uv pip compile --python-version 3.9 --python-platform linux hail/python/requirements.txt --output-file=hail/python/pinned-requirements.txt +altair==5 +typeguard==4 aiodns==2.0.0 # via # -c hail/python/hailtop/pinned-requirements.txt diff --git a/hail/python/requirements.txt b/hail/python/requirements.txt index ba04650d8d6..c15da77639f 100644 --- a/hail/python/requirements.txt +++ b/hail/python/requirements.txt @@ -2,6 +2,8 @@ -c dataproc-pre-installed-requirements.txt -r hailtop/requirements.txt +altair==5 +typeguard==4 avro>=1.10,<1.12 bokeh>=3,<3.4 decorator<5