From ac1e0e91c4b19d469d1563e0f885c8f263dd42dd Mon Sep 17 00:00:00 2001 From: Pranav Aurora Date: Wed, 4 Oct 2023 08:35:51 -0700 Subject: [PATCH 1/3] added new notebook --- notebooks/pipelines-query-tuning/meta.toml | 8 ++++++++ notebooks/pipelines-query-tuning/notebook.ipynb | 1 + 2 files changed, 9 insertions(+) create mode 100644 notebooks/pipelines-query-tuning/meta.toml create mode 100644 notebooks/pipelines-query-tuning/notebook.ipynb diff --git a/notebooks/pipelines-query-tuning/meta.toml b/notebooks/pipelines-query-tuning/meta.toml new file mode 100644 index 00000000..a8674aff --- /dev/null +++ b/notebooks/pipelines-query-tuning/meta.toml @@ -0,0 +1,8 @@ +[meta] +title="Kafka Pipelines and Query Tuning" +description="""\ + Create a SingleStore pipeline to track the International Space Station and adjust queries & schema to optimize performance. + """ +tags=["beginner", "kafka", "pipeline","querytuning"] +icon="database" +destinations=["spaces"] diff --git a/notebooks/pipelines-query-tuning/notebook.ipynb b/notebooks/pipelines-query-tuning/notebook.ipynb new file mode 100644 index 00000000..ee5ceaf0 --- /dev/null +++ b/notebooks/pipelines-query-tuning/notebook.ipynb @@ -0,0 +1 @@ +{"cells":[{"cell_type":"markdown","id":"d4c162a7-e101-475a-9e22-86d045c138ae","metadata":{"language":"sql"},"source":"# Ingesting real time data from the International Space Station (ISS)"},{"cell_type":"markdown","id":"1e182e1d-9395-4fb8-9e1b-62d6d4f91f58","metadata":{"language":"sql"},"source":"### 1. Drop the database if it exists, create a new database, switch to it, and then create a table.\n\n### Database Name\nIn the following cell you will enter your email address as the database name. However, you will need to replace all characters that are not underscores or alpha numberics with underscores.\n\n### Example:\nIf your email address is lorrin.smith-bates@singlestore.com you would use **lorrin_smith_bates_singlestore_com**"},{"cell_type":"code","execution_count":null,"id":"b3daf7fc-c60e-4c3b-a811-3338b94015ca","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\nDROP DATABASE IF EXISTS <>;\nCREATE DATABASE <>;\nUSE <>;\nCREATE TABLE iss_location(\n name varchar(10),\n id int,\n latitude float,\n longitude float,\n velocity float,\n visibility varchar(20),\n footprint float,\n timestamp bigint,\n daynum float,\n solar_lat float,\n solar_lon float,\n units varchar(20),\n url varchar(255)\n);"},{"cell_type":"markdown","id":"c4a85e23-6309-4e55-a2d7-39ab529f28a2","metadata":{"language":"sql"},"source":"### 2. Create a SingleStore pipeline to ingest ISS data from a Kafka topic."},{"cell_type":"code","execution_count":null,"id":"6af2f20a-8d80-4ff4-b56a-07db593157a5","metadata":{"language":"sql","scrolled":true,"trusted":true},"outputs":[],"source":"%%sql\n\nCREATE OR REPLACE PIPELINE iss_pipeline AS\n LOAD DATA kafka '100.25.125.23/iss'\n INTO TABLE iss_location\n FORMAT JSON;"},{"cell_type":"markdown","id":"5d7745a0-6649-445d-b1cf-7068cad7d74c","metadata":{"language":"sql"},"source":"### 3. Test the pipeline."},{"cell_type":"code","execution_count":null,"id":"49316e8f-03b1-4dcc-a747-170135ccf7a0","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\n\nTEST PIPELINE iss_pipeline;"},{"cell_type":"markdown","id":"ca166af0-6440-4869-9ce0-6e11caba41e5","metadata":{"language":"sql"},"source":"### 4. Start the Pipeline"},{"cell_type":"code","execution_count":null,"id":"6f36c4ff-a153-4acf-ba20-0b28a0ba75e4","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\n\nSTART PIPELINE iss_pipeline;"},{"cell_type":"markdown","id":"908c7e4c-fa9c-42b1-a164-6f410efb4f05","metadata":{"language":"sql"},"source":"### 5. Get the count of records. Run this a few time to see the number of records ingested."},{"cell_type":"code","execution_count":null,"id":"e97f6065-9f42-4a32-9a15-fa06c4a1dda7","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\n\nSELECT COUNT(*) FROM iss_location;"},{"cell_type":"markdown","id":"06769299-1807-42e0-9a5e-f71ea5bf8dfe","metadata":{"language":"sql"},"source":"### 6. Get the latest location record. Click the link to see the location of the ISS in Google Maps."},{"cell_type":"code","execution_count":null,"id":"46a63ec2-bb8b-4887-b41a-42612e64c756","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\n\nSELECT timestamp, url \n FROM iss_location\n ORDER BY timestamp desc\n LIMIT 1;"},{"cell_type":"markdown","id":"f547c895-5a4a-40c6-9284-14763ac2bea2","metadata":{"language":"sql"},"source":"### 7. Stop the pipeline and delete the data from the iss_location table."},{"cell_type":"code","execution_count":null,"id":"bc4a122a-e86e-405d-a0ed-fb82dcc21816","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\n\nSTOP PIPELINE iss_pipeline;\nDELETE FROM iss_location;"},{"cell_type":"markdown","id":"ccaef975-6a97-42dc-8112-64ae05bfbe70","metadata":{"language":"sql"},"source":"### 8. Change the pipeline offsets and interval."},{"cell_type":"code","execution_count":null,"id":"7efb3adb-bc12-4ca4-8eae-28703020af18","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\n\nALTER PIPELINE iss_pipeline\n SET BATCH_INTERVAL 30000\n SET OFFSETS latest ;"},{"cell_type":"markdown","id":"814bff09-f60e-4c04-9408-9dd65e893c96","metadata":{"language":"sql"},"source":"### 9. Start the Pipeline again."},{"cell_type":"code","execution_count":null,"id":"94e46f4b-597e-4c71-b329-c4a3c3b8fff9","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\n\nSTART PIPELINE iss_pipeline;"},{"cell_type":"markdown","id":"a4a6f369-8ee8-479d-8014-5d528d449e13","metadata":{"language":"sql"},"source":"### 10. Count the records, notice how the records are populated now after alterning the pipeline."},{"cell_type":"code","execution_count":null,"id":"196f0285-0c8f-4e71-8a7d-8237fcdc891e","metadata":{"language":"sql","scrolled":true,"trusted":true},"outputs":[],"source":"%%sql\n\nSELECT COUNT(*) from iss_location;"},{"cell_type":"markdown","id":"a186addf-29fe-4311-b67d-03696ef4bc62","metadata":{"language":"python"},"source":"### 11. Stop the pipeline"},{"cell_type":"code","execution_count":null,"id":"ddf2a7d5-050a-4146-b965-68a2aa6ce92c","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\n\nSTOP PIPELINE iss_pipeline;"},{"cell_type":"markdown","id":"b2e91faf-99e8-4753-9e6d-08e3d703fce3","metadata":{"language":"sql"},"source":"# Query Optimization"},{"cell_type":"markdown","id":"df7cd5f1-20a5-4f6e-8d44-c47def53dfb2","metadata":{"language":"sql"},"source":"### 1. Restore the 'employees' database that has been backed up into a public S3 bucket\n\nFor the database name we'll prepend employee_ to the modified email address again."},{"cell_type":"code","execution_count":37,"id":"cb27d520-2141-450d-b0cf-925c12c32f3c","metadata":{"execution":{"iopub.execute_input":"2023-10-04T07:19:19.436678Z","iopub.status.busy":"2023-10-04T07:19:19.436430Z","iopub.status.idle":"2023-10-04T07:19:29.433057Z","shell.execute_reply":"2023-10-04T07:19:29.432497Z","shell.execute_reply.started":"2023-10-04T07:19:19.436661Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n
","text/plain":"++\n||\n++\n++"},"execution_count":37,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nRESTORE DATABASE employees AS employees_<>\n FROM S3 'train.memsql.com/employee'\n CONFIG'{\"region\":\"us-east-1\"}'\n CREDENTIALS'{}';"},{"cell_type":"markdown","id":"ba2c8fc3-44c4-4d73-bf36-b9b1aab1b74d","metadata":{"language":"sql"},"source":"### 2. Switch to the Employees database"},{"cell_type":"code","execution_count":38,"id":"bafa5dbc-f149-4465-ab16-83339e515abb","metadata":{"execution":{"iopub.execute_input":"2023-10-04T07:19:33.364708Z","iopub.status.busy":"2023-10-04T07:19:33.364459Z","iopub.status.idle":"2023-10-04T07:19:33.376207Z","shell.execute_reply":"2023-10-04T07:19:33.375717Z","shell.execute_reply.started":"2023-10-04T07:19:33.364692Z"},"language":"sql","scrolled":true,"trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n
","text/plain":"++\n||\n++\n++"},"execution_count":38,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nUSE employees_<>;"},{"cell_type":"markdown","id":"fbbba65d-5473-4b86-8f8e-ca6835ac1b77","metadata":{"language":"sql"},"source":"### 3. Run a query that joins 4 tables and orders by 4 columns in 3 tables"},{"cell_type":"code","execution_count":null,"id":"41fb7c5b-e007-4d2c-a271-f91bc40a8405","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\n%%sql\n\nSELECT e.first_name, e.last_name, d.dept_name, t.title, t.from_date, t.to_date\n FROM employees e\n INNER JOIN dept_emp de ON e.emp_no=de.emp_no\n INNER JOIN departments d ON de.dept_no=d.dept_no\n INNER JOIN titles t ON e.emp_no=t.emp_no\n ORDER BY e.first_name, e.last_name, d.dept_name, t.from_date\n LIMIT 10;"},{"cell_type":"markdown","id":"77c58db6-8dab-4de8-8834-a4ca4e508c1e","metadata":{"language":"sql"},"source":"### 4. Examine the query execution profile using EXPLAIN"},{"cell_type":"code","execution_count":null,"id":"d58506b8-f97f-462c-a0e5-a15a93c8a7a5","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\n%%sql\n\nEXPLAIN SELECT e.first_name, e.last_name, d.dept_name, t.title, t.from_date, t.to_date\n FROM employees e\n INNER JOIN dept_emp de ON e.emp_no=de.emp_no\n INNER JOIN departments d ON de.dept_no=d.dept_no\n INNER JOIN titles t ON e.emp_no=t.emp_no\n ORDER BY e.first_name, e.last_name, d.dept_name, t.from_date\n LIMIT 10;"},{"cell_type":"markdown","id":"00b95e97-87f7-456f-b3dc-c6e70c014a40","metadata":{"execution":{"iopub.execute_input":"2023-10-03T20:48:44.146598Z","iopub.status.busy":"2023-10-03T20:48:44.146231Z","iopub.status.idle":"2023-10-03T20:48:44.158430Z","shell.execute_reply":"2023-10-03T20:48:44.157907Z","shell.execute_reply.started":"2023-10-03T20:48:44.146578Z"},"language":"sql"},"source":"### 5. Profile the query by using PROFILE."},{"cell_type":"code","execution_count":null,"id":"0c35c2fd-67b2-4ddd-be8f-4e25f0e851b0","metadata":{"language":"sql","scrolled":true,"trusted":true},"outputs":[],"source":"%%sql\n%%sql\nPROFILE SELECT e.first_name, e.last_name, d.dept_name, t.title, t.from_date, t.to_date\n FROM employees e\n INNER JOIN dept_emp de ON e.emp_no=de.emp_no\n INNER JOIN departments d ON de.dept_no=d.dept_no\n INNER JOIN titles t ON e.emp_no=t.emp_no\n ORDER BY e.first_name, e.last_name, d.dept_name, t.from_date\n LIMIT 10;"},{"cell_type":"markdown","id":"abbbd5b9-d9c2-4a54-b391-fb3a2e9ba0c1","metadata":{"execution":{"iopub.execute_input":"2023-10-03T20:50:23.154097Z","iopub.status.busy":"2023-10-03T20:50:23.153676Z","iopub.status.idle":"2023-10-03T20:50:23.166357Z","shell.execute_reply":"2023-10-03T20:50:23.165799Z","shell.execute_reply.started":"2023-10-03T20:50:23.154071Z"},"language":"sql"},"source":"### 6. Run SHOW PROFILE to view the statistics on an actual run of the query"},{"cell_type":"code","execution_count":null,"id":"a45226d3-4a3e-442e-9800-90654ef8d045","metadata":{"language":"sql","scrolled":true,"trusted":true},"outputs":[],"source":"%%sql\n%%sql\nSHOW PROFILE;"},{"cell_type":"markdown","id":"7c2a606c-15b3-4c67-97b6-e92ed00108fb","metadata":{"language":"sql"},"source":"### 7. Run Visual Profile to see this the profile in a GUI format"},{"cell_type":"markdown","id":"3e9ab748-6b05-40e8-9563-93775d089625","metadata":{"execution":{"iopub.execute_input":"2023-10-03T20:53:09.687210Z","iopub.status.busy":"2023-10-03T20:53:09.686942Z","iopub.status.idle":"2023-10-03T20:53:09.698661Z","shell.execute_reply":"2023-10-03T20:53:09.697929Z","shell.execute_reply.started":"2023-10-03T20:53:09.687193Z"},"language":"sql"},"source":"## Query/Schema Tuning Exercise\n\nNow that we've visualized our query execution plan, let's address some of the issues we've uncovered. \n\n### 1. Create a Reference table for departments"},{"cell_type":"code","execution_count":52,"id":"60599ab6-5c8e-4a49-a2cd-a4b201ff2f84","metadata":{"execution":{"iopub.execute_input":"2023-10-04T07:23:53.176524Z","iopub.status.busy":"2023-10-04T07:23:53.176155Z","iopub.status.idle":"2023-10-04T07:23:53.580877Z","shell.execute_reply":"2023-10-04T07:23:53.580309Z","shell.execute_reply.started":"2023-10-04T07:23:53.176498Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n
","text/plain":"++\n||\n++\n++"},"execution_count":52,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nCREATE REFERENCE TABLE departments_ref(\n dept_no CHAR(4) not null,\n dept_name varchar(40) not null,\n primary key (dept_no),\n key(dept_name)\n);\n\nINSERT INTO departments_ref (SELECT * FROM departments);"},{"cell_type":"markdown","id":"bf12d27f-a303-4c57-9b33-1d1a3280bfef","metadata":{"language":"sql"},"source":"### 2. Profile the old and the new"},{"cell_type":"code","execution_count":null,"id":"fdba3687-6430-4c30-8b50-177c528e54be","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\n%%sql\n%%sql\n-- CONTROL. Here is the original query. We can use this as our control in our experiment.\nSELECT e.first_name, e.last_name, d.dept_name, t.title, t.from_date, t.to_date\n FROM employees e\n INNER JOIN dept_emp de ON e.emp_no=de.emp_no\n INNER JOIN departments d ON de.dept_no=d.dept_no\n INNER JOIN titles t ON e.emp_no=t.emp_no\n ORDER BY e.first_name, e.last_name, d.dept_name, t.from_date\n LIMIT 10;\n\n\n\n-- IMPROVED. Here is the slightly more improved query with the departments_ref table\nSELECT e.first_name, e.last_name, d.dept_name, t.title, t.from_date, t.to_date\n FROM employees e\n INNER JOIN dept_emp de ON e.emp_no=de.emp_no\n INNER JOIN departments_ref d ON de.dept_no=d.dept_no\n INNER JOIN titles t ON e.emp_no=t.emp_no\n ORDER BY e.first_name, e.last_name, d.dept_name, t.from_date\n LIMIT 10;\n\n-- PROFILE them both and observe the differences."},{"cell_type":"markdown","id":"e569d4cc-80b7-4da0-9022-c9bdddac6e01","metadata":{"execution":{"iopub.execute_input":"2023-10-03T20:57:40.110119Z","iopub.status.busy":"2023-10-03T20:57:40.109741Z","iopub.status.idle":"2023-10-03T20:57:40.122251Z","shell.execute_reply":"2023-10-03T20:57:40.121691Z","shell.execute_reply.started":"2023-10-03T20:57:40.110099Z"},"language":"sql"},"source":"### 3. Create a titles table with sort and shard keys defined."},{"cell_type":"code","execution_count":57,"id":"8494132a-38cd-40e7-bcff-3d6a32eb25fb","metadata":{"collapsed":true,"execution":{"iopub.execute_input":"2023-10-04T07:25:33.089168Z","iopub.status.busy":"2023-10-04T07:25:33.088870Z","iopub.status.idle":"2023-10-04T07:25:34.453978Z","shell.execute_reply":"2023-10-04T07:25:34.453445Z","shell.execute_reply.started":"2023-10-04T07:25:33.089149Z"},"jupyter":{"outputs_hidden":true},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n
","text/plain":"++\n||\n++\n++"},"execution_count":57,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nCREATE TABLE titles_sharded (\n emp_no INT NOT NULL,\n title VARCHAR(50) NOT NULL,\n from_date DATE NOT NULL,\n to_date DATE,\n SORT KEY (emp_no),\n SHARD KEY (emp_no)\n);\n\nINSERT INTO titles_sharded (SELECT * FROM titles);"},{"cell_type":"markdown","id":"aca3590d-4144-4d0d-b2a6-e24ee21cb573","metadata":{"language":"sql"},"source":"### 4. Add shard and sort keys to the dept_emp table"},{"cell_type":"code","execution_count":null,"id":"40327756-cb80-444c-8745-f0646385d8da","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\nCREATE TABLE dept_emp_sharded(\n emp_no int not null,\n dept_no char(4) not null,\n from_date date not null,\n to_date date not null,\n SORT KEY (dept_no),\n SHARD KEY(emp_no),\n KEY (dept_no)\n);\n\nINSERT INTO dept_emp_sharded (SELECT * FROM dept_emp);"},{"cell_type":"code","execution_count":null,"id":"7066ff30-e4f1-4105-aaa4-0369af08658d","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\nSELECT e.first_name, e.last_name, d.dept_name, t.title, t.from_date, t.to_date\n FROM employees e\n INNER JOIN dept_emp de ON e.emp_no=de.emp_no\n INNER JOIN departments_ref d ON de.dept_no=d.dept_no\n INNER JOIN titles_sharded t ON e.emp_no=t.emp_no\n ORDER BY e.first_name, e.last_name, d.dept_name, t.from_date\n LIMIT 10;"},{"cell_type":"markdown","id":"884c0aeb-ba6a-4e30-bd87-1a23f23ec451","metadata":{"language":"sql"},"source":"### 5. Add shard and sort keys to the employees table"},{"cell_type":"code","execution_count":65,"id":"9b05ca85-78f7-4a4b-b4d9-779af58d8501","metadata":{"execution":{"iopub.execute_input":"2023-10-04T07:28:17.524467Z","iopub.status.busy":"2023-10-04T07:28:17.524187Z","iopub.status.idle":"2023-10-04T07:28:18.601673Z","shell.execute_reply":"2023-10-04T07:28:18.601115Z","shell.execute_reply.started":"2023-10-04T07:28:17.524448Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n
","text/plain":"++\n||\n++\n++"},"execution_count":65,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nCREATE TABLE employees_sharded (\n emp_no INT NOT NULL,\n birth_date DATE NOT NULL,\n first_name VARCHAR(14) NOT NULL,\n last_name VARCHAR(16) NOT NULL,\n hire_date DATE NOT NULL,\n SORT KEY (emp_no),\n SHARD KEY (emp_no)\n);\n\nINSERT INTO employees_sharded (SELECT * FROM employees);"},{"cell_type":"code","execution_count":null,"id":"431ee5a9-9953-4891-9db5-effd0ec04320","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\nSELECT e.first_name, e.last_name, d.dept_name, t.title, t.from_date, t.to_date\n FROM employees_sharded e\n INNER JOIN dept_emp de ON e.emp_no=de.emp_no\n INNER JOIN departments_ref d ON de.dept_no=d.dept_no\n INNER JOIN titles_sharded t ON e.emp_no=t.emp_no\n ORDER BY e.first_name, e.last_name, d.dept_name, t.from_date\n LIMIT 10;"}],"metadata":{"jupyterlab":{"notebooks":{"version_major":6,"version_minor":4}},"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.4"},"singlestore_cell_default_language":"sql","singlestore_connection":{"connectionID":"cffe07c6-154d-4d38-837b-876c941e8706","defaultDatabase":"employees_lorrin_smith_bates_singlestore_com"}},"nbformat":4,"nbformat_minor":5} \ No newline at end of file From 5797cef058b56b49702fe693b13f173266671ab7 Mon Sep 17 00:00:00 2001 From: Pranav Aurora Date: Wed, 4 Oct 2023 09:30:50 -0700 Subject: [PATCH 2/3] ran pre-commit --- .../pipelines-query-tuning/notebook.ipynb | 667 +++++++++++++++++- 1 file changed, 666 insertions(+), 1 deletion(-) diff --git a/notebooks/pipelines-query-tuning/notebook.ipynb b/notebooks/pipelines-query-tuning/notebook.ipynb index ee5ceaf0..d2ee5275 100644 --- a/notebooks/pipelines-query-tuning/notebook.ipynb +++ b/notebooks/pipelines-query-tuning/notebook.ipynb @@ -1 +1,666 @@ -{"cells":[{"cell_type":"markdown","id":"d4c162a7-e101-475a-9e22-86d045c138ae","metadata":{"language":"sql"},"source":"# Ingesting real time data from the International Space Station (ISS)"},{"cell_type":"markdown","id":"1e182e1d-9395-4fb8-9e1b-62d6d4f91f58","metadata":{"language":"sql"},"source":"### 1. Drop the database if it exists, create a new database, switch to it, and then create a table.\n\n### Database Name\nIn the following cell you will enter your email address as the database name. However, you will need to replace all characters that are not underscores or alpha numberics with underscores.\n\n### Example:\nIf your email address is lorrin.smith-bates@singlestore.com you would use **lorrin_smith_bates_singlestore_com**"},{"cell_type":"code","execution_count":null,"id":"b3daf7fc-c60e-4c3b-a811-3338b94015ca","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\nDROP DATABASE IF EXISTS <>;\nCREATE DATABASE <>;\nUSE <>;\nCREATE TABLE iss_location(\n name varchar(10),\n id int,\n latitude float,\n longitude float,\n velocity float,\n visibility varchar(20),\n footprint float,\n timestamp bigint,\n daynum float,\n solar_lat float,\n solar_lon float,\n units varchar(20),\n url varchar(255)\n);"},{"cell_type":"markdown","id":"c4a85e23-6309-4e55-a2d7-39ab529f28a2","metadata":{"language":"sql"},"source":"### 2. Create a SingleStore pipeline to ingest ISS data from a Kafka topic."},{"cell_type":"code","execution_count":null,"id":"6af2f20a-8d80-4ff4-b56a-07db593157a5","metadata":{"language":"sql","scrolled":true,"trusted":true},"outputs":[],"source":"%%sql\n\nCREATE OR REPLACE PIPELINE iss_pipeline AS\n LOAD DATA kafka '100.25.125.23/iss'\n INTO TABLE iss_location\n FORMAT JSON;"},{"cell_type":"markdown","id":"5d7745a0-6649-445d-b1cf-7068cad7d74c","metadata":{"language":"sql"},"source":"### 3. Test the pipeline."},{"cell_type":"code","execution_count":null,"id":"49316e8f-03b1-4dcc-a747-170135ccf7a0","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\n\nTEST PIPELINE iss_pipeline;"},{"cell_type":"markdown","id":"ca166af0-6440-4869-9ce0-6e11caba41e5","metadata":{"language":"sql"},"source":"### 4. Start the Pipeline"},{"cell_type":"code","execution_count":null,"id":"6f36c4ff-a153-4acf-ba20-0b28a0ba75e4","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\n\nSTART PIPELINE iss_pipeline;"},{"cell_type":"markdown","id":"908c7e4c-fa9c-42b1-a164-6f410efb4f05","metadata":{"language":"sql"},"source":"### 5. Get the count of records. Run this a few time to see the number of records ingested."},{"cell_type":"code","execution_count":null,"id":"e97f6065-9f42-4a32-9a15-fa06c4a1dda7","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\n\nSELECT COUNT(*) FROM iss_location;"},{"cell_type":"markdown","id":"06769299-1807-42e0-9a5e-f71ea5bf8dfe","metadata":{"language":"sql"},"source":"### 6. Get the latest location record. Click the link to see the location of the ISS in Google Maps."},{"cell_type":"code","execution_count":null,"id":"46a63ec2-bb8b-4887-b41a-42612e64c756","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\n\nSELECT timestamp, url \n FROM iss_location\n ORDER BY timestamp desc\n LIMIT 1;"},{"cell_type":"markdown","id":"f547c895-5a4a-40c6-9284-14763ac2bea2","metadata":{"language":"sql"},"source":"### 7. Stop the pipeline and delete the data from the iss_location table."},{"cell_type":"code","execution_count":null,"id":"bc4a122a-e86e-405d-a0ed-fb82dcc21816","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\n\nSTOP PIPELINE iss_pipeline;\nDELETE FROM iss_location;"},{"cell_type":"markdown","id":"ccaef975-6a97-42dc-8112-64ae05bfbe70","metadata":{"language":"sql"},"source":"### 8. Change the pipeline offsets and interval."},{"cell_type":"code","execution_count":null,"id":"7efb3adb-bc12-4ca4-8eae-28703020af18","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\n\nALTER PIPELINE iss_pipeline\n SET BATCH_INTERVAL 30000\n SET OFFSETS latest ;"},{"cell_type":"markdown","id":"814bff09-f60e-4c04-9408-9dd65e893c96","metadata":{"language":"sql"},"source":"### 9. Start the Pipeline again."},{"cell_type":"code","execution_count":null,"id":"94e46f4b-597e-4c71-b329-c4a3c3b8fff9","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\n\nSTART PIPELINE iss_pipeline;"},{"cell_type":"markdown","id":"a4a6f369-8ee8-479d-8014-5d528d449e13","metadata":{"language":"sql"},"source":"### 10. Count the records, notice how the records are populated now after alterning the pipeline."},{"cell_type":"code","execution_count":null,"id":"196f0285-0c8f-4e71-8a7d-8237fcdc891e","metadata":{"language":"sql","scrolled":true,"trusted":true},"outputs":[],"source":"%%sql\n\nSELECT COUNT(*) from iss_location;"},{"cell_type":"markdown","id":"a186addf-29fe-4311-b67d-03696ef4bc62","metadata":{"language":"python"},"source":"### 11. Stop the pipeline"},{"cell_type":"code","execution_count":null,"id":"ddf2a7d5-050a-4146-b965-68a2aa6ce92c","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\n\nSTOP PIPELINE iss_pipeline;"},{"cell_type":"markdown","id":"b2e91faf-99e8-4753-9e6d-08e3d703fce3","metadata":{"language":"sql"},"source":"# Query Optimization"},{"cell_type":"markdown","id":"df7cd5f1-20a5-4f6e-8d44-c47def53dfb2","metadata":{"language":"sql"},"source":"### 1. Restore the 'employees' database that has been backed up into a public S3 bucket\n\nFor the database name we'll prepend employee_ to the modified email address again."},{"cell_type":"code","execution_count":37,"id":"cb27d520-2141-450d-b0cf-925c12c32f3c","metadata":{"execution":{"iopub.execute_input":"2023-10-04T07:19:19.436678Z","iopub.status.busy":"2023-10-04T07:19:19.436430Z","iopub.status.idle":"2023-10-04T07:19:29.433057Z","shell.execute_reply":"2023-10-04T07:19:29.432497Z","shell.execute_reply.started":"2023-10-04T07:19:19.436661Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n
","text/plain":"++\n||\n++\n++"},"execution_count":37,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nRESTORE DATABASE employees AS employees_<>\n FROM S3 'train.memsql.com/employee'\n CONFIG'{\"region\":\"us-east-1\"}'\n CREDENTIALS'{}';"},{"cell_type":"markdown","id":"ba2c8fc3-44c4-4d73-bf36-b9b1aab1b74d","metadata":{"language":"sql"},"source":"### 2. Switch to the Employees database"},{"cell_type":"code","execution_count":38,"id":"bafa5dbc-f149-4465-ab16-83339e515abb","metadata":{"execution":{"iopub.execute_input":"2023-10-04T07:19:33.364708Z","iopub.status.busy":"2023-10-04T07:19:33.364459Z","iopub.status.idle":"2023-10-04T07:19:33.376207Z","shell.execute_reply":"2023-10-04T07:19:33.375717Z","shell.execute_reply.started":"2023-10-04T07:19:33.364692Z"},"language":"sql","scrolled":true,"trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n
","text/plain":"++\n||\n++\n++"},"execution_count":38,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nUSE employees_<>;"},{"cell_type":"markdown","id":"fbbba65d-5473-4b86-8f8e-ca6835ac1b77","metadata":{"language":"sql"},"source":"### 3. Run a query that joins 4 tables and orders by 4 columns in 3 tables"},{"cell_type":"code","execution_count":null,"id":"41fb7c5b-e007-4d2c-a271-f91bc40a8405","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\n%%sql\n\nSELECT e.first_name, e.last_name, d.dept_name, t.title, t.from_date, t.to_date\n FROM employees e\n INNER JOIN dept_emp de ON e.emp_no=de.emp_no\n INNER JOIN departments d ON de.dept_no=d.dept_no\n INNER JOIN titles t ON e.emp_no=t.emp_no\n ORDER BY e.first_name, e.last_name, d.dept_name, t.from_date\n LIMIT 10;"},{"cell_type":"markdown","id":"77c58db6-8dab-4de8-8834-a4ca4e508c1e","metadata":{"language":"sql"},"source":"### 4. Examine the query execution profile using EXPLAIN"},{"cell_type":"code","execution_count":null,"id":"d58506b8-f97f-462c-a0e5-a15a93c8a7a5","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\n%%sql\n\nEXPLAIN SELECT e.first_name, e.last_name, d.dept_name, t.title, t.from_date, t.to_date\n FROM employees e\n INNER JOIN dept_emp de ON e.emp_no=de.emp_no\n INNER JOIN departments d ON de.dept_no=d.dept_no\n INNER JOIN titles t ON e.emp_no=t.emp_no\n ORDER BY e.first_name, e.last_name, d.dept_name, t.from_date\n LIMIT 10;"},{"cell_type":"markdown","id":"00b95e97-87f7-456f-b3dc-c6e70c014a40","metadata":{"execution":{"iopub.execute_input":"2023-10-03T20:48:44.146598Z","iopub.status.busy":"2023-10-03T20:48:44.146231Z","iopub.status.idle":"2023-10-03T20:48:44.158430Z","shell.execute_reply":"2023-10-03T20:48:44.157907Z","shell.execute_reply.started":"2023-10-03T20:48:44.146578Z"},"language":"sql"},"source":"### 5. Profile the query by using PROFILE."},{"cell_type":"code","execution_count":null,"id":"0c35c2fd-67b2-4ddd-be8f-4e25f0e851b0","metadata":{"language":"sql","scrolled":true,"trusted":true},"outputs":[],"source":"%%sql\n%%sql\nPROFILE SELECT e.first_name, e.last_name, d.dept_name, t.title, t.from_date, t.to_date\n FROM employees e\n INNER JOIN dept_emp de ON e.emp_no=de.emp_no\n INNER JOIN departments d ON de.dept_no=d.dept_no\n INNER JOIN titles t ON e.emp_no=t.emp_no\n ORDER BY e.first_name, e.last_name, d.dept_name, t.from_date\n LIMIT 10;"},{"cell_type":"markdown","id":"abbbd5b9-d9c2-4a54-b391-fb3a2e9ba0c1","metadata":{"execution":{"iopub.execute_input":"2023-10-03T20:50:23.154097Z","iopub.status.busy":"2023-10-03T20:50:23.153676Z","iopub.status.idle":"2023-10-03T20:50:23.166357Z","shell.execute_reply":"2023-10-03T20:50:23.165799Z","shell.execute_reply.started":"2023-10-03T20:50:23.154071Z"},"language":"sql"},"source":"### 6. Run SHOW PROFILE to view the statistics on an actual run of the query"},{"cell_type":"code","execution_count":null,"id":"a45226d3-4a3e-442e-9800-90654ef8d045","metadata":{"language":"sql","scrolled":true,"trusted":true},"outputs":[],"source":"%%sql\n%%sql\nSHOW PROFILE;"},{"cell_type":"markdown","id":"7c2a606c-15b3-4c67-97b6-e92ed00108fb","metadata":{"language":"sql"},"source":"### 7. Run Visual Profile to see this the profile in a GUI format"},{"cell_type":"markdown","id":"3e9ab748-6b05-40e8-9563-93775d089625","metadata":{"execution":{"iopub.execute_input":"2023-10-03T20:53:09.687210Z","iopub.status.busy":"2023-10-03T20:53:09.686942Z","iopub.status.idle":"2023-10-03T20:53:09.698661Z","shell.execute_reply":"2023-10-03T20:53:09.697929Z","shell.execute_reply.started":"2023-10-03T20:53:09.687193Z"},"language":"sql"},"source":"## Query/Schema Tuning Exercise\n\nNow that we've visualized our query execution plan, let's address some of the issues we've uncovered. \n\n### 1. Create a Reference table for departments"},{"cell_type":"code","execution_count":52,"id":"60599ab6-5c8e-4a49-a2cd-a4b201ff2f84","metadata":{"execution":{"iopub.execute_input":"2023-10-04T07:23:53.176524Z","iopub.status.busy":"2023-10-04T07:23:53.176155Z","iopub.status.idle":"2023-10-04T07:23:53.580877Z","shell.execute_reply":"2023-10-04T07:23:53.580309Z","shell.execute_reply.started":"2023-10-04T07:23:53.176498Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n
","text/plain":"++\n||\n++\n++"},"execution_count":52,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nCREATE REFERENCE TABLE departments_ref(\n dept_no CHAR(4) not null,\n dept_name varchar(40) not null,\n primary key (dept_no),\n key(dept_name)\n);\n\nINSERT INTO departments_ref (SELECT * FROM departments);"},{"cell_type":"markdown","id":"bf12d27f-a303-4c57-9b33-1d1a3280bfef","metadata":{"language":"sql"},"source":"### 2. Profile the old and the new"},{"cell_type":"code","execution_count":null,"id":"fdba3687-6430-4c30-8b50-177c528e54be","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\n%%sql\n%%sql\n-- CONTROL. Here is the original query. We can use this as our control in our experiment.\nSELECT e.first_name, e.last_name, d.dept_name, t.title, t.from_date, t.to_date\n FROM employees e\n INNER JOIN dept_emp de ON e.emp_no=de.emp_no\n INNER JOIN departments d ON de.dept_no=d.dept_no\n INNER JOIN titles t ON e.emp_no=t.emp_no\n ORDER BY e.first_name, e.last_name, d.dept_name, t.from_date\n LIMIT 10;\n\n\n\n-- IMPROVED. Here is the slightly more improved query with the departments_ref table\nSELECT e.first_name, e.last_name, d.dept_name, t.title, t.from_date, t.to_date\n FROM employees e\n INNER JOIN dept_emp de ON e.emp_no=de.emp_no\n INNER JOIN departments_ref d ON de.dept_no=d.dept_no\n INNER JOIN titles t ON e.emp_no=t.emp_no\n ORDER BY e.first_name, e.last_name, d.dept_name, t.from_date\n LIMIT 10;\n\n-- PROFILE them both and observe the differences."},{"cell_type":"markdown","id":"e569d4cc-80b7-4da0-9022-c9bdddac6e01","metadata":{"execution":{"iopub.execute_input":"2023-10-03T20:57:40.110119Z","iopub.status.busy":"2023-10-03T20:57:40.109741Z","iopub.status.idle":"2023-10-03T20:57:40.122251Z","shell.execute_reply":"2023-10-03T20:57:40.121691Z","shell.execute_reply.started":"2023-10-03T20:57:40.110099Z"},"language":"sql"},"source":"### 3. Create a titles table with sort and shard keys defined."},{"cell_type":"code","execution_count":57,"id":"8494132a-38cd-40e7-bcff-3d6a32eb25fb","metadata":{"collapsed":true,"execution":{"iopub.execute_input":"2023-10-04T07:25:33.089168Z","iopub.status.busy":"2023-10-04T07:25:33.088870Z","iopub.status.idle":"2023-10-04T07:25:34.453978Z","shell.execute_reply":"2023-10-04T07:25:34.453445Z","shell.execute_reply.started":"2023-10-04T07:25:33.089149Z"},"jupyter":{"outputs_hidden":true},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n
","text/plain":"++\n||\n++\n++"},"execution_count":57,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nCREATE TABLE titles_sharded (\n emp_no INT NOT NULL,\n title VARCHAR(50) NOT NULL,\n from_date DATE NOT NULL,\n to_date DATE,\n SORT KEY (emp_no),\n SHARD KEY (emp_no)\n);\n\nINSERT INTO titles_sharded (SELECT * FROM titles);"},{"cell_type":"markdown","id":"aca3590d-4144-4d0d-b2a6-e24ee21cb573","metadata":{"language":"sql"},"source":"### 4. Add shard and sort keys to the dept_emp table"},{"cell_type":"code","execution_count":null,"id":"40327756-cb80-444c-8745-f0646385d8da","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\nCREATE TABLE dept_emp_sharded(\n emp_no int not null,\n dept_no char(4) not null,\n from_date date not null,\n to_date date not null,\n SORT KEY (dept_no),\n SHARD KEY(emp_no),\n KEY (dept_no)\n);\n\nINSERT INTO dept_emp_sharded (SELECT * FROM dept_emp);"},{"cell_type":"code","execution_count":null,"id":"7066ff30-e4f1-4105-aaa4-0369af08658d","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\nSELECT e.first_name, e.last_name, d.dept_name, t.title, t.from_date, t.to_date\n FROM employees e\n INNER JOIN dept_emp de ON e.emp_no=de.emp_no\n INNER JOIN departments_ref d ON de.dept_no=d.dept_no\n INNER JOIN titles_sharded t ON e.emp_no=t.emp_no\n ORDER BY e.first_name, e.last_name, d.dept_name, t.from_date\n LIMIT 10;"},{"cell_type":"markdown","id":"884c0aeb-ba6a-4e30-bd87-1a23f23ec451","metadata":{"language":"sql"},"source":"### 5. Add shard and sort keys to the employees table"},{"cell_type":"code","execution_count":65,"id":"9b05ca85-78f7-4a4b-b4d9-779af58d8501","metadata":{"execution":{"iopub.execute_input":"2023-10-04T07:28:17.524467Z","iopub.status.busy":"2023-10-04T07:28:17.524187Z","iopub.status.idle":"2023-10-04T07:28:18.601673Z","shell.execute_reply":"2023-10-04T07:28:18.601115Z","shell.execute_reply.started":"2023-10-04T07:28:17.524448Z"},"language":"sql","trusted":true},"outputs":[{"data":{"text/html":"\n \n \n \n \n \n \n
","text/plain":"++\n||\n++\n++"},"execution_count":65,"metadata":{},"output_type":"execute_result"}],"source":"%%sql\nCREATE TABLE employees_sharded (\n emp_no INT NOT NULL,\n birth_date DATE NOT NULL,\n first_name VARCHAR(14) NOT NULL,\n last_name VARCHAR(16) NOT NULL,\n hire_date DATE NOT NULL,\n SORT KEY (emp_no),\n SHARD KEY (emp_no)\n);\n\nINSERT INTO employees_sharded (SELECT * FROM employees);"},{"cell_type":"code","execution_count":null,"id":"431ee5a9-9953-4891-9db5-effd0ec04320","metadata":{"language":"sql","trusted":true},"outputs":[],"source":"%%sql\nSELECT e.first_name, e.last_name, d.dept_name, t.title, t.from_date, t.to_date\n FROM employees_sharded e\n INNER JOIN dept_emp de ON e.emp_no=de.emp_no\n INNER JOIN departments_ref d ON de.dept_no=d.dept_no\n INNER JOIN titles_sharded t ON e.emp_no=t.emp_no\n ORDER BY e.first_name, e.last_name, d.dept_name, t.from_date\n LIMIT 10;"}],"metadata":{"jupyterlab":{"notebooks":{"version_major":6,"version_minor":4}},"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.4"},"singlestore_cell_default_language":"sql","singlestore_connection":{"connectionID":"cffe07c6-154d-4d38-837b-876c941e8706","defaultDatabase":"employees_lorrin_smith_bates_singlestore_com"}},"nbformat":4,"nbformat_minor":5} \ No newline at end of file +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5278edf1-1abe-4d62-9623-445379f91ba3", + "metadata": {}, + "source": [ + "
\n", + "
\n", + " \n", + "
\n", + "
\n", + "
SingleStore Notebooks
\n", + "

Kafka Pipelines and Query Tuning

\n", + "
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "d4c162a7-e101-475a-9e22-86d045c138ae", + "metadata": {}, + "source": [ + "# Ingesting real time data from the International Space Station (ISS)" + ] + }, + { + "cell_type": "markdown", + "id": "1e182e1d-9395-4fb8-9e1b-62d6d4f91f58", + "metadata": {}, + "source": [ + "### 1. Drop the database if it exists, create a new database, switch to it, and then create a table.\n", + "\n", + "### Database Name\n", + "In the following cell you will enter your email address as the database name. However, you will need to replace all characters that are not underscores or alpha numberics with underscores.\n", + "\n", + "### Example:\n", + "If your email address is lorrin.smith-bates@singlestore.com you would use **lorrin_smith_bates_singlestore_com**" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b3daf7fc-c60e-4c3b-a811-3338b94015ca", + "metadata": {}, + "outputs": [], + "source": [ + "modified_email_address = \"insert your email\"\n", + "%%sql\n", + "DROP DATABASE IF EXISTS {{ modified_email_address }};\n", + "CREATE DATABASE {{ modified_email_address }};\n", + "USE {{ modified_email_address }};\n", + "CREATE TABLE iss_location(\n", + " name varchar(10),\n", + " id int,\n", + " latitude float,\n", + " longitude float,\n", + " velocity float,\n", + " visibility varchar(20),\n", + " footprint float,\n", + " timestamp bigint,\n", + " daynum float,\n", + " solar_lat float,\n", + " solar_lon float,\n", + " units varchar(20),\n", + " url varchar(255)\n", + ");" + ] + }, + { + "cell_type": "markdown", + "id": "c4a85e23-6309-4e55-a2d7-39ab529f28a2", + "metadata": {}, + "source": [ + "### 2. Create a SingleStore pipeline to ingest ISS data from a Kafka topic." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6af2f20a-8d80-4ff4-b56a-07db593157a5", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "CREATE OR REPLACE PIPELINE iss_pipeline AS\n", + " LOAD DATA kafka '100.25.125.23/iss'\n", + " INTO TABLE iss_location\n", + " FORMAT JSON;" + ] + }, + { + "cell_type": "markdown", + "id": "5d7745a0-6649-445d-b1cf-7068cad7d74c", + "metadata": {}, + "source": [ + "### 3. Test the pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "49316e8f-03b1-4dcc-a747-170135ccf7a0", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "TEST PIPELINE iss_pipeline;" + ] + }, + { + "cell_type": "markdown", + "id": "ca166af0-6440-4869-9ce0-6e11caba41e5", + "metadata": {}, + "source": [ + "### 4. Start the Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6f36c4ff-a153-4acf-ba20-0b28a0ba75e4", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "START PIPELINE iss_pipeline;" + ] + }, + { + "cell_type": "markdown", + "id": "908c7e4c-fa9c-42b1-a164-6f410efb4f05", + "metadata": {}, + "source": [ + "### 5. Get the count of records. Run this a few time to see the number of records ingested." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e97f6065-9f42-4a32-9a15-fa06c4a1dda7", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT COUNT(*) FROM iss_location;" + ] + }, + { + "cell_type": "markdown", + "id": "06769299-1807-42e0-9a5e-f71ea5bf8dfe", + "metadata": {}, + "source": [ + "### 6. Get the latest location record. Click the link to see the location of the ISS in Google Maps." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "46a63ec2-bb8b-4887-b41a-42612e64c756", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT timestamp, url\n", + " FROM iss_location\n", + " ORDER BY timestamp desc\n", + " LIMIT 1;" + ] + }, + { + "cell_type": "markdown", + "id": "f547c895-5a4a-40c6-9284-14763ac2bea2", + "metadata": {}, + "source": [ + "### 7. Stop the pipeline and delete the data from the iss_location table." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "bc4a122a-e86e-405d-a0ed-fb82dcc21816", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "STOP PIPELINE iss_pipeline;\n", + "DELETE FROM iss_location;" + ] + }, + { + "cell_type": "markdown", + "id": "ccaef975-6a97-42dc-8112-64ae05bfbe70", + "metadata": {}, + "source": [ + "### 8. Change the pipeline offsets and interval." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "7efb3adb-bc12-4ca4-8eae-28703020af18", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "ALTER PIPELINE iss_pipeline\n", + " SET BATCH_INTERVAL 30000\n", + " SET OFFSETS latest ;" + ] + }, + { + "cell_type": "markdown", + "id": "814bff09-f60e-4c04-9408-9dd65e893c96", + "metadata": {}, + "source": [ + "### 9. Start the Pipeline again." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "94e46f4b-597e-4c71-b329-c4a3c3b8fff9", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "START PIPELINE iss_pipeline;" + ] + }, + { + "cell_type": "markdown", + "id": "a4a6f369-8ee8-479d-8014-5d528d449e13", + "metadata": {}, + "source": [ + "### 10. Count the records, notice how the records are populated now after alterning the pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "196f0285-0c8f-4e71-8a7d-8237fcdc891e", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "SELECT COUNT(*) from iss_location;" + ] + }, + { + "cell_type": "markdown", + "id": "a186addf-29fe-4311-b67d-03696ef4bc62", + "metadata": {}, + "source": [ + "### 11. Stop the pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "ddf2a7d5-050a-4146-b965-68a2aa6ce92c", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "STOP PIPELINE iss_pipeline;" + ] + }, + { + "cell_type": "markdown", + "id": "b2e91faf-99e8-4753-9e6d-08e3d703fce3", + "metadata": {}, + "source": [ + "# Query Optimization" + ] + }, + { + "cell_type": "markdown", + "id": "df7cd5f1-20a5-4f6e-8d44-c47def53dfb2", + "metadata": {}, + "source": [ + "### 1. Restore the 'employees' database that has been backed up into a public S3 bucket\n", + "\n", + "For the database name we'll prepend employee_ to the modified email address again." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "cb27d520-2141-450d-b0cf-925c12c32f3c", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "RESTORE DATABASE employees AS employees_<>\n", + " FROM S3 'train.memsql.com/employee'\n", + " CONFIG'{\"region\":\"us-east-1\"}'\n", + " CREDENTIALS'{}';" + ] + }, + { + "cell_type": "markdown", + "id": "ba2c8fc3-44c4-4d73-bf36-b9b1aab1b74d", + "metadata": {}, + "source": [ + "### 2. Switch to the Employees database" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "bafa5dbc-f149-4465-ab16-83339e515abb", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "USE employees_<>;" + ] + }, + { + "cell_type": "markdown", + "id": "fbbba65d-5473-4b86-8f8e-ca6835ac1b77", + "metadata": {}, + "source": [ + "### 3. Run a query that joins 4 tables and orders by 4 columns in 3 tables" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "41fb7c5b-e007-4d2c-a271-f91bc40a8405", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "%%sql\n", + "\n", + "SELECT e.first_name, e.last_name, d.dept_name, t.title, t.from_date, t.to_date\n", + " FROM employees e\n", + " INNER JOIN dept_emp de ON e.emp_no=de.emp_no\n", + " INNER JOIN departments d ON de.dept_no=d.dept_no\n", + " INNER JOIN titles t ON e.emp_no=t.emp_no\n", + " ORDER BY e.first_name, e.last_name, d.dept_name, t.from_date\n", + " LIMIT 10;" + ] + }, + { + "cell_type": "markdown", + "id": "77c58db6-8dab-4de8-8834-a4ca4e508c1e", + "metadata": {}, + "source": [ + "### 4. Examine the query execution profile using EXPLAIN" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "d58506b8-f97f-462c-a0e5-a15a93c8a7a5", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "%%sql\n", + "\n", + "EXPLAIN SELECT e.first_name, e.last_name, d.dept_name, t.title, t.from_date, t.to_date\n", + " FROM employees e\n", + " INNER JOIN dept_emp de ON e.emp_no=de.emp_no\n", + " INNER JOIN departments d ON de.dept_no=d.dept_no\n", + " INNER JOIN titles t ON e.emp_no=t.emp_no\n", + " ORDER BY e.first_name, e.last_name, d.dept_name, t.from_date\n", + " LIMIT 10;" + ] + }, + { + "cell_type": "markdown", + "id": "00b95e97-87f7-456f-b3dc-c6e70c014a40", + "metadata": {}, + "source": [ + "### 5. Profile the query by using PROFILE." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "0c35c2fd-67b2-4ddd-be8f-4e25f0e851b0", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "%%sql\n", + "PROFILE SELECT e.first_name, e.last_name, d.dept_name, t.title, t.from_date, t.to_date\n", + " FROM employees e\n", + " INNER JOIN dept_emp de ON e.emp_no=de.emp_no\n", + " INNER JOIN departments d ON de.dept_no=d.dept_no\n", + " INNER JOIN titles t ON e.emp_no=t.emp_no\n", + " ORDER BY e.first_name, e.last_name, d.dept_name, t.from_date\n", + " LIMIT 10;" + ] + }, + { + "cell_type": "markdown", + "id": "abbbd5b9-d9c2-4a54-b391-fb3a2e9ba0c1", + "metadata": {}, + "source": [ + "### 6. Run SHOW PROFILE to view the statistics on an actual run of the query" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "a45226d3-4a3e-442e-9800-90654ef8d045", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "%%sql\n", + "SHOW PROFILE;" + ] + }, + { + "cell_type": "markdown", + "id": "7c2a606c-15b3-4c67-97b6-e92ed00108fb", + "metadata": {}, + "source": [ + "### 7. Run Visual Profile to see this the profile in a GUI format" + ] + }, + { + "cell_type": "markdown", + "id": "3e9ab748-6b05-40e8-9563-93775d089625", + "metadata": {}, + "source": [ + "## Query/Schema Tuning Exercise\n", + "\n", + "Now that we've visualized our query execution plan, let's address some of the issues we've uncovered.\n", + "\n", + "### 1. Create a Reference table for departments" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "60599ab6-5c8e-4a49-a2cd-a4b201ff2f84", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "CREATE REFERENCE TABLE departments_ref(\n", + " dept_no CHAR(4) not null,\n", + " dept_name varchar(40) not null,\n", + " primary key (dept_no),\n", + " key(dept_name)\n", + ");\n", + "\n", + "INSERT INTO departments_ref (SELECT * FROM departments);" + ] + }, + { + "cell_type": "markdown", + "id": "bf12d27f-a303-4c57-9b33-1d1a3280bfef", + "metadata": {}, + "source": [ + "### 2. Profile the old and the new" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "fdba3687-6430-4c30-8b50-177c528e54be", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "%%sql\n", + "%%sql\n", + "-- CONTROL. Here is the original query. We can use this as our control in our experiment.\n", + "SELECT e.first_name, e.last_name, d.dept_name, t.title, t.from_date, t.to_date\n", + " FROM employees e\n", + " INNER JOIN dept_emp de ON e.emp_no=de.emp_no\n", + " INNER JOIN departments d ON de.dept_no=d.dept_no\n", + " INNER JOIN titles t ON e.emp_no=t.emp_no\n", + " ORDER BY e.first_name, e.last_name, d.dept_name, t.from_date\n", + " LIMIT 10;\n", + "\n", + "\n", + "\n", + "-- IMPROVED. Here is the slightly more improved query with the departments_ref table\n", + "SELECT e.first_name, e.last_name, d.dept_name, t.title, t.from_date, t.to_date\n", + " FROM employees e\n", + " INNER JOIN dept_emp de ON e.emp_no=de.emp_no\n", + " INNER JOIN departments_ref d ON de.dept_no=d.dept_no\n", + " INNER JOIN titles t ON e.emp_no=t.emp_no\n", + " ORDER BY e.first_name, e.last_name, d.dept_name, t.from_date\n", + " LIMIT 10;\n", + "\n", + "-- PROFILE them both and observe the differences." + ] + }, + { + "cell_type": "markdown", + "id": "e569d4cc-80b7-4da0-9022-c9bdddac6e01", + "metadata": {}, + "source": [ + "### 3. Create a titles table with sort and shard keys defined." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "8494132a-38cd-40e7-bcff-3d6a32eb25fb", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "CREATE TABLE titles_sharded (\n", + " emp_no INT NOT NULL,\n", + " title VARCHAR(50) NOT NULL,\n", + " from_date DATE NOT NULL,\n", + " to_date DATE,\n", + " SORT KEY (emp_no),\n", + " SHARD KEY (emp_no)\n", + ");\n", + "\n", + "INSERT INTO titles_sharded (SELECT * FROM titles);" + ] + }, + { + "cell_type": "markdown", + "id": "aca3590d-4144-4d0d-b2a6-e24ee21cb573", + "metadata": {}, + "source": [ + "### 4. Add shard and sort keys to the dept_emp table" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "40327756-cb80-444c-8745-f0646385d8da", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "CREATE TABLE dept_emp_sharded(\n", + " emp_no int not null,\n", + " dept_no char(4) not null,\n", + " from_date date not null,\n", + " to_date date not null,\n", + " SORT KEY (dept_no),\n", + " SHARD KEY(emp_no),\n", + " KEY (dept_no)\n", + ");\n", + "\n", + "INSERT INTO dept_emp_sharded (SELECT * FROM dept_emp);" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "7066ff30-e4f1-4105-aaa4-0369af08658d", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "SELECT e.first_name, e.last_name, d.dept_name, t.title, t.from_date, t.to_date\n", + " FROM employees e\n", + " INNER JOIN dept_emp de ON e.emp_no=de.emp_no\n", + " INNER JOIN departments_ref d ON de.dept_no=d.dept_no\n", + " INNER JOIN titles_sharded t ON e.emp_no=t.emp_no\n", + " ORDER BY e.first_name, e.last_name, d.dept_name, t.from_date\n", + " LIMIT 10;" + ] + }, + { + "cell_type": "markdown", + "id": "884c0aeb-ba6a-4e30-bd87-1a23f23ec451", + "metadata": {}, + "source": [ + "### 5. Add shard and sort keys to the employees table" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "9b05ca85-78f7-4a4b-b4d9-779af58d8501", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "CREATE TABLE employees_sharded (\n", + " emp_no INT NOT NULL,\n", + " birth_date DATE NOT NULL,\n", + " first_name VARCHAR(14) NOT NULL,\n", + " last_name VARCHAR(16) NOT NULL,\n", + " hire_date DATE NOT NULL,\n", + " SORT KEY (emp_no),\n", + " SHARD KEY (emp_no)\n", + ");\n", + "\n", + "INSERT INTO employees_sharded (SELECT * FROM employees);" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "431ee5a9-9953-4891-9db5-effd0ec04320", + "metadata": {}, + "outputs": [], + "source": [ + "%%sql\n", + "SELECT e.first_name, e.last_name, d.dept_name, t.title, t.from_date, t.to_date\n", + " FROM employees_sharded e\n", + " INNER JOIN dept_emp de ON e.emp_no=de.emp_no\n", + " INNER JOIN departments_ref d ON de.dept_no=d.dept_no\n", + " INNER JOIN titles_sharded t ON e.emp_no=t.emp_no\n", + " ORDER BY e.first_name, e.last_name, d.dept_name, t.from_date\n", + " LIMIT 10;" + ] + }, + { + "cell_type": "markdown", + "id": "0485bc25-71c5-45b0-93b3-3d47e9501be3", + "metadata": {}, + "source": [ + "
\n", + "
" + ] + } + ], + "metadata": { + "jupyterlab": { + "notebooks": { + "version_major": 6, + "version_minor": 4 + } + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From aef82849604c5eb41f8662a4e28eedd46b141f7f Mon Sep 17 00:00:00 2001 From: Pranav Aurora Date: Wed, 4 Oct 2023 09:39:12 -0700 Subject: [PATCH 3/3] fixed <> --- notebooks/pipelines-query-tuning/notebook.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/pipelines-query-tuning/notebook.ipynb b/notebooks/pipelines-query-tuning/notebook.ipynb index d2ee5275..f4da0d13 100644 --- a/notebooks/pipelines-query-tuning/notebook.ipynb +++ b/notebooks/pipelines-query-tuning/notebook.ipynb @@ -302,7 +302,7 @@ "outputs": [], "source": [ "%%sql\n", - "RESTORE DATABASE employees AS employees_<>\n", + "RESTORE DATABASE employees AS employees_{{ modified_email_address }}\n", " FROM S3 'train.memsql.com/employee'\n", " CONFIG'{\"region\":\"us-east-1\"}'\n", " CREDENTIALS'{}';"