From a0b54375a62e09a8bc97f20277e1a05efc0e65ed Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Thu, 28 Dec 2023 14:58:59 +0800 Subject: [PATCH] [Minor] Improve English, use better names and data, simplify SQL (#1272) ### What changes were proposed in this pull request? In general, it is better to use more realistic data and don't use generic names ending in numbers. The SQL also needs to match the amount of data we have in the playground. ### Why are the changes needed? To provide a slightly more realistic example and make SQL easier to understand. Fix: # N/A ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? Ran commands locally to test. Co-authored-by: Justin Mclean --- docs/how-to-use-the-playground.md | 160 ++++++++---------------------- 1 file changed, 43 insertions(+), 117 deletions(-) diff --git a/docs/how-to-use-the-playground.md b/docs/how-to-use-the-playground.md index a254719affc..6209bd708f3 100644 --- a/docs/how-to-use-the-playground.md +++ b/docs/how-to-use-the-playground.md @@ -8,15 +8,13 @@ This software is licensed under the Apache License version 2." ## Playground introduction -Playground is a complete Gravitino Docker runtime environment with `Hive`, `Hdfs`, `Trino`, `MySQL`, `PostgreSQL`, and `Gravitino` server. +The playground is a complete Gravitino Docker runtime environment with `Hive`, `HDFS`, `Trino`, `MySQL`, `PostgreSQL`, and a `Gravitino` server. -Depending on your network, the startup may take 3-5 minutes. +Depending on your network and computer, startup time may take 3-5 minutes. Once the playground environment has started, you can open http://localhost:8090 in a browser to access the Gravitino Web UI. -Once the playground environment has started, you can open http://localhost:8090 to access the Gravitino Web UI. +## Prerequisites -## Prerequisite - -You should install git and docker-compose. +You first need to install git and docker-compose. ## Start playground @@ -26,15 +24,15 @@ cd gravitino-playground ./launch-playground.sh ``` -## Experience Gravitino with Trino SQL +## Experiencing Gravitino with Trino SQL -1. Login to Gravitino playground Trino Docker container using the following command. +1. Login to the Gravitino playground Trino Docker container using the following command. ```shell docker exec -it playground-trino bash ```` -2. Open Trino CLI in the container. +2. Open the Trino CLI in the container. ```shell trino@d2bbfccc7432:/$ trino @@ -44,145 +42,73 @@ trino@d2bbfccc7432:/$ trino ### Simple queries -Use simple queries to test in the Trino CLI. +You can use simple queries to test in the Trino CLI. ```SQL SHOW CATALOGS; -CREATE SCHEMA "metalake_demo.catalog_hive".db1 - WITH (location = 'hdfs://hive:9000/user/hive/warehouse/db1.db'); +CREATE SCHEMA "metalake_demo.catalog_hive".company + WITH (location = 'hdfs://hive:9000/user/hive/warehouse/company.db'); -SHOW CREATE SCHEMA "metalake_demo.catalog_hive".db1; +SHOW CREATE SCHEMA "metalake_demo.catalog_hive".company; -CREATE TABLE "metalake_demo.catalog_hive".db1.table_001 +CREATE TABLE "metalake_demo.catalog_hive".company.employees ( name varchar, - salary varchar + salary decimal(10,2) ) WITH ( format = 'TEXTFILE' ); -INSERT INTO "metalake_demo.catalog_hive".db1.table_001 (name, salary) VALUES ('sam', '11'); +INSERT INTO "metalake_demo.catalog_hive".company.employees (name, salary) VALUES ('Sam Evans', 55000); -SELECT * FROM "metalake_demo.catalog_hive".db1.table_001; +SELECT * FROM "metalake_demo.catalog_hive".company.employees; SHOW SCHEMAS from "metalake_demo.catalog_hive"; -DESCRIBE "metalake_demo.catalog_hive".db1.table_001; +DESCRIBE "metalake_demo.catalog_hive".company.employees; -SHOW TABLES from "metalake_demo.catalog_hive".db1; +SHOW TABLES from "metalake_demo.catalog_hive".company; ``` ### Cross-catalog queries -In companies, there may be different departments using different data stacks. -In this example, HR department uses Apache Hive to store its data. -Sales department uses PostgreSQL to store its data. -This example has generated some data for two departments. -You can query some interesting results with Gravitino. +In a company, there may be different departments using different data stacks. In this example, the HR department uses Apache Hive to store its data and the sales department uses PostgreSQL to store its data. You can run some interesting queries by joining the two departments' data together with Gravitino. -If you want to know which employee has the largest sales amount. -You can run the SQL. +If you want to know which employee has the largest sales amount, you can run this SQL. ```SQL -WITH totalsales AS ( - SELECT - employee_id, - SUM(total_amount) AS sales_amount - FROM "metalake_demo.catalog_hive".sales.sales - GROUP BY - employee_id -), rankedemployees AS ( - SELECT - employee_id, - sales_amount, - RANK() OVER (ORDER BY sales_amount DESC) AS sales_rank - FROM totalsales -) -SELECT - e.employee_id, - given_name, - family_name, - job_title, - sales_amount -FROM rankedemployees AS r -JOIN "metalake_demo.catalog_postgres".hr.employees AS e - ON r.employee_id = e.employee_id -WHERE - sales_rank = 1; +SET SESSION allow_pushdown_into_connectors=false; +SELECT given_name, family_name, job_title, sum(total_amount) AS total_sales +FROM "metalake_demo.catalog_hive".sales.sales as s, + "metalake_demo.catalog_postgres".hr.employees AS e +where s.employee_id = e.employee_id +GROUP BY given_name, family_name, job_title +ORDER BY total_sales DESC +LIMIT 1; ``` -If you want to know top 10 customers who bought the most by state. -You run the SQL. +If you want to know top customers who bought the most by state, you can run this SQL. ```SQL -WITH customersales AS ( - SELECT - "metalake_demo.catalog_hive".sales.customers.customer_id, - customer_name, - customer_email, - location AS state, - SUM(total_amount) AS total_spent - FROM "metalake_demo.catalog_hive".sales.sales - JOIN "metalake_demo.catalog_hive".sales.customers - ON "metalake_demo.catalog_hive".sales.sales.customer_id = "metalake_demo.catalog_hive".sales.customers.customer_id - JOIN "metalake_demo.catalog_hive".sales.stores - ON "metalake_demo.catalog_hive".sales.sales.store_id = "metalake_demo.catalog_hive".sales.stores.store_id - GROUP BY - "metalake_demo.catalog_hive".sales.customers.customer_id, - customer_name, - customer_email, - location -), rankedcustomersales AS ( - SELECT - customer_id, - customer_name, - customer_email, - state, - total_spent, - RANK() OVER (PARTITION BY state ORDER BY total_spent DESC) AS customer_rank - FROM customersales -) -SELECT - customer_id, - customer_name, - customer_email, - state, - total_spent -FROM rankedcustomersales -WHERE - customer_rank <= 10 -ORDER BY - state, - customer_rank; +SELECT customer_name, location, SUM(total_amount) AS total_spent +FROM "metalake_demo.catalog_hive".sales.sales AS s, + "metalake_demo.catalog_hive".sales.stores AS l, + "metalake_demo.catalog_hive".sales.customers AS c +WHERE s.store_id = l.store_id AND s.customer_id = c.customer_id +GROUP BY location, customer_name +ORDER BY location, SUM(total_amount) DESC; ``` -If you want to know that employees average performance rating and total sales. -You run the SQL. +If you want to know the employee's average performance rating and total sales, you can run this SQL. ```SQL -set session allow_pushdown_into_connectors=false; -WITH employeeperformance AS ( - SELECT - employee_id, - AVG(rating) AS average_rating - FROM "metalake_demo.catalog_postgres".hr.employee_performance - GROUP BY - employee_id -), employeesales AS ( - SELECT - employee_id, - SUM(total_amount) AS total_sales - FROM "metalake_demo.catalog_hive".sales.sales - GROUP BY - employee_id -) -SELECT - e.employee_id, - average_rating, - total_sales -FROM employeeperformance AS e -JOIN employeesales AS s - ON e.employee_id = s.employee_id; +SET SESSION allow_pushdown_into_connectors=false; +SELECT e.employee_id, given_name, family_name, AVG(rating) AS average_rating, SUM(total_amount) AS total_sales +FROM "metalake_demo.catalog_postgres".hr.employees AS e, + "metalake_demo.catalog_postgres".hr.employee_performance AS p, + "metalake_demo.catalog_hive".sales.sales AS s +WHERE e.employee_id = p.employee_id AND p.employee_id = s.employee_id +GROUP BY e.employee_id, given_name, family_name; ```