From fcc4c4559b2e34ca2d5d3df378341ddb1e337da6 Mon Sep 17 00:00:00 2001 From: Dave Date: Tue, 17 Sep 2024 11:39:51 +0200 Subject: [PATCH] re-enable throw on broken links clear cache in the right places in preprocess docs Bring back links, redirects, reorder Fix caching issue on historic docs Fix some broken links (will need cleanup) --- CONTRIBUTING.md | 2 +- .../website/docs/build-a-pipeline-tutorial.md | 11 +- .../dlt-ecosystem/destinations/bigquery.md | 2 +- .../verified-sources/filesystem/basic.md | 2 +- .../dlt-ecosystem/verified-sources/index.md | 4 +- .../verified-sources/rest_api/basic.md | 4 +- docs/website/docs/general-usage/schema.md | 2 +- docs/website/docs/getting-started.md | 108 ------------------ .../docs/tutorial/load-data-from-an-api.md | 4 +- docs/website/netlify.toml | 16 ++- docs/website/sidebars.js | 90 +++++++-------- docs/website/tools/preprocess_docs.js | 2 + docs/website/tools/update_versions.js | 8 +- 13 files changed, 74 insertions(+), 181 deletions(-) delete mode 100644 docs/website/docs/getting-started.md diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 85dbf37c97..8520736f60 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -4,7 +4,7 @@ Thank you for considering contributing to **dlt**! We appreciate your help in ma ## Table of Contents -1. [Getting Started](#getting-started) +1. [Getting Started](#intro) 2. [Submitting Changes](#submitting-changes) 3. [Adding or updating core dependencies](#adding-or-updating-core-dependencies) 4. [Linting](#linting) diff --git a/docs/website/docs/build-a-pipeline-tutorial.md b/docs/website/docs/build-a-pipeline-tutorial.md index a7cd4e4050..de1a4d647f 100644 --- a/docs/website/docs/build-a-pipeline-tutorial.md +++ b/docs/website/docs/build-a-pipeline-tutorial.md @@ -7,8 +7,7 @@ keywords: [getting started, quick start, basics] # Building data pipelines with `dlt`, from basic to advanced This in-depth overview will take you through the main areas of pipelining with `dlt`. Go to the -related pages you are instead looking for the [quickstart](getting-started.md), or the -[walkthroughs](walkthroughs). +related pages you are instead looking for the [quickstart](./intro.md). ## Why build pipelines with `dlt`? @@ -49,8 +48,8 @@ normalize, and evolve your data schemas, enabling seamless data integration and For example, let's consider a scenario where you want to load a list of objects into a DuckDB table named "three". With `dlt`, you can create a pipeline and run it with just a few lines of code: -1. [Create a pipeline](walkthroughs/create-a-pipeline.md) to the [destination](dlt-ecosystem/destinations). -1. Give this pipeline data and [run it](walkthroughs/run-a-pipeline.md). +1. [Create a pipeline](./walkthroughs/create-a-pipeline.md) to the [destination](dlt-ecosystem/destinations). +1. Give this pipeline data and [run it](./walkthroughs/run-a-pipeline.md). ```py import dlt @@ -378,7 +377,7 @@ processing and loading the data. Exporting schema files enables you to modify th adjustments to the schema as needed. You can then import the modified schema files back into `dlt` to use them in your pipeline. -Read more: [Adjust a schema docs.](walkthroughs/adjust-a-schema.md) +Read more: [Adjust a schema docs.](./walkthroughs/adjust-a-schema.md) ## Governance Support in `dlt` Pipelines @@ -400,7 +399,7 @@ define the structure of normalized data and guide the processing and loading of predefined schemas, pipelines maintain data integrity and facilitate standardized data handling practices. -Read more: [Adjust a schema docs.](walkthroughs/adjust-a-schema.md) +Read more: [Adjust a schema docs.](./walkthroughs/adjust-a-schema.md) ### Schema evolution diff --git a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md index 334e08c4a7..324c712dfc 100644 --- a/docs/website/docs/dlt-ecosystem/destinations/bigquery.md +++ b/docs/website/docs/dlt-ecosystem/destinations/bigquery.md @@ -362,7 +362,7 @@ bigquery_adapter(my_resource, partition="partition_column_name") my_resource = bigquery_adapter(my_resource, partition="partition_column_name") ``` -Refer to the [full API specification](../../api_reference/destinations/impl/bigquery/bigquery_adapter.md) for more details. +Refer to the [full API specification](../../api_reference/destinations/impl/bigquery/bigquery_adapter) for more details. diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md index 359ebb5088..847ff64bf1 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/filesystem/basic.md @@ -42,7 +42,7 @@ print(pipeline.last_trace.last_normalize_info) ### Prerequisites -Please make sure the `dlt` library is installed. Refer to the [installation guide](../../../getting-started). +Please make sure the `dlt` library is installed. Refer to the [installation guide](../../../intro). ### Initialize the filesystem source diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/index.md b/docs/website/docs/dlt-ecosystem/verified-sources/index.md index c846a73eb2..a3d2ba00a7 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/index.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/index.md @@ -12,7 +12,7 @@ Planning to use `dlt` in production and need a source that isn't listed? We're h ### Core sources item.label === '30+ SQL Databases' || item.label === 'REST APIs' || item.label === 'Filesystem & buckets' +item => item.label === '30+ SQL Databases' || item.label === 'REST APIs' || item.label === 'Filesystem & cloud storage' )} /> ### Verified sources @@ -24,7 +24,7 @@ If you couldn't find a source implementation, you can easily create your own, ch ::: item.label !== '30+ SQL Databases' && item.label !== 'REST API generic source'&& item.label !== 'Filesystem & buckets' +item => item.label !== '30+ SQL Databases' && item.label !== 'REST APIs' && item.label !== 'Filesystem & cloud storage' )} /> ### What's the difference between core and verified sources? diff --git a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md index 1a28fe4602..e301128dc1 100644 --- a/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md +++ b/docs/website/docs/dlt-ecosystem/verified-sources/rest_api/basic.md @@ -68,7 +68,7 @@ Running this pipeline will create two tables in the DuckDB: `posts` and `comment ### Prerequisites -Please make sure the `dlt` library is installed. Refer to the [installation guide](../../../getting-started). +Please make sure the `dlt` library is installed. Refer to the [installation guide](../../../intro). ### Initialize the REST API source @@ -309,7 +309,7 @@ A resource configuration is used to define a [dlt resource](../../../general-usa - `include_from_parent`: A list of fields from the parent resource to be included in the resource output. See the [resource relationships](#include-fields-from-the-parent-resource) section for more details. - `selected`: A flag to indicate if the resource is selected for loading. This could be useful when you want to load data only from child resources and not from the parent resource. -You can also pass additional resource parameters that will be used to configure the dlt resource. See [dlt resource API reference](../../../api_reference/extract/decorators.md#resource) for more details. +You can also pass additional resource parameters that will be used to configure the dlt resource. See [dlt resource API reference](../../../api_reference/extract/decorators#resource) for more details. ### Endpoint configuration diff --git a/docs/website/docs/general-usage/schema.md b/docs/website/docs/general-usage/schema.md index 84693b6078..534d3ca3bd 100644 --- a/docs/website/docs/general-usage/schema.md +++ b/docs/website/docs/general-usage/schema.md @@ -148,7 +148,7 @@ Postgres ignore it when creating tables. Variant columns are generated by a normalizer when it encounters data item with type that cannot be coerced in existing column. Please see our [`coerce_row`](https://github.com/dlt-hub/dlt/blob/7d9baf1b8fdf2813bcf7f1afe5bb3558993305ca/dlt/common/schema/schema.py#L205) if you are interested to see how internally it works. -Let's consider our [getting started](../getting-started#quick-start) example with slightly different approach, +Let's consider our [getting started](../intro) example with slightly different approach, where `id` is an integer type at the beginning ```py diff --git a/docs/website/docs/getting-started.md b/docs/website/docs/getting-started.md deleted file mode 100644 index c24881f94d..0000000000 --- a/docs/website/docs/getting-started.md +++ /dev/null @@ -1,108 +0,0 @@ ---- -title: Getting started -description: quick start with dlt -keywords: [getting started, quick start, basic examples] ---- -import snippets from '!!raw-loader!./getting-started-snippets.py'; - -# Getting started - -## Overview - -`dlt` is an open-source library that you can add to your Python scripts to load data -from various and often messy data sources into well-structured, live datasets. -This guide will show you how to start using `dlt` with a simple example: loading data -from a list of Python dictionaries into DuckDB. - -Let's get started! - -## Installation - -Install dlt using `pip`: - -```sh -pip install -U dlt -``` - -The command above installs (or upgrades) the core library, in the example below we -use DuckDB as a destination so let's add a `duckdb` dependency: - -```sh -pip install "dlt[duckdb]" -``` - -:::tip -Use a clean virtual environment for your experiments! Here are [detailed instructions](reference/installation). - -Make sure that your `dlt` version is **0.3.15** or above. Check it in the terminal with `dlt --version`. -::: - -## Quick start - -For starters, let's load a list of Python dictionaries into DuckDB and inspect the created dataset. Here is the code: - - - - -When you look at the code above, you can see that we: -1. Import the `dlt` library. -2. Define our data to load. -3. Create a pipeline that loads data into DuckDB. Here we also specify the `pipeline_name` and `dataset_name`. We'll use both in a moment. -4. Run the pipeline. - -Save this Python script with the name `quick_start_pipeline.py` and run the following command: - -```sh -python quick_start_pipeline.py -``` - -The output should look like: - -```sh -Pipeline quick_start completed in 0.59 seconds -1 load package(s) were loaded to destination duckdb and into dataset mydata -The duckdb destination used duckdb:////home/user-name/quick_start/quick_start.duckdb location to store data -Load package 1692364844.460054 is LOADED and contains no failed jobs -``` - -`dlt` just created a database schema called **mydata** (the `dataset_name`) with a table **users** in it. - -### Explore the data - -To allow sneak peek and basic discovery you can take advantage of [built-in integration with Strealmit](reference/command-line-interface#show-tables-and-data-in-the-destination): - -```sh -dlt pipeline quick_start show -``` - -**quick_start** is the name of the pipeline from the script above. If you do not have Streamlit installed yet do: - -```sh -pip install streamlit -``` - -Now you should see the **users** table: - -![Streamlit Explore data](/img/streamlit-new.png) -Streamlit Explore data. Schema and data for a test pipeline “quick_start”. - -:::tip -`dlt` works in Jupyter Notebook and Google Colab! See our [Quickstart Colab Demo.](https://colab.research.google.com/drive/1NfSB1DpwbbHX9_t5vlalBTf13utwpMGx?usp=sharing) - -Looking for source code of all the snippets? You can find and run them [from this repository](https://github.com/dlt-hub/dlt/blob/devel/docs/website/docs/getting-started-snippets.py). -::: - -## What's next? - -Now that you have a basic understanding of how to get started with dlt, you might be eager to dive deeper. A great next step is to walk through our detailed tutorial, where we provide a step-by-step guide to building a pipeline that loads data from the GitHub API into DuckDB and teaches you how to use some of the most important features of dlt. - -[Follow the tutorial →](tutorial/intro) - -More resources: -- [What is a data pipeline in dlt?](general-usage/pipeline) -- [How to create a pipeline](walkthroughs/create-a-pipeline) -- [How to run a pipeline.](walkthroughs/run-a-pipeline) -- [How to configure DuckDB](dlt-ecosystem/destinations/duckdb) -- [The full list of available destinations](dlt-ecosystem/destinations/) -- [Exploring the data](dlt-ecosystem/visualizations/exploring-the-data). -- [Destination tables: what happens after loading?](general-usage/destination-tables) \ No newline at end of file diff --git a/docs/website/docs/tutorial/load-data-from-an-api.md b/docs/website/docs/tutorial/load-data-from-an-api.md index 1e40531691..5b1d63373c 100644 --- a/docs/website/docs/tutorial/load-data-from-an-api.md +++ b/docs/website/docs/tutorial/load-data-from-an-api.md @@ -74,7 +74,7 @@ Load package 1692364844.460054 is LOADED and contains no failed jobs ### Explore the data -To allow sneak peek and basic discovery you can take advantage of [built-in integration with Strealmit](reference/command-line-interface#show-tables-and-data-in-the-destination): +To allow sneak peek and basic discovery you can take advantage of [built-in integration with Strealmit](../reference/command-line-interface#show-tables-and-data-in-the-destination): ```sh dlt pipeline quick_start show @@ -558,7 +558,7 @@ That's it! Now you have a reusable source that can load data from any GitHub rep ## What’s next -Congratulations on completing the tutorial! You've come a long way since the [getting started](../getting-started) guide. By now, you've mastered loading data from various GitHub API endpoints, organizing resources into sources, managing secrets securely, and creating reusable sources. You can use these skills to build your own pipelines and load data from any source. +Congratulations on completing the tutorial! You've come a long way since the [getting started](../intro) guide. By now, you've mastered loading data from various GitHub API endpoints, organizing resources into sources, managing secrets securely, and creating reusable sources. You can use these skills to build your own pipelines and load data from any source. Interested in learning more? Here are some suggestions: 1. You've been running your pipelines locally. Learn how to [deploy and run them in the cloud](../walkthroughs/deploy-a-pipeline/). diff --git a/docs/website/netlify.toml b/docs/website/netlify.toml index 919ad09aea..76c0a15c03 100644 --- a/docs/website/netlify.toml +++ b/docs/website/netlify.toml @@ -6,18 +6,27 @@ to = "/docs/intro" from = "/docs" to = "/docs/intro" +[[redirects]] +from = "/docs/getting-started" +to = "/docs/intro" + [[redirects]] from = "/docs/dlt-ecosystem" to = "/docs/dlt-ecosystem/verified-sources" [[redirects]] from = "/docs/general-usage/credentials/config_providers" -to = "/docs/general-usage/credentials" +to = "/docs/general-usage/credentials/setup" [[redirects]] from = "/docs/general-usage/credentials/configuration" to = "/docs/general-usage/credentials/setup" +[[redirects]] +from = "/docs/general-usage/credentials/config_specs" +to = "/docs/general-usage/credentials/complex_types" +status = 301 + [[redirects]] from = "/docs/tutorial/intro" to = "docs/tutorial/load-data-from-an-api" @@ -25,8 +34,3 @@ to = "docs/tutorial/load-data-from-an-api" [[redirects]] from = "/docs/tutorial/grouping-resources" to = "docs/tutorial/load-data-from-an-api" - -[[redirects]] -from = "/docs/general-usage/credentials/config_specs" -to = "/docs/general-usage/credentials" -status = 301 diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 68a05fa1f9..710c3ac57c 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -94,67 +94,59 @@ const sidebars = { type: 'category', label: '30+ SQL Databases', description: 'PostgreSQL, MySQL, MS SQL, BigQuery, Redshift, and more', - link: { + link: { type: 'doc', id: 'dlt-ecosystem/verified-sources/sql_database/index', - }, + }, items: [ 'dlt-ecosystem/verified-sources/sql_database/setup', 'dlt-ecosystem/verified-sources/sql_database/configuration', 'dlt-ecosystem/verified-sources/sql_database/usage', 'dlt-ecosystem/verified-sources/sql_database/troubleshooting', - 'dlt-ecosystem/verified-sources/sql_database/advanced' + 'dlt-ecosystem/verified-sources/sql_database/advanced', ] }, - + 'dlt-ecosystem/verified-sources/airtable', + 'dlt-ecosystem/verified-sources/amazon_kinesis', + 'dlt-ecosystem/verified-sources/arrow-pandas', + 'dlt-ecosystem/verified-sources/asana', + 'dlt-ecosystem/verified-sources/chess', + 'dlt-ecosystem/verified-sources/facebook_ads', + 'dlt-ecosystem/verified-sources/freshdesk', + 'dlt-ecosystem/verified-sources/github', + 'dlt-ecosystem/verified-sources/google_ads', + 'dlt-ecosystem/verified-sources/google_analytics', + 'dlt-ecosystem/verified-sources/google_sheets', + 'dlt-ecosystem/verified-sources/hubspot', + 'dlt-ecosystem/verified-sources/inbox', + 'dlt-ecosystem/verified-sources/jira', + 'dlt-ecosystem/verified-sources/kafka', + 'dlt-ecosystem/verified-sources/matomo', + 'dlt-ecosystem/verified-sources/mongodb', + 'dlt-ecosystem/verified-sources/mux', + 'dlt-ecosystem/verified-sources/notion', + 'dlt-ecosystem/verified-sources/personio', + 'dlt-ecosystem/verified-sources/pg_replication', + 'dlt-ecosystem/verified-sources/pipedrive', + 'dlt-ecosystem/verified-sources/openapi-generator', + 'dlt-ecosystem/verified-sources/salesforce', + 'dlt-ecosystem/verified-sources/scrapy', + 'dlt-ecosystem/verified-sources/shopify', + 'dlt-ecosystem/verified-sources/slack', + 'dlt-ecosystem/verified-sources/strapi', + 'dlt-ecosystem/verified-sources/stripe', + 'dlt-ecosystem/verified-sources/workable', + 'dlt-ecosystem/verified-sources/zendesk', { type: 'category', - label: 'All verified sources', - description: 'All our verified sources', + label: 'REST API helpers', + link: { + type: 'doc', + id: 'general-usage/http/overview', + }, items: [ - 'dlt-ecosystem/verified-sources/airtable', - 'dlt-ecosystem/verified-sources/amazon_kinesis', - 'dlt-ecosystem/verified-sources/arrow-pandas', - 'dlt-ecosystem/verified-sources/asana', - 'dlt-ecosystem/verified-sources/chess', - 'dlt-ecosystem/verified-sources/facebook_ads', - 'dlt-ecosystem/verified-sources/freshdesk', - 'dlt-ecosystem/verified-sources/github', - 'dlt-ecosystem/verified-sources/google_ads', - 'dlt-ecosystem/verified-sources/google_analytics', - 'dlt-ecosystem/verified-sources/google_sheets', - 'dlt-ecosystem/verified-sources/hubspot', - 'dlt-ecosystem/verified-sources/inbox', - 'dlt-ecosystem/verified-sources/jira', - 'dlt-ecosystem/verified-sources/kafka', - 'dlt-ecosystem/verified-sources/matomo', - 'dlt-ecosystem/verified-sources/mongodb', - 'dlt-ecosystem/verified-sources/mux', - 'dlt-ecosystem/verified-sources/notion', - 'dlt-ecosystem/verified-sources/personio', - 'dlt-ecosystem/verified-sources/pg_replication', - 'dlt-ecosystem/verified-sources/pipedrive', - 'dlt-ecosystem/verified-sources/openapi-generator', - 'dlt-ecosystem/verified-sources/salesforce', - 'dlt-ecosystem/verified-sources/scrapy', - 'dlt-ecosystem/verified-sources/shopify', - 'dlt-ecosystem/verified-sources/slack', - 'dlt-ecosystem/verified-sources/strapi', - 'dlt-ecosystem/verified-sources/stripe', - 'dlt-ecosystem/verified-sources/workable', - 'dlt-ecosystem/verified-sources/zendesk', - { - type: 'category', - label: 'REST API helpers', - link: { - type: 'doc', - id: 'general-usage/http/overview', - }, - items: [ - 'general-usage/http/rest-client', - 'general-usage/http/requests', - ] - }, + 'general-usage/http/rest-client', + 'general-usage/http/requests', ] }, 'walkthroughs/add-a-verified-source', diff --git a/docs/website/tools/preprocess_docs.js b/docs/website/tools/preprocess_docs.js index 28b1d11474..c5c0c33246 100644 --- a/docs/website/tools/preprocess_docs.js +++ b/docs/website/tools/preprocess_docs.js @@ -358,6 +358,7 @@ function syncExamples() { console.log(`Synced ${count} examples`) } +fs.rmSync(MD_TARGET_DIR, {force: true, recursive: true}) syncExamples(); preprocess_docs(); @@ -372,6 +373,7 @@ if (process.argv.includes("--watch")) { if (Date.now() - lastUpdate < 500) { return; } + fs.rmSync(MD_TARGET_DIR, {force: true, recursive: true}) console.log('%s changed...', name); syncExamples(); preprocess_docs(); diff --git a/docs/website/tools/update_versions.js b/docs/website/tools/update_versions.js index 855766c5dd..ff188adb80 100644 --- a/docs/website/tools/update_versions.js +++ b/docs/website/tools/update_versions.js @@ -6,6 +6,7 @@ const semver = require('semver') // const const REPO_DIR = ".dlt-repo" const REPO_DOCS_DIR = REPO_DIR + "/docs/website" +const REPO_PREPROCESSED_FILES_DIR = REPO_DOCS_DIR + "/docs_processed" const REPO_URL = "https://github.com/dlt-hub/dlt.git" const VERSIONED_DOCS_FOLDER = "versioned_docs" const VERSIONED_SIDEBARS_FOLDER = "versioned_sidebars" @@ -19,6 +20,7 @@ fs.rmSync(REPO_DIR, { recursive: true, force: true }) // checkout fresh console.log("Checking out dlt repo") +fs.rmSync(REPO_DIR, {force: true, recursive: true}) proc.execSync(`git clone ${REPO_URL} ${REPO_DIR}`) // find tags @@ -91,6 +93,9 @@ for (const version of selectedVersions) { // process.exit(1) // } + // clear preprocessed docs in subrepo + fs.rmSync(REPO_PREPROCESSED_FILES_DIR, { force: true, recursive: true}) + // build doc version, we also run preprocessing and markdown gen for each doc version console.log(`Building docs...`) proc.execSync(`cd ${REPO_DOCS_DIR} && npm run preprocess-docs && PYTHONPATH=. pydoc-markdown`) @@ -100,8 +105,7 @@ for (const version of selectedVersions) { console.log(`Moving snapshot`) fs.cpSync(REPO_DOCS_DIR+"/"+VERSIONED_DOCS_FOLDER, VERSIONED_DOCS_FOLDER, {recursive: true}) - fs.cpSync(REPO_DOCS_DIR+"/"+VERSIONED_SIDEBARS_FOLDER, VERSIONED_SIDEBARS_FOLDER, {recursive: true}) - + fs.cpSync(REPO_DOCS_DIR+"/"+VERSIONED_SIDEBARS_FOLDER, VERSIONED_SIDEBARS_FOLDER, {recursive: true}) } fs.cpSync(REPO_DOCS_DIR+"/versions.json", "versions.json")