diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index d71b0de..6ff3292 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,7 +1,4 @@ { - "name": "Datasets", - "build": { - "dockerfile": "../Dockerfile", - "context": ".." - } + "name": "Datonic Hub", + "image": "mcr.microsoft.com/devcontainers/python:1-3.12-bullseye" } diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..a60807f --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,6 @@ +version: 2 +updates: + - package-ecosystem: "devcontainers" + directory: "/" + schedule: + interval: weekly diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index a7fd503..0000000 --- a/Dockerfile +++ /dev/null @@ -1,6 +0,0 @@ -FROM mcr.microsoft.com/devcontainers/python:3.11 - -# Install requirements -COPY requirements.txt /tmp/pip-tmp/ -RUN pip3 --no-cache-dir install -r /tmp/pip-tmp/requirements.txt \ - && rm -rf /tmp/pip-tmp diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..48d8dee --- /dev/null +++ b/Makefile @@ -0,0 +1,5 @@ +venv: + @command -v uv >/dev/null 2>&1 || pip install -U uv + uv venv + uv pip install -U -r requirements.txt + . .venv/bin/activate diff --git a/README.md b/README.md index a5a8df8..fbb1b14 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,8 @@ # 📦 Datonic Hub -The center of the Datonic community. +The center of the Datonic community. A place to improve the way the world produces, share, consume and collaborate on open datasets. -We aim to improve the way the world produces, share, consume and collaborate on open datasets. -We aim for a world that produces **open data** with **open source software** using **open protocols** running on **open infrastructure**. +Aiming to share **open data** generated with **open source software** using **open protocols** running on **open infrastructure**. ## 📖 Documentation diff --git a/docs/FAQ.md b/docs/FAQ.md index 3f73f14..409b33e 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -1,6 +1,6 @@ # FAQ -Please open an issue if you have any other question! +[Open an issue](https://github.com/datonic/hub/issues/new) if you have questions! ## Why Frictionless? @@ -16,7 +16,7 @@ We need to solve the problem of "packaging data" as a community. Frictionless is I've [tried quite a bunch of Data Package Managers](https://publish.obsidian.md/davidgasquez/Open+Data#Data+Package+Managers). Frictionless is the simplest and most flexible one. It also has a reasonable adoption and active community. -That said, I'm open to other options. If you have a better idea, please open an issue and let's chat! +That said, I'm open to other options. If you have a better idea, [let's chat](https://davidgasquez.com/)! ### How would you make datasets immutable? @@ -31,7 +31,7 @@ resources: scheme: ipfs ``` -In the end, the Frictionless abstraction is just a URL. We can use anything we want in the backend as long as we provide a way to read the data. In this case: +In the end, the Frictionless abstraction is just an URL. We can use anything we want in the backend as long as we provide a way to read the data. In this case: ```python ipfs_package = Package("my-dataset-datapackage.yaml") # Could even be Package("bafyreca4sf...") @@ -45,7 +45,7 @@ ipfs_resource.sql("SELECT * FROM my-data") ### How would you backup datasets? -An easy and cheap way to backup datasets is to preiodically backup the data resources on IPFS/Filecoin. This can be done using GitHub Actions and [Estuary](https://estuary.tech/)/[web3.storage](https://web3.storage/). Once the data in there, we can rely on the [`_cache` property of the Frictionless Specs](https://specs.frictionlessdata.io/patterns/#caching-of-resources) (or a `_backup` one) to point to the IPFS CID. +Depending on the dataset, this feature could be pushed to the hosting later. If you publish in HuggingFace, you get versioning and backup for free! Once the data in there, we can rely on the [`_cache` property of the Frictionless Specs](https://specs.frictionlessdata.io/patterns/#caching-of-resources) (or a `_backup` one) to point to the previous backup. ### How would you make datasets discoverable? @@ -89,7 +89,7 @@ Some interesting plugins ideas might be to integrate with Socrata ([Simon Wilson ### How would you make datasets reproducible? -Need more thought but probably using something like Bacalhau to run the pipelines. +By versioning the code and the data together, it should be possible to reproduce the dataset. The easiest way to do this is by publishing datasets via GitHub Actions, this way the code and the data are always in sync. Furthermore, attaching a Docker image and Dev Container environment makes it easy to reproduce the dataset in any environment. ### How would you make datasets versioned? @@ -108,13 +108,9 @@ Yes, the new LLM models could help with this vision. A few things that could be - Extract data and generate resources from anything. Define the schema and let GPT-N do the rest. [Some projects are already working on this](https://jamesturk.github.io/scrapeghost/). - Can datapackages be written in natural language? Can we use GPT-N to generate them? The same way [plugins are starting to be written for ChatGPT](https://raw.githubusercontent.com/openai/chatgpt-retrieval-plugin/336ff64b96ef23bda164ab94ca6f349607bbc5b6/.well-known/ai-plugin.json) that only requires a `description_for_model` text. Could something like this work on data packages. Embeddings become the flexible metadata we all want. -### How does Frictionless Data compare to other data management or data packaging tools? - -TODO: Explain how the project fits into the larger open data ecosystem and how it relates to other similar projects. - ### Can Frictionless be used for non-tabular data formats? -TODO: Explain how the project can be used for non-tabular data formats and add examples. +Yes! It is probably not the best fit but the basic idea would be to have a table pointing to the URI of the non-tabular data. For example, you could have a datasets of sounds, images, or videos by having a column with the URI of the file. ### Why should people use Frictionless Data? diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md index b00052b..843b5f6 100644 --- a/docs/ROADMAP.md +++ b/docs/ROADMAP.md @@ -1,25 +1,22 @@ # ROADMAP -## Overview +## Goal -Align the way we package data as an ecosystem. +Create better ways to produce **open data** with **open source software** using **open protocols** running on **open infrastructure**. ## Milestones ### 0.1 -- [ ] Document how to backup datasets to IPFS -- [ ] Create a Catalog of existing datasets -- [ ] Make datasets retrievable via gateways -- [ ] Make datasets retrievable via IPFS with `fsspec` +- [ ] Create a sample repository for creating and sharing datasets +- [ ] Make datasets easily retrievable +- [ ] Make datasets discoverable - [ ] Early community reach out to look for potential datasets to package and collaborate on ### 0.2 -- [ ] Write HuggingFace plugin -- [ ] Write Socrata plugin -- [ ] Backup HuggingFace and Socrata datasets to IPFS/Filecoin -- [ ] Integrate with other community projects like [OpSci Commons](https://commons.opsci.io/), [OpenNeuro](https://openneuro.org/), [OpenPanda](https://openpanda.io/). +- [ ] Backup datasets to multiple locations +- [ ] Automate dataset format conversion ### 0.3 diff --git a/docs/working-group.md b/docs/working-group.md index 50107a5..15f95f0 100644 --- a/docs/working-group.md +++ b/docs/working-group.md @@ -1,6 +1,6 @@ # 📦 Open Data Working Group -Exploring a better way to produces **open data** with **open source software** using **open protocols** running on **open infrastructure**. +Exploring better ways to produce **open data** with **open source software** using **open protocols** running on **open infrastructure**. ## 🧑‍🦱 Interesting Folks diff --git a/notebooks/quickstart.ipynb b/notebooks/quickstart.ipynb index ebb0731..1bfbd1d 100644 --- a/notebooks/quickstart.ipynb +++ b/notebooks/quickstart.ipynb @@ -13,11 +13,11 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "from frictionless import Package\n" + "from frictionless import Package" ] }, { @@ -32,14 +32,14 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Overwriting /tmp/external_data_datapackage.yaml\n" + "Writing /tmp/external_data_datapackage.yaml\n" ] } ], @@ -82,16 +82,16 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "p = Package(\"/tmp/external_data_datapackage.yaml\")\n" + "p = Package(\"/tmp/external_data_datapackage.yaml\")" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -125,43 +125,43 @@ " \n", " \n", " 0\n", - " 2013\n", - " 1\n", + " 2014\n", " 1\n", - " 395.18\n", - " 394.28\n", + " 18\n", + " 397.81\n", + " 396.46\n", " \n", " \n", " 1\n", - " 2013\n", + " 2014\n", " 1\n", - " 2\n", - " 395.20\n", - " 394.29\n", + " 19\n", + " 397.83\n", + " 396.47\n", " \n", " \n", " 2\n", - " 2013\n", + " 2014\n", " 1\n", - " 3\n", - " 395.22\n", - " 394.29\n", + " 20\n", + " 397.85\n", + " 396.48\n", " \n", " \n", " 3\n", - " 2013\n", + " 2014\n", " 1\n", - " 4\n", - " 395.24\n", - " 394.30\n", + " 21\n", + " 397.87\n", + " 396.48\n", " \n", " \n", " 4\n", - " 2013\n", + " 2014\n", " 1\n", - " 5\n", - " 395.27\n", - " 394.31\n", + " 22\n", + " 397.89\n", + " 396.49\n", " \n", " \n", "\n", @@ -169,20 +169,20 @@ ], "text/plain": [ " year month day smoothed trend\n", - "0 2013 1 1 395.18 394.28\n", - "1 2013 1 2 395.20 394.29\n", - "2 2013 1 3 395.22 394.29\n", - "3 2013 1 4 395.24 394.30\n", - "4 2013 1 5 395.27 394.31" + "0 2014 1 18 397.81 396.46\n", + "1 2014 1 19 397.83 396.47\n", + "2 2014 1 20 397.85 396.48\n", + "3 2014 1 21 397.87 396.48\n", + "4 2014 1 22 397.89 396.49" ] }, - "execution_count": 6, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "p.resources[0].to_pandas().head()\n" + "p.resources[0].to_pandas().head()" ] }, { @@ -222,16 +222,16 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ - "from frictionless import Catalog\n" + "from frictionless import Catalog" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -261,16 +261,16 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ - "c = Catalog(\"/tmp/datapackage_catalog.yaml\")\n" + "c = Catalog(\"/tmp/datapackage_catalog.yaml\")" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -300,29 +300,29 @@ " \n", " \n", " \n", - " 9101\n", - " 2023-03-28\n", - " 78.07\n", + " 9215\n", + " 2023-09-12\n", + " 93.58\n", " \n", " \n", - " 9102\n", - " 2023-03-29\n", - " 77.51\n", + " 9216\n", + " 2023-09-13\n", + " 93.04\n", " \n", " \n", - " 9103\n", - " 2023-03-30\n", - " 78.45\n", + " 9217\n", + " 2023-09-14\n", + " 95.20\n", " \n", " \n", - " 9104\n", - " 2023-03-31\n", - " 79.19\n", + " 9218\n", + " 2023-09-15\n", + " 95.55\n", " \n", " \n", - " 9105\n", - " 2023-04-03\n", - " 85.81\n", + " 9219\n", + " 2023-09-18\n", + " 95.95\n", " \n", " \n", "\n", @@ -330,20 +330,20 @@ ], "text/plain": [ " Date Price\n", - "9101 2023-03-28 78.07\n", - "9102 2023-03-29 77.51\n", - "9103 2023-03-30 78.45\n", - "9104 2023-03-31 79.19\n", - "9105 2023-04-03 85.81" + "9215 2023-09-12 93.58\n", + "9216 2023-09-13 93.04\n", + "9217 2023-09-14 95.20\n", + "9218 2023-09-15 95.55\n", + "9219 2023-09-18 95.95" ] }, - "execution_count": 15, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "c.get_dataset(\"oil\").package.get_resource(\"brent-daily\").to_pandas().tail(5)\n" + "c.get_dataset(\"oil\").package.get_resource(\"brent-daily\").to_pandas().tail(5)" ] } ], @@ -363,7 +363,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.2" + "version": "3.12.3" }, "orig_nbformat": 4 }, diff --git a/requirements.txt b/requirements.txt index c6297c1..1a332bf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ frictionless[parquet,pandas] ipykernel +huggingface_hub