From d97634c40be4b33a67f1b7b38d978a8f914eb650 Mon Sep 17 00:00:00 2001 From: baniasbaabe Date: Sun, 4 Feb 2024 12:22:14 +0100 Subject: [PATCH] Add parquet vs csv --- book/pandas/Chapter.ipynb | 48 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/book/pandas/Chapter.ipynb b/book/pandas/Chapter.ipynb index 4fead7e..aabb881 100644 --- a/book/pandas/Chapter.ipynb +++ b/book/pandas/Chapter.ipynb @@ -189,6 +189,54 @@ "data = {'Value': [1.2343129, 5.8956701, 6.224289]}\n", "df = pd.DataFrame(data)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Faster I/O with Parquet" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Whenever you work with bigger datasets, please avoid using CSV format (or similar).\n", + "\n", + "CSV files are text files, which are human-readable, and therefore a popular option to store data.\n", + "\n", + "For small datasets, this is not a big issue.\n", + "\n", + "But, what if your data has millions of rows?\n", + "\n", + "It can get really slow to do read/write operations on them.\n", + "\n", + "On the other side, binary files exist too.\n", + "\n", + "They consist of 0s and 1s and are not meant to be human-readable but to be used by programs that know how to interpret them.\n", + "\n", + "Because of that, binary files are more compact and consume less space.\n", + "\n", + "Parquet is one popular binary file format, which is more memory-efficient than CSVs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "# Shape: (100000000, 5)\n", + "df = pd.DataFrame(...)\n", + "\n", + "# Time: 1m 58s\n", + "df.to_csv(\"data.csv\")\n", + "\n", + "# Time: 8s\n", + "df.to_parquet(\"data.parquet\")" + ] } ], "metadata": {