Merge pull request #5 from alexandrainst/download-images

Image caption dataset downloaded
alexandrainst · Sep 22, 2023 · 65fb46b · 65fb46b
2 parents 9dc8474 + 7253d0c
commit 65fb46b
Show file tree

Hide file tree

Showing 17 changed files with 902 additions and 30 deletions.
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@ ______________________________________________________________________
 [![Documentation](https://img.shields.io/badge/docs-passing-green)](https://alexandrainst.github.io/NordjyllandNews/nordjylland_news.html)
 [![License](https://img.shields.io/github/license/alexandrainst/NordjyllandNews)](https://github.com/alexandrainst/NordjyllandNews/blob/main/LICENSE)
 [![LastCommit](https://img.shields.io/github/last-commit/alexandrainst/NordjyllandNews)](https://github.com/alexandrainst/NordjyllandNews/commits/main)
-[![Code Coverage](https://img.shields.io/badge/Coverage-66%25-yellow.svg)](https://github.com/alexandrainst/NordjyllandNews/tree/main/tests)
+[![Code Coverage](https://img.shields.io/badge/Coverage-61%25-yellow.svg)](https://github.com/alexandrainst/NordjyllandNews/tree/main/tests)
 [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.0-4baaaa.svg)](https://github.com/alexandrainst/NordjyllandNews/blob/main/CODE_OF_CONDUCT.md)
 
 

diff --git a/config/config.yaml b/config/config.yaml
@@ -26,3 +26,6 @@ sleeps:
   long: 900
 
 testing: False
+timeout: 60
+n_requests_image: 2
+
diff --git a/figures/caption_length_distribution.png b/figures/caption_length_distribution.png
diff --git a/figures/image_size_distribution.png b/figures/image_size_distribution.png
diff --git a/figures/summary_length_distribution.png b/figures/summary_length_distribution.png
diff --git a/figures/text_length_distribution.png b/figures/text_length_distribution.png
diff --git a/notebooks/process_image_data.ipynb b/notebooks/process_image_data.ipynb
@@ -0,0 +1,294 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare image caption dataset for HuggingFace"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Rename `image_caption.jsonl` to `metadata.json` and place it in the raw images/train folder together with the images."
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Only keep features `file_name` and `caption`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def read_jsonl(path):\n",
+    "    with open(path, \"r\") as f:\n",
+    "        return [json.loads(line) for line in f]\n",
+    "\n",
+    "\n",
+    "def write_jsonl(path, data):\n",
+    "    with open(path, \"w\") as f:\n",
+    "        for line in data:\n",
+    "            json.dump(line, f)\n",
+    "            f.write(\"\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "RAW_PATH = \"../../../../mnt/data_6tb/oliver/NordjyllandNews/data/raw/images/train\"\n",
+    "PROCESSED_PATH = \"../../../../mnt/data_6tb/oliver/NordjyllandNews/data/processed/images/train\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "raw_meta_data_path = f\"{RAW_PATH}/metadata.jsonl\"\n",
+    "\n",
+    "data = read_jsonl(raw_meta_data_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "processed_data = []\n",
+    "keys_to_keep = [\"file_name\", \"caption\"]\n",
+    "for d in data:\n",
+    "    processed_data.append({k: d[k] for k in keys_to_keep})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "processed_meta_data_path = f\"{PROCESSED_PATH}/metadata.jsonl\"\n",
+    "\n",
+    "write_jsonl(processed_meta_data_path, processed_data)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Dataset statistics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "\n",
+    "sns.set_style(\"whitegrid\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = read_jsonl(processed_meta_data_path)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Caption Length Distribution"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "caption_lenghts = [len(d[\"caption\"]) for d in data]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(10, 5))\n",
+    "sns.histplot(caption_lenghts, bins=100)\n",
+    "plt.title(\"Caption Length Distribution\")\n",
+    "plt.ylabel(\"Frequency\")\n",
+    "plt.xlabel(\"Number of characters in caption\")\n",
+    "plt.savefig(\"../figures/caption_length_distribution.png\")\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Number of samples"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(caption_lenghts)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Image resolutions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import cv2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "smallest_height = float(\"inf\")\n",
+    "smallest_width = float(\"inf\")\n",
+    "smallest_channels = float(\"inf\")\n",
+    "\n",
+    "largest_height = 0\n",
+    "largest_width = 0\n",
+    "largest_channels = 0\n",
+    "pixel_counts = []\n",
+    "n_samples = len(data)\n",
+    "for i, d in enumerate(data):\n",
+    "    img = cv2.imread(f\"{PROCESSED_PATH}/{d['file_name']}\")\n",
+    "    height, width, channels = img.shape\n",
+    "    pixel_count = height * width * channels\n",
+    "    pixel_counts.append(pixel_count)\n",
+    "    if height < smallest_height:\n",
+    "        smallest_height = height\n",
+    "    if width < smallest_width:\n",
+    "        smallest_width = width\n",
+    "    if channels < smallest_channels:\n",
+    "        smallest_channels = channels\n",
+    "    \n",
+    "    if height > largest_height:\n",
+    "        largest_height = height\n",
+    "    if width > largest_width:\n",
+    "        largest_width = width\n",
+    "    if channels > largest_channels:\n",
+    "        largest_channels = channels\n",
+    "\n",
+    "    if not i % 200:\n",
+    "        print(f\"{i}/{n_samples}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"Height: [{smallest_height}, {largest_height}]\")\n",
+    "print(f\"Width: [{smallest_width}, {largest_width}]\")\n",
+    "print(f\"Channels: [{smallest_channels}, {largest_channels}]\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pixel_counts_sqrt = [int(np.sqrt(p / 3)) for p in pixel_counts] # Divide by 3 because of 3 channels\n",
+    "\n",
+    "plt.figure(figsize=(10, 5))\n",
+    "sns.histplot(pixel_counts_sqrt, bins=50)\n",
+    "plt.title(\"Image size distribution\")\n",
+    "plt.ylabel(\"Frequency\")\n",
+    "plt.xlabel(\"x\")\n",
+    "\n",
+    "plt.ticklabel_format(style='plain', axis='x')\n",
+    "plt.xticks(range(0, 10000 + 100, 1000), rotation=45)\n",
+    "\n",
+    "\n",
+    "\n",
+    "plt.savefig(\"../figures/image_size_distribution.png\", bbox_inches='tight')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}