Skip to content

Commit

Permalink
Merge pull request #5 from alexandrainst/download-images
Browse files Browse the repository at this point in the history
Image caption dataset downloaded
  • Loading branch information
oliverkinch authored Sep 22, 2023
2 parents 9dc8474 + 7253d0c commit 65fb46b
Show file tree
Hide file tree
Showing 17 changed files with 902 additions and 30 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ ______________________________________________________________________
[![Documentation](https://img.shields.io/badge/docs-passing-green)](https://alexandrainst.github.io/NordjyllandNews/nordjylland_news.html)
[![License](https://img.shields.io/github/license/alexandrainst/NordjyllandNews)](https://github.com/alexandrainst/NordjyllandNews/blob/main/LICENSE)
[![LastCommit](https://img.shields.io/github/last-commit/alexandrainst/NordjyllandNews)](https://github.com/alexandrainst/NordjyllandNews/commits/main)
[![Code Coverage](https://img.shields.io/badge/Coverage-66%25-yellow.svg)](https://github.com/alexandrainst/NordjyllandNews/tree/main/tests)
[![Code Coverage](https://img.shields.io/badge/Coverage-61%25-yellow.svg)](https://github.com/alexandrainst/NordjyllandNews/tree/main/tests)
[![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.0-4baaaa.svg)](https://github.com/alexandrainst/NordjyllandNews/blob/main/CODE_OF_CONDUCT.md)


Expand Down
3 changes: 3 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,6 @@ sleeps:
long: 900

testing: False
timeout: 60
n_requests_image: 2

Binary file added figures/caption_length_distribution.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added figures/image_size_distribution.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added figures/summary_length_distribution.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added figures/text_length_distribution.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
294 changes: 294 additions & 0 deletions notebooks/process_image_data.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,294 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Prepare image caption dataset for HuggingFace"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Rename `image_caption.jsonl` to `metadata.json` and place it in the raw images/train folder together with the images."
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Only keep features `file_name` and `caption`."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import json"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def read_jsonl(path):\n",
" with open(path, \"r\") as f:\n",
" return [json.loads(line) for line in f]\n",
"\n",
"\n",
"def write_jsonl(path, data):\n",
" with open(path, \"w\") as f:\n",
" for line in data:\n",
" json.dump(line, f)\n",
" f.write(\"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"RAW_PATH = \"../../../../mnt/data_6tb/oliver/NordjyllandNews/data/raw/images/train\"\n",
"PROCESSED_PATH = \"../../../../mnt/data_6tb/oliver/NordjyllandNews/data/processed/images/train\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"raw_meta_data_path = f\"{RAW_PATH}/metadata.jsonl\"\n",
"\n",
"data = read_jsonl(raw_meta_data_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"processed_data = []\n",
"keys_to_keep = [\"file_name\", \"caption\"]\n",
"for d in data:\n",
" processed_data.append({k: d[k] for k in keys_to_keep})"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"processed_meta_data_path = f\"{PROCESSED_PATH}/metadata.jsonl\"\n",
"\n",
"write_jsonl(processed_meta_data_path, processed_data)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Dataset statistics"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"sns.set_style(\"whitegrid\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data = read_jsonl(processed_meta_data_path)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Caption Length Distribution"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"caption_lenghts = [len(d[\"caption\"]) for d in data]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.figure(figsize=(10, 5))\n",
"sns.histplot(caption_lenghts, bins=100)\n",
"plt.title(\"Caption Length Distribution\")\n",
"plt.ylabel(\"Frequency\")\n",
"plt.xlabel(\"Number of characters in caption\")\n",
"plt.savefig(\"../figures/caption_length_distribution.png\")\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Number of samples"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(caption_lenghts)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Image resolutions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import cv2"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"smallest_height = float(\"inf\")\n",
"smallest_width = float(\"inf\")\n",
"smallest_channels = float(\"inf\")\n",
"\n",
"largest_height = 0\n",
"largest_width = 0\n",
"largest_channels = 0\n",
"pixel_counts = []\n",
"n_samples = len(data)\n",
"for i, d in enumerate(data):\n",
" img = cv2.imread(f\"{PROCESSED_PATH}/{d['file_name']}\")\n",
" height, width, channels = img.shape\n",
" pixel_count = height * width * channels\n",
" pixel_counts.append(pixel_count)\n",
" if height < smallest_height:\n",
" smallest_height = height\n",
" if width < smallest_width:\n",
" smallest_width = width\n",
" if channels < smallest_channels:\n",
" smallest_channels = channels\n",
" \n",
" if height > largest_height:\n",
" largest_height = height\n",
" if width > largest_width:\n",
" largest_width = width\n",
" if channels > largest_channels:\n",
" largest_channels = channels\n",
"\n",
" if not i % 200:\n",
" print(f\"{i}/{n_samples}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(f\"Height: [{smallest_height}, {largest_height}]\")\n",
"print(f\"Width: [{smallest_width}, {largest_width}]\")\n",
"print(f\"Channels: [{smallest_channels}, {largest_channels}]\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pixel_counts_sqrt = [int(np.sqrt(p / 3)) for p in pixel_counts] # Divide by 3 because of 3 channels\n",
"\n",
"plt.figure(figsize=(10, 5))\n",
"sns.histplot(pixel_counts_sqrt, bins=50)\n",
"plt.title(\"Image size distribution\")\n",
"plt.ylabel(\"Frequency\")\n",
"plt.xlabel(\"x\")\n",
"\n",
"plt.ticklabel_format(style='plain', axis='x')\n",
"plt.xticks(range(0, 10000 + 100, 1000), rotation=45)\n",
"\n",
"\n",
"\n",
"plt.savefig(\"../figures/image_size_distribution.png\", bbox_inches='tight')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 65fb46b

Please sign in to comment.