Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add daily dumps of DB main tables as JSONL files #282

Merged
merged 1 commit into from
May 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,12 @@ class Settings(BaseSettings):
sentry_dns: str | None = None
log_level: LoggingLevel = LoggingLevel.INFO
images_dir: Path = STATIC_DIR / "img"
data_dir: Path = STATIC_DIR / "data"
environment: Environment = Environment.org

model_config = SettingsConfigDict(env_file=".env", extra="ignore")


settings = Settings()
settings.images_dir.mkdir(parents=True, exist_ok=True)
settings.data_dir.mkdir(parents=True, exist_ok=True)
25 changes: 24 additions & 1 deletion app/scheduler.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
import datetime
import shutil
from pathlib import Path

from apscheduler.executors.pool import ThreadPoolExecutor
from apscheduler.jobstores.memory import MemoryJobStore
from apscheduler.schedulers.blocking import BlockingScheduler
from openfoodfacts import Flavor
from openfoodfacts.utils import get_logger

from app.config import settings
from app.db import session
from app.tasks import import_product_db
from app.tasks import dump_db, import_product_db

logger = get_logger(__name__)

Expand All @@ -19,11 +24,29 @@ def import_product_db_job() -> None:
import_product_db(db=db, flavor=flavor)


def dump_db_job() -> None:
"""Dump the database as JSONL files to the data directory."""
# Create a temporary directory to store the dump
tmp_dir = Path(f"/tmp/dump-{datetime.datetime.now().isoformat()}").resolve()

with session() as db:
dump_db(db, tmp_dir)

for file_path in tmp_dir.iterdir():
# Move the file to the final location
logger.info(f"Moving {file_path} to {settings.data_dir}")
shutil.move(file_path, settings.data_dir / file_path.name)
tmp_dir.rmdir()


def run() -> None:
scheduler = BlockingScheduler()
scheduler.add_executor(ThreadPoolExecutor(20))
scheduler.add_jobstore(MemoryJobStore())
scheduler.add_job(
import_product_db_job, "cron", max_instances=1, hour=10, minute=0, jitter=60
)
scheduler.add_job(
dump_db_job, "cron", max_instances=1, hour=23, minute=0, jitter=60
)
scheduler.start()
24 changes: 22 additions & 2 deletions app/tasks.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import datetime
import gzip
from pathlib import Path

import tqdm
from openfoodfacts import DatasetType, Flavor, ProductDataset
Expand All @@ -7,8 +9,8 @@
from sqlalchemy import or_, select, update
from sqlalchemy.orm import Session

from app import crud
from app.models import Price, Product, Proof
from app import crud, schemas
from app.models import Location, Price, Product, Proof
from app.schemas import LocationCreate, ProductCreate, UserCreate
from app.utils import (
OFF_FIELDS,
Expand Down Expand Up @@ -200,3 +202,21 @@ def create_price_location(db: Session, price: Price) -> None:
else:
# Increment the price count of the location
crud.increment_location_price_count(db, location=db_location)


def dump_db(db: Session, output_dir: Path) -> None:
"""Dump the database to gzipped JSONL files."""
logger.info("Creating dumps of the database")
output_dir.mkdir(parents=True, exist_ok=True)

for table_name, model_cls, schema_cls in (
("prices", Price, schemas.PriceFull),
("proofs", Proof, schemas.ProofFull),
("locations", Location, schemas.LocationFull),
):
logger.info(f"Dumping {table_name}")
output_path = output_dir / f"{table_name}.jsonl.gz"
with gzip.open(output_path, "wt") as f:
for (item,) in tqdm.tqdm(db.execute(select(model_cls)), desc=table_name):
f.write(schema_cls(**item.__dict__).model_dump_json())
f.write("\n")
7 changes: 6 additions & 1 deletion static/es/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,12 @@ <h4 class="emphasized-title" id="how-can-i-use-the-data">¿Cómo puedo utilizar
<p>Los datos están disponibles bajo la
<a href="https://opendatacommons.org/licenses/odbl/1.0/">Licence Open Database</a>, lo que significa que se pueden utilizar para cualquier propósito, siempre y cuando acredites Open Prices y compartas cualquier modificación que realices en el conjunto de datos.
<br/><br/>
La API REST proporciona una forma de acceder fácilmente a los datos. Próximamente también estará disponible una Data dump.</p>
La API REST proporciona una forma de acceder fácilmente a los datos.
Los datos también están disponibles en forma de un volcado JSONL (comprimido con gzip):
<a href="https://prices.openfoodfacts.org/data/prices.jsonl.gz">precios</a>,
<a href="https://prices.openfoodfacts.org/data/proofs.jsonl.gz">pruebas</a>,
y <a href="https://prices.openfoodfacts.org/data/locations.jsonl.gz">ubicaciones</a>.
</p>
</div>
</div>
<div class="row faq-row">
Expand Down
7 changes: 6 additions & 1 deletion static/fr/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,12 @@ <h4 class="emphasized-title" id="how-can-i-use-the-data">Comment
href="https://opendatacommons.org/licenses/odbl/1.0/">Licence
Open Database</a>, ce qui signifie qu'elles peuvent être utilisées à
des fins quelconques, à condition de créditer Open Prices et de partager toutes les modifications apportées au jeu de données.</p>
<p>L'API REST permet d'accéder facilement aux données. Un export (dump) des données sera également bientôt disponible.</p>
<p>L'API REST permet d'accéder facilement aux données.
Les données sont également disponibles sous forme d'un dump JSONL (gzippé) :
<a href="https://prices.openfoodfacts.org/data/prices.jsonl.gz">prix</a>,
<a href="https://prices.openfoodfacts.org/data/proofs.jsonl.gz">preuves</a>,
et <a href="https://prices.openfoodfacts.org/data/locations.jsonl.gz">lieux</a>.
</p>
</div>
</div>
<div class="row faq-row">
Expand Down
9 changes: 6 additions & 3 deletions static/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -153,9 +153,12 @@ <h4 class="emphasized-title" id="how-can-i-use-the-data">How
for any purpose, as long as you credit Open Prices and
share
any modifications you make to the dataset.</p>
<p>The REST API provides a way to easily access the data. A
data
dump will also be made available soon.</p>
<p>The REST API provides a way to easily access the data. The
data is also available as 3 gzipped JSONL dumps: <a
href="https://prices.openfoodfacts.org/data/prices.jsonl.gz">prices</a>,
<a href="https://prices.openfoodfacts.org/data/proofs.jsonl.gz">proofs</a>,
and <a href="https://prices.openfoodfacts.org/data/locations.jsonl.gz">locations</a>.
</p>
</div>
</div>
<div class="row faq-row">
Expand Down