Skip to content

Commit

Permalink
Merge pull request #3 from ekoepplin/addmakefile
Browse files Browse the repository at this point in the history
add makefile
  • Loading branch information
ekoepplin authored Nov 13, 2024
2 parents 9ead18d + 17674ec commit da186b4
Show file tree
Hide file tree
Showing 5 changed files with 190 additions and 17 deletions.
62 changes: 62 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
.PHONY: install run test lint clean full-refresh fix

# Python executable
PYTHON = python3

# Main script
SCRIPT = newsapi_pipeline.py

# Install dependencies
install:
pipenv install

# Run the pipeline normally
run:
$(PYTHON) $(SCRIPT)

# Run in test mode with DuckDB
test:
$(PYTHON) $(SCRIPT) --test

# Run with full refresh
full-refresh:
$(PYTHON) $(SCRIPT) --full-refresh

# Run with debug logging
debug:
$(PYTHON) $(SCRIPT) --log-level DEBUG

# Install linting tools
install-lint:
pipenv install black flake8 isort autopep8

# Check code style without fixing
lint: install-lint
black --check .
flake8 .
isort --check .

# Automatically fix code style issues
fix: install-lint
black .
isort .
autopep8 --in-place --aggressive --aggressive --max-line-length 79 *.py

# Clean generated files
clean:
rm -rf .dlt
rm -rf newsapi_data
find . -type d -name "__pycache__" -exec rm -r {} +
find . -type f -name "*.pyc" -delete

# Help command
help:
@echo "Available commands:"
@echo " make install - Install dependencies"
@echo " make run - Run the pipeline"
@echo " make test - Run in test mode with DuckDB"
@echo " make full-refresh - Run with full refresh"
@echo " make debug - Run with debug logging"
@echo " make lint - Run code formatters and linters"
@echo " make clean - Clean generated files"
@echo " make fix - Automatically fix code style issues"
4 changes: 4 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ urllib3 = "==2.2.3"
wrapt = "==1.16.0"
yarl = "==1.15.5"
dlt = {extras = ["parquet"], version = "*"}
black = "*"
flake8 = "*"
isort = "*"
autopep8 = "*"

[requires]
python_version = "3.11"
Expand Down
109 changes: 108 additions & 1 deletion Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 7 additions & 7 deletions newsapi_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import dlt
from newsapi.newsapi_client import NewsApiClient
from datetime import datetime, timedelta
import json
from loguru import logger # Import Loguru
import argparse
import json
from datetime import datetime, timedelta
from pathlib import Path

import dlt
from loguru import logger # Import Loguru
from newsapi.newsapi_client import NewsApiClient

# Get today's date and calculate the date range for a 24-hour period
today = datetime.utcnow().date()
Expand Down Expand Up @@ -145,8 +145,8 @@ def run_pipeline(destination="filesystem", full_refresh=False):
)

load_info = pipeline.run(
run_all_articles(), write_disposition="replace" if full_refresh else "append"
)
run_all_articles(),
write_disposition="replace" if full_refresh else "append")

logger.info(f"Load info: {load_info}")
logger.success(f"All data processed and uploaded to {destination}")
Expand Down
18 changes: 9 additions & 9 deletions notebooks/eda-newsapi.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -12,27 +12,27 @@
"from datetime import datetime, timedelta\n",
"import os\n",
"\n",
"news_api_key = os.environ['NEWS_API_KEY']\n",
"news_api_key = os.environ[\"NEWS_API_KEY\"]\n",
"\n",
"\n",
"@dlt.resource(table_name=\"articles_us_en\")\n",
"def get_articles_us_en(api_key=news_api_key):\n",
" newsapi = NewsApiClient(api_key=api_key)\n",
" articles = newsapi.get_everything(language=\"en\", q=\"United States\", sort_by=\"publishedAt\")\n",
" articles = newsapi.get_everything(\n",
" language=\"en\", q=\"United States\", sort_by=\"publishedAt\"\n",
" )\n",
" for article in articles[\"articles\"]:\n",
" yield article\n",
"\n",
"\n",
"# Create and run the pipeline\n",
"pipeline = dlt.pipeline(\n",
" pipeline_name=\"newsapi_articles\",\n",
" destination=\"athena\",\n",
" dataset_name=\"newsapi_data\"\n",
" pipeline_name=\"newsapi_articles\", destination=\"athena\", dataset_name=\"newsapi_data\"\n",
")\n",
"\n",
"# Run all resources\n",
"load_info = pipeline.run([\n",
" get_articles_us_en # Fetching articles for the UK\n",
"])\n",
"print(load_info)\n"
"load_info = pipeline.run([get_articles_us_en]) # Fetching articles for the UK\n",
"print(load_info)"
]
},
{
Expand Down

0 comments on commit da186b4

Please sign in to comment.