Skip to content

Commit

Permalink
Merge pull request #7 from okfde/develop
Browse files Browse the repository at this point in the history
2023 Updates
  • Loading branch information
simonwoerpel authored Nov 10, 2023
2 parents 4299e97 + 556c707 commit 0dc5814
Show file tree
Hide file tree
Showing 13 changed files with 167 additions and 10 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.5.0
current_version = 2023.11.10
commit = True
tag = True

Expand Down
1 change: 1 addition & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ jobs:
type=ref,event=branch
type=semver,pattern={{version}}
type=sha
type=raw,value=latest
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2
with:
Expand Down
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
fs_logins.csv
.htpasswd
cache.db*
farmsubsidy.duckdb
data

Expand Down Expand Up @@ -130,3 +133,4 @@ dmypy.json

# Pyre type checker
.pyre/
farmsubsidy.code-workspace
21 changes: 19 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,38 @@ reports.

Below a list of data updates since the relaunch of the platform on Dec. 1st, 2022.

## 2023-11-10

**2022 data added**

For most of the countries we added the most recent data for the last year, 2022.

This update marks the iteration of the public available data: From now on, the years 2021 and 2022 are publicly available, all others require a research account.

For some countries, we are still trying to get the data and will update them as soon as possible. Countries that are **not updated yet**:

- Poland
- Sweden
- Finland
- The Netherlands
- Italy
- Malta
- Romania

## 2023-01-19

- Added [EU NUTS levels](https://ec.europa.eu/eurostat/web/nuts/national-structures) data & aggregations.
- Fixed some data cleaning, this slightly changes some of the aggregated numbers on the platform.


## 2023-01-12

Improved deduping and changed handling of empty names:

- try to take a recipient id from the source, if any
- generate random identifiers for empty names

This slightly changes some of the aggregated numbers on the platform.


## 2022-12-14

Poland data for 2021 was scraped and added.
Expand Down
5 changes: 2 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ COPY VERSION /farmsubsidy/VERSION
COPY Makefile /farmsubsidy/Makefile

WORKDIR /farmsubsidy
RUN wget -O cache.db.gz https://cdn.investigativedata.org/farmsubsidy/cache.db.gz
RUN wget -O cache.db.gz https://s3.investigativedata.org/farmsubsidy/cache.db.gz
RUN gunzip cache.db.gz

RUN pip install -U pip setuptools
Expand All @@ -19,5 +19,4 @@ RUN pip install -e ".[geo]"
ENV DEBUG=0
ENV PARALLEL=-j`nproc`

# Run the green unicorn with 1 worker (scale via docker then)
CMD ["uvicorn", "farmsubsidy_store.api:app", "--proxy-headers", "--host", "0.0.0.0", "--port", "8000"]
ENTRYPOINT ["gunicorn", "farmsubsidy_store.api:app", "--bind", "0.0.0.0:8000", "--worker-class", "uvicorn.workers.UvicornWorker"]
10 changes: 10 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,16 @@ download:
mkdir -p $(DATA_ROOT)/src/flat
wget -4 -P $(DATA_ROOT)/src/flat/ -r -l2 -H -nd -N -np -A "gz" -e robots=off $(DATA_DOMAIN)/Flat/

s3.up:
aws --endpoint-url https://s3.investigativedata.org s3 sync $(DATA_ROOT)/src/latest s3://farmsubsidy/latest
aws --endpoint-url https://s3.investigativedata.org s3 sync $(DATA_ROOT)/src/flat s3://farmsubsidy/Flat
aws --endpoint-url https://s3.investigativedata.org s3 sync $(DATA_ROOT)/cleaned s3://farmsubsidy/cleaned

s3.down:
aws --endpoint-url https://s3.investigativedata.org s3 sync s3://farmsubsidy/latest $(DATA_ROOT)/src/latest
aws --endpoint-url https://s3.investigativedata.org s3 sync s3://farmsubsidy/Flat $(DATA_ROOT)/src/flat
aws --endpoint-url https://s3.investigativedata.org s3 sync s3://farmsubsidy/cleaned $(DATA_ROOT)/cleaned

clean:
mkdir -p $(DATA_ROOT)/cleaned
parallel "fscli clean -i {} --ignore-errors | gzip > $(DATA_ROOT)/cleaned/{/.}.cleaned.csv.gz" ::: $(DATA_ROOT)/src/latest/*.csv.gz
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.5.0
2023.11.10
2 changes: 1 addition & 1 deletion farmsubsidy_store/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.5.0"
__version__ = "2023.11.10"
6 changes: 5 additions & 1 deletion farmsubsidy_store/api/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,11 @@
settings.ALLOWED_ORIGIN,
]
app = FastAPI(
title="Farmsubsidy.org API",
debug=settings.DEBUG,
version=settings.VERSION,
title=settings.API_TITLE,
contact=settings.API_CONTACT,
description=settings.API_DESCRIPTION,
redoc_url="/",
)
app.add_middleware(
Expand Down
3 changes: 3 additions & 0 deletions farmsubsidy_store/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def handle_error(log, e, do_raise, **kwargs):
"recipient_street2",
"recipient_location",
"recipient_postcode",
"recipient_county",
"recipient_country",
)

Expand Down Expand Up @@ -213,8 +214,10 @@ def make_entity_id(*parts) -> str:

def clean_recipient_id(row: pd.Series) -> str:
"""deduplicate recipients via generated id from country, name, address"""
# FIXME respect source ids if present?
fp = row["recipient_fingerprint"]
assert fp is not None and fp != "", dict(row)
# FIXME get rid of uuid!!
return make_entity_id(
row["recipient_country"],
row["recipient_fingerprint"],
Expand Down
11 changes: 10 additions & 1 deletion farmsubsidy_store/currency_rates.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"BGN|2019": 0.5134566041289047,
"BGN|2020": 0.5111195340593252,
"BGN|2021": 0.5137864317779282,
"BGN|2022": 0.515,
"CZK|2010": 0.03791335575336573,
"CZK|2011": 0.03989465160499548,
"CZK|2012": 0.038943858983306774,
Expand All @@ -23,6 +24,7 @@
"CZK|2019": 0.03885435830207158,
"CZK|2020": 0.039339046051480316,
"CZK|2021": 0.03831734039192274,
"CZK|2022": 0.04,
"DKK|2010": 0.13459503043890436,
"DKK|2011": 0.134156233443214,
"DKK|2012": 0.13453601442604265,
Expand All @@ -35,6 +37,7 @@
"DKK|2019": 0.13394317218137594,
"DKK|2020": 0.13384930038111884,
"DKK|2021": 0.1350228955013212,
"DKK|2022": 0.13,
"GBP|2010": 1.1278323067367169,
"GBP|2011": 1.1640740786868093,
"GBP|2012": 1.1987647423960273,
Expand All @@ -47,6 +50,7 @@
"GBP|2019": 1.1127682803022187,
"GBP|2020": 1.181214492918793,
"GBP|2021": 1.1248800164080126,
"GBP|2022": 1.16,
"HRK|2010": 0.1372081522839623,
"HRK|2011": 0.13482231159457528,
"HRK|2012": 0.13248306882762317,
Expand All @@ -59,11 +63,13 @@
"HRK|2019": 0.13500541577693187,
"HRK|2020": 0.1342584057232302,
"HRK|2021": 0.13313282842994464,
"HRK|2022": 0.133,
"HUF|2014": 0.00337,
"HUF|2015": 0.00318,
"HUF|2019": 0.00316,
"HUF|2020": 0.00302,
"HUF|2021": 0.00274,
"HUF|2022": 0.0025,
"LTL|2015": 0.290,
"LTL|2016": 0.2962000840716404,
"PLN|2010": 0.24472230411770488,
Expand All @@ -78,6 +84,7 @@
"PLN|2019": 0.23320645985577051,
"PLN|2020": 0.23498886150033615,
"PLN|2021": 0.21916126987002402,
"PLN|2022": 0.212,
"RON|2010": 0.23632290263267156,
"RON|2011": 0.23292352769482222,
"RON|2012": 0.2310683776151275,
Expand All @@ -90,6 +97,7 @@
"RON|2019": 0.2149778784829439,
"RON|2020": 0.2090018605861726,
"RON|2021": 0.20654289372599233,
"RON|2022": 0.202,
"SEK|2010": 0.09767301777858403,
"SEK|2011": 0.11143100052829193,
"SEK|2012": 0.11217364055074694,
Expand All @@ -101,5 +109,6 @@
"SEK|2018": 0.10175969251091803,
"SEK|2019": 0.09839547948689177,
"SEK|2020": 0.09536620931226081,
"SEK|2021": 0.09948629205167554
"SEK|2021": 0.09948629205167554,
"SEK|2022": 0.0923
}
17 changes: 17 additions & 0 deletions farmsubsidy_store/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,23 @@ def get_env(name, default=None):
API_TOKEN_LIFETIME = int(get_env("API_TOKEN_LIFETIME", 60 * 24)) # in minutes
API_HTTPS = as_bool(get_env("API_HTTPS"), False)

API_TITLE = get_env("API_TITLE", "Farmsubsidy.org API")
API_CONTACT = {
"name": get_env("API_CONTACT_NAME", "Farmsubsidy.org"),
"url": get_env("API_CONTACT_URL", "https://farmsubsidy.org"),
"email": get_env("API_CONTACT_EMAIL", "[email protected]")
}
API_DESCRIPTION = """
This api exposes detailed data relating to payments and recipients of farm
subsidies in every EU member state based on EU's Common Agricultural Policy
(CAP).
The data is stored in a [clickhouse](https://clickhouse.com/) instance and feeds
the public website [farmsubsidy.org](https://farmsubsidy.org)
[data parsing and api repo on github](https://github.com/okfde/farmsubsidy-store)
"""

PUBLIC_YEARS = get_env("API_PUBLIC_YEARS", "2020,2021").split(",")
EXPORT_DIRECTORY = get_env("EXPORT_DIRECTORY", os.path.join(DATA_ROOT, "exports"))
EXPORT_PUBLIC_PATH = get_env("EXPORT_PUBLIC_PATH", "/exports")
Expand Down
93 changes: 93 additions & 0 deletions generate_logins.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "ab5ea961-0ab0-4cf5-bcb3-fc0c99af0055",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import shortuuid\n",
"import bcrypt"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e9308c8d-4b57-4ca8-989d-f05fe3860ae3",
"metadata": {},
"outputs": [],
"source": [
"HTPASSWD = \".htpasswd\"\n",
"CLEARFILE = \"fs_logins.csv\"\n",
"PREFIX = \"farmsubsidy\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "acd0247c-afac-4de7-916d-e1efc9f3a9d0",
"metadata": {},
"outputs": [],
"source": [
"def generate_logins(start, end, prefix):\n",
" for i in range(start, end):\n",
" pw = shortuuid.uuid()[:12]\n",
" hashed = bcrypt.hashpw(pw.encode(), bcrypt.gensalt()).decode()\n",
" yield f\"{prefix}-{str(i).zfill(4)}\", hashed, pw"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a018fa38-86b2-4397-b176-d72d039210c8",
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(generate_logins(0, 100, PREFIX), columns=(\"username\", \"hashed\", \"password\"))\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e64ac6f1-5148-472e-8824-0cc6644aa3b7",
"metadata": {},
"outputs": [],
"source": [
"df[[\"username\", \"password\"]].to_csv(CLEARFILE, index=False)\n",
"df[[\"username\", \"hashed\"]].to_csv(HTPASSWD, index=False, header=False, sep=\":\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c49f125a-f41d-413a-8315-ad4d11d05164",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit 0dc5814

Please sign in to comment.