From 55aeeed45112fbf2b4bdf4341392e830e0052b6f Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Thu, 29 Feb 2024 11:58:55 +0000 Subject: [PATCH 01/27] Prefactor: Fix import --- hq_superset/views.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hq_superset/views.py b/hq_superset/views.py index c2aa231..dee0564 100644 --- a/hq_superset/views.py +++ b/hq_superset/views.py @@ -6,11 +6,11 @@ from flask_appbuilder import expose from flask_appbuilder.security.decorators import has_access, permission_name from superset import db -from superset.commands.dataset.delete import DeleteDatasetCommand -from superset.commands.dataset.exceptions import ( +from superset.commands.dataset.delete import ( DatasetDeleteFailedError, DatasetForbiddenError, DatasetNotFoundError, + DeleteDatasetCommand, ) from superset.connectors.sqla.models import SqlaTable from superset.views.base import BaseSupersetView From c02e52850e64c0dfc632dfede7f1a20bb57ac4cb Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Thu, 29 Feb 2024 12:07:21 +0000 Subject: [PATCH 02/27] Prefactor: Raise DatabaseMissing --- hq_superset/exceptions.py | 4 ++++ hq_superset/utils.py | 13 +++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/hq_superset/exceptions.py b/hq_superset/exceptions.py index 338c258..3d0e37c 100644 --- a/hq_superset/exceptions.py +++ b/hq_superset/exceptions.py @@ -1,3 +1,7 @@ +class DatabaseMissing(Exception): + pass + + class HQAPIException(Exception): pass diff --git a/hq_superset/utils.py b/hq_superset/utils.py index c8b9037..c2df76d 100644 --- a/hq_superset/utils.py +++ b/hq_superset/utils.py @@ -7,6 +7,8 @@ import sqlalchemy from flask_login import current_user +from .exceptions import DatabaseMissing + DOMAIN_PREFIX = "hqdomain_" SESSION_USER_DOMAINS_KEY = "user_hq_domains" SESSION_OAUTH_RESPONSE_KEY = "oauth_response" @@ -30,8 +32,15 @@ def get_hq_database(): from superset import db from superset.models.core import Database - # Todo; get actual DB once that's implemented - return db.session.query(Database).filter_by(database_name=HQ_DB_CONNECTION_NAME).one() + try: + return ( + db.session + .query(Database) + .filter_by(database_name=HQ_DB_CONNECTION_NAME) + .one() + ) + except sqlalchemy.orm.exc.NoResultFound as err: + raise DatabaseMissing('CommCare HQ database missing') from err def get_schema_name_for_domain(domain): From 8a97e4e031143762edbc1cfb59d709f350ed5b97 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Thu, 29 Feb 2024 17:46:44 +0000 Subject: [PATCH 03/27] Prefactor: Tweaks to `refresh_hq_datasource()` --- hq_superset/services.py | 42 ++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/hq_superset/services.py b/hq_superset/services.py index a22b4b6..9fa649f 100644 --- a/hq_superset/services.py +++ b/hq_superset/services.py @@ -61,37 +61,39 @@ def refresh_hq_datasource( # See `CsvToDatabaseView.form_post()` in # https://github.com/apache/superset/blob/master/superset/views/database/views.py + def dataframe_to_sql(df, replace=False): + """ + Upload Pandas DataFrame ``df`` to ``database``. + """ + database.db_engine_spec.df_to_sql( + database, + csv_table, + df, + to_sql_kwargs={ + "if_exists": "replace" if replace else "append", + "dtype": sql_converters, + "index": False, + }, + ) + database = get_hq_database() schema = get_schema_name_for_domain(domain) csv_table = Table(table=datasource_id, schema=schema) column_dtypes, date_columns, array_columns = get_column_dtypes( datasource_defn ) - converters = { column_name: convert_to_array for column_name in array_columns } - # TODO: can we assume all array values will be of type TEXT? - sqlconverters = { + sql_converters = { + # Assumes all array values will be of type TEXT column_name: postgresql.ARRAY(sqlalchemy.types.TEXT) for column_name in array_columns } - def to_sql(df, replace=False): - database.db_engine_spec.df_to_sql( - database, - csv_table, - df, - to_sql_kwargs={ - "if_exists": "replace" if replace else "append", - "dtype": sqlconverters, - }, - ) - try: with get_datasource_file(file_path) as csv_file: - - _iter = pandas.read_csv( + dataframes = pandas.read_csv( chunksize=10000, filepath_or_buffer=csv_file, encoding="utf-8", @@ -103,11 +105,9 @@ def to_sql(df, replace=False): iterator=True, low_memory=True, ) - - to_sql(next(_iter), replace=True) - - for df in _iter: - to_sql(df, replace=False) + dataframe_to_sql(next(dataframes), replace=True) + for df in dataframes: + dataframe_to_sql(df, replace=False) sqla_table = ( db.session.query(SqlaTable) From c2acbf9a52b52ce4b5d8fd3d443aa1be9f7b9f5f Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Thu, 29 Feb 2024 12:11:36 +0000 Subject: [PATCH 04/27] Add DataSetChangeAPI --- hq_superset/__init__.py | 3 ++- hq_superset/api.py | 47 ++++++++++++++++++++++++++++++++++ hq_superset/hq_domain.py | 6 ++++- hq_superset/models.py | 34 +++++++++++++++++++++++++ hq_superset/utils.py | 54 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 142 insertions(+), 2 deletions(-) create mode 100644 hq_superset/api.py create mode 100644 hq_superset/models.py diff --git a/hq_superset/__init__.py b/hq_superset/__init__.py index 1af425a..ad0de89 100644 --- a/hq_superset/__init__.py +++ b/hq_superset/__init__.py @@ -9,10 +9,11 @@ def flask_app_mutator(app): # return from superset.extensions import appbuilder - from . import hq_domain, views + from . import api, hq_domain, views appbuilder.add_view(views.HQDatasourceView, 'Update HQ Datasource', menu_cond=lambda *_: False) appbuilder.add_view(views.SelectDomainView, 'Select a Domain', menu_cond=lambda *_: False) + appbuilder.add_api(api.DataSetChangeAPI) app.before_request_funcs.setdefault(None, []).append( hq_domain.before_request_hook ) diff --git a/hq_superset/api.py b/hq_superset/api.py new file mode 100644 index 0000000..3f5b702 --- /dev/null +++ b/hq_superset/api.py @@ -0,0 +1,47 @@ +import json +from http import HTTPStatus + +from flask import request +from flask_appbuilder.api import BaseApi +from flask_appbuilder.baseviews import expose +from superset.superset_typing import FlaskResponse +from superset.views.base import ( + handle_api_exception, + json_error_response, + json_success, +) + +from .models import DataSetChange + + +class DataSetChangeAPI(BaseApi): + """ + Accepts changes to datasets from CommCare HQ data forwarding + """ + + MAX_REQUEST_LENGTH = 10 * 1024 * 1024 # reject JSON requests > 10MB + + def __init__(self): + self.route_base = '/hq_webhook' + self.default_view = 'post_dataset_change' + super().__init__() + + @expose('/change/', methods=('POST',)) + @handle_api_exception + def post_dataset_change(self) -> FlaskResponse: + if request.content_length > self.MAX_REQUEST_LENGTH: + return json_error_response( + HTTPStatus.REQUEST_ENTITY_TOO_LARGE.description, + status=HTTPStatus.REQUEST_ENTITY_TOO_LARGE.value, + ) + + try: + request_json = json.loads(request.get_data(as_text=True)) + change = DataSetChange(**request_json) + change.update_dataset() + return json_success('Dataset updated') + except json.JSONDecodeError: + return json_error_response( + 'Invalid JSON syntax', + status=HTTPStatus.BAD_REQUEST.value, + ) diff --git a/hq_superset/hq_domain.py b/hq_superset/hq_domain.py index 6fc5a12..d9cc02b 100644 --- a/hq_superset/hq_domain.py +++ b/hq_superset/hq_domain.py @@ -26,6 +26,7 @@ def after_request_hook(response): "AuthDBView.login", "SelectDomainView.list", "SelectDomainView.select", + "DataSetChangeAPI.post_dataset_change", "appbuilder.static", "static", ] @@ -39,7 +40,10 @@ def is_user_admin(): def ensure_domain_selected(): # Check if a hq_domain cookie is set # Ensure necessary roles, permissions and DB schemas are created for the domain - if is_user_admin() or (request.url_rule and request.url_rule.endpoint in DOMAIN_EXCLUDED_VIEWS): + if is_user_admin() or ( + request.url_rule + and request.url_rule.endpoint in DOMAIN_EXCLUDED_VIEWS + ): return hq_domain = request.cookies.get('hq_domain') valid_domains = user_domains() diff --git a/hq_superset/models.py b/hq_superset/models.py new file mode 100644 index 0000000..425420b --- /dev/null +++ b/hq_superset/models.py @@ -0,0 +1,34 @@ +from dataclasses import dataclass +from typing import Any + +from .utils import cast_data_for_table, get_hq_database + + +@dataclass +class DataSetChange: + data_source_id: str + doc_id: str + data: list[dict[str, Any]] + + def update_dataset(self): + database = get_hq_database() + try: + sqla_table = next(( + table for table in database.tables + if table.table_name == self.data_source_id + )) + except StopIteration: + raise ValueError(f'{self.data_source_id} table not found.') + table = sqla_table.get_sqla_table_object() + + with ( + database.get_sqla_engine_with_context() as engine, + engine.connect() as connection, + connection.begin() # Commit on leaving context + ): + delete_stmt = table.delete().where(table.c.doc_id == self.doc_id) + connection.execute(delete_stmt) + if self.data: + rows = list(cast_data_for_table(self.data, table)) + insert_stmt = table.insert().values(rows) + connection.execute(insert_stmt) diff --git a/hq_superset/utils.py b/hq_superset/utils.py index c2df76d..5fdafbc 100644 --- a/hq_superset/utils.py +++ b/hq_superset/utils.py @@ -1,11 +1,15 @@ import ast +import sys from contextlib import contextmanager from datetime import date, datetime +from functools import partial +from typing import Any, Generator from zipfile import ZipFile import pandas import sqlalchemy from flask_login import current_user +from sqlalchemy.sql import TableClause from .exceptions import DatabaseMissing @@ -198,3 +202,53 @@ def array_is_falsy(array_values): return [] return array_values + + +def js_to_py_datetime(jsdt, preserve_tz=True): + """ + JavaScript UTC datetimes end in "Z". In Python < 3.11, + ``datetime.isoformat()`` doesn't like it, and raises + "ValueError: Invalid isoformat string" + + >>> jsdt = '2024-02-24T14:01:25.397469Z' + >>> js_to_py_datetime(jsdt) + datetime.datetime(2024, 2, 24, 14, 1, 25, 397469, tzinfo=datetime.timezone.utc) + >>> js_to_py_datetime(jsdt, preserve_tz=False) + datetime.datetime(2024, 2, 24, 14, 1, 25, 397469) + + """ + if preserve_tz: + if sys.version_info >= (3, 11): + return datetime.fromisoformat(jsdt) + pydt = jsdt.replace('Z', '+00:00') + else: + pydt = jsdt.replace('Z', '') + return datetime.fromisoformat(pydt) + + +def cast_data_for_table( + data: list[dict[str, Any]], + table: TableClause, +) -> Generator[dict[str, Any], None, None]: + """ + Returns ``data`` with values cast in the correct data types for + the columns of ``table``. + """ + cast_functions = { + # 'BIGINT': int, + # 'TEXT': str, + 'TIMESTAMP': partial(js_to_py_datetime, preserve_tz=False), + # TODO: What else? + } + + column_types = {c.name: str(c.type) for c in table.columns} + for row in data: + cast_row = {} + for column, value in row.items(): + type_name = column_types[column] + if type_name in cast_functions: + cast_func = cast_functions[type_name] + cast_row[column] = cast_func(value) + else: + cast_row[column] = value + yield cast_row From 9765307fa67a30eff9c24375728b5b581e20bbe0 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Thu, 29 Feb 2024 18:15:02 +0000 Subject: [PATCH 05/27] HQRequest helper class --- hq_superset/hq_requests.py | 30 ++++++++++++++++++++++++++++++ hq_superset/hq_url.py | 25 +++++++++++++++++++++++++ hq_superset/services.py | 12 +++++++----- hq_superset/tests/test_views.py | 22 +++++++++++++++++----- hq_superset/utils.py | 12 ------------ hq_superset/views.py | 30 +++++++++++++++++++++--------- 6 files changed, 100 insertions(+), 31 deletions(-) create mode 100644 hq_superset/hq_requests.py create mode 100644 hq_superset/hq_url.py diff --git a/hq_superset/hq_requests.py b/hq_superset/hq_requests.py new file mode 100644 index 0000000..26392c8 --- /dev/null +++ b/hq_superset/hq_requests.py @@ -0,0 +1,30 @@ +import superset +from hq_superset.oauth import get_valid_cchq_oauth_token + + +class HQRequest: + + def __init__(self, url): + self.url = url + + @property + def oauth_token(self): + return get_valid_cchq_oauth_token() + + @property + def commcare_provider(self): + return superset.appbuilder.sm.oauth_remotes["commcare"] + + @property + def api_base_url(self): + return self.commcare_provider.api_base_url + + @property + def absolute_url(self): + return f"{self.api_base_url}{self.url}" + + def get(self): + return self.commcare_provider.get(self.url, token=self.oauth_token) + + def post(self, data): + return self.commcare_provider.post(self.url, data=data, token=self.oauth_token) diff --git a/hq_superset/hq_url.py b/hq_superset/hq_url.py new file mode 100644 index 0000000..aca7e58 --- /dev/null +++ b/hq_superset/hq_url.py @@ -0,0 +1,25 @@ +""" +Functions that return URLs on CommCare HQ +""" + + +def datasource_details(domain, datasource_id): + return f"a/{domain}/api/v0.5/ucr_data_source/{datasource_id}/" + + +def datasource_list(domain): + return f"a/{domain}/api/v0.5/ucr_data_source/" + + +def datasource_export(domain, datasource_id): + return ( + f"a/{domain}/configurable_reports/data_sources/export/{datasource_id}/" + "?format=csv" + ) + + +def datasource_subscribe(domain, datasource_id): + return ( + f"a/{domain}/configurable_reports/data_sources/subscribe/" + f"{datasource_id}/" + ) diff --git a/hq_superset/services.py b/hq_superset/services.py index 9fa649f..ab4e887 100644 --- a/hq_superset/services.py +++ b/hq_superset/services.py @@ -11,11 +11,11 @@ from superset.extensions import cache_manager from superset.sql_parse import Table +from .hq_requests import HQRequest +from .hq_url import datasource_details, datasource_export, datasource_subscribe from .utils import ( convert_to_array, get_column_dtypes, - get_datasource_details_url, - get_datasource_export_url, get_datasource_file, get_hq_database, get_schema_name_for_domain, @@ -35,12 +35,14 @@ def download_datasource(provider, oauth_token, domain, datasource_id): with open(path, "wb") as f: f.write(response.content) + subscribe_to_hq_datasource(domain, datasource_id) + return path, len(response.content) -def get_datasource_defn(provider, oauth_token, domain, datasource_id): - url = get_datasource_details_url(domain, datasource_id) - response = provider.get(url, token=oauth_token) +def get_datasource_defn(domain, datasource_id): + hq_request = HQRequest(url=datasource_details(domain, datasource_id)) + response = hq_request.get() if response.status_code != 200: raise HQAPIException("Error downloading the UCR definition from HQ") return response.json() diff --git a/hq_superset/tests/test_views.py b/hq_superset/tests/test_views.py index 97d8137..904374c 100644 --- a/hq_superset/tests/test_views.py +++ b/hq_superset/tests/test_views.py @@ -36,6 +36,7 @@ class UserMock(): def get_id(self): return self.user_id + class OAuthMock(): def __init__(self): @@ -106,6 +107,7 @@ def get(self, url, token): a3, 2021-11-22, 2022-01-19, 10, 2022-03-20, some_other_text2 """ + class TestViews(HQDBTestCase): def setUp(self): @@ -276,18 +278,28 @@ def _test_sync_or_async(ds_size, routing_method, user_id): None ) - @patch('hq_superset.views.get_valid_cchq_oauth_token', return_value={}) - def test_download_datasource(self, *args): - from hq_superset.services import download_datasource + @patch('hq_superset.hq_requests.get_valid_cchq_oauth_token', return_value={}) + @patch('hq_superset.services.subscribe_to_hq_datasource') + @patch('hq_superset.hq_requests.HQRequest.get') + def test_download_datasource(self, hq_request_get_mock, subscribe_mock, *args): + from hq_superset.services import download_and_subscribe_to_datasource + hq_request_get_mock.return_value = MockResponse( + json_data=TEST_UCR_CSV_V1, + status_code=200, + ) ucr_id = self.oauth_mock.test1_datasources['objects'][0]['id'] - path, size = download_datasource(self.oauth_mock, '_', 'test1', ucr_id) + path, size = download_and_subscribe_to_datasource('test1', ucr_id) + subscribe_mock.assert_called_once_with( + 'test1', + ucr_id, + ) with open(path, 'rb') as f: self.assertEqual(pickle.load(f), TEST_UCR_CSV_V1) self.assertEqual(size, len(pickle.dumps(TEST_UCR_CSV_V1))) os.remove(path) - @patch('hq_superset.views.get_valid_cchq_oauth_token', return_value={}) + @patch('hq_superset.hq_requests.get_valid_cchq_oauth_token', return_value={}) def test_refresh_hq_datasource(self, *args): from hq_superset.services import refresh_hq_datasource diff --git a/hq_superset/utils.py b/hq_superset/utils.py index 5fdafbc..f40d6fc 100644 --- a/hq_superset/utils.py +++ b/hq_superset/utils.py @@ -19,18 +19,6 @@ HQ_DB_CONNECTION_NAME = "HQ Data" -def get_datasource_export_url(domain, datasource_id): - return f"a/{domain}/configurable_reports/data_sources/export/{datasource_id}/?format=csv" - - -def get_datasource_list_url(domain): - return f"a/{domain}/api/v0.5/ucr_data_source/" - - -def get_datasource_details_url(domain, datasource_id): - return f"a/{domain}/api/v0.5/ucr_data_source/{datasource_id}/" - - def get_hq_database(): # Todo; cache to avoid multiple lookups in single request from superset import db diff --git a/hq_superset/views.py b/hq_superset/views.py index dee0564..c231e91 100644 --- a/hq_superset/views.py +++ b/hq_superset/views.py @@ -1,6 +1,7 @@ import logging import os +import requests import superset from flask import Response, abort, flash, g, redirect, request, url_for from flask_appbuilder import expose @@ -17,6 +18,8 @@ from .hq_domain import user_domains from .oauth import get_valid_cchq_oauth_token +from .hq_url import datasource_list +from .hq_requests import HQRequest from .services import ( AsyncImportHelper, download_datasource, @@ -61,18 +64,27 @@ def create_or_update(self, datasource_id): @expose("/list/", methods=["GET"]) def list_hq_datasources(self): - datasource_list_url = get_datasource_list_url(g.hq_domain) - provider = superset.appbuilder.sm.oauth_remotes["commcare"] - oauth_token = get_valid_cchq_oauth_token() - response = provider.get(datasource_list_url, token=oauth_token) + hq_request = HQRequest(url=datasource_list(g.hq_domain)) + try: + response = hq_request.get() + except requests.exceptions.ConnectionError: + return Response( + "Unable to connect to CommCare HQ " + f"at {hq_request.absolute_url}", + status=400 + ) + if response.status_code == 403: return Response(status=403) if response.status_code != 200: - url = f"{provider.api_base_url}{datasource_list_url}" + try: + msg = response.json()['error'] + except: # pylint: disable=E722 + msg = '' return Response( - response="There was an error in fetching datasources from " - f"CommCare HQ at {url}", - status=400, + "There was an error in fetching datasources from CommCare HQ " + f"at {hq_request.absolute_url}: {response.status_code} {msg}", + status=400 ) hq_datasources = response.json() for ds in hq_datasources['objects']: @@ -83,7 +95,7 @@ def list_hq_datasources(self): "hq_datasource_list.html", hq_datasources=hq_datasources, ucr_id_to_pks=self._ucr_id_to_pks(), - hq_base_url=provider.api_base_url, + hq_base_url=hq_request.api_base_url ) @expose("/delete/", methods=["GET"]) From c05c4da4abbbff382a46ab9016631da649a422bc Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Thu, 29 Feb 2024 18:18:30 +0000 Subject: [PATCH 06/27] Add OAuth 2.0 server There is a lot happening in this commit. 1. Setting up the test database for datasets from HQ no longer needs a special function. The `tests.utils` module is renamed `tests.const` and the `UnitTestingOnly` exception is no longer needed. 2. The `oauth2_server` module is the heart of the implementation of an OAuth 2.0 server, based heavily on the sample implementation given at https://github.com/authlib/example-oauth2-server/ 3. The OAuth 2.0 server is exposed in `api.py`, and uses models defined in `models.py`. 4. The OAuth 2.0 server uses Fernet symmetric encryption so that OAuth 2.0 client secrets can be read for setting up syncing for multiple data sources. It uses a couple of utility functions defined in `utils.py`. --- hq_superset/__init__.py | 5 +- hq_superset/api.py | 28 +++++++- hq_superset/const.py | 4 ++ hq_superset/exceptions.py | 4 -- hq_superset/hq_domain.py | 1 + hq_superset/models.py | 83 +++++++++++++++++++++++- hq_superset/oauth2_server.py | 51 +++++++++++++++ hq_superset/services.py | 43 ++++++++++-- hq_superset/tasks.py | 3 - hq_superset/tests/base_test.py | 4 +- hq_superset/tests/config_for_tests.py | 9 ++- hq_superset/tests/{utils.py => const.py} | 33 ---------- hq_superset/tests/test_hq_domain.py | 4 +- hq_superset/tests/test_utils.py | 2 +- hq_superset/tests/test_views.py | 17 +++-- hq_superset/utils.py | 48 ++++++++++++-- hq_superset/views.py | 12 +--- superset_config.example.py | 29 ++++++++- 18 files changed, 299 insertions(+), 81 deletions(-) create mode 100644 hq_superset/const.py create mode 100644 hq_superset/oauth2_server.py rename hq_superset/tests/{utils.py => const.py} (74%) diff --git a/hq_superset/__init__.py b/hq_superset/__init__.py index ad0de89..11d6d1b 100644 --- a/hq_superset/__init__.py +++ b/hq_superset/__init__.py @@ -9,11 +9,14 @@ def flask_app_mutator(app): # return from superset.extensions import appbuilder - from . import api, hq_domain, views + from . import api, hq_domain, oauth2_server, views appbuilder.add_view(views.HQDatasourceView, 'Update HQ Datasource', menu_cond=lambda *_: False) appbuilder.add_view(views.SelectDomainView, 'Select a Domain', menu_cond=lambda *_: False) + appbuilder.add_api(api.OAuth) appbuilder.add_api(api.DataSetChangeAPI) + oauth2_server.config_oauth2(app) + app.before_request_funcs.setdefault(None, []).append( hq_domain.before_request_hook ) diff --git a/hq_superset/api.py b/hq_superset/api.py index 3f5b702..5473c58 100644 --- a/hq_superset/api.py +++ b/hq_superset/api.py @@ -1,9 +1,9 @@ import json from http import HTTPStatus -from flask import request -from flask_appbuilder.api import BaseApi -from flask_appbuilder.baseviews import expose +from flask import jsonify, request +from flask_appbuilder.api import BaseApi, expose +from sqlalchemy.orm.exc import NoResultFound from superset.superset_typing import FlaskResponse from superset.views.base import ( handle_api_exception, @@ -12,6 +12,27 @@ ) from .models import DataSetChange +from .oauth2_server import authorization, require_oauth + + +class OAuth(BaseApi): + + def __init__(self): + super().__init__() + self.route_base = "/oauth" + + @expose("/token", methods=('POST',)) + def issue_access_token(self): + try: + response = authorization.create_token_response() + except NoResultFound: + return jsonify({"error": "Invalid client"}), 401 + + if response.status_code >= 400: + return response + + data = json.loads(response.data.decode("utf-8")) + return jsonify(data) class DataSetChangeAPI(BaseApi): @@ -28,6 +49,7 @@ def __init__(self): @expose('/change/', methods=('POST',)) @handle_api_exception + @require_oauth() def post_dataset_change(self) -> FlaskResponse: if request.content_length > self.MAX_REQUEST_LENGTH: return json_error_response( diff --git a/hq_superset/const.py b/hq_superset/const.py new file mode 100644 index 0000000..1fde1b5 --- /dev/null +++ b/hq_superset/const.py @@ -0,0 +1,4 @@ +# The name of the database for storing data related to CommCare HQ +HQ_DATABASE_NAME = "HQ Data" + +OAUTH2_DATABASE_NAME = "oauth2-server-data" diff --git a/hq_superset/exceptions.py b/hq_superset/exceptions.py index 3d0e37c..3eb5f9d 100644 --- a/hq_superset/exceptions.py +++ b/hq_superset/exceptions.py @@ -8,7 +8,3 @@ class HQAPIException(Exception): class OAuthSessionExpired(Exception): pass - - -class UnitTestingOnly(Exception): - pass diff --git a/hq_superset/hq_domain.py b/hq_superset/hq_domain.py index d9cc02b..de137d4 100644 --- a/hq_superset/hq_domain.py +++ b/hq_superset/hq_domain.py @@ -26,6 +26,7 @@ def after_request_hook(response): "AuthDBView.login", "SelectDomainView.list", "SelectDomainView.select", + "OAuth.issue_access_token", "DataSetChangeAPI.post_dataset_change", "appbuilder.static", "static", diff --git a/hq_superset/models.py b/hq_superset/models.py index 425420b..01b17e6 100644 --- a/hq_superset/models.py +++ b/hq_superset/models.py @@ -1,7 +1,20 @@ +import secrets +import string +import time +import uuid from dataclasses import dataclass from typing import Any -from .utils import cast_data_for_table, get_hq_database +from authlib.integrations.sqla_oauth2 import ( + OAuth2ClientMixin, + OAuth2TokenMixin, +) +from cryptography.fernet import MultiFernet +from sqlalchemy import update +from superset import db + +from .const import OAUTH2_DATABASE_NAME +from .utils import cast_data_for_table, get_fernet_keys, get_hq_database @dataclass @@ -32,3 +45,71 @@ def update_dataset(self): rows = list(cast_data_for_table(self.data, table)) insert_stmt = table.insert().values(rows) connection.execute(insert_stmt) + + +class OAuth2Client(db.Model, OAuth2ClientMixin): + __bind_key__ = OAUTH2_DATABASE_NAME + __tablename__ = 'hq_oauth_client' + + domain = db.Column(db.String(255), primary_key=True) + client_secret = db.Column(db.String(255)) # more chars for encryption + + def get_client_secret(self): + keys = get_fernet_keys() + fernet = MultiFernet(keys) + + ciphertext_bytes = self.client_secret.encode('utf-8') + plaintext_bytes = fernet.decrypt(ciphertext_bytes) + return plaintext_bytes.decode('utf-8') + + def set_client_secret(self, plaintext): + keys = get_fernet_keys() + fernet = MultiFernet(keys) + + plaintext_bytes = plaintext.encode('utf-8') + ciphertext_bytes = fernet.encrypt(plaintext_bytes) + self.client_secret = ciphertext_bytes.decode('utf-8') + + def check_client_secret(self, plaintext): + return self.get_client_secret() == plaintext + + def revoke_tokens(self): + revoked_at = int(time.time()) + stmt = ( + update(OAuth2Token) + .where(OAuth2Token.client_id == self.client_id) + .where(OAuth2Token.access_token_revoked_at == 0) + .values(access_token_revoked_at=revoked_at) + ) + db.session.execute(stmt) + db.session.commit() + + @classmethod + def get_by_domain(cls, domain): + return db.session.query(OAuth2Client).filter_by(domain=domain).first() + + @classmethod + def create_domain_client(cls, domain: str): + alphabet = string.ascii_letters + string.digits + client_secret = ''.join(secrets.choice(alphabet) for i in range(64)) + client = OAuth2Client( + domain=domain, + client_id=str(uuid.uuid4()), + ) + client.set_client_secret(client_secret) + client.set_client_metadata({"grant_types": ["client_credentials"]}) + db.session.add(client) + db.session.commit() + return client + + +class OAuth2Token(db.Model, OAuth2TokenMixin): + __bind_key__ = OAUTH2_DATABASE_NAME + __tablename__ = 'hq_oauth_token' + + id = db.Column(db.Integer, primary_key=True) + + @property + def domain(self): + client = OAuth2Client.get_by_client_id(self.client_id) + return client.domain diff --git a/hq_superset/oauth2_server.py b/hq_superset/oauth2_server.py new file mode 100644 index 0000000..aca16d1 --- /dev/null +++ b/hq_superset/oauth2_server.py @@ -0,0 +1,51 @@ +from datetime import timedelta + +from authlib.integrations.flask_oauth2 import ( + AuthorizationServer, + ResourceProtector, +) +from authlib.integrations.sqla_oauth2 import ( + create_bearer_token_validator, + create_query_client_func, + create_revocation_endpoint, +) +from authlib.oauth2.rfc6749 import grants + +from .models import OAuth2Client, OAuth2Token, db + + +def save_token(token, request): + client = request.client + client.revoke_tokens() + + one_day = 24 * 60 * 60 + token = OAuth2Token( + client_id=client.client_id, + token_type=token['token_type'], + access_token=token['access_token'], + scope=client.domain, + expires_in=one_day, + ) + db.session.add(token) + db.session.commit() + + +query_client = create_query_client_func(db.session, OAuth2Client) +authorization = AuthorizationServer( + query_client=query_client, + save_token=save_token, +) +require_oauth = ResourceProtector() + + +def config_oauth2(app): + authorization.init_app(app) + authorization.register_grant(grants.ClientCredentialsGrant) + + # support revocation + revocation_cls = create_revocation_endpoint(db.session, OAuth2Token) + authorization.register_endpoint(revocation_cls) + + # protect resource + bearer_cls = create_bearer_token_validator(db.session, OAuth2Token) + require_oauth.register_token_validator(bearer_cls()) diff --git a/hq_superset/services.py b/hq_superset/services.py index ab4e887..a5c6ad9 100644 --- a/hq_superset/services.py +++ b/hq_superset/services.py @@ -1,10 +1,12 @@ +import logging import os from datetime import datetime +from urllib.parse import urljoin import pandas import sqlalchemy import superset -from flask import g +from flask import g, request, url_for from sqlalchemy.dialects import postgresql from superset import db from superset.connectors.sqla.models import SqlaTable @@ -13,6 +15,7 @@ from .hq_requests import HQRequest from .hq_url import datasource_details, datasource_export, datasource_subscribe +from .models import OAuth2Client from .utils import ( convert_to_array, get_column_dtypes, @@ -23,10 +26,13 @@ ) from .exceptions import HQAPIException +logger = logging.getLogger(__name__) + + +def download_and_subscribe_to_datasource(domain, datasource_id): + hq_request = HQRequest(url=datasource_export(domain, datasource_id)) + response = hq_request.get() -def download_datasource(provider, oauth_token, domain, datasource_id): - datasource_url = get_datasource_export_url(domain, datasource_id) - response = provider.get(datasource_url, token=oauth_token) if response.status_code != 200: raise HQAPIException("Error downloading the UCR export from HQ") @@ -145,6 +151,35 @@ def dataframe_to_sql(df, replace=False): raise ex +def subscribe_to_hq_datasource(domain, datasource_id): + hq_client = OAuth2Client.get_by_domain(domain) + if hq_client is None: + hq_client = OAuth2Client.create_domain_client(domain) + + hq_request = HQRequest(url=datasource_subscribe(domain, datasource_id)) + webhook_url = urljoin( + request.root_url, + url_for('DataSetChangeAPI.post_dataset_change'), + ) + token_url = urljoin(request.root_url, url_for('OAuth.issue_access_token')) + response = hq_request.post({ + 'webhook_url': webhook_url, + 'token_url': token_url, + 'client_id': hq_client.client_id, + 'client_secret': hq_client.get_client_secret(), + }) + if response.status_code == 201: + return + if response.status_code < 500: + logger.error( + f"Failed to subscribe to data source {datasource_id} due to the following issue: {response.data}" + ) + if response.status_code >= 500: + logger.exception( + f"Failed to subscribe to data source {datasource_id} due to a remote server error" + ) + + class AsyncImportHelper: def __init__(self, domain, datasource_id): self.domain = domain diff --git a/hq_superset/tasks.py b/hq_superset/tasks.py index 2710a17..66ed506 100644 --- a/hq_superset/tasks.py +++ b/hq_superset/tasks.py @@ -1,12 +1,9 @@ -import logging import os from superset.extensions import celery_app from .services import AsyncImportHelper, refresh_hq_datasource -logger = logging.getLogger(__name__) - @celery_app.task(name='refresh_hq_datasource_task') def refresh_hq_datasource_task(domain, datasource_id, display_name, export_path, datasource_defn, user_id): diff --git a/hq_superset/tests/base_test.py b/hq_superset/tests/base_test.py index eed23ee..b94e0be 100644 --- a/hq_superset/tests/base_test.py +++ b/hq_superset/tests/base_test.py @@ -12,8 +12,6 @@ from hq_superset.utils import DOMAIN_PREFIX, get_hq_database -from .utils import setup_hq_db - superset_test_home = os.path.join(os.path.dirname(__file__), ".test_superset") shutil.rmtree(superset_test_home, ignore_errors=True) os.environ["SUPERSET_HOME"] = superset_test_home @@ -35,7 +33,7 @@ class HQDBTestCase(SupersetTestCase): def setUp(self): super(HQDBTestCase, self).setUp() - self.hq_db = setup_hq_db() + self.hq_db = get_hq_database() def tearDown(self): # Drop HQ DB Schemas diff --git a/hq_superset/tests/config_for_tests.py b/hq_superset/tests/config_for_tests.py index 90dba45..aa7173f 100644 --- a/hq_superset/tests/config_for_tests.py +++ b/hq_superset/tests/config_for_tests.py @@ -16,17 +16,21 @@ from flask_appbuilder.security.manager import AUTH_OAUTH from hq_superset import flask_app_mutator, oauth +from hq_superset.const import OAUTH2_DATABASE_NAME WTF_CSRF_ENABLED = False TESTING = True SECRET_KEY = 'abc' +FERNET_KEYS = [ + '0fXurIGyQM4HQYoe7feuwV8c1Kz_88BdmCNutLKiO38=', # Don't reuse this! +] # Any other additional roles to be assigned to the user on top of the base role # Note: by design we cannot use AUTH_USER_REGISTRATION_ROLE to # specify more than one role AUTH_USER_ADDITIONAL_ROLES = ["sql_lab"] -HQ_DATA_DB = "postgresql://commcarehq:commcarehq@localhost:5432/test_superset_hq" +HQ_DATABASE_URI = "postgresql://commcarehq:commcarehq@localhost:5432/test_superset_hq" AUTH_TYPE = AUTH_OAUTH @@ -48,6 +52,9 @@ ] SQLALCHEMY_DATABASE_URI = "sqlite:///:memory:" +SQLALCHEMY_BINDS = { + OAUTH2_DATABASE_NAME: 'sqlite:///test_oauth2.db' +} SHARED_DIR = "shared_dir" ENABLE_ASYNC_UCR_IMPORTS = True CACHE_CONFIG = { diff --git a/hq_superset/tests/utils.py b/hq_superset/tests/const.py similarity index 74% rename from hq_superset/tests/utils.py rename to hq_superset/tests/const.py index b0aec8b..005ad7f 100644 --- a/hq_superset/tests/utils.py +++ b/hq_superset/tests/const.py @@ -1,30 +1,3 @@ -from functools import wraps - -from hq_superset.exceptions import UnitTestingOnly -from hq_superset.utils import HQ_DB_CONNECTION_NAME - - -def unit_testing_only(fn): - import superset - - @wraps(fn) - def inner(*args, **kwargs): - if not superset.app.config.get('TESTING'): - raise UnitTestingOnly( - 'You may only call {} during unit testing'.format(fn.__name__)) - return fn(*args, **kwargs) - return inner - - -@unit_testing_only -def setup_hq_db(): - import superset - from superset.utils.database import get_or_create_db - - db_uri = superset.app.config['HQ_DATA_DB'] - return get_or_create_db(HQ_DB_CONNECTION_NAME, db_uri) - - TEST_DATASOURCE = { "configured_filter": { "filters": [ @@ -135,9 +108,3 @@ def setup_hq_db(): "id": "test1_ucr1", "resource_uri": "/a/demo/api/v0.5/ucr_data_source/52a134da12c9b801bd85d2122901b30c/" } - -TEST_UCR_CSV = """\ -doc_id,inserted_at,data_visit_date_eaece89e,data_visit_number_33d63739,data_lmp_date_5e24b993,data_visit_comment_fb984fda -a1, 2021-12-20, 2022-01-19, 100, 2022-02-20, some_text -a2, 2021-12-22, 2022-02-19, 10, 2022-03-20, some_other_text -""" diff --git a/hq_superset/tests/test_hq_domain.py b/hq_superset/tests/test_hq_domain.py index 23b89e9..68cd5ed 100644 --- a/hq_superset/tests/test_hq_domain.py +++ b/hq_superset/tests/test_hq_domain.py @@ -13,11 +13,11 @@ from hq_superset.utils import ( SESSION_USER_DOMAINS_KEY, DomainSyncUtil, + get_hq_database, get_schema_name_for_domain, ) from .base_test import HQDBTestCase, SupersetTestCase -from .utils import setup_hq_db MOCK_DOMAIN_SESSION = { SESSION_USER_DOMAINS_KEY:[ @@ -116,7 +116,7 @@ class TestDomainSyncUtil(HQDBTestCase): def setUp(self): super(TestDomainSyncUtil, self).setUp() self.domain = 'test-domain' - setup_hq_db() + get_hq_database() def test_schema_gets_created(self): schema_name = get_schema_name_for_domain(self.domain) diff --git a/hq_superset/tests/test_utils.py b/hq_superset/tests/test_utils.py index 548fea4..2802c61 100644 --- a/hq_superset/tests/test_utils.py +++ b/hq_superset/tests/test_utils.py @@ -2,7 +2,7 @@ from hq_superset.utils import get_column_dtypes -from .utils import TEST_DATASOURCE +from .const import TEST_DATASOURCE def test_get_column_dtypes(): diff --git a/hq_superset/tests/test_views.py b/hq_superset/tests/test_views.py index 904374c..9c15fe0 100644 --- a/hq_superset/tests/test_views.py +++ b/hq_superset/tests/test_views.py @@ -14,7 +14,7 @@ ) from .base_test import HQDBTestCase -from .utils import TEST_DATASOURCE +from .const import TEST_DATASOURCE class MockResponse: @@ -199,7 +199,7 @@ def test_non_user_domain_cant_be_selected(self): self.assertTrue('/domain/list' in response.request.path) self.logout(client) - @patch('hq_superset.views.get_valid_cchq_oauth_token', return_value={}) + @patch('hq_superset.hq_requests.get_valid_cchq_oauth_token', return_value={}) def test_datasource_list(self, *args): def _do_assert(datasources): self.assert_template_used("hq_datasource_list.html") @@ -231,7 +231,8 @@ def test_datasource_upload(self, *args): 'ds1' ) - @patch('hq_superset.views.get_valid_cchq_oauth_token', return_value={}) + @patch('hq_superset.hq_requests.get_valid_cchq_oauth_token', return_value={}) + @patch('hq_superset.services.subscribe_to_hq_datasource') @patch('hq_superset.views.os.remove') def test_trigger_datasource_refresh(self, *args): from hq_superset.views import ( @@ -246,10 +247,12 @@ def test_trigger_datasource_refresh(self, *args): def _test_sync_or_async(ds_size, routing_method, user_id): - with patch("hq_superset.views.download_datasource") as download_ds_mock, \ - patch("hq_superset.views.get_datasource_defn") as ds_defn_mock, \ - patch(routing_method) as refresh_mock, \ - patch("hq_superset.views.g") as mock_g: + with ( + patch("hq_superset.views.download_and_subscribe_to_datasource") as download_ds_mock, + patch("hq_superset.views.get_datasource_defn") as ds_defn_mock, + patch(routing_method) as refresh_mock, + patch("hq_superset.views.g") as mock_g + ): mock_g.user = UserMock() download_ds_mock.return_value = file_path, ds_size ds_defn_mock.return_value = TEST_DATASOURCE diff --git a/hq_superset/utils.py b/hq_superset/utils.py index f40d6fc..1383907 100644 --- a/hq_superset/utils.py +++ b/hq_superset/utils.py @@ -8,31 +8,42 @@ import pandas import sqlalchemy +from cryptography.fernet import Fernet +from flask import current_app from flask_login import current_user from sqlalchemy.sql import TableClause +from superset.utils.database import get_or_create_db +from .const import HQ_DATABASE_NAME from .exceptions import DatabaseMissing DOMAIN_PREFIX = "hqdomain_" SESSION_USER_DOMAINS_KEY = "user_hq_domains" SESSION_OAUTH_RESPONSE_KEY = "oauth_response" -HQ_DB_CONNECTION_NAME = "HQ Data" def get_hq_database(): - # Todo; cache to avoid multiple lookups in single request - from superset import db + """ + Returns the user-created database for datasets imported from + CommCare HQ. If it has not been created and its URI is set in + ``superset_config``, it will create it. Otherwise, it will raise a + ``DatabaseMissing`` exception. + """ + from superset import app, db from superset.models.core import Database try: return ( db.session .query(Database) - .filter_by(database_name=HQ_DB_CONNECTION_NAME) + .filter_by(database_name=HQ_DATABASE_NAME) .one() ) - except sqlalchemy.orm.exc.NoResultFound as err: - raise DatabaseMissing('CommCare HQ database missing') from err + except sqlalchemy.orm.exc.NoResultFound: + db_uri = app.config.get('HQ_DATABASE_URI') + if db_uri: + return get_or_create_db(HQ_DATABASE_NAME, db_uri) + raise DatabaseMissing('CommCare HQ database missing') def get_schema_name_for_domain(domain): @@ -158,6 +169,31 @@ def get_datasource_file(path): yield zipfile.open(filename) +def get_fernet_keys(): + return [ + Fernet(encoded(key, 'ascii')) + for key in current_app.config['FERNET_KEYS'] + ] + + +def encoded(string_maybe, encoding): + """ + Returns ``string_maybe`` encoded with ``encoding``, otherwise + returns it unchanged. + + >>> encoded('abc', 'utf-8') + b'abc' + >>> encoded(b'abc', 'ascii') + b'abc' + >>> encoded(123, 'utf-8') + 123 + + """ + if hasattr(string_maybe, 'encode'): + return string_maybe.encode(encoding) + return string_maybe + + def convert_to_array(string_array): """ Converts the string representation of a list to a list. diff --git a/hq_superset/views.py b/hq_superset/views.py index c231e91..c3106f9 100644 --- a/hq_superset/views.py +++ b/hq_superset/views.py @@ -17,19 +17,17 @@ from superset.views.base import BaseSupersetView from .hq_domain import user_domains -from .oauth import get_valid_cchq_oauth_token from .hq_url import datasource_list from .hq_requests import HQRequest from .services import ( AsyncImportHelper, - download_datasource, + download_and_subscribe_to_datasource, get_datasource_defn, refresh_hq_datasource, ) from .tasks import refresh_hq_datasource_task from .utils import ( DomainSyncUtil, - get_datasource_list_url, get_hq_database, get_schema_name_for_domain, ) @@ -126,12 +124,8 @@ def trigger_datasource_refresh(domain, datasource_id, display_name): ) return redirect("/tablemodelview/list/") - provider = superset.appbuilder.sm.oauth_remotes["commcare"] - token = get_valid_cchq_oauth_token() - path, size = download_datasource(provider, token, domain, datasource_id) - datasource_defn = get_datasource_defn( - provider, token, domain, datasource_id - ) + path, size = download_and_subscribe_to_datasource(domain, datasource_id) + datasource_defn = get_datasource_defn(domain, datasource_id) if size < ASYNC_DATASOURCE_IMPORT_LIMIT_IN_BYTES: refresh_hq_datasource( domain, datasource_id, display_name, path, datasource_defn, None diff --git a/superset_config.example.py b/superset_config.example.py index b899d4e..7d6ef37 100644 --- a/superset_config.example.py +++ b/superset_config.example.py @@ -13,17 +13,36 @@ from sentry_sdk.integrations.flask import FlaskIntegration from hq_superset import flask_app_mutator, oauth - +from hq_superset.const import OAUTH2_DATABASE_NAME # Use a tool to generate a sufficiently random string, e.g. # $ openssl rand -base64 42 # SECRET_KEY = ... +# [Fernet](https://cryptography.io/en/latest/fernet/) (symmetric +# encryption) is used to encrypt and decrypt client secrets so that the +# same credentials can be used to subscribe to many data sources. +# +# FERNET_KEYS is a list of keys where the first key is the current one, +# the second is the previous one, etc. Encryption uses the first key. +# Decryption is attempted with each key in turn. +# +# To generate a key: +# >>> from cryptography.fernet import Fernet +# >>> Fernet.generate_key() +# Keys can be bytes or strings. +# FERNET_KEYS = [...] + AUTH_TYPE = AUTH_OAUTH # Authenticate with CommCare HQ # AUTH_TYPE = AUTH_DB # Authenticate with Superset user DB -# Override this to reflect your local Postgres DB -SQLALCHEMY_DATABASE_URI = 'postgresql://postgres:postgres@localhost:5433/superset_meta' +# Override these for your databases for Superset and HQ Data +SQLALCHEMY_DATABASE_URI = 'postgresql://postgres:postgres@localhost:5432/superset' +SQLALCHEMY_BINDS = { + OAUTH2_DATABASE_NAME: 'postgresql://postgres:postgres@localhost:5432/superset_oauth2' +} + +HQ_DATABASE_URI = "postgresql://commcarehq:commcarehq@localhost:5432/superset_hq_data" # Populate with oauth credentials from your local CommCareHQ OAUTH_PROVIDERS = [ @@ -123,6 +142,10 @@ class CeleryConfig: 'pt': {'flag':'pt', 'name':'Portuguese'} } +OAUTH2_TOKEN_EXPIRES_IN = { + 'client_credentials': 86400, +} + # CommCare Analytics extensions FLASK_APP_MUTATOR = flask_app_mutator CUSTOM_SECURITY_MANAGER = oauth.CommCareSecurityManager From cffc0db379a5558b5856e675a1a117af3ceed1a1 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Thu, 29 Feb 2024 17:30:35 +0000 Subject: [PATCH 07/27] Add migration for OAuth 2.0 server models --- README.md | 90 ++++++++++---- hq_superset/migrations/README | 1 + hq_superset/migrations/alembic.ini | 115 ++++++++++++++++++ hq_superset/migrations/env.py | 77 ++++++++++++ hq_superset/migrations/script.py.mako | 26 ++++ ...4_23-53_56d0467ff6ff_added_oauth_tables.py | 67 ++++++++++ 6 files changed, 349 insertions(+), 27 deletions(-) create mode 100644 hq_superset/migrations/README create mode 100644 hq_superset/migrations/alembic.ini create mode 100644 hq_superset/migrations/env.py create mode 100644 hq_superset/migrations/script.py.mako create mode 100644 hq_superset/migrations/versions/2024-02-24_23-53_56d0467ff6ff_added_oauth_tables.py diff --git a/README.md b/README.md index 0e590b1..2d86b7c 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,14 @@ This is a Python package that integrates Superset and CommCare HQ. Local Development ----------------- -Follow below instructions. +### Preparing CommCare HQ -### Setup env +The 'User configurable reports UI' feature flag must be enabled for the +domain in CommCare HQ, even if the data sources to be imported were +created by Report Builder, not a UCR. + + +### Setting up a dev environment While doing development on top of this integration, it's useful to install this via `pip -e` option so that any changes made get reflected @@ -51,11 +56,12 @@ directly without another `pip install`. Read through the initialization instructions at https://superset.apache.org/docs/installation/installing-superset-from-scratch/#installing-and-initializing-superset. -Create the database. These instructions assume that PostgreSQL is -running on localhost, and that its user is "commcarehq". Adapt -accordingly: +Create a database for Superset, and a database for storing data from +CommCare HQ. Adapt the username and database names to suit your +environment. ```bash -$ createdb -h localhost -p 5432 -U commcarehq superset_meta +$ createdb -h localhost -p 5432 -U postgres superset +$ createdb -h localhost -p 5432 -U postgres superset_hq_data ``` Set the following environment variables: @@ -64,10 +70,17 @@ $ export FLASK_APP=superset $ export SUPERSET_CONFIG_PATH=/path/to/superset_config.py ``` -Initialize the database. Create an administrator. Create default roles +Set this environment variable to allow OAuth 2.0 authentication with +CommCare HQ over insecure HTTP. (DO NOT USE THIS IN PRODUCTION.) +```bash +$ export AUTHLIB_INSECURE_TRANSPORT=1 +``` + +Initialize the databases. Create an administrator. Create default roles and permissions: ```bash $ superset db upgrade +$ superset db upgrade --directory hq_superset/migrations/ $ superset fab create-admin $ superset load_examples # (Optional) $ superset init @@ -78,28 +91,16 @@ You should now be able to run superset using the `superset run` command: ```bash $ superset run -p 8088 --with-threads --reload --debugger ``` -However, OAuth login does not work yet as hq-superset needs a Postgres -database created to store CommCare HQ data. - -### Create a Postgres Database Connection for storing HQ data - -- Create a Postgres database. e.g. - ```bash - $ createdb -h localhost -p 5432 -U commcarehq hq_data - ``` -- Log into Superset as the admin user created in the Superset - installation and initialization. Note that you will need to update - `AUTH_TYPE = AUTH_DB` to log in as admin user. `AUTH_TYPE` should be - otherwise set to `AUTH_OAUTH`. -- Go to 'Data' -> 'Databases' or http://127.0.0.1:8088/databaseview/list/ -- Create a database connection by clicking '+ DATABASE' button at the top. -- The name of the DISPLAY NAME should be 'HQ Data' exactly, as this is - the name by which this codebase refers to the Postgres DB. - -OAuth integration should now be working. You can log in as a CommCare -HQ web user. +You can now log in as a CommCare HQ web user. +In order for CommCare HQ to sync data source changes, you will need to +allow OAuth 2.0 authentication over insecure HTTP. (DO NOT USE THIS IN +PRODUCTION.) Set this environment variable in your CommCare HQ Django +server. (Yes, it's "OAUTHLIB" this time, not "AUTHLIB" as before.) +```bash +$ export OAUTHLIB_INSECURE_TRANSPORT=1 +``` ### Importing UCRs using Redis and Celery @@ -129,6 +130,41 @@ code you want to test will need to be in a module whose dependencies don't include Superset. +### Creating a migration + +You will need to create an Alembic migration for any new SQLAlchemy +models that you add. The Superset CLI should allow you to do this: + +```shell +$ superset db revision --autogenerate -m "Add table for Foo model" +``` + +However, problems with this approach have occurred in the past. You +might have more success by using Alembic directly. You will need to +modify the configuration a little to do this: + +1. Copy the "HQ_DATA" database URI from `superset_config.py`. + +2. Paste it as the value of `sqlalchemy.url` in + `hq_superset/migrations/alembic.ini`. + +3. Edit `env.py` and comment out the following lines: + ``` + hq_data_uri = current_app.config['SQLALCHEMY_BINDS'][HQ_DATA] + decoded_uri = urllib.parse.unquote(hq_data_uri) + config.set_main_option('sqlalchemy.url', decoded_uri) + ``` + +Those changes will allow Alembic to connect to the "HD Data" database +without the need to instantiate Superset's Flask app. You can now +autogenerate your new table with: + +```shell +$ cd hq_superset/migrations/ +$ alembic revision --autogenerate -m "Add table for Foo model" +``` + + Upgrading Superset ------------------ diff --git a/hq_superset/migrations/README b/hq_superset/migrations/README new file mode 100644 index 0000000..98e4f9c --- /dev/null +++ b/hq_superset/migrations/README @@ -0,0 +1 @@ +Generic single-database configuration. \ No newline at end of file diff --git a/hq_superset/migrations/alembic.ini b/hq_superset/migrations/alembic.ini new file mode 100644 index 0000000..f01502d --- /dev/null +++ b/hq_superset/migrations/alembic.ini @@ -0,0 +1,115 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts +script_location = . + +# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s +# Uncomment the line below if you want the files to be prepended with date and time +# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file +# for all available tokens +file_template = %%(year)d-%%(month).2d-%%(day).2d_%%(hour).2d-%%(minute).2d_%%(rev)s_%%(slug)s + +# sys.path path, will be prepended to sys.path if present. +# defaults to the current working directory. +# prepend_sys_path = . + +# timezone to use when rendering the date within the migration file +# as well as the filename. +# If specified, requires the python>=3.9 or backports.zoneinfo library. +# Any required deps can installed by adding `alembic[tz]` to the pip requirements +# string value is passed to ZoneInfo() +# leave blank for localtime +# timezone = + +# max length of characters to apply to the +# "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; This defaults +# to hq_superset/migrations/versions. When using multiple version +# directories, initial revisions must be specified with --version-path. +# The path separator used here should be the separator specified by "version_path_separator" below. +# version_locations = %(here)s/bar:%(here)s/bat:hq_superset/migrations/versions + +# version path separator; As mentioned above, this is the character used to split +# version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. +# If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. +# Valid values for version_path_separator are: +# +# version_path_separator = : +# version_path_separator = ; +# version_path_separator = space +version_path_separator = os # Use os.pathsep. Default configuration used for new projects. + +# set to 'true' to search source files recursively +# in each "version_locations" directory +# new in Alembic version 1.10 +# recursive_version_locations = false + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +sqlalchemy.url = driver://user:pass@localhost/dbname + +[post_write_hooks] +# post_write_hooks defines scripts or Python functions that are run +# on newly generated revision scripts. See the documentation for further +# detail and examples + +# format using "black" - use the console_scripts runner, against the "black" entrypoint +# hooks = black +# black.type = console_scripts +# black.entrypoint = black +# black.options = -l 79 REVISION_SCRIPT_FILENAME + +# lint with attempts to fix using "ruff" - use the exec runner, execute a binary +# hooks = ruff +# ruff.type = exec +# ruff.executable = %(here)s/.venv/bin/ruff +# ruff.options = --fix REVISION_SCRIPT_FILENAME + +# Logging configuration +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARN +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/hq_superset/migrations/env.py b/hq_superset/migrations/env.py new file mode 100644 index 0000000..c89f28b --- /dev/null +++ b/hq_superset/migrations/env.py @@ -0,0 +1,77 @@ +import urllib.parse +from logging.config import fileConfig + +from alembic import context +from flask import current_app +from sqlalchemy import engine_from_config, pool + +from hq_superset.const import OAUTH2_DATABASE_NAME +from hq_superset.models import HQClient + +config = context.config +if config.config_file_name is not None: + fileConfig(config.config_file_name) + +db_uri = current_app.config['SQLALCHEMY_BINDS'][OAUTH2_DATABASE_NAME] +decoded_uri = urllib.parse.unquote(db_uri) +config.set_main_option('sqlalchemy.url', decoded_uri) + +# add your model's MetaData object here for 'autogenerate' support +target_metadata = HQClient.metadata + +# other values from the config, defined by the needs of env.py, +# can be acquired: +# my_important_option = config.get_main_option("my_important_option") +# ... etc. + + +def run_migrations_offline() -> None: + """Run migrations in 'offline' mode. + + This configures the context with just a URL + and not an Engine, though an Engine is acceptable + here as well. By skipping the Engine creation + we don't even need a DBAPI to be available. + + Calls to context.execute() here emit the given string to the + script output. + + """ + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + ) + + with context.begin_transaction(): + context.run_migrations() + + +def run_migrations_online() -> None: + """Run migrations in 'online' mode. + + In this scenario we need to create an Engine + and associate a connection with the context. + + """ + connectable = engine_from_config( + config.get_section(config.config_ini_section, {}), + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + + with connectable.connect() as connection: + context.configure( + connection=connection, target_metadata=target_metadata + ) + + with context.begin_transaction(): + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/hq_superset/migrations/script.py.mako b/hq_superset/migrations/script.py.mako new file mode 100644 index 0000000..fbc4b07 --- /dev/null +++ b/hq_superset/migrations/script.py.mako @@ -0,0 +1,26 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision: str = ${repr(up_revision)} +down_revision: Union[str, None] = ${repr(down_revision)} +branch_labels: Union[str, Sequence[str], None] = ${repr(branch_labels)} +depends_on: Union[str, Sequence[str], None] = ${repr(depends_on)} + + +def upgrade() -> None: + ${upgrades if upgrades else "pass"} + + +def downgrade() -> None: + ${downgrades if downgrades else "pass"} diff --git a/hq_superset/migrations/versions/2024-02-24_23-53_56d0467ff6ff_added_oauth_tables.py b/hq_superset/migrations/versions/2024-02-24_23-53_56d0467ff6ff_added_oauth_tables.py new file mode 100644 index 0000000..0b962f5 --- /dev/null +++ b/hq_superset/migrations/versions/2024-02-24_23-53_56d0467ff6ff_added_oauth_tables.py @@ -0,0 +1,67 @@ +"""Added OAuth tables + +Revision ID: 56d0467ff6ff +Revises: +Create Date: 2024-02-24 23:53:10.289606 +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = '56d0467ff6ff' +down_revision: Union[str, None] = None +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.create_table( + 'hq_oauth_client', + sa.Column('client_id', sa.String(length=48), nullable=True), + sa.Column('client_id_issued_at', sa.Integer(), nullable=False), + sa.Column('client_secret_expires_at', sa.Integer(), nullable=False), + sa.Column('client_metadata', sa.Text(), nullable=True), + sa.Column('domain', sa.String(length=255), nullable=False), + sa.Column('client_secret', sa.String(length=255), nullable=True), + sa.PrimaryKeyConstraint('domain'), + info={'bind_key': 'oauth2-server-data'}, + ) + op.create_index( + op.f('ix_hq_oauth_client_client_id'), + 'hq_oauth_client', + ['client_id'], + unique=False, + ) + op.create_table( + 'hq_oauth_token', + sa.Column('client_id', sa.String(length=48), nullable=True), + sa.Column('token_type', sa.String(length=40), nullable=True), + sa.Column('access_token', sa.String(length=255), nullable=False), + sa.Column('refresh_token', sa.String(length=255), nullable=True), + sa.Column('scope', sa.Text(), nullable=True), + sa.Column('issued_at', sa.Integer(), nullable=False), + sa.Column('access_token_revoked_at', sa.Integer(), nullable=False), + sa.Column('refresh_token_revoked_at', sa.Integer(), nullable=False), + sa.Column('expires_in', sa.Integer(), nullable=False), + sa.Column('id', sa.Integer(), nullable=False), + sa.PrimaryKeyConstraint('id'), + sa.UniqueConstraint('access_token'), + info={'bind_key': 'oauth2-server-data'}, + ) + op.create_index( + op.f('ix_hq_oauth_token_refresh_token'), + 'hq_oauth_token', + ['refresh_token'], + unique=False, + ) + + +def downgrade() -> None: + op.drop_table('hq_oauth_token') + op.drop_index( + op.f('ix_hq_oauth_client_client_id'), table_name='hq_oauth_client' + ) + op.drop_table('hq_oauth_client') From f0276dd445030e0866b83283d38a752481a3f940 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Mon, 4 Mar 2024 17:37:10 +0000 Subject: [PATCH 08/27] Rename endpoint --- hq_superset/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hq_superset/api.py b/hq_superset/api.py index 5473c58..8e0d932 100644 --- a/hq_superset/api.py +++ b/hq_superset/api.py @@ -43,7 +43,7 @@ class DataSetChangeAPI(BaseApi): MAX_REQUEST_LENGTH = 10 * 1024 * 1024 # reject JSON requests > 10MB def __init__(self): - self.route_base = '/hq_webhook' + self.route_base = '/commcarehq_dataset' self.default_view = 'post_dataset_change' super().__init__() From 9f4b053d4afefc203a0774ee87bf7d19a4a3af09 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Mon, 4 Mar 2024 17:37:59 +0000 Subject: [PATCH 09/27] Move `create_domain_client()` into a function --- hq_superset/models.py | 21 --------------------- hq_superset/services.py | 30 ++++++++++++++++++++++++------ 2 files changed, 24 insertions(+), 27 deletions(-) diff --git a/hq_superset/models.py b/hq_superset/models.py index 01b17e6..0f5799a 100644 --- a/hq_superset/models.py +++ b/hq_superset/models.py @@ -1,7 +1,4 @@ -import secrets -import string import time -import uuid from dataclasses import dataclass from typing import Any @@ -84,24 +81,6 @@ def revoke_tokens(self): db.session.execute(stmt) db.session.commit() - @classmethod - def get_by_domain(cls, domain): - return db.session.query(OAuth2Client).filter_by(domain=domain).first() - - @classmethod - def create_domain_client(cls, domain: str): - alphabet = string.ascii_letters + string.digits - client_secret = ''.join(secrets.choice(alphabet) for i in range(64)) - client = OAuth2Client( - domain=domain, - client_id=str(uuid.uuid4()), - ) - client.set_client_secret(client_secret) - client.set_client_metadata({"grant_types": ["client_credentials"]}) - db.session.add(client) - db.session.commit() - return client - class OAuth2Token(db.Model, OAuth2TokenMixin): __bind_key__ = OAUTH2_DATABASE_NAME diff --git a/hq_superset/services.py b/hq_superset/services.py index a5c6ad9..63824b8 100644 --- a/hq_superset/services.py +++ b/hq_superset/services.py @@ -1,5 +1,8 @@ import logging import os +import secrets +import string +import uuid from datetime import datetime from urllib.parse import urljoin @@ -152,10 +155,7 @@ def dataframe_to_sql(df, replace=False): def subscribe_to_hq_datasource(domain, datasource_id): - hq_client = OAuth2Client.get_by_domain(domain) - if hq_client is None: - hq_client = OAuth2Client.create_domain_client(domain) - + client = _get_or_create_oauth2client(domain) hq_request = HQRequest(url=datasource_subscribe(domain, datasource_id)) webhook_url = urljoin( request.root_url, @@ -165,8 +165,8 @@ def subscribe_to_hq_datasource(domain, datasource_id): response = hq_request.post({ 'webhook_url': webhook_url, 'token_url': token_url, - 'client_id': hq_client.client_id, - 'client_secret': hq_client.get_client_secret(), + 'client_id': client.client_id, + 'client_secret': client.get_client_secret(), }) if response.status_code == 201: return @@ -180,6 +180,24 @@ def subscribe_to_hq_datasource(domain, datasource_id): ) +def _get_or_create_oauth2client(domain): + client = db.session.query(OAuth2Client).filter_by(domain=domain).first() + if client: + return client + + alphabet = string.ascii_letters + string.digits + client_secret = ''.join(secrets.choice(alphabet) for i in range(64)) + client = OAuth2Client( + domain=domain, + client_id=str(uuid.uuid4()), + ) + client.set_client_secret(client_secret) + client.set_client_metadata({"grant_types": ["client_credentials"]}) + db.session.add(client) + db.session.commit() + return client + + class AsyncImportHelper: def __init__(self, domain, datasource_id): self.domain = domain From d30cc792ecd236157356504bda5270fd9430a461 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Mon, 4 Mar 2024 17:51:52 +0000 Subject: [PATCH 10/27] Generate secret in its own function --- hq_superset/services.py | 9 +++------ hq_superset/utils.py | 7 +++++++ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/hq_superset/services.py b/hq_superset/services.py index 63824b8..9dd7ff9 100644 --- a/hq_superset/services.py +++ b/hq_superset/services.py @@ -1,7 +1,5 @@ import logging import os -import secrets -import string import uuid from datetime import datetime from urllib.parse import urljoin @@ -16,6 +14,7 @@ from superset.extensions import cache_manager from superset.sql_parse import Table +from .exceptions import HQAPIException from .hq_requests import HQRequest from .hq_url import datasource_details, datasource_export, datasource_subscribe from .models import OAuth2Client @@ -25,9 +24,9 @@ get_datasource_file, get_hq_database, get_schema_name_for_domain, + generate_secret, parse_date, ) -from .exceptions import HQAPIException logger = logging.getLogger(__name__) @@ -185,13 +184,11 @@ def _get_or_create_oauth2client(domain): if client: return client - alphabet = string.ascii_letters + string.digits - client_secret = ''.join(secrets.choice(alphabet) for i in range(64)) client = OAuth2Client( domain=domain, client_id=str(uuid.uuid4()), ) - client.set_client_secret(client_secret) + client.set_client_secret(generate_secret()) client.set_client_metadata({"grant_types": ["client_credentials"]}) db.session.add(client) db.session.commit() diff --git a/hq_superset/utils.py b/hq_superset/utils.py index 1383907..aa8ceab 100644 --- a/hq_superset/utils.py +++ b/hq_superset/utils.py @@ -1,4 +1,6 @@ import ast +import secrets +import string import sys from contextlib import contextmanager from datetime import date, datetime @@ -276,3 +278,8 @@ def cast_data_for_table( else: cast_row[column] = value yield cast_row + + +def generate_secret(): + alphabet = string.ascii_letters + string.digits + return ''.join(secrets.choice(alphabet) for __ in range(64)) From ba6286d124b483fe03eb527e0971cfa59e8502b0 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Mon, 4 Mar 2024 18:09:59 +0000 Subject: [PATCH 11/27] Use `current_app.url_for()` --- hq_superset/services.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/hq_superset/services.py b/hq_superset/services.py index 9dd7ff9..83ef3a9 100644 --- a/hq_superset/services.py +++ b/hq_superset/services.py @@ -2,12 +2,11 @@ import os import uuid from datetime import datetime -from urllib.parse import urljoin import pandas import sqlalchemy import superset -from flask import g, request, url_for +from flask import g, current_app from sqlalchemy.dialects import postgresql from superset import db from superset.connectors.sqla.models import SqlaTable @@ -156,11 +155,11 @@ def dataframe_to_sql(df, replace=False): def subscribe_to_hq_datasource(domain, datasource_id): client = _get_or_create_oauth2client(domain) hq_request = HQRequest(url=datasource_subscribe(domain, datasource_id)) - webhook_url = urljoin( - request.root_url, - url_for('DataSetChangeAPI.post_dataset_change'), + webhook_url = current_app.url_for( + 'DataSetChangeAPI.post_dataset_change', + _external=True, ) - token_url = urljoin(request.root_url, url_for('OAuth.issue_access_token')) + token_url = current_app.url_for('OAuth.issue_access_token', _external=True) response = hq_request.post({ 'webhook_url': webhook_url, 'token_url': token_url, From 875ff55342997eb94e7dfc76fb3a4ebcaa862129 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Mon, 4 Mar 2024 18:47:56 +0000 Subject: [PATCH 12/27] Fix migration --- hq_superset/migrations/env.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hq_superset/migrations/env.py b/hq_superset/migrations/env.py index c89f28b..7ee95b2 100644 --- a/hq_superset/migrations/env.py +++ b/hq_superset/migrations/env.py @@ -6,7 +6,7 @@ from sqlalchemy import engine_from_config, pool from hq_superset.const import OAUTH2_DATABASE_NAME -from hq_superset.models import HQClient +from hq_superset.models import OAuth2Client config = context.config if config.config_file_name is not None: @@ -17,7 +17,7 @@ config.set_main_option('sqlalchemy.url', decoded_uri) # add your model's MetaData object here for 'autogenerate' support -target_metadata = HQClient.metadata +target_metadata = OAuth2Client.metadata # other values from the config, defined by the needs of env.py, # can be acquired: From 0974df5717c0c4efb5c2502d67f89ce38c2f084f Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Mon, 4 Mar 2024 18:51:44 +0000 Subject: [PATCH 13/27] Delete Alembic-generated README --- hq_superset/migrations/README | 1 - 1 file changed, 1 deletion(-) delete mode 100644 hq_superset/migrations/README diff --git a/hq_superset/migrations/README b/hq_superset/migrations/README deleted file mode 100644 index 98e4f9c..0000000 --- a/hq_superset/migrations/README +++ /dev/null @@ -1 +0,0 @@ -Generic single-database configuration. \ No newline at end of file From 3ea55a6dbe4d2393f96ae703f87a206be8e74889 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Mon, 4 Mar 2024 20:24:24 +0000 Subject: [PATCH 14/27] Raise TableMissing exception --- hq_superset/exceptions.py | 4 ++++ hq_superset/models.py | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/hq_superset/exceptions.py b/hq_superset/exceptions.py index 3eb5f9d..ffbd888 100644 --- a/hq_superset/exceptions.py +++ b/hq_superset/exceptions.py @@ -8,3 +8,7 @@ class HQAPIException(Exception): class OAuthSessionExpired(Exception): pass + + +class TableMissing(Exception): + pass diff --git a/hq_superset/models.py b/hq_superset/models.py index 0f5799a..a8402d8 100644 --- a/hq_superset/models.py +++ b/hq_superset/models.py @@ -11,6 +11,7 @@ from superset import db from .const import OAUTH2_DATABASE_NAME +from .exceptions import TableMissing from .utils import cast_data_for_table, get_fernet_keys, get_hq_database @@ -28,7 +29,7 @@ def update_dataset(self): if table.table_name == self.data_source_id )) except StopIteration: - raise ValueError(f'{self.data_source_id} table not found.') + raise TableMissing(f'{self.data_source_id} table not found.') table = sqla_table.get_sqla_table_object() with ( From aedcb42c4c4740db82364c462eadb3713ddf7d37 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Mon, 4 Mar 2024 20:32:39 +0000 Subject: [PATCH 15/27] Sort `DOMAIN_EXCLUDED_VIEWS` --- hq_superset/hq_domain.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/hq_superset/hq_domain.py b/hq_superset/hq_domain.py index de137d4..0229124 100644 --- a/hq_superset/hq_domain.py +++ b/hq_superset/hq_domain.py @@ -19,17 +19,17 @@ def after_request_hook(response): DOMAIN_EXCLUDED_VIEWS = [ - "AuthOAuthView.login", - "AuthOAuthView.logout", - "AuthOAuthView.oauth_authorized", - "AuthDBView.logout", - "AuthDBView.login", - "SelectDomainView.list", - "SelectDomainView.select", - "OAuth.issue_access_token", - "DataSetChangeAPI.post_dataset_change", - "appbuilder.static", - "static", + 'AuthDBView.login', + 'AuthDBView.logout', + 'AuthOAuthView.login', + 'AuthOAuthView.logout', + 'AuthOAuthView.oauth_authorized', + 'DataSetChangeAPI.post_dataset_change', + 'OAuth.issue_access_token', + 'SelectDomainView.list', + 'SelectDomainView.select', + 'appbuilder.static', + 'static', ] From af4878fd749de71fedc0571b2c53347e54a07a0c Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Tue, 5 Mar 2024 14:52:50 +0000 Subject: [PATCH 16/27] README: Setting local user authentication --- README.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2d86b7c..af60543 100644 --- a/README.md +++ b/README.md @@ -102,8 +102,19 @@ server. (Yes, it's "OAUTHLIB" this time, not "AUTHLIB" as before.) $ export OAUTHLIB_INSECURE_TRANSPORT=1 ``` -### Importing UCRs using Redis and Celery +### Logging in as a local admin user + +There might be situations where you need to log into Superset as a local +admin user, for example, to add a database connection. To enable local +user authentication, in `superset_config.py`, set +`AUTH_TYPE = AUTH_DB`. + +To return to allowing CommCare HQ users to log in, set it back to +`AUTH_TYPE = AUTH_OAUTH`. + + +### Importing UCRs using Redis and Celery Celery is used to import UCRs that are larger than `hq_superset.views.ASYNC_DATASOURCE_IMPORT_LIMIT_IN_BYTES`. If you need From 2c0c766c44a87f1014008d7bf89815657bb28c8b Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Wed, 6 Mar 2024 16:25:35 +0000 Subject: [PATCH 17/27] Docstring to explain how deletes are represented --- hq_superset/models.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/hq_superset/models.py b/hq_superset/models.py index a8402d8..5af2616 100644 --- a/hq_superset/models.py +++ b/hq_superset/models.py @@ -22,6 +22,13 @@ class DataSetChange: data: list[dict[str, Any]] def update_dataset(self): + """ + Updates a dataset with ``self.data``. + + ``self.data`` represents the current state of a UCR data source + for a form or a case, which is identified by ``self.doc_id``. If + the form or case has been deleted, then the list will be empty. + """ database = get_hq_database() try: sqla_table = next(( From ceb8f800f5313674c94eed74b932363431cf277a Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Wed, 6 Mar 2024 16:39:49 +0000 Subject: [PATCH 18/27] Add comment to README and config about AUTH_TYPE --- README.md | 4 ++++ superset_config.example.py | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index af60543..0083143 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,10 @@ admin user, for example, to add a database connection. To enable local user authentication, in `superset_config.py`, set `AUTH_TYPE = AUTH_DB`. +Doing this will prevent CommCare HQ users from logging in, so it should +only be done in production environments when CommCare Analytics is not +in use. + To return to allowing CommCare HQ users to log in, set it back to `AUTH_TYPE = AUTH_OAUTH`. diff --git a/superset_config.example.py b/superset_config.example.py index 7d6ef37..936a680 100644 --- a/superset_config.example.py +++ b/superset_config.example.py @@ -33,8 +33,9 @@ # Keys can be bytes or strings. # FERNET_KEYS = [...] -AUTH_TYPE = AUTH_OAUTH # Authenticate with CommCare HQ -# AUTH_TYPE = AUTH_DB # Authenticate with Superset user DB +# Authentication backend +AUTH_TYPE = AUTH_OAUTH # Authenticate with CommCare HQ (only) +# AUTH_TYPE = AUTH_DB # Authenticate with Superset user DB (only) # Override these for your databases for Superset and HQ Data SQLALCHEMY_DATABASE_URI = 'postgresql://postgres:postgres@localhost:5432/superset' From a017594503bd3475e21b7c30c331ff904d731649 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Wed, 6 Mar 2024 16:41:10 +0000 Subject: [PATCH 19/27] Clean up unused setting --- superset_config.example.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/superset_config.example.py b/superset_config.example.py index 936a680..7dda59b 100644 --- a/superset_config.example.py +++ b/superset_config.example.py @@ -143,10 +143,6 @@ class CeleryConfig: 'pt': {'flag':'pt', 'name':'Portuguese'} } -OAUTH2_TOKEN_EXPIRES_IN = { - 'client_credentials': 86400, -} - # CommCare Analytics extensions FLASK_APP_MUTATOR = flask_app_mutator CUSTOM_SECURITY_MANAGER = oauth.CommCareSecurityManager From ab8d89f3f2614164e3bd8cb8c39b479d301e7ab2 Mon Sep 17 00:00:00 2001 From: mkangia Date: Fri, 8 Mar 2024 14:35:48 +0530 Subject: [PATCH 20/27] pass scheme to url_for --- hq_superset/services.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hq_superset/services.py b/hq_superset/services.py index 83ef3a9..280a29e 100644 --- a/hq_superset/services.py +++ b/hq_superset/services.py @@ -155,11 +155,13 @@ def dataframe_to_sql(df, replace=False): def subscribe_to_hq_datasource(domain, datasource_id): client = _get_or_create_oauth2client(domain) hq_request = HQRequest(url=datasource_subscribe(domain, datasource_id)) + scheme = 'http' if current_app.config.get('OAUTHLIB_INSECURE_TRANSPORT') else 'https' webhook_url = current_app.url_for( 'DataSetChangeAPI.post_dataset_change', _external=True, + _scheme=scheme, ) - token_url = current_app.url_for('OAuth.issue_access_token', _external=True) + token_url = current_app.url_for('OAuth.issue_access_token', _external=True, _scheme=scheme) response = hq_request.post({ 'webhook_url': webhook_url, 'token_url': token_url, From 029fd9db9255c01be8f0235a5a6d6d308e8e90fd Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Mon, 11 Mar 2024 15:37:36 +0000 Subject: [PATCH 21/27] Get URL scheme from request --- hq_superset/services.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/hq_superset/services.py b/hq_superset/services.py index 280a29e..5013aad 100644 --- a/hq_superset/services.py +++ b/hq_superset/services.py @@ -6,7 +6,7 @@ import pandas import sqlalchemy import superset -from flask import g, current_app +from flask import g, current_app, request from sqlalchemy.dialects import postgresql from superset import db from superset.connectors.sqla.models import SqlaTable @@ -155,13 +155,16 @@ def dataframe_to_sql(df, replace=False): def subscribe_to_hq_datasource(domain, datasource_id): client = _get_or_create_oauth2client(domain) hq_request = HQRequest(url=datasource_subscribe(domain, datasource_id)) - scheme = 'http' if current_app.config.get('OAUTHLIB_INSECURE_TRANSPORT') else 'https' webhook_url = current_app.url_for( 'DataSetChangeAPI.post_dataset_change', _external=True, - _scheme=scheme, + _scheme=request.scheme, + ) + token_url = current_app.url_for( + 'OAuth.issue_access_token', + _external=True, + _scheme=request.scheme, ) - token_url = current_app.url_for('OAuth.issue_access_token', _external=True, _scheme=scheme) response = hq_request.post({ 'webhook_url': webhook_url, 'token_url': token_url, From 9042fb6e37d1e5a2f9d9a19e264f62c615ce09fd Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Mon, 11 Mar 2024 22:19:33 +0000 Subject: [PATCH 22/27] Determine URL scheme from `request.server` --- hq_superset/services.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/hq_superset/services.py b/hq_superset/services.py index 5013aad..0626d3f 100644 --- a/hq_superset/services.py +++ b/hq_superset/services.py @@ -155,15 +155,16 @@ def dataframe_to_sql(df, replace=False): def subscribe_to_hq_datasource(domain, datasource_id): client = _get_or_create_oauth2client(domain) hq_request = HQRequest(url=datasource_subscribe(domain, datasource_id)) + scheme = _get_url_scheme() webhook_url = current_app.url_for( 'DataSetChangeAPI.post_dataset_change', _external=True, - _scheme=request.scheme, + _scheme=scheme, ) token_url = current_app.url_for( 'OAuth.issue_access_token', _external=True, - _scheme=request.scheme, + _scheme=scheme, ) response = hq_request.post({ 'webhook_url': webhook_url, @@ -183,6 +184,17 @@ def subscribe_to_hq_datasource(domain, datasource_id): ) +def _get_url_scheme(): + scheme = 'https' + # Allow "http" for localhost only. Use request.server because + # request.scheme can return "http" for HTTPS requests. :facepalm: + if request.server: + host, port = request.server + if host == '127.0.0.1': # Also True if hostname is "localhost" + scheme = 'http' + return scheme + + def _get_or_create_oauth2client(domain): client = db.session.query(OAuth2Client).filter_by(domain=domain).first() if client: From d5830d97a990e0bf811d68de1fe71777aec8e526 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Wed, 13 Mar 2024 17:47:19 +0000 Subject: [PATCH 23/27] More detail in exception message --- hq_superset/services.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/hq_superset/services.py b/hq_superset/services.py index 0626d3f..e302d1a 100644 --- a/hq_superset/services.py +++ b/hq_superset/services.py @@ -51,7 +51,10 @@ def get_datasource_defn(domain, datasource_id): hq_request = HQRequest(url=datasource_details(domain, datasource_id)) response = hq_request.get() if response.status_code != 200: - raise HQAPIException("Error downloading the UCR definition from HQ") + raise HQAPIException( + "Error downloading the UCR definition from HQ: " + f"HTTP status {response.status_code}: {response.content}" + ) return response.json() From 96858c4214348acaa08f2a83ca8bc9542b060c5c Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Wed, 13 Mar 2024 17:48:12 +0000 Subject: [PATCH 24/27] Debug logging for authlib --- hq_superset/oauth2_server.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/hq_superset/oauth2_server.py b/hq_superset/oauth2_server.py index aca16d1..fb5ce0a 100644 --- a/hq_superset/oauth2_server.py +++ b/hq_superset/oauth2_server.py @@ -1,4 +1,5 @@ -from datetime import timedelta +import logging +import sys from authlib.integrations.flask_oauth2 import ( AuthorizationServer, @@ -39,6 +40,10 @@ def save_token(token, request): def config_oauth2(app): + authlib_logger = logging.getLogger('authlib') + authlib_logger.addHandler(logging.StreamHandler(sys.stdout)) + authlib_logger.setLevel(logging.DEBUG) + authorization.init_app(app) authorization.register_grant(grants.ClientCredentialsGrant) From 35d447e1a7aaae49666c38248f770a1d013a6ab7 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Wed, 13 Mar 2024 21:31:36 +0000 Subject: [PATCH 25/27] What is `request`? --- hq_superset/oauth2_server.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hq_superset/oauth2_server.py b/hq_superset/oauth2_server.py index fb5ce0a..b1dab2b 100644 --- a/hq_superset/oauth2_server.py +++ b/hq_superset/oauth2_server.py @@ -5,6 +5,7 @@ AuthorizationServer, ResourceProtector, ) +from authlib.integrations.flask_oauth2.requests import FlaskOAuth2Request from authlib.integrations.sqla_oauth2 import ( create_bearer_token_validator, create_query_client_func, @@ -15,7 +16,7 @@ from .models import OAuth2Client, OAuth2Token, db -def save_token(token, request): +def save_token(token: dict, request: FlaskOAuth2Request) -> None: client = request.client client.revoke_tokens() From 4c8977a371132f21ff2312f34742c217e44dd778 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Wed, 13 Mar 2024 22:25:23 +0000 Subject: [PATCH 26/27] There is a method for that --- hq_superset/__init__.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/hq_superset/__init__.py b/hq_superset/__init__.py index 11d6d1b..ba8390a 100644 --- a/hq_superset/__init__.py +++ b/hq_superset/__init__.py @@ -17,12 +17,8 @@ def flask_app_mutator(app): appbuilder.add_api(api.DataSetChangeAPI) oauth2_server.config_oauth2(app) - app.before_request_funcs.setdefault(None, []).append( - hq_domain.before_request_hook - ) - app.after_request_funcs.setdefault(None, []).append( - hq_domain.after_request_hook - ) + app.before_request(hq_domain.before_request_hook) + app.after_request(hq_domain.after_request_hook) app.strict_slashes = False override_jinja2_template_loader(app) From 49d575aa47411ba211e1dda3ec260600c6dd64c4 Mon Sep 17 00:00:00 2001 From: Norman Hooper Date: Wed, 13 Mar 2024 22:32:44 +0000 Subject: [PATCH 27/27] Set AUTHLIB_INSECURE_TRANSPORT (Urgh!) --- hq_superset/__init__.py | 7 +++++++ hq_superset/services.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/hq_superset/__init__.py b/hq_superset/__init__.py index ba8390a..4e0c7c2 100644 --- a/hq_superset/__init__.py +++ b/hq_superset/__init__.py @@ -22,6 +22,13 @@ def flask_app_mutator(app): app.strict_slashes = False override_jinja2_template_loader(app) + # A proxy (maybe) is changing the URL scheme from "https" to "http" + # on commcare-analytics-staging.dimagi.com, which breaks the OAuth + # 2.0 secure transport check despite transport being over HTTPS. I + # hate to do this, but werkzeug.contrib.fixers.ProxyFix didn't fix + # it. So I've run out of better options. (Norman 2024-03-13) + os.environ['AUTHLIB_INSECURE_TRANSPORT'] = '1' + def override_jinja2_template_loader(app): # Allow loading templates from the templates directory in this project as well diff --git a/hq_superset/services.py b/hq_superset/services.py index e302d1a..758068d 100644 --- a/hq_superset/services.py +++ b/hq_superset/services.py @@ -190,7 +190,7 @@ def subscribe_to_hq_datasource(domain, datasource_id): def _get_url_scheme(): scheme = 'https' # Allow "http" for localhost only. Use request.server because - # request.scheme can return "http" for HTTPS requests. :facepalm: + # request.scheme can return "http" for HTTPS requests (because proxy?) if request.server: host, port = request.server if host == '127.0.0.1': # Also True if hostname is "localhost"