From 29a4a084806320187bc49399c547f2a1cc8aa6c9 Mon Sep 17 00:00:00 2001 From: Jack Greenlee Date: Thu, 19 Dec 2024 10:20:55 -0500 Subject: [PATCH] improve batch loading of UUIDs stats https://dash.plotly.com/background-callbacks#using-set_props-within-a-callback Implements a 'background' callback to improve the batching of add_user_stats. This avoids relying on a dcc.Interval with a fixed time between batches The new callback, which has background=True, continues running continuously on the server until all of the chunks have been processed. At the end of each chunk it calls set_props, which sends an update to the UI with the latest data for 'store-loaded-uuids' Via `running=`, the loading spinner is disabled during execution of the callback and reinstated when the callback finishes or cancels. When there are 0 loaded-uuids-stats and >0 total uuids (i.e. while we are loading the first batch of 10) a secondary loading spinner is shown in place of the table This should speed things up further and prevent any glitches caused by the fixed interval The Output of the callback is 'all-uuids-stats-loaded', a new store which is not used anywhere. This seems pointless, but is necessary because the callback's initialization depends on its Output. if it has no Output, it initializes on the Overview page causing errors. If the output is 'loaded-uuids-stats', it waits for the callback to finish before showing anything, defeating the purpose of batch loading. Thus, I resorted to making a new store for it --- app_sidebar_collapsible.py | 9 ++- pages/data.py | 140 +++++++++++++++---------------------- requirements.txt | 5 ++ 3 files changed, 70 insertions(+), 84 deletions(-) diff --git a/app_sidebar_collapsible.py b/app_sidebar_collapsible.py index 5c77e5d..c06867f 100644 --- a/app_sidebar_collapsible.py +++ b/app_sidebar_collapsible.py @@ -14,7 +14,7 @@ import dash import dash_bootstrap_components as dbc -from dash import Input, Output, dcc, html, Dash +from dash import Input, Output, dcc, html, Dash, DiskcacheManager import dash_auth import logging import base64 @@ -42,10 +42,17 @@ 'hello': 'world' } +# diskcache manager; required for 'background' callbacks +# https: // dash.plotly.com/background-callbacks +import diskcache +cache = diskcache.Cache("./cache") +background_callback_manager = DiskcacheManager(cache) + app = Dash( external_stylesheets=[dbc.themes.BOOTSTRAP, dbc.icons.FONT_AWESOME], suppress_callback_exceptions=True, use_pages=True, + background_callback_manager=background_callback_manager ) server = app.server # expose server variable for Procfile diff --git a/pages/data.py b/pages/data.py index ca45aec..0ea5ad1 100644 --- a/pages/data.py +++ b/pages/data.py @@ -3,7 +3,7 @@ Since the dcc.Location component is not in the layout when navigating to this page, it triggers the callback. The workaround is to check if the input value is None. """ -from dash import dcc, html, Input, Output, callback, register_page, dash_table, State +from dash import dcc, html, Input, Output, callback, register_page, dash_table, State, set_props # Etc import logging import pandas as pd @@ -31,9 +31,9 @@ ]), html.Div(id='tabs-content'), dcc.Store(id='selected-tab', data='tab-uuids-datatable'), # Store to hold selected tab - dcc.Interval(id='interval-load-more', interval=24000, n_intervals=0), # Interval for loading more data dcc.Store(id='store-uuids', data=[]), # Store to hold the original UUIDs data - dcc.Store(id='store-loaded-uuids', data={'data': [], 'loaded': False}), # Store to track loaded data + dcc.Store(id='loaded-uuids-stats', data=[]), + dcc.Store(id='all-uuids-stats-loaded', data=False), dcc.Store(id='uuids-page-current', data=0), # Store to track current page for UUIDs DataTable # RadioItems for key list switch, wrapped in a div that can hide/show html.Div( @@ -143,10 +143,37 @@ def update_uuids_page_current(page_current, selected_tab): raise PreventUpdate +@callback( + Output('all-uuids-stats-loaded', 'data'), + Input('tabs-datatable', 'value'), + Input('store-uuids', 'data'), + background=True, + # hide the global spinner while callback is running + running=[Output('global-loading', 'display'), 'hide', 'auto'], + # if page changes or tab changes while callback is running, cancel + cancel=[ + Input('url', 'pathname'), + Input('tabs-datatable', 'value') + ], +) +def load_uuids_stats(tab, uuids): + logging.debug("loading uuids stats for tab %s" % tab) + if tab != 'tab-uuids-datatable': + return + + # slice uuids into chunks of 10 + uuids_chunks = [uuids['data'][i:i+10] for i in range(0, len(uuids['data']), 10)] + loaded_stats = [] + for uuids in uuids_chunks: + processed_uuids = db_utils.add_user_stats(uuids, 10) + loaded_stats.extend(processed_uuids) + logging.debug("loaded %s uuids stats: %s" % (len(loaded_stats), loaded_stats)) + set_props('loaded-uuids-stats', {'data': loaded_stats}) + return True + + @callback( Output('tabs-content', 'children'), - Output('store-loaded-uuids', 'data'), - Output('interval-load-more', 'disabled'), # Disable interval when all data is loaded Input('tabs-datatable', 'value'), Input('store-uuids', 'data'), Input('store-excluded-uuids', 'data'), @@ -156,13 +183,11 @@ def update_uuids_page_current(page_current, selected_tab): Input('date-picker', 'start_date'), Input('date-picker', 'end_date'), Input('date-picker-timezone', 'value'), - Input('interval-load-more', 'n_intervals'), # Interval to trigger the loading of more data Input('keylist-switch', 'value'), # Add keylist-switch to trigger data refresh on change Input('uuids-page-current', 'data'), # Current page number for UUIDs DataTable - State('store-loaded-uuids', 'data'), # Use State to track already loaded data - State('store-loaded-uuids', 'loaded') # Keep track if we have finished loading all data + Input('loaded-uuids-stats', 'data'), ) -def render_content(tab, store_uuids, store_excluded_uuids, store_trips, store_demographics, store_trajectories, start_date, end_date, timezone, n_intervals, key_list, current_page, loaded_uuids_store, all_data_loaded): +def render_content(tab, store_uuids, store_excluded_uuids, store_trips, store_demographics, store_trajectories, start_date, end_date, timezone, key_list, current_page, loaded_uuids): with ect.Timer() as total_timer: initial_batch_size = 10 # Define the batch size for loading UUIDs @@ -172,74 +197,35 @@ def render_content(tab, store_uuids, store_excluded_uuids, store_trips, store_de # Initialize return variables content = None - updated_loaded_uuids_store = loaded_uuids_store.copy() if loaded_uuids_store else {'data': [], 'loaded': False} - interval_disabled = True # Handle the UUIDs tab without fullscreen loading spinner if tab == 'tab-uuids-datatable': with ect.Timer() as handle_uuids_timer: - logging.debug(f"Callback - {selected_tab} Stage 2: Handling UUIDs tab.") + # Prepare the data to be displayed + columns = perm_utils.get_uuids_columns() # Get the relevant columns + df = pd.DataFrame(loaded_uuids) - # Ensure store_uuids contains the key 'data' which is a list of dictionaries - if not isinstance(store_uuids, dict) or 'data' not in store_uuids: - logging.error(f"Expected store_uuids to be a dict with a 'data' key, but got {type(store_uuids)}") - content = html.Div([html.P("Data structure error.")]) - interval_disabled = True + if not perm_utils.has_permission('data_uuids'): + logging.debug(f"Callback - {selected_tab} insufficient permission.") + content = html.Div([html.P("No data available or you don't have permission.")]) else: - uuids_list = store_uuids['data'] - - # Ensure uuids_list is a list for slicing - if not isinstance(uuids_list, list): - logging.error(f"Expected store_uuids['data'] to be a list but got {type(uuids_list)}") - content = html.Div([html.P("Data structure error.")]) - interval_disabled = True + if df.empty and len(store_uuids['data']) > 0: + logging.debug(f"Callback - {selected_tab} loaded_uuids is empty.") + content = html.Div( + [dcc.Loading(id='uuids-loading', display='show')], + style={'margin-top': '36px'} + ) else: - loaded_data = updated_loaded_uuids_store.get('data', []) - total_loaded = len(loaded_data) - - # Handle lazy loading - if not updated_loaded_uuids_store.get('loaded', False): - total_to_load = total_loaded + initial_batch_size - total_to_load = min(total_to_load, len(uuids_list)) # Avoid loading more than available - - logging.debug(f"Callback - {selected_tab} Stage 3: Loading next batch of UUIDs from {total_loaded} to {total_to_load}.") - - new_data = uuids_list[total_loaded:total_to_load] - - if new_data: - # Process and append the new data to the loaded store - processed_data = db_utils.add_user_stats(new_data, initial_batch_size) - loaded_data.extend(processed_data) - - # Update the store with the new data by creating a new dict - updated_loaded_uuids_store = { - 'data': loaded_data, - 'loaded': len(loaded_data) >= len(uuids_list) - } - - logging.debug(f"Callback - {selected_tab} Stage 4: New batch loaded. Total loaded: {len(loaded_data)}.") - - # Prepare the data to be displayed - columns = perm_utils.get_uuids_columns() # Get the relevant columns - df = pd.DataFrame(updated_loaded_uuids_store['data']) - - if df.empty or not perm_utils.has_permission('data_uuids'): - logging.debug(f"Callback - {selected_tab} Error Stage: No data available or permission issues.") - content = html.Div([html.P("No data available or you don't have permission.")]) - interval_disabled = True - else: - df = df.drop(columns=[col for col in df.columns if col not in columns]) - - logging.debug(f"Callback - {selected_tab} Stage 5: Returning appended data to update the UI.") - content = html.Div([ - populate_datatable(df, table_id='uuid-table', page_current=current_page), # Pass current_page - html.P( - f"Showing {len(updated_loaded_uuids_store['data'])} of {len(uuids_list)} UUIDs." + - (f" Loading {initial_batch_size} more..." if not updated_loaded_uuids_store.get('loaded', False) else ""), - style={'margin': '15px 5px'} - ) - ]) - interval_disabled = updated_loaded_uuids_store.get('loaded', False) + df = df.drop(columns=[col for col in df.columns if col not in columns]) + logging.debug(f"Callback - {selected_tab} Stage 5: Returning appended data to update the UI.") + content = html.Div([ + populate_datatable(df, table_id='uuid-table', page_current=current_page), # Pass current_page + html.P( + f"Showing {len(loaded_uuids)} of {len(store_uuids['data'])} UUIDs." + + (f" Loading {initial_batch_size} more..." if len(loaded_uuids) < len(store_uuids['data']) else ""), + style={'margin': '15px 5px'} + ) + ]) # Store timing after handling UUIDs tab esdsq.store_dashboard_time( @@ -262,7 +248,6 @@ def render_content(tab, store_uuids, store_excluded_uuids, store_trips, store_de if df.empty or not has_perm: logging.debug(f"Callback - {selected_tab} Error Stage: No data available or permission issues.") content = None - interval_disabled = True else: df = df.drop(columns=[col for col in df.columns if col not in columns]) df = clean_location_data(df) @@ -273,7 +258,6 @@ def render_content(tab, store_uuids, store_excluded_uuids, store_trips, store_de html.Button('Display columns with raw units', id='button-clicked', n_clicks=0, style={'marginLeft': '5px'}), trips_table ]) - interval_disabled = True # Store timing after handling Trips tab esdsq.store_dashboard_time( @@ -294,14 +278,11 @@ def render_content(tab, store_uuids, store_excluded_uuids, store_trips, store_de df = pd.DataFrame(data) if df.empty: content = None - interval_disabled = True else: content = populate_datatable(df) - interval_disabled = True elif len(data) > 1: if not has_perm: content = None - interval_disabled = True else: content = html.Div([ dcc.Tabs(id='subtabs-demographics', value=list(data.keys())[0], children=[ @@ -309,10 +290,8 @@ def render_content(tab, store_uuids, store_excluded_uuids, store_trips, store_de ]), html.Div(id='subtabs-demographics-content') ]) - interval_disabled = True else: content = None - interval_disabled = True # Store timing after handling Demographics tab esdsq.store_dashboard_time( @@ -338,17 +317,14 @@ def render_content(tab, store_uuids, store_excluded_uuids, store_trips, store_de if df.empty or not has_perm: logging.debug(f"Callback - {selected_tab} Error Stage: No data available or permission issues.") content = None - interval_disabled = True else: df = df.drop(columns=[col for col in df.columns if col not in columns]) datatable = populate_datatable(df) content = datatable - interval_disabled = True else: content = None - interval_disabled = True # Store timing after handling Trajectories tab esdsq.store_dashboard_time( @@ -360,7 +336,6 @@ def render_content(tab, store_uuids, store_excluded_uuids, store_trips, store_de else: logging.debug(f"Callback - {selected_tab} Error Stage: No data loaded or unhandled tab.") content = None - interval_disabled = True # Store total timing after all stages esdsq.store_dashboard_time( @@ -368,8 +343,7 @@ def render_content(tab, store_uuids, store_excluded_uuids, store_trips, store_de total_timer ) - return content, updated_loaded_uuids_store, interval_disabled - + return content # Handle subtabs for demographic table when there are multiple surveys @callback( diff --git a/requirements.txt b/requirements.txt index 6182fe8..f3460a5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,3 +16,8 @@ flask-talisman==1.0.0 dash_auth==2.0.0 arrow==1.3.0 dash-leaflet==1.0.7 +# dash[diskcache] dependencies +# from https://github.com/plotly/dash/blob/dev/requirements/diskcache.txt +diskcache>=5.2.1 +multiprocess>=0.70.12 +psutil>=5.8.0