diff --git a/pynsee/localdata/_find_latest_local_dataset.py b/pynsee/localdata/_find_latest_local_dataset.py index b2a17c19..c33e3269 100644 --- a/pynsee/localdata/_find_latest_local_dataset.py +++ b/pynsee/localdata/_find_latest_local_dataset.py @@ -22,7 +22,7 @@ def _find_latest_local_dataset(dataset_version, variables, nivgeo, codegeo, upda filename = _hash("".join([dataset_version] + ['_find_latest_local_dataset'])) insee_folder = _create_insee_folder() - file_localdata = insee_folder + "/" + filename + file_localdata = os.path.join(insee_folder, filename) if (not os.path.exists(file_localdata)) or update: diff --git a/pynsee/macrodata/_load_dataset_data.py b/pynsee/macrodata/_load_dataset_data.py index 70c3eae9..664f9c0b 100644 --- a/pynsee/macrodata/_load_dataset_data.py +++ b/pynsee/macrodata/_load_dataset_data.py @@ -11,13 +11,22 @@ def _del_dataset_files(): for f in list_dataset_files: os.remove(f) + def _get_dataset_files(): list_dataset = list(get_dataset_list().id.unique()) insee_folder = _create_insee_folder() - file_dataset_metadata_list = [insee_folder + "/" + _hash("idbank_list" + dt) for dt in list_dataset] - file_dataset_metadata_list_exist = [f for f in file_dataset_metadata_list if os.path.exists(f)] + + file_dataset_metadata_list = [ + os.path.join(insee_folder, _hash("idbank_list" + dt)) for dt in list_dataset + ] + + file_dataset_metadata_list_exist = [ + f for f in file_dataset_metadata_list if os.path.exists(f) + ] + return file_dataset_metadata_list_exist + def _load_dataset_data(): list_dataset_files = _get_dataset_files() if len(list_dataset_files) > 0: diff --git a/pynsee/metadata/get_definition_list.py b/pynsee/metadata/get_definition_list.py index 93c7b904..09445e0d 100644 --- a/pynsee/metadata/get_definition_list.py +++ b/pynsee/metadata/get_definition_list.py @@ -1,82 +1,82 @@ -# -*- coding: utf-8 -*- -# Copyright : INSEE, 2021 - -from functools import lru_cache -from pynsee.utils._request_insee import _request_insee -from pynsee.utils._create_insee_folder import _create_insee_folder -from pynsee.utils._make_dataframe_from_dict import _make_dataframe_from_dict - -import zipfile -import os -import pkg_resources -import pandas as pd - -import logging -logger = logging.getLogger(__name__) - -@lru_cache(maxsize=None) -def _warning_definition_internal_data(): - logger.info( - "Internal package data has been used !\n" - "If some data is missing, please use get_definition !" - ) - - -@lru_cache(maxsize=None) -def get_definition_list(): - """Get a list of concept definitions - - Examples: - >>> from pynsee.metadata import get_definition_list - >>> definition = get_definition_list() - """ - - insee_folder = _create_insee_folder() - - insee_folder_local_def = insee_folder + "/" + "definition" - - if not os.path.exists(insee_folder_local_def): - os.mkdir(insee_folder_local_def) - - list_expected_files = ["all_definitions.csv"] - - list_expected_files = [ - insee_folder + "/definition/" + f for f in list_expected_files - ] - - list_available_file = [not os.path.exists(f) for f in list_expected_files] - - # unzipping raw files - if any(list_available_file): - - zip_file = pkg_resources.resource_stream(__name__, "data/definition.zip") - - with zipfile.ZipFile(zip_file, "r") as zip_ref: - zip_ref.extractall(insee_folder) - - link = "https://api.insee.fr/metadonnees/V1/concepts/definitions" - - request = _request_insee(api_url=link, file_format="application/json") - - data_request = request.json() - - list_data = [] - - for i in range(len(data_request)): - df = _make_dataframe_from_dict(data_request[i]) - df = df.iloc[:, 0:3].reset_index(drop=True).drop_duplicates() - list_data.append(df) - - data = pd.concat(list_data, axis=0) - data = data.reset_index(drop=True) - data.columns = ["ID", "URI", "TITLE_FR"] - - if os.path.exists(list_expected_files[0]): - all_data = pd.read_csv(list_expected_files[0]) - all_data = all_data.iloc[:, 1:10] - all_data = all_data.drop(columns={"URI", "TITLE_FR"}) - data = data.merge(all_data, on="ID", how="left") - - _warning_definition_internal_data() - - return data +# -*- coding: utf-8 -*- +# Copyright : INSEE, 2021 + +from functools import lru_cache +from pynsee.utils._request_insee import _request_insee +from pynsee.utils._create_insee_folder import _create_insee_folder +from pynsee.utils._make_dataframe_from_dict import _make_dataframe_from_dict + +import zipfile +import os +import pkg_resources +import pandas as pd + +import logging +logger = logging.getLogger(__name__) + +@lru_cache(maxsize=None) +def _warning_definition_internal_data(): + logger.info( + "Internal package data has been used !\n" + "If some data is missing, please use get_definition !" + ) + + +@lru_cache(maxsize=None) +def get_definition_list(): + """Get a list of concept definitions + + Examples: + >>> from pynsee.metadata import get_definition_list + >>> definition = get_definition_list() + """ + + insee_folder = _create_insee_folder() + + insee_folder_local_def = os.path.join(insee_folder, "definition") + + if not os.path.exists(insee_folder_local_def): + os.mkdir(insee_folder_local_def) + + list_expected_files = ["all_definitions.csv"] + + list_expected_files = [ + insee_folder + "/definition/" + f for f in list_expected_files + ] + + list_available_file = [not os.path.exists(f) for f in list_expected_files] + + # unzipping raw files + if any(list_available_file): + + zip_file = pkg_resources.resource_stream(__name__, "data/definition.zip") + + with zipfile.ZipFile(zip_file, "r") as zip_ref: + zip_ref.extractall(insee_folder) + + link = "https://api.insee.fr/metadonnees/V1/concepts/definitions" + + request = _request_insee(api_url=link, file_format="application/json") + + data_request = request.json() + + list_data = [] + + for i in range(len(data_request)): + df = _make_dataframe_from_dict(data_request[i]) + df = df.iloc[:, 0:3].reset_index(drop=True).drop_duplicates() + list_data.append(df) + + data = pd.concat(list_data, axis=0) + data = data.reset_index(drop=True) + data.columns = ["ID", "URI", "TITLE_FR"] + + if os.path.exists(list_expected_files[0]): + all_data = pd.read_csv(list_expected_files[0]) + all_data = all_data.iloc[:, 1:10] + all_data = all_data.drop(columns={"URI", "TITLE_FR"}) + data = data.merge(all_data, on="ID", how="left") + + _warning_definition_internal_data() + + return data diff --git a/pynsee/sirene/search_sirene.py b/pynsee/sirene/search_sirene.py index 02097f43..c72a1402 100644 --- a/pynsee/sirene/search_sirene.py +++ b/pynsee/sirene/search_sirene.py @@ -213,7 +213,7 @@ def search_sirene( filename = _hash(query + string) insee_folder = _create_insee_folder() - file_sirene = insee_folder + "/" + filename + file_sirene = os.path.join(insee_folder, filename) if (not os.path.exists(file_sirene)) or update: data_final = _request_sirene(query=query, kind=kind, number=number) diff --git a/pynsee/utils/_create_insee_folder.py b/pynsee/utils/_create_insee_folder.py index 262ba16a..8cd30dc2 100644 --- a/pynsee/utils/_create_insee_folder.py +++ b/pynsee/utils/_create_insee_folder.py @@ -14,24 +14,18 @@ def _create_insee_folder(): try: # find local folder local_appdata_folder = platformdirs.user_cache_dir() - insee_folder = local_appdata_folder + "/pynsee" + insee_folder = os.path.join(local_appdata_folder, "pynsee") # create insee folder if not os.path.exists(insee_folder): os.mkdir(insee_folder) - insee_folder = insee_folder + "/pynsee" + insee_folder = os.path.join(insee_folder, "pynsee") # create insee folder if not os.path.exists(insee_folder): os.mkdir(insee_folder) - # create internal folder - # if folder is not None: - # insee_folder = insee_folder + '/' + folder - # if not os.path.exists(insee_folder): - # os.mkdir(insee_folder) - # test if saving a file is possible test_file = os.path.join(insee_folder, _hash("test_file")) with open(test_file, "w") as f: diff --git a/pynsee/utils/_request_insee.py b/pynsee/utils/_request_insee.py index a6fc30ad..8951cdfb 100644 --- a/pynsee/utils/_request_insee.py +++ b/pynsee/utils/_request_insee.py @@ -36,9 +36,6 @@ def _request_insee( api_url=None, sdmx_url=None, file_format="application/xml", print_msg=True ): - # sdmx_url = "https://bdm.insee.fr/series/sdmx/data/SERIES_BDM/001688370" - # api_url = "https://api.insee.fr/series/BDM/V1/data/SERIES_BDM/001688370" - # api_url = 'https://api.insee.fr/series/BDM/V1/data/CLIMAT-AFFAIRES/?firstNObservations=4&lastNObservations=1' urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) if api_url is not None: @@ -67,14 +64,10 @@ def _request_insee( # 2- if the api request fails # if api url is missing sdmx url is used - if api_url is not None: token = pynsee.get_config("insee_token") - try: - username = os.environ['USERNAME'] - except Exception: - username = "username" + username = os.environ.get("USERNAME", "username") if token: headers = { @@ -113,54 +106,52 @@ def _request_insee( if success is True: return results - else: - msg = ( - "An error occurred !\n" - "Query : {api_url}\n" - f"{results.text}\n" - "Make sure you have subscribed to all APIs !\n" - "Click on all APIs' icons one by one, select your " - "application, and click on Subscribe" - ) - raise requests.exceptions.RequestException(msg) - else: - # token is None - commands = "\n\ninit_conn(insee_key='my_insee_key', insee_secret='my_insee_secret')\n" msg = ( - "Token missing, please check your credentials " - "on api.insee.fr !\n" - "Please do the following to use your " - f"credentials: {commands}\n\n" - "If your token still does not work, please try to clear " - "the cache :\n " - "from pynsee.utils import clear_all_cache; clear_all_cache()\n" + "An error occurred !\n" + "Query : {api_url}\n" + f"{results.text}\n" + "Make sure you have subscribed to all APIs !\n" + "Click on all APIs' icons one by one, select your " + "application, and click on Subscribe" ) - if sdmx_url is not None: - msg2 = "\nSDMX web service used instead of API" - if print_msg: - logger.critical(msg + msg2) + raise requests.exceptions.RequestException(msg) - results = requests.get(sdmx_url, proxies=proxies, verify=False) + # token is None + commands = "\n\ninit_conn(insee_key='my_insee_key', insee_secret='my_insee_secret')\n" + msg = ( + "Token missing, please check your credentials " + "on api.insee.fr !\n" + "Please do the following to use your " + f"credentials: {commands}\n\n" + "If your token still does not work, please try to clear " + "the cache :\n " + "from pynsee.utils import clear_all_cache; clear_all_cache()\n" + ) - if results.status_code == 200: - return results - else: - raise ValueError(results.text + "\n" + sdmx_url) - - else: - raise ValueError(msg) - else: - # api_url is None if sdmx_url is not None: + msg2 = "\nSDMX web service used instead of API" + if print_msg: + logger.critical(msg + msg2) + results = requests.get(sdmx_url, proxies=proxies, verify=False) - print(sdmx_url, results.status_code) if results.status_code == 200: return results - else: - raise ValueError(results.text + "\n" + sdmx_url) - else: - raise ValueError("!!! Error : urls are missing") + raise ValueError(results.text + "\n" + sdmx_url) + + raise ValueError(msg) + + # api_url is None + if sdmx_url is not None: + results = requests.get(sdmx_url, proxies=proxies, verify=False) + logger.debug(f"{sdmx_url}: {results.status_code}") + + if results.status_code == 200: + return results + + raise ValueError(results.text + "\n" + sdmx_url) + + raise ValueError("URLs are missing!") diff --git a/pynsee/utils/config.py b/pynsee/utils/config.py index 09d4c077..69868814 100644 --- a/pynsee/utils/config.py +++ b/pynsee/utils/config.py @@ -130,24 +130,20 @@ def set_config(config: Union[str, dict], value: Any = None): raise e -def _request_with_429_error_catch(url, proxies, headers, verify, session=None): - - if session is None: - session = requests.Session() - +def _request_with_429_error_catch(url, proxies, headers, verify, session): + ''' Workaround rate limitations ''' response = session.get( url, proxies=proxies, headers=headers, verify=verify) if response.status_code == 429: - time.sleep(10) response_again = session.get( url, proxies=proxies, headers=headers, verify=verify) return response_again - else: - return response + + return response def _register_token( @@ -170,78 +166,63 @@ def _register_token( raise ValueError( "!!! Token is missing, please check that insee_key and " "insee_secret are correct !!!") - else: - try: - username = os.environ['USERNAME'] - except Exception: - username = "username" - - headers = { - "Accept": "application/xml", - "Authorization": "Bearer " + (token or ""), - 'User-Agent': f"python_pynsee_{username}" - } - url_test = "https://api.insee.fr/series/BDM/V1/data/CLIMAT-AFFAIRES" - - session = requests.Session() + username = os.environ.get("USERNAME", "username") + + headers = { + "Accept": "application/xml", + "Authorization": "Bearer " + (token or ""), + 'User-Agent': f"python_pynsee_{username}" + } - request_test = _request_with_429_error_catch(url_test, - session=session, - proxies=proxies, - headers=headers, verify=False) + url_test = "https://api.insee.fr/series/BDM/V1/data/CLIMAT-AFFAIRES" - #request_test = requests.get( - # url_test, proxies=proxies, headers=headers, verify=False) + with requests.Session() as session: + request_test = _request_with_429_error_catch( + url_test, proxies=proxies, headers=headers, verify=False, + session=session) if request_test.status_code != 200: raise ValueError(f"This token is not working: {token}") - queries = [ - "https://api.insee.fr/series/BDM/V1/dataflow/FR1/all", - "https://api.insee.fr/metadonnees/V1/codes/cj/n3/5599", - "https://api.insee.fr/entreprises/sirene/V3/siret?q=activitePrincipaleUniteLegale:86.10*&nombre=1000", - "https://api.insee.fr/donnees-locales/V0.1/donnees/geo-SEXE-DIPL_19@GEO2020RP2017/FE-1.all.all", - ] + queries = [ + "https://api.insee.fr/series/BDM/V1/dataflow/FR1/all", + "https://api.insee.fr/metadonnees/V1/codes/cj/n3/5599", + "https://api.insee.fr/entreprises/sirene/V3/siret?q=activitePrincipaleUniteLegale:86.10*&nombre=1000", + "https://api.insee.fr/donnees-locales/V0.1/donnees/geo-SEXE-DIPL_19@GEO2020RP2017/FE-1.all.all", + ] - apis = ["BDM", "Metadata", "Sirene", "Local Data"] + apis = ["BDM", "Metadata", "Sirene", "Local Data"] - file_format = [ - "application/xml", - "application/xml", - "application/json;charset=utf-8", - "application/xml", - ] + file_format = [ + "application/xml", + "application/xml", + "application/json;charset=utf-8", + "application/xml", + ] + + list_requests_status = [] + + for q in range(len(queries)): + headers = { + "Accept": file_format[q], + "Authorization": "Bearer " + token, + 'User-Agent': f"python_pynsee_{username}" + } + + api_url = queries[q] + + results = _request_with_429_error_catch( + api_url, proxies=proxies, headers=headers, verify=False, + session=session) + + + if results.status_code != 200: + logger.critical( + f"Please subscribe to {apis[q]} API on api.insee.fr !" + ) - list_requests_status = [] - - for q in range(len(queries)): - headers = { - "Accept": file_format[q], - "Authorization": "Bearer " + token, - 'User-Agent': f"python_pynsee_{username}" - } - - api_url = queries[q] - - results = _request_with_429_error_catch(api_url, - session=session, - proxies=proxies, - headers=headers, - verify=False) - - #results = requests.get( - # api_url, proxies=proxies, headers=headers, verify=False - #) - - if results.status_code != 200: - logger.critical( - f"Please subscribe to {apis[q]} API on api.insee.fr !" - ) - list_requests_status += [results.status_code] - - # Close the session - session.close() + list_requests_status += [results.status_code] if all([sts == 200 for sts in list_requests_status]): return True