From 718bf5cdad137134ae535d24355d7b869f906326 Mon Sep 17 00:00:00 2001 From: Martin Raspaud Date: Fri, 13 Sep 2024 15:10:59 +0200 Subject: [PATCH 1/2] Allow including directory name in file names --- src/pytroll_watchers/dataspace_watcher.py | 33 ++++++++++++++++++++++- src/pytroll_watchers/publisher.py | 17 +++++++++--- tests/test_publisher.py | 5 +++- 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/src/pytroll_watchers/dataspace_watcher.py b/src/pytroll_watchers/dataspace_watcher.py index 03518de..dc091f8 100644 --- a/src/pytroll_watchers/dataspace_watcher.py +++ b/src/pytroll_watchers/dataspace_watcher.py @@ -6,6 +6,37 @@ Note: The OData and S3 services require two different set of credentials. + +Example of configuration file to retrieve SAR data from dataspace: + +.. code-block:: yaml + + + backend: dataspace + fs_config: + filter_string: "contains(Name,'IW_GRDH')" + dataspace_auth: + netrc_host: catalogue.dataspace.copernicus.eu + storage_options: + profile: copernicus + polling_interval: + minutes: 10 + start_from: + hours: 1 + publisher_config: + name: s1_watcher + nameservers: false + port: 3000 + message_config: + unpack: directory + include_dir_in_uid: true + subject: /segment/s1/l1b/ + atype: file + aliases: + sensor: + SAR: SAR-C + + """ import datetime @@ -135,7 +166,7 @@ def generate_download_links(filter_string, dataspace_auth, storage_options): mda["orbit_number"] = int(attributes["orbitNumber"]) for checksum in metadata["Checksum"]: - if checksum["Algorithm"] == "MD5": + if checksum.get("Algorithm") == "MD5": mda["checksum"] = dict(algorithm=checksum["Algorithm"], hash=checksum["Value"]) break mda["size"] = int(metadata["ContentLength"]) diff --git a/src/pytroll_watchers/publisher.py b/src/pytroll_watchers/publisher.py index bf835e3..9abcd5d 100644 --- a/src/pytroll_watchers/publisher.py +++ b/src/pytroll_watchers/publisher.py @@ -28,7 +28,9 @@ def file_publisher_from_generator(generator, publisher_config, message_config): message_config: The information needed to complete the posttroll message generation. Will be amended with the file metadata, and passed directly to posttroll's Message constructor. If it contains "unpack", it is expected to have the archive type (eg "zip"), or "directory", and the - contents of the archive or directory will be published as a "dataset". + contents of the archive or directory will be published as a "dataset". For the case where "directory" is + used, it is also possible to set the boolean "include_dir_in_uid" to true so that the full relative path + of the file is provided. Side effect: Publishes posttroll messages containing the location of the file with the following fields: @@ -47,12 +49,15 @@ def file_publisher_from_generator(generator, publisher_config, message_config): publisher = create_publisher_from_dict_config(publisher_config) publisher.start() unpack = message_config.pop("unpack", None) + include_dir = message_config.pop("include_dir_in_uid", None) with closing(publisher): for file_item, file_metadata in generator: amended_message_config = deepcopy(message_config) amended_message_config.setdefault("data", {}) if unpack == "directory": - dataset = [_build_file_location(unpacked_file) for unpacked_file in unpack_dir(file_item)] + dir_to_include = file_item.name if include_dir else None + dataset = [_build_file_location(unpacked_file, dir_to_include) + for unpacked_file in unpack_dir(file_item)] amended_message_config["data"]["dataset"] = dataset elif unpack: dataset = [_build_file_location(unpacked_file) for unpacked_file in unpack_archive(file_item, unpack)] @@ -91,10 +96,14 @@ def unpack_dir(path): **path.storage_options) -def _build_file_location(file_item): +def _build_file_location(file_item, include_dir=None): file_location = dict() file_location["uri"] = file_item.as_uri() - file_location["uid"] = file_item.name + if include_dir: + uid = include_dir + file_item.path.rsplit(include_dir, 1)[-1] + else: + uid = file_item.name + file_location["uid"] = uid with suppress(AttributeError): file_location["filesystem"] = json.loads(file_item.fs.to_json()) file_location["path"] = file_item.path diff --git a/tests/test_publisher.py b/tests/test_publisher.py index 1eb39b7..3b50e6d 100644 --- a/tests/test_publisher.py +++ b/tests/test_publisher.py @@ -3,6 +3,7 @@ import os from shutil import make_archive +from posttroll.message import Message from posttroll.testing import patched_publisher from upath import UPath @@ -52,7 +53,7 @@ def test_unpacking_directory(tmp_path): publisher_settings = dict(nameservers=False, port=1979) message_settings = dict(subject="/segment/olci/l2/", atype="dataset", data=dict(sensor="olci"), - unpack="directory") + unpack="directory", include_dir_in_uid=True) with patched_publisher() as messages: file_publisher_from_generator([[path, dict()]], @@ -61,3 +62,5 @@ def test_unpacking_directory(tmp_path): assert "my_dir/file1" in messages[0] assert "my_dir/file2" in messages[0] + msg = Message.decode(messages[0]) + assert msg.data["dataset"][0]["uid"].startswith("my_dir") From d9c937ebe49270867e572428007b53fa0cff3088 Mon Sep 17 00:00:00 2001 From: Martin Raspaud Date: Fri, 13 Sep 2024 15:11:39 +0200 Subject: [PATCH 2/2] Fix dhus to use netrc for unpacking --- src/pytroll_watchers/dhus_watcher.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pytroll_watchers/dhus_watcher.py b/src/pytroll_watchers/dhus_watcher.py index a8f26ba..96a33fc 100644 --- a/src/pytroll_watchers/dhus_watcher.py +++ b/src/pytroll_watchers/dhus_watcher.py @@ -21,12 +21,12 @@ name: s1_watcher message_config: subject: /segment/s1/l1b/ - atype: file + atype: dataset + unpack: zip aliases: sensor: SAR: SAR-C - """ import datetime as dt @@ -129,7 +129,7 @@ def generate_download_links(server, filter_params): for entry in entries["d"]["results"]: mda = dict() - path = UPath(entry["__metadata"]["media_src"]) + path = UPath(entry["__metadata"]["media_src"], client_kwargs=dict(trust_env=True)) mda["boundary"] = _extract_boundary_as_geojson(entry) attributes = _construct_attributes_dict(entry) mda["platform_name"] = attributes["Satellite name"].capitalize() + attributes["Satellite number"]