[2/2] [a] Fix: Can't use curl to download a single manifest in one in…

…vocation (#5918) Add a wait parameter option to the manifest endpoint
DataBiosphere · Aug 20, 2024 · 99cbd30 · 99cbd30
1 parent 0b60723
commit 99cbd30
Show file tree

Hide file tree

Showing 4 changed files with 110 additions and 31 deletions.
diff --git a/lambdas/service/app.py b/lambdas/service/app.py
@@ -739,6 +739,13 @@ def validate_json_param(name: str, value: str) -> MutableJSON:
         raise BRE(f'The {name!r} parameter is not valid JSON')
 
 
+def validate_wait(wait: str | None):
+    if wait is None or wait in ['0', '1']:
+        pass
+    else:
+        raise ValueError
+
+
 class Mandatory:
     """
     Validation wrapper signifying that a parameter is mandatory.
@@ -1313,6 +1320,29 @@ def get_summary():
                                              authentication=request.authentication)
 
 
+def wait_parameter_spec(*, default: int | None = None) -> JSON:
+    valid_values = [0, 1]
+    assert default in (None, *valid_values), default
+    return params.query(
+        'wait',
+        schema.optional(
+            schema.enum(*valid_values)
+            if default is None else
+            schema.with_default(default, type_=schema.enum(*valid_values))
+        ),
+        description=fd('''
+            If 0, the client is responsible for honoring the waiting period
+            specified in the `Retry-After` response header. If 1, the server
+            will delay the response in order to consume as much of that waiting
+            period as possible. This parameter should only be set to 1 by
+            clients who can't honor the `Retry-After` header, preventing them
+            from quickly exhausting the maximum number of redirects. If the
+            server cannot wait the full amount, any amount of wait time left
+            will still be returned in the `Retry-After` header of the response.
+        ''')
+    )
+
+
 post_manifest_example_url = (
     f'{app.base_url}/manifest/files'
     f'?catalog={list(config.catalogs.keys())[0]}'
@@ -1347,7 +1377,8 @@ def manifest_route(*, fetch: bool, initiate: bool, curl: bool = False):
             'parameters': [
                 params.path('token', str, description=fd('''
                     An opaque string representing the manifest preparation job
-                '''))
+                ''')),
+                *([] if fetch else [wait_parameter_spec()])
             ]
         },
         method_spec={
@@ -1465,6 +1496,7 @@ def manifest_route(*, fetch: bool, initiate: bool, curl: bool = False):
             '''),
             'parameters': [
                 catalog_param_spec,
+                *([wait_parameter_spec(default=1)] if curl else []),
                 filters_param_spec,
                 params.query(
                     'format',
@@ -1660,24 +1692,32 @@ def _file_manifest(fetch: bool, token_or_key: Optional[str] = None):
         and request.headers.get('content-type') == 'application/x-www-form-urlencoded'
         and request.raw_body != b''
     ):
-        raise BRE('The body must be empty for a POST request of content-type '
-                  '`application/x-www-form-urlencoded` to this endpoint')
+        raise BRE('POST requests to this endpoint must have an empty body if '
+                  'they specify a `Content-Type` header of '
+                  '`application/x-www-form-urlencoded`')
     query_params = request.query_params or {}
     _hoist_parameters(query_params, request)
+    if post:
+        query_params.setdefault('wait', '1')
     if token_or_key is None:
         query_params.setdefault('filters', '{}')
         # We list the `catalog` validator first so that the catalog is validated
         # before any other potentially catalog-dependent validators are invoked
         validate_params(query_params,
                         catalog=validate_catalog,
                         format=validate_manifest_format,
-                        filters=validate_filters)
+                        filters=validate_filters,
+                        **({'wait': validate_wait} if post else {}))
         # Now that the catalog is valid, we can provide the default format that
         # depends on it
         default_format = app.metadata_plugin.manifest_formats[0].value
         query_params.setdefault('format', default_format)
     else:
-        validate_params(query_params)
+        validate_params(query_params,
+                        # If the initial request was a POST to the non-fetch
+                        # endpoint, the 'wait' param will be carried over to
+                        # each subsequent GET request to the non-fetch endpoint.
+                        **({'wait': validate_wait} if not fetch else {}))
     return app.manifest_controller.get_manifest_async(query_params=query_params,
                                                       token_or_key=token_or_key,
                                                       fetch=fetch,
@@ -1724,21 +1764,7 @@ def generate_manifest(event: AnyJSON, _context: LambdaContext):
                 made. If that fails, the UUID of the file will be used instead.
             ''')
         ),
-        params.query(
-            'wait',
-            schema.optional(schema.with_default(0)),
-            description=fd('''
-                If 0, the client is responsible for honoring the waiting period
-                specified in the Retry-After response header. If 1, the server
-                will delay the response in order to consume as much of that
-                waiting period as possible. This parameter should only be set to
-                1 by clients who can't honor the `Retry-After` header,
-                preventing them from quickly exhausting the maximum number of
-                redirects. If the server cannot wait the full amount, any amount
-                of wait time left will still be returned in the Retry-After
-                header of the response.
-            ''')
-        ),
+        wait_parameter_spec(default=0),
         params.query(
             'replica',
             schema.optional(str),
@@ -1881,10 +1907,6 @@ def validate_replica(replica: str) -> None:
         if replica not in ('aws', 'gcp'):
             raise ValueError
 
-    def validate_wait(wait: str) -> None:
-        if wait not in ['0', '1']:
-            raise ValueError
-
     def validate_version(version: str) -> None:
         # This function exists so the repository plugin can be lazily loaded
         # instead of being loaded before `validate_params()` can run. This is

diff --git a/lambdas/service/openapi.json b/lambdas/service/openapi.json
@@ -9631,6 +9631,21 @@
                         },
                         "description": "The name of the catalog to query."
                     },
+                    {
+                        "name": "wait",
+                        "in": "query",
+                        "required": false,
+                        "schema": {
+                            "type": "integer",
+                            "format": "int64",
+                            "enum": [
+                                0,
+                                1
+                            ],
+                            "default": 1
+                        },
+                        "description": "\nIf 0, the client is responsible for honoring the waiting period\nspecified in the `Retry-After` response header. If 1, the server\nwill delay the response in order to consume as much of that waiting\nperiod as possible. This parameter should only be set to 1 by\nclients who can't honor the `Retry-After` header, preventing them\nfrom quickly exhausting the maximum number of redirects. If the\nserver cannot wait the full amount, any amount of wait time left\nwill still be returned in the `Retry-After` header of the response.\n"
+                    },
                     {
                         "name": "filters",
                         "in": "query",
@@ -10988,6 +11003,20 @@
                         "type": "string"
                     },
                     "description": "\nAn opaque string representing the manifest preparation job\n"
+                },
+                {
+                    "name": "wait",
+                    "in": "query",
+                    "required": false,
+                    "schema": {
+                        "type": "integer",
+                        "format": "int64",
+                        "enum": [
+                            0,
+                            1
+                        ]
+                    },
+                    "description": "\nIf 0, the client is responsible for honoring the waiting period\nspecified in the `Retry-After` response header. If 1, the server\nwill delay the response in order to consume as much of that waiting\nperiod as possible. This parameter should only be set to 1 by\nclients who can't honor the `Retry-After` header, preventing them\nfrom quickly exhausting the maximum number of redirects. If the\nserver cannot wait the full amount, any amount of wait time left\nwill still be returned in the `Retry-After` header of the response.\n"
                 }
             ],
             "get": {
@@ -12541,9 +12570,13 @@
                         "schema": {
                             "type": "integer",
                             "format": "int64",
+                            "enum": [
+                                0,
+                                1
+                            ],
                             "default": 0
                         },
-                        "description": "\nIf 0, the client is responsible for honoring the waiting period\nspecified in the Retry-After response header. If 1, the server\nwill delay the response in order to consume as much of that\nwaiting period as possible. This parameter should only be set to\n1 by clients who can't honor the `Retry-After` header,\npreventing them from quickly exhausting the maximum number of\nredirects. If the server cannot wait the full amount, any amount\nof wait time left will still be returned in the Retry-After\nheader of the response.\n"
+                        "description": "\nIf 0, the client is responsible for honoring the waiting period\nspecified in the `Retry-After` response header. If 1, the server\nwill delay the response in order to consume as much of that waiting\nperiod as possible. This parameter should only be set to 1 by\nclients who can't honor the `Retry-After` header, preventing them\nfrom quickly exhausting the maximum number of redirects. If the\nserver cannot wait the full amount, any amount of wait time left\nwill still be returned in the `Retry-After` header of the response.\n"
                     },
                     {
                         "name": "replica",
@@ -12677,9 +12710,13 @@
                         "schema": {
                             "type": "integer",
                             "format": "int64",
+                            "enum": [
+                                0,
+                                1
+                            ],
                             "default": 0
                         },
-                        "description": "\nIf 0, the client is responsible for honoring the waiting period\nspecified in the Retry-After response header. If 1, the server\nwill delay the response in order to consume as much of that\nwaiting period as possible. This parameter should only be set to\n1 by clients who can't honor the `Retry-After` header,\npreventing them from quickly exhausting the maximum number of\nredirects. If the server cannot wait the full amount, any amount\nof wait time left will still be returned in the Retry-After\nheader of the response.\n"
+                        "description": "\nIf 0, the client is responsible for honoring the waiting period\nspecified in the `Retry-After` response header. If 1, the server\nwill delay the response in order to consume as much of that waiting\nperiod as possible. This parameter should only be set to 1 by\nclients who can't honor the `Retry-After` header, preventing them\nfrom quickly exhausting the maximum number of redirects. If the\nserver cannot wait the full amount, any amount of wait time left\nwill still be returned in the `Retry-After` header of the response.\n"
                     },
                     {
                         "name": "replica",

diff --git a/src/azul/service/manifest_controller.py b/src/azul/service/manifest_controller.py
@@ -118,6 +118,7 @@ def get_manifest_async(self,
                            query_params: Mapping[str, str],
                            fetch: bool,
                            authentication: Optional[Authentication]):
+        wait = query_params.get('wait')
         if token_or_key is None:
             token, manifest_key = None, None
         else:
@@ -187,7 +188,9 @@ def get_manifest_async(self,
         body: dict[str, int | str | FlatJSON]
 
         if manifest is None:
-            url = self.manifest_url_func(fetch=fetch, token_or_key=token.encode())
+            url = self.manifest_url_func(fetch=fetch,
+                                         token_or_key=token.encode(),
+                                         **({} if wait is None else {'wait': wait}))
             body = {
                 'Status': 301,
                 'Location': str(url),
@@ -225,6 +228,17 @@ def get_manifest_async(self,
                 'CommandLine': self.service.command_lines(manifest, url, authentication)
             }
 
+        if wait is not None:
+            if wait == '0':
+                pass
+            elif wait == '1':
+                retry_after = body.get('Retry-After')
+                if retry_after is not None:
+                    time_slept = self.server_side_sleep(retry_after)
+                    body['Retry-After'] = round(retry_after - time_slept)
+            else:
+                assert False, wait
+
         # Note: Response objects returned without a 'Content-Type' header will
         # be given one of type 'application/json' as default by Chalice.
         # https://aws.github.io/chalice/tutorials/basicrestapi.html#customizing-the-http-response

diff --git a/test/integration_test.py b/test/integration_test.py
@@ -602,8 +602,9 @@ def _test_other_endpoints(self):
     def _test_manifest(self, catalog: CatalogName):
         supported_formats = self.metadata_plugin(catalog).manifest_formats
         assert supported_formats
-        for curl, format in chain(
-            product([False, True], [None, *supported_formats])
+        for curl, wait, format in chain(
+            product([False], [None], [None, *supported_formats]),
+            product([True], [None, 0, 1], [None, *supported_formats])
         ):
             filters = self._manifest_filters(catalog)
             if curl:
@@ -614,9 +615,10 @@ def _test_manifest(self, catalog: CatalogName):
                 fetch_modes = [first_fetch, not first_fetch]
             for fetch in fetch_modes:
                 with self.subTest('manifest', catalog=catalog, format=format,
-                                  fetch=fetch, curl=curl):
+                                  fetch=fetch, curl=curl, wait=wait):
                     args = dict(catalog=catalog,
-                                filters=json.dumps(filters))
+                                filters=json.dumps(filters),
+                                **({} if wait is None else {'wait': wait}))
                     if format is None:
                         format = first(supported_formats)
                     else:
@@ -892,6 +894,10 @@ def _get_url_content(self, method: str, url: furl) -> bytes:
                 retry_after = response.headers.get('Retry-After')
                 if retry_after is not None:
                     retry_after = float(retry_after)
+                    if url.args.get('wait') == 1:
+                        # The wait should have happened server-side and been
+                        # subtracted from the retry-after that was returned.
+                        self.assertEqual(0.0, retry_after)
                     log.info('Sleeping %.3fs to honor Retry-After header', retry_after)
                     time.sleep(retry_after)
                 url = furl(response.headers['Location'])