From e00bef57682a0bcb8a9ffee012dec09fb5d9e391 Mon Sep 17 00:00:00 2001 From: Wolf Vollprecht Date: Mon, 9 Jan 2023 10:18:36 +0100 Subject: [PATCH 1/3] initial cep for repodata state --- cep-repodata-state.md | 79 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 cep-repodata-state.md diff --git a/cep-repodata-state.md b/cep-repodata-state.md new file mode 100644 index 00000000..3f7ad2a2 --- /dev/null +++ b/cep-repodata-state.md @@ -0,0 +1,79 @@ + + + + + + + + +
Title .state.json files for repodata metadata
Status Draft
Author(s) Wolf Vollprecht <wolf@prefix.dev>
Created Jan 09, 2023
Updated Jan 09, 2023
Discussion https://conda.slack.com/archives/C017F7C0VM3/p1672669131100819
Implementation https://github.com/mamba-org/mamba/pull/2113
+ +## Abstract + +Changing how conda and mamba store metadata about repodata.json downloads. + +### Motivation + +When conda currently downloads `repodata.json` files from the internet, it stores metadata "inside" the file by adding some JSON keys: + +- `_url`: The URL that was requested +- `_etag`: ETag returned from server +- `_mod`: Last-Modified header from server +- `_cache_control`: Cache-Control header from server + +These are stored as three string values. + +This is not an ideal approach as it modifies the `repodata.json` file and corrupts e.g. the hash of the file. Also, the repodata files have gotten increasingly large, and parsing these state values can require parsing a large `json` file. + +Therefore we propose to store the metadata in a secondary file called `.state.json` file next to the repodata. + +Another motivating factor is that for the `jlap` proposal we need to (repeatedly) compute the hash value of the `repodata.json` file -- that only gives correct results straight away when the repodata is stored externally. + +Both mamba and conda currently use the same cache folder. If both don't implement the same storage strategy but continue to share the same repodata cache, it would lead to frequent cache busting. + +### Specification + +```json +{ + // we ensure that state.json and .json files are in sync by storing the file + // last modified time in the state file, as well as the file size + "file_mtime": { + // seconds and nanoseconds counted from UNIX timestamp (1970-01-01) + "seconds": "", + "nanoseconds": "" + }, + "file_size": "", // file size in bytes + + // The header values as before + "url": "", + "etag: "", + "mod": "", + "cache_control": "", + + // these are alternative encodings of the repodata.json that + // can be used for faster downloading + // both `has_zst` and `has_jlap` keys are optional but should be kept + // even if the other data times out or `file_mtime` does not match + "has_zst": { + // UTC RFC3999 timestamp of when we last checked wether the file is available or not + // in this case the `repodata.json.zst` file + // Note: same format as conda TUF spec + "last_checked": "2023-01-08T11:45:44Z", + // false = unavailable, true = available + "value": false + }, + "has_jlap": { + // same format as `has_zst` + } +} +``` + +If the `state.json` file_mtime or file_size does not match the `.json` file actual `mtime`, the header values are discarded. However, the `has_zst` or `has_jlap` values are kept as they are independent from the repodata validity on disk. + +### Backward compatibility + +Older clients that try to reuse the existing cache will not be able to make use of the cached repodata as they do not know about the state (since it's not written to the same location). That means they will redownload the repodata. + +## Copyright + +All CEPs are explicitly [CC0 1.0 Universal](https://creativecommons.org/publicdomain/zero/1.0/). \ No newline at end of file From 33180fc69f855eb5188fa2c64f8160475da20a82 Mon Sep 17 00:00:00 2001 From: Wolf Vollprecht Date: Mon, 9 Jan 2023 10:23:12 +0100 Subject: [PATCH 2/3] fix syntax highlighting --- cep-repodata-state.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/cep-repodata-state.md b/cep-repodata-state.md index 3f7ad2a2..3db33a02 100644 --- a/cep-repodata-state.md +++ b/cep-repodata-state.md @@ -33,22 +33,22 @@ Both mamba and conda currently use the same cache folder. If both don't implemen ### Specification -```json +```json5 { // we ensure that state.json and .json files are in sync by storing the file // last modified time in the state file, as well as the file size "file_mtime": { // seconds and nanoseconds counted from UNIX timestamp (1970-01-01) - "seconds": "", - "nanoseconds": "" + "seconds": INTEGER, + "nanoseconds": INTEGER }, - "file_size": "", // file size in bytes + "file_size": INTEGER, // file size in bytes // The header values as before - "url": "", - "etag: "", - "mod": "", - "cache_control": "", + "url": STRING, + "etag": STRING, + "mod": STRING, + "cache_control": STRING, // these are alternative encodings of the repodata.json that // can be used for faster downloading @@ -60,7 +60,7 @@ Both mamba and conda currently use the same cache folder. If both don't implemen // Note: same format as conda TUF spec "last_checked": "2023-01-08T11:45:44Z", // false = unavailable, true = available - "value": false + "value": BOOLEAN }, "has_jlap": { // same format as `has_zst` @@ -76,4 +76,4 @@ Older clients that try to reuse the existing cache will not be able to make use ## Copyright -All CEPs are explicitly [CC0 1.0 Universal](https://creativecommons.org/publicdomain/zero/1.0/). \ No newline at end of file +All CEPs are explicitly [CC0 1.0 Universal](https://creativecommons.org/publicdomain/zero/1.0/). From d34ea8de9138616fadff3497119355d1a3612a35 Mon Sep 17 00:00:00 2001 From: Daniel Holth Date: Mon, 9 Jan 2023 14:00:30 -0500 Subject: [PATCH 3/3] proposed edits --- cep-repodata-state.md | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/cep-repodata-state.md b/cep-repodata-state.md index 3db33a02..48d9b796 100644 --- a/cep-repodata-state.md +++ b/cep-repodata-state.md @@ -21,7 +21,7 @@ When conda currently downloads `repodata.json` files from the internet, it store - `_mod`: Last-Modified header from server - `_cache_control`: Cache-Control header from server -These are stored as three string values. +These are stored as three string values. This is not an ideal approach as it modifies the `repodata.json` file and corrupts e.g. the hash of the file. Also, the repodata files have gotten increasingly large, and parsing these state values can require parsing a large `json` file. @@ -37,12 +37,10 @@ Both mamba and conda currently use the same cache folder. If both don't implemen { // we ensure that state.json and .json files are in sync by storing the file // last modified time in the state file, as well as the file size - "file_mtime": { - // seconds and nanoseconds counted from UNIX timestamp (1970-01-01) - "seconds": INTEGER, - "nanoseconds": INTEGER - }, - "file_size": INTEGER, // file size in bytes + + // seconds and nanoseconds counted from UNIX timestamp (1970-01-01) + "mtime_ns": INTEGER, + "size": INTEGER, // file size in bytes // The header values as before "url": STRING, @@ -50,7 +48,7 @@ Both mamba and conda currently use the same cache folder. If both don't implemen "mod": STRING, "cache_control": STRING, - // these are alternative encodings of the repodata.json that + // these are alternative encodings of the repodata.json that // can be used for faster downloading // both `has_zst` and `has_jlap` keys are optional but should be kept // even if the other data times out or `file_mtime` does not match @@ -58,18 +56,25 @@ Both mamba and conda currently use the same cache folder. If both don't implemen // UTC RFC3999 timestamp of when we last checked wether the file is available or not // in this case the `repodata.json.zst` file // Note: same format as conda TUF spec + // Python's time.time_ns() would be convenient? "last_checked": "2023-01-08T11:45:44Z", // false = unavailable, true = available "value": BOOLEAN }, "has_jlap": { // same format as `has_zst` - } + }, + + "jlap": { } // unspecified additional state for jlap when available } ``` If the `state.json` file_mtime or file_size does not match the `.json` file actual `mtime`, the header values are discarded. However, the `has_zst` or `has_jlap` values are kept as they are independent from the repodata validity on disk. +If the client is tracking `repodata.json.zst` or `repodata.jlap` instead of +`(current_)?repodata.json`, then `etag`/`mod`/`cache_control` will correspond to +those remote files, instead of `repodata.json`. + ### Backward compatibility Older clients that try to reuse the existing cache will not be able to make use of the cached repodata as they do not know about the state (since it's not written to the same location). That means they will redownload the repodata.