Skip to content

Commit

Permalink
Merge branch 'master' into lambdas-modern-checksums
Browse files Browse the repository at this point in the history
* master:
  Speedup promote in pkgpush (#3884)
  Catalog: adjust checksums-related constants and logic to adhere to the spec (#3887)
  shared: Tweak checksum types (#3888)
  Add new pkgpush types to quilt-shared (#3896)
  Add copy_file_list_fn parameter for Package._build() (#3895)
  Bump cryptography from 42.0.2 to 42.0.4 in /lambdas/indexer (#3891)
  pkgpush: adjust max_pool_connections (#3890)
  • Loading branch information
nl0 committed Feb 23, 2024
2 parents 66464f0 + 57095a7 commit cf160ff
Show file tree
Hide file tree
Showing 23 changed files with 471 additions and 102 deletions.
19 changes: 17 additions & 2 deletions api/python/quilt3/packages.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import tempfile
import textwrap
import time
import typing as T
import uuid
import warnings
from collections import deque
Expand Down Expand Up @@ -70,6 +71,16 @@
)


class CopyFileListFn(T.Protocol):
def __call__(
self,
file_list: T.List[T.Tuple[PhysicalKey, PhysicalKey, int]],
message: T.Optional[str] = None,
callback: T.Optional[T.Callable] = None,
) -> T.List[PhysicalKey]:
...


def _fix_docstring(**kwargs):
def f(wrapped):
if sys.flags.optimize < 2:
Expand Down Expand Up @@ -1360,12 +1371,16 @@ def push(

def _push(
self, name, registry=None, dest=None, message=None, selector_fn=None, *,
workflow, print_info, force: bool, dedupe: bool
workflow, print_info, force: bool, dedupe: bool,
copy_file_list_fn: T.Optional[CopyFileListFn] = None,
):
if selector_fn is None:
def selector_fn(*args):
return True

if copy_file_list_fn is None:
copy_file_list_fn = copy_file_list

validate_package_name(name)

if registry is None:
Expand Down Expand Up @@ -1490,7 +1505,7 @@ def check_hash_conficts(latest_hash):
entries.append((logical_key, entry))
file_list.append((physical_key, new_physical_key, entry.size))

results = copy_file_list(file_list, message="Copying objects")
results = copy_file_list_fn(file_list, message="Copying objects")

for (logical_key, entry), versioned_key in zip(entries, results):
# Create a new package entry pointing to the new remote key.
Expand Down
4 changes: 2 additions & 2 deletions catalog/app/containers/Bucket/PackageDialog/PackageDialog.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ import PACKAGE_EXISTS_QUERY from './gql/PackageExists.generated'
export const MAX_UPLOAD_SIZE = 20 * 1000 * 1000 * 1000 // 20GB
// XXX: keep in sync w/ the backend
// NOTE: these limits are lower than the actual "hard" limits on the backend
export const MAX_S3_SIZE = cfg.multipartChecksums
? 10 * 10 ** 12 // 10 TB
export const MAX_S3_SIZE = cfg.chunkedChecksums
? 5 * 10 ** 12 // 5 TB
: 50 * 10 ** 9 // 50 GB
export const MAX_FILE_COUNT = 1000

Expand Down
6 changes: 3 additions & 3 deletions catalog/app/model/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@ export type Collaborators = ReadonlyArray<
// Note that the actual user-defined meta is in the `user_meta` field
export type EntryMeta = (Types.JsonRecord & { user_meta?: Types.JsonRecord }) | null

export const CHECKSUM_TYPE_SP = 'SHA256' as const
export const CHECKSUM_TYPE_MP = 'QuiltMultipartSHA256' as const
export const CHECKSUM_TYPE_SHA256 = 'SHA256' as const
export const CHECKSUM_TYPE_SHA256_CHUNKED = 'sha2-256-chunked' as const
export interface Checksum {
type: typeof CHECKSUM_TYPE_SP | typeof CHECKSUM_TYPE_MP
type: typeof CHECKSUM_TYPE_SHA256 | typeof CHECKSUM_TYPE_SHA256_CHUNKED
value: string
}

Expand Down
4 changes: 2 additions & 2 deletions catalog/app/utils/Config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ export interface ConfigJson {
ssoAuth: AuthMethodConfig
ssoProviders: string

multipartChecksums?: boolean
chunkedChecksums?: boolean

build_version?: string // not sure where this comes from
}
Expand Down Expand Up @@ -93,7 +93,7 @@ const transformConfig = (cfg: ConfigJson) => ({
noDownload: !!cfg.noDownload,
noOverviewImages: !!cfg.noOverviewImages,
desktop: !!cfg.desktop,
multipartChecksums: !!cfg.multipartChecksums,
chunkedChecksums: !!cfg.chunkedChecksums,
})

export function prepareConfig(input: unknown) {
Expand Down
15 changes: 11 additions & 4 deletions catalog/app/utils/checksums/checksums-legacy.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,20 @@ import computeFileChecksumLimit from './checksums'
jest.mock(
'constants/config',
jest.fn(() => ({
multipartChecksums: false,
chunkedChecksums: false,
})),
)

describe('utils/checksums', () => {
describe('computeFileChecksumLimit, legacy and singlepart hashing', () => {
it('Hashes using singlepart', async () => {
describe('computeFileChecksumLimit, plain (legacy)', () => {
it('produces a correct checksum given an empty file', async () => {
const file = new File([], 'empty')
await expect(computeFileChecksumLimit(file)).resolves.toEqual({
value: 'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855',
type: 'SHA256',
})
})
it('produces a correct checksum given a file with the size below the threshold', async () => {
// Package manifest: https://open.quiltdata.com/b/allencell/tree/.quilt/packages/7acdd948d565d1f22c10f0d5ec4ae99742f04a4849c5e1498f252a0ac1ddeb04
// File in that package: https://open.quiltdata.com/b/allencell/packages/aics/wtc11_short_read_genome_sequence/tree/7acdd948d565d1f22c10f0d5ec4ae99742f04a4849c5e1498f252a0ac1ddeb04/README.md
const fileContents = '# This is a test package\n'
Expand All @@ -23,7 +30,7 @@ describe('utils/checksums', () => {
type: 'SHA256',
})
})
it('Hashes >8Mb file using legacy algorithm', async () => {
it('produces a correct checksum given a file with the size above the threshold', async () => {
// Package manifest: https://open.quiltdata.com/b/allencell/tree/.quilt/packages/38886848f1bad99396b96157101dd52520fa6aae0479adb9de4bde2b12997d92
// File in that package: https://open.quiltdata.com/b/allencell/packages/aics/actk/tree/38886848f1bad99396b96157101dd52520fa6aae0479adb9de4bde2b12997d92/master/diagnosticsheets/diagnostic_sheets/ProteinDisplayName_Alpha-actinin-1_1.png
const readFile = util.promisify(fs.readFile)
Expand Down
23 changes: 15 additions & 8 deletions catalog/app/utils/checksums/checksums.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,32 +7,39 @@ import computeFileChecksumLimit from './checksums'
jest.mock(
'constants/config',
jest.fn(() => ({
multipartChecksums: true,
chunkedChecksums: true,
})),
)

describe('utils/checksums', () => {
describe('computeFileChecksumLimit, multipart hashing', () => {
it('Hashes using singlepart', async () => {
describe('computeFileChecksumLimit, chunked', () => {
it('produces a correct checksum given an empty file', async () => {
const file = new File([], 'empty')
await expect(computeFileChecksumLimit(file)).resolves.toEqual({
value: '47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=',
type: 'sha2-256-chunked',
})
})
it('produces a correct checksum given a file with the size below the threshold', async () => {
// Package manifest: https://open.quiltdata.com/b/allencell/tree/.quilt/packages/7acdd948d565d1f22c10f0d5ec4ae99742f04a4849c5e1498f252a0ac1ddeb04
// File in that package: https://open.quiltdata.com/b/allencell/tree/aics/wtc11_short_read_genome_sequence/README.md?version=qt7oZnXdqJ0vokH1MpXksOiwgqOPPHV2
const fileContents = '# This is a test package\n'
const file = new File([Buffer.from(fileContents)], 'junk/s3.js')
await expect(computeFileChecksumLimit(file)).resolves.toEqual({
value: 'e5c0b36103e96037f4892e73e457ad62753c1c5ad50c3bb0610fad268666f1ea',
type: 'SHA256',
value: 'JvRxeMunq4eK7c1I8s4YNg2ajaE9BtNEg/0pdpEGv58=',
type: 'sha2-256-chunked',
})
})
it('Hashes >8Mb file', async () => {
it('produces a correct checksum given a file with the size above the threshold', async () => {
const readFile = util.promisify(fs.readFile)
const contents = await readFile(path.join(__dirname, './checksums-11mb-test.png'))
const file = new File(
[contents],
'master/diagnosticsheets/diagnostic_sheets/ProteinDisplayName_Alpha-actinin-1_1.png',
)
await expect(computeFileChecksumLimit(file)).resolves.toEqual({
value: '7y6ba70KDRATjq9HezYMTKLluvYXP5g+CSJL9EwnjiY=-2',
type: 'QuiltMultipartSHA256',
value: '7y6ba70KDRATjq9HezYMTKLluvYXP5g+CSJL9EwnjiY=',
type: 'sha2-256-chunked',
})
})
})
Expand Down
22 changes: 10 additions & 12 deletions catalog/app/utils/checksums/checksums.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ const MAX_PARTS = 10000 // Maximum number of parts per upload supported by S3

export function getPartSize(fileSize: number): number | null {
// use single-part upload (and plain SHA256 hash)
if (!cfg.multipartChecksums || fileSize < MIN_PART_SIZE) return null
if (fileSize < MIN_PART_SIZE) return null

// NOTE: in the case where fileSize is exactly equal to MIN_PART_SIZE
// boto creates a 1-part multipart upload :shrug:
Expand All @@ -25,14 +25,14 @@ export function getPartSize(fileSize: number): number | null {
return partSize
}

const singlepart = (value: ArrayBuffer): Model.Checksum => ({
const plain = (value: ArrayBuffer): Model.Checksum => ({
value: Buffer.from(value).toString('hex'),
type: Model.CHECKSUM_TYPE_SP,
type: Model.CHECKSUM_TYPE_SHA256,
})

const multipart = (value: ArrayBuffer, partCount: number): Model.Checksum => ({
value: `${Buffer.from(value).toString('base64')}-${partCount}`,
type: Model.CHECKSUM_TYPE_MP,
const chunked = (value: ArrayBuffer): Model.Checksum => ({
value: Buffer.from(value).toString('base64'),
type: Model.CHECKSUM_TYPE_SHA256_CHUNKED,
})

function mergeBuffers(buffers: ArrayBuffer[]) {
Expand Down Expand Up @@ -60,13 +60,11 @@ const hashBlobLimit = (blob: Blob) => blobLimit(hashBlob, blob)
async function computeFileChecksum(f: File): Promise<Model.Checksum> {
if (!crypto?.subtle?.digest) throw new Error('Crypto API unavailable')

const partSize = getPartSize(f.size)
if (!cfg.chunkedChecksums) return plain(await hashBlobLimit(f))

// single part
if (partSize === null) return singlepart(await hashBlobLimit(f))

// multipart
const partSize = getPartSize(f.size) ?? f.size
const parts: Blob[] = []

let offset = 0
while (offset < f.size) {
const end = Math.min(offset + partSize, f.size)
Expand All @@ -76,7 +74,7 @@ async function computeFileChecksum(f: File): Promise<Model.Checksum> {

const checksums = await Promise.all(parts.map(hashBlobLimit))
const value = await crypto.subtle.digest('SHA-256', mergeBuffers(checksums))
return multipart(value, parts.length)
return chunked(value)
}

const computeFileChecksumLimit = (f: File) => fileLimit(computeFileChecksum, f)
Expand Down
4 changes: 2 additions & 2 deletions catalog/config-schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,9 @@
"desktop": {
"type": "boolean"
},
"multipartChecksums": {
"chunkedChecksums": {
"type": "boolean",
"description": "Whether to use multipart checksums when creating packages via the Catalog UI."
"description": "Whether to use chunked checksums when creating / modifying packages via the Catalog UI."
},
"build_version": {
"type": "string",
Expand Down
2 changes: 1 addition & 1 deletion catalog/config.json.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,5 @@
"analyticsBucket": "${ANALYTICS_BUCKET}",
"serviceBucket": "${SERVICE_BUCKET}",
"mode": "${CATALOG_MODE}",
"multipartChecksums": ${MULTIPART_CHECKSUMS}
"chunkedChecksums": ${CHUNKED_CHECKSUMS}
}
2 changes: 1 addition & 1 deletion docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ Entries inside each section should be ordered by type:
## CLI

## Catalog, Lambdas
* [Added] Support multipart checksums ([#3403](https://github.com/quiltdata/quilt/pull/3403))
* [Added] Support chunked checksums ([#3403](https://github.com/quiltdata/quilt/pull/3403), [#3887](https://github.com/quiltdata/quilt/pull/3887))
* [Added] Search: Help link to ElasticSearch docs ([#3861](https://github.com/quiltdata/quilt/pull/3861))
* [Fixed] Faceted Search: show helpful message in case of search query syntax errors ([#3821](https://github.com/quiltdata/quilt/pull/3821))
* [Fixed] JsonEditor: fix changing collections items, that have `.additionalProperties` or `.items` JSON Schema ([#3860](https://github.com/quiltdata/quilt/pull/3860))
Expand Down
2 changes: 1 addition & 1 deletion lambdas/indexer/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ certifi==2023.7.22
cffi==1.15.1
chardet==3.0.4
charset-normalizer==2.0.12
cryptography==42.0.2
cryptography==42.0.4
decorator==4.4.0
docutils==0.15.2
# This is the highest version that's less than our pinned ES version of 6.7
Expand Down
1 change: 1 addition & 0 deletions lambdas/pkgpush/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ where verb is one of

## Changes

- [Changed] Speed-up copying of large files during promotion ([#3884](https://github.com/quiltdata/quilt/pull/3884))
- [Changed] Bump quilt3 to set max_pool_connections, this improves performance ([#3870](https://github.com/quiltdata/quilt/pull/3870))
- [Changed] Compute multipart checksums ([#3402](https://github.com/quiltdata/quilt/pull/3402))
- [Added] Bootstrap the change log ([#3402](https://github.com/quiltdata/quilt/pull/3402))
2 changes: 1 addition & 1 deletion lambdas/pkgpush/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ pyyaml==6.0.1
# via quilt3
quilt-shared[boto,pydantic,quilt] @ git+https://github.com/quiltdata/quilt@47055f7c5c0a93ddddfa5030a73b22a5d42b9c10#subdirectory=py-shared
# via t4_lambda_pkgpush (setup.py)
quilt3 @ git+https://github.com/quiltdata/quilt@299b1da851004386ab43423172c4405997fd9c53#subdirectory=api/python
quilt3 @ git+https://github.com/quiltdata/quilt@5c2b79128fe4d5d1e6093ff6a7d11d09d3315843#subdirectory=api/python
# via
# quilt-shared
# t4_lambda_pkgpush (setup.py)
Expand Down
2 changes: 1 addition & 1 deletion lambdas/pkgpush/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"pydantic ~= 1.10",
(
"quilt3 @ git+https://github.com/quiltdata/quilt@"
"299b1da851004386ab43423172c4405997fd9c53"
"5c2b79128fe4d5d1e6093ff6a7d11d09d3315843"
"#subdirectory=api/python"
),
(
Expand Down
Loading

0 comments on commit cf160ff

Please sign in to comment.