Skip to content

Commit

Permalink
Catalog: adjust checksums-related constants and logic to adhere to th…
Browse files Browse the repository at this point in the history
…e spec (#3887)
  • Loading branch information
nl0 authored Feb 23, 2024
1 parent 47055f7 commit 84867b0
Show file tree
Hide file tree
Showing 9 changed files with 47 additions and 35 deletions.
4 changes: 2 additions & 2 deletions catalog/app/containers/Bucket/PackageDialog/PackageDialog.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ import PACKAGE_EXISTS_QUERY from './gql/PackageExists.generated'
export const MAX_UPLOAD_SIZE = 20 * 1000 * 1000 * 1000 // 20GB
// XXX: keep in sync w/ the backend
// NOTE: these limits are lower than the actual "hard" limits on the backend
export const MAX_S3_SIZE = cfg.multipartChecksums
? 10 * 10 ** 12 // 10 TB
export const MAX_S3_SIZE = cfg.chunkedChecksums
? 5 * 10 ** 12 // 5 TB
: 50 * 10 ** 9 // 50 GB
export const MAX_FILE_COUNT = 1000

Expand Down
6 changes: 3 additions & 3 deletions catalog/app/model/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@ export type Collaborators = ReadonlyArray<
// Note that the actual user-defined meta is in the `user_meta` field
export type EntryMeta = (Types.JsonRecord & { user_meta?: Types.JsonRecord }) | null

export const CHECKSUM_TYPE_SP = 'SHA256' as const
export const CHECKSUM_TYPE_MP = 'QuiltMultipartSHA256' as const
export const CHECKSUM_TYPE_SHA256 = 'SHA256' as const
export const CHECKSUM_TYPE_SHA256_CHUNKED = 'sha2-256-chunked' as const
export interface Checksum {
type: typeof CHECKSUM_TYPE_SP | typeof CHECKSUM_TYPE_MP
type: typeof CHECKSUM_TYPE_SHA256 | typeof CHECKSUM_TYPE_SHA256_CHUNKED
value: string
}

Expand Down
4 changes: 2 additions & 2 deletions catalog/app/utils/Config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ export interface ConfigJson {
ssoAuth: AuthMethodConfig
ssoProviders: string

multipartChecksums?: boolean
chunkedChecksums?: boolean

build_version?: string // not sure where this comes from
}
Expand Down Expand Up @@ -93,7 +93,7 @@ const transformConfig = (cfg: ConfigJson) => ({
noDownload: !!cfg.noDownload,
noOverviewImages: !!cfg.noOverviewImages,
desktop: !!cfg.desktop,
multipartChecksums: !!cfg.multipartChecksums,
chunkedChecksums: !!cfg.chunkedChecksums,
})

export function prepareConfig(input: unknown) {
Expand Down
15 changes: 11 additions & 4 deletions catalog/app/utils/checksums/checksums-legacy.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,20 @@ import computeFileChecksumLimit from './checksums'
jest.mock(
'constants/config',
jest.fn(() => ({
multipartChecksums: false,
chunkedChecksums: false,
})),
)

describe('utils/checksums', () => {
describe('computeFileChecksumLimit, legacy and singlepart hashing', () => {
it('Hashes using singlepart', async () => {
describe('computeFileChecksumLimit, plain (legacy)', () => {
it('produces a correct checksum given an empty file', async () => {
const file = new File([], 'empty')
await expect(computeFileChecksumLimit(file)).resolves.toEqual({
value: 'e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855',
type: 'SHA256',
})
})
it('produces a correct checksum given a file with the size below the threshold', async () => {
// Package manifest: https://open.quiltdata.com/b/allencell/tree/.quilt/packages/7acdd948d565d1f22c10f0d5ec4ae99742f04a4849c5e1498f252a0ac1ddeb04
// File in that package: https://open.quiltdata.com/b/allencell/packages/aics/wtc11_short_read_genome_sequence/tree/7acdd948d565d1f22c10f0d5ec4ae99742f04a4849c5e1498f252a0ac1ddeb04/README.md
const fileContents = '# This is a test package\n'
Expand All @@ -23,7 +30,7 @@ describe('utils/checksums', () => {
type: 'SHA256',
})
})
it('Hashes >8Mb file using legacy algorithm', async () => {
it('produces a correct checksum given a file with the size above the threshold', async () => {
// Package manifest: https://open.quiltdata.com/b/allencell/tree/.quilt/packages/38886848f1bad99396b96157101dd52520fa6aae0479adb9de4bde2b12997d92
// File in that package: https://open.quiltdata.com/b/allencell/packages/aics/actk/tree/38886848f1bad99396b96157101dd52520fa6aae0479adb9de4bde2b12997d92/master/diagnosticsheets/diagnostic_sheets/ProteinDisplayName_Alpha-actinin-1_1.png
const readFile = util.promisify(fs.readFile)
Expand Down
23 changes: 15 additions & 8 deletions catalog/app/utils/checksums/checksums.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,32 +7,39 @@ import computeFileChecksumLimit from './checksums'
jest.mock(
'constants/config',
jest.fn(() => ({
multipartChecksums: true,
chunkedChecksums: true,
})),
)

describe('utils/checksums', () => {
describe('computeFileChecksumLimit, multipart hashing', () => {
it('Hashes using singlepart', async () => {
describe('computeFileChecksumLimit, chunked', () => {
it('produces a correct checksum given an empty file', async () => {
const file = new File([], 'empty')
await expect(computeFileChecksumLimit(file)).resolves.toEqual({
value: '47DEQpj8HBSa+/TImW+5JCeuQeRkm5NMpJWZG3hSuFU=',
type: 'sha2-256-chunked',
})
})
it('produces a correct checksum given a file with the size below the threshold', async () => {
// Package manifest: https://open.quiltdata.com/b/allencell/tree/.quilt/packages/7acdd948d565d1f22c10f0d5ec4ae99742f04a4849c5e1498f252a0ac1ddeb04
// File in that package: https://open.quiltdata.com/b/allencell/tree/aics/wtc11_short_read_genome_sequence/README.md?version=qt7oZnXdqJ0vokH1MpXksOiwgqOPPHV2
const fileContents = '# This is a test package\n'
const file = new File([Buffer.from(fileContents)], 'junk/s3.js')
await expect(computeFileChecksumLimit(file)).resolves.toEqual({
value: 'e5c0b36103e96037f4892e73e457ad62753c1c5ad50c3bb0610fad268666f1ea',
type: 'SHA256',
value: 'JvRxeMunq4eK7c1I8s4YNg2ajaE9BtNEg/0pdpEGv58=',
type: 'sha2-256-chunked',
})
})
it('Hashes >8Mb file', async () => {
it('produces a correct checksum given a file with the size above the threshold', async () => {
const readFile = util.promisify(fs.readFile)
const contents = await readFile(path.join(__dirname, './checksums-11mb-test.png'))
const file = new File(
[contents],
'master/diagnosticsheets/diagnostic_sheets/ProteinDisplayName_Alpha-actinin-1_1.png',
)
await expect(computeFileChecksumLimit(file)).resolves.toEqual({
value: '7y6ba70KDRATjq9HezYMTKLluvYXP5g+CSJL9EwnjiY=-2',
type: 'QuiltMultipartSHA256',
value: '7y6ba70KDRATjq9HezYMTKLluvYXP5g+CSJL9EwnjiY=',
type: 'sha2-256-chunked',
})
})
})
Expand Down
22 changes: 10 additions & 12 deletions catalog/app/utils/checksums/checksums.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ const MAX_PARTS = 10000 // Maximum number of parts per upload supported by S3

export function getPartSize(fileSize: number): number | null {
// use single-part upload (and plain SHA256 hash)
if (!cfg.multipartChecksums || fileSize < MIN_PART_SIZE) return null
if (fileSize < MIN_PART_SIZE) return null

// NOTE: in the case where fileSize is exactly equal to MIN_PART_SIZE
// boto creates a 1-part multipart upload :shrug:
Expand All @@ -25,14 +25,14 @@ export function getPartSize(fileSize: number): number | null {
return partSize
}

const singlepart = (value: ArrayBuffer): Model.Checksum => ({
const plain = (value: ArrayBuffer): Model.Checksum => ({
value: Buffer.from(value).toString('hex'),
type: Model.CHECKSUM_TYPE_SP,
type: Model.CHECKSUM_TYPE_SHA256,
})

const multipart = (value: ArrayBuffer, partCount: number): Model.Checksum => ({
value: `${Buffer.from(value).toString('base64')}-${partCount}`,
type: Model.CHECKSUM_TYPE_MP,
const chunked = (value: ArrayBuffer): Model.Checksum => ({
value: Buffer.from(value).toString('base64'),
type: Model.CHECKSUM_TYPE_SHA256_CHUNKED,
})

function mergeBuffers(buffers: ArrayBuffer[]) {
Expand Down Expand Up @@ -60,13 +60,11 @@ const hashBlobLimit = (blob: Blob) => blobLimit(hashBlob, blob)
async function computeFileChecksum(f: File): Promise<Model.Checksum> {
if (!crypto?.subtle?.digest) throw new Error('Crypto API unavailable')

const partSize = getPartSize(f.size)
if (!cfg.chunkedChecksums) return plain(await hashBlobLimit(f))

// single part
if (partSize === null) return singlepart(await hashBlobLimit(f))

// multipart
const partSize = getPartSize(f.size) ?? f.size
const parts: Blob[] = []

let offset = 0
while (offset < f.size) {
const end = Math.min(offset + partSize, f.size)
Expand All @@ -76,7 +74,7 @@ async function computeFileChecksum(f: File): Promise<Model.Checksum> {

const checksums = await Promise.all(parts.map(hashBlobLimit))
const value = await crypto.subtle.digest('SHA-256', mergeBuffers(checksums))
return multipart(value, parts.length)
return chunked(value)
}

const computeFileChecksumLimit = (f: File) => fileLimit(computeFileChecksum, f)
Expand Down
4 changes: 2 additions & 2 deletions catalog/config-schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,9 @@
"desktop": {
"type": "boolean"
},
"multipartChecksums": {
"chunkedChecksums": {
"type": "boolean",
"description": "Whether to use multipart checksums when creating packages via the Catalog UI."
"description": "Whether to use chunked checksums when creating / modifying packages via the Catalog UI."
},
"build_version": {
"type": "string",
Expand Down
2 changes: 1 addition & 1 deletion catalog/config.json.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,5 @@
"analyticsBucket": "${ANALYTICS_BUCKET}",
"serviceBucket": "${SERVICE_BUCKET}",
"mode": "${CATALOG_MODE}",
"multipartChecksums": ${MULTIPART_CHECKSUMS}
"chunkedChecksums": ${CHUNKED_CHECKSUMS}
}
2 changes: 1 addition & 1 deletion docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ Entries inside each section should be ordered by type:
## CLI

## Catalog, Lambdas
* [Added] Support multipart checksums ([#3403](https://github.com/quiltdata/quilt/pull/3403))
* [Added] Support chunked checksums ([#3403](https://github.com/quiltdata/quilt/pull/3403), [#3887](https://github.com/quiltdata/quilt/pull/3887))
* [Added] Search: Help link to ElasticSearch docs ([#3861](https://github.com/quiltdata/quilt/pull/3861))
* [Fixed] Faceted Search: show helpful message in case of search query syntax errors ([#3821](https://github.com/quiltdata/quilt/pull/3821))
* [Fixed] JsonEditor: fix changing collections items, that have `.additionalProperties` or `.items` JSON Schema ([#3860](https://github.com/quiltdata/quilt/pull/3860))
Expand Down

0 comments on commit 84867b0

Please sign in to comment.