diff --git a/FileAPI/file/send-file-form-iso-2022-jp.tentative.html b/FileAPI/file/send-file-form-iso-2022-jp.tentative.html new file mode 100644 index 0000000000..421de30129 --- /dev/null +++ b/FileAPI/file/send-file-form-iso-2022-jp.tentative.html @@ -0,0 +1,71 @@ + + +Upload files in ISO-2022-JP form (tentative) + + + + + + + + + + diff --git a/FileAPI/file/send-file-form-utf-8.html b/FileAPI/file/send-file-form-utf-8.html new file mode 100644 index 0000000000..03417ba72e --- /dev/null +++ b/FileAPI/file/send-file-form-utf-8.html @@ -0,0 +1,61 @@ + + +Upload files in UTF-8 form + + + + + + + + diff --git a/FileAPI/file/send-file-form-windows-1252.tentative.html b/FileAPI/file/send-file-form-windows-1252.tentative.html new file mode 100644 index 0000000000..8e9463f83a --- /dev/null +++ b/FileAPI/file/send-file-form-windows-1252.tentative.html @@ -0,0 +1,70 @@ + + +Upload files in Windows-1252 form (tentative) + + + + + + + + + + diff --git a/FileAPI/file/send-file-form-x-user-defined.tentative.html b/FileAPI/file/send-file-form-x-user-defined.tentative.html new file mode 100644 index 0000000000..072e3bb1e0 --- /dev/null +++ b/FileAPI/file/send-file-form-x-user-defined.tentative.html @@ -0,0 +1,70 @@ + + +Upload files in x-user-defined form (tentative) + + + + + + + + + + diff --git a/FileAPI/file/send-file-form.html b/FileAPI/file/send-file-form.html new file mode 100644 index 0000000000..baa8d4286c --- /dev/null +++ b/FileAPI/file/send-file-form.html @@ -0,0 +1,25 @@ + + +Upload ASCII-named file in UTF-8 form + + + + + + + + diff --git a/FileAPI/support/send-file-form-helper.js b/FileAPI/support/send-file-form-helper.js new file mode 100644 index 0000000000..a7522c7b08 --- /dev/null +++ b/FileAPI/support/send-file-form-helper.js @@ -0,0 +1,249 @@ +'use strict'; + +// Rationale for this particular test character sequence, which is +// used in filenames and also in file contents: +// +// - ABC~ ensures the string starts with something we can read to +// ensure it is from the correct source; ~ is used because even +// some 1-byte otherwise-ASCII-like parts of ISO-2022-JP +// interpret it differently. +// - ‾¥ are inside a single-byte range of ISO-2022-JP and help +// diagnose problems due to filesystem encoding or locale +// - ≈ is inside IBM437 and helps diagnose problems due to filesystem +// encoding or locale +// - ¤ is inside Latin-1 and helps diagnose problems due to +// filesystem encoding or locale; it is also the "simplest" case +// needing substitution in ISO-2022-JP +// - ・ is inside a single-byte range of ISO-2022-JP in some variants +// and helps diagnose problems due to filesystem encoding or locale; +// on the web it is distinct when decoding but unified when encoding +// - ・ is inside a double-byte range of ISO-2022-JP and helps +// diagnose problems due to filesystem encoding or locale +// - • is inside Windows-1252 and helps diagnose problems due to +// filesystem encoding or locale and also ensures these aren't +// accidentally turned into e.g. control codes +// - ∙ is inside IBM437 and helps diagnose problems due to filesystem +// encoding or locale +// - · is inside Latin-1 and helps diagnose problems due to +// filesystem encoding or locale and also ensures HTML named +// character references (e.g. ·) are not used +// - ☼ is inside IBM437 shadowing C0 and helps diagnose problems due to +// filesystem encoding or locale and also ensures these aren't +// accidentally turned into e.g. control codes +// - ★ is inside ISO-2022-JP on a non-Kanji page and makes correct +// output easier to spot +// - 星 is inside ISO-2022-JP on a Kanji page and makes correct +// output easier to spot +// - 🌟 is outside the BMP and makes incorrect surrogate pair +// substitution detectable and ensures substitutions work +// correctly immediately after Kanji 2-byte ISO-2022-JP +// - 星 repeated here ensures the correct codec state is used +// after a non-BMP substitution +// - ★ repeated here also makes correct output easier to spot +// - ☼ is inside IBM437 shadowing C0 and helps diagnose problems due to +// filesystem encoding or locale and also ensures these aren't +// accidentally turned into e.g. control codes and also ensures +// substitutions work correctly immediately after non-Kanji +// 2-byte ISO-2022-JP +// - · is inside Latin-1 and helps diagnose problems due to +// filesystem encoding or locale and also ensures HTML named +// character references (e.g. ·) are not used +// - ∙ is inside IBM437 and helps diagnose problems due to filesystem +// encoding or locale +// - • is inside Windows-1252 and again helps diagnose problems +// due to filesystem encoding or locale +// - ・ is inside a double-byte range of ISO-2022-JP and helps +// diagnose problems due to filesystem encoding or locale +// - ・ is inside a single-byte range of ISO-2022-JP in some variants +// and helps diagnose problems due to filesystem encoding or locale; +// on the web it is distinct when decoding but unified when encoding +// - ¤ is inside Latin-1 and helps diagnose problems due to +// filesystem encoding or locale; again it is a "simple" +// substitution case +// - ≈ is inside IBM437 and helps diagnose problems due to filesystem +// encoding or locale +// - ¥‾ are inside a single-byte range of ISO-2022-JP and help +// diagnose problems due to filesystem encoding or locale +// - ~XYZ ensures earlier errors don't lead to misencoding of +// simple ASCII +// +// Overall the near-symmetry makes common I18N mistakes like +// off-by-1-after-non-BMP easier to spot. All the characters +// are also allowed in Windows Unicode filenames. +const kTestChars = 'ABC~‾¥≈¤・・•∙·☼★星🌟星★☼·∙•・・¤≈¥‾~XYZ'; + +// NOTE: The expected interpretation of ISO-2022-JP according to +// https://encoding.spec.whatwg.org/#iso-2022-jp-encoder unifies +// single-byte and double-byte katakana. +const kTestFallbackIso2022jp = + ('ABC~\x1B(J~\\≈¤\x1B$B!&!&\x1B(B•∙·☼\x1B$B!z@1\x1B(B🌟' + + '\x1B$B@1!z\x1B(B☼·∙•\x1B$B!&!&\x1B(B¤≈\x1B(J\\~\x1B(B~XYZ').replace( + /[^\0-\x7F]/gu, + x => `&#${x.codePointAt(0)};`); + +// NOTE: \uFFFD is used here to replace Windows-1252 bytes to match +// how we will see them in the reflected POST bytes in a frame using +// UTF-8 byte interpretation. The bytes will actually be intact, but +// this code cannot tell and does not really care. +const kTestFallbackWindows1252 = + 'ABC~‾\xA5≈\xA4・・\x95∙\xB7☼★星🌟星★☼\xB7∙\x95・・\xA4≈\xA5‾~XYZ'.replace( + /[^\0-\xFF]/gu, + x => `&#${x.codePointAt(0)};`).replace(/[\x80-\xFF]/g, '\uFFFD'); + +const kTestFallbackXUserDefined = + kTestChars.replace(/[^\0-\x7F]/gu, x => `&#${x.codePointAt(0)};`); + +// formPostFileUploadTest - verifies multipart upload structure and +// numeric character reference replacement for filenames, field names, +// and field values. +// +// Uses /fetch/api/resources/echo-content.py to echo the upload +// POST with UTF-8 byte interpretation, leading to the "UTF-8 goggles" +// behavior documented below for expectedEncodedBaseName when non- +// UTF-8-compatible byte sequences appear in the formEncoding-encoded +// uploaded data. +// +// Fields in the parameter object: +// +// - fileNameSource: purely explanatory and gives a clue about which +// character encoding is the source for the non-7-bit-ASCII parts of +// the fileBaseName, or Unicode if no smaller-than-Unicode source +// contains all the characters. Used in the test name. +// - fileBaseName: the not-necessarily-just-7-bit-ASCII file basename +// used for the constructed test file. Used in the test name. +// - formEncoding: the acceptCharset of the form used to submit the +// test file. Used in the test name. +// - expectedEncodedBaseName: the expected formEncoding-encoded +// version of fileBaseName with unencodable characters replaced by +// numeric character references and non-7-bit-ASCII bytes seen +// through UTF-8 goggles; subsequences not interpretable as UTF-8 +// have each byte represented here by \uFFFD REPLACEMENT CHARACTER. +const formPostFileUploadTest = ({ + fileNameSource, + fileBaseName, + formEncoding, + expectedEncodedBaseName, +}) => { + promise_test(async testCase => { + + if (document.readyState !== 'complete') { + await new Promise(resolve => addEventListener('load', resolve)); + } + + const formTargetFrame = Object.assign(document.createElement('iframe'), { + name: 'formtargetframe', + }); + document.body.append(formTargetFrame); + testCase.add_cleanup(() => { + document.body.removeChild(formTargetFrame); + }); + + const form = Object.assign(document.createElement('form'), { + acceptCharset: formEncoding, + action: '/fetch/api/resources/echo-content.py', + method: 'POST', + enctype: 'multipart/form-data', + target: formTargetFrame.name, + }); + document.body.append(form); + testCase.add_cleanup(() => { + document.body.removeChild(form); + }); + + // Used to verify that the browser agrees with the test about + // which form charset is used. + form.append(Object.assign(document.createElement('input'), { + type: 'hidden', + name: '_charset_', + })); + + // Used to verify that the browser agrees with the test about + // field value replacement and encoding independently of file system + // idiosyncracies. + form.append(Object.assign(document.createElement('input'), { + type: 'hidden', + name: 'filename', + value: fileBaseName, + })); + + // Same, but with name and value reversed to ensure field names + // get the same treatment. + form.append(Object.assign(document.createElement('input'), { + type: 'hidden', + name: fileBaseName, + value: 'filename', + })); + + const fileInput = Object.assign(document.createElement('input'), { + type: 'file', + name: 'file', + }); + form.append(fileInput); + + // Removes c:\fakepath\ or other pseudofolder and returns just the + // final component of filePath; allows both / and \ as segment + // delimiters. + const baseNameOfFilePath = filePath => filePath.split(/[\/\\]/).pop(); + await new Promise(resolve => { + const dataTransfer = new DataTransfer; + dataTransfer.items.add( + new File([kTestChars], fileBaseName, {type: 'text/plain'})); + fileInput.files = dataTransfer.files; + // For historical reasons .value will be prefixed with + // c:\fakepath\, but the basename should match the file name + // exposed through the newer .files[0].name API. This check + // verifies that assumption. + assert_equals( + fileInput.files[0].name, + baseNameOfFilePath(fileInput.value), + `The basename of the field's value should match its files[0].name`); + form.submit(); + formTargetFrame.onload = resolve; + }); + + const formDataText = formTargetFrame.contentDocument.body.textContent; + const formDataLines = formDataText.split('\n'); + if (formDataLines.length && !formDataLines[formDataLines.length - 1]) { + --formDataLines.length; + } + assert_greater_than( + formDataLines.length, + 2, + `${fileBaseName}: multipart form data must have at least 3 lines: ${ + JSON.stringify(formDataText) + }`); + const boundary = formDataLines[0]; + assert_equals( + formDataLines[formDataLines.length - 1], + boundary + '--', + `${fileBaseName}: multipart form data must end with ${boundary}--: ${ + JSON.stringify(formDataText) + }`); + const expectedText = [ + boundary, + 'Content-Disposition: form-data; name="_charset_"', + '', + formEncoding, + boundary, + 'Content-Disposition: form-data; name="filename"', + '', + expectedEncodedBaseName, + boundary, + `Content-Disposition: form-data; name="${expectedEncodedBaseName}"`, + '', + 'filename', + boundary, + `Content-Disposition: form-data; name="file"; ` + + `filename="${expectedEncodedBaseName}"`, + 'Content-Type: text/plain', + '', + kTestChars, + boundary + '--', + ].join('\n'); + assert_true( + formDataText.startsWith(expectedText), + `Unexpected multipart-shaped form data received:\n${ + formDataText + }\nExpected:\n${expectedText}`); + }, `Upload ${fileBaseName} (${fileNameSource}) in ${formEncoding} form`); +};