diff --git a/java/dev/enola/common/io/iri/URIs.java b/java/dev/enola/common/io/iri/URIs.java index e226abba..3cf055de 100644 --- a/java/dev/enola/common/io/iri/URIs.java +++ b/java/dev/enola/common/io/iri/URIs.java @@ -33,6 +33,7 @@ import java.net.URISyntaxException; import java.net.URL; import java.net.URLEncoder; +import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.nio.file.FileSystems; import java.nio.file.Path; @@ -72,10 +73,25 @@ public static MediaTypeAndOrCharset getMediaTypeAndCharset(URI uri) { return new MediaTypeAndOrCharset(mediaTypeParameter, charsetParameter); } + public static @Nullable String getCharset(URI uri) { + var mediaTypeAndCharset = getMediaTypeAndCharset(uri); + if (mediaTypeAndCharset.charset != null) return mediaTypeAndCharset.charset; + if (mediaTypeAndCharset.mediaType != null) { + var optMediaTypeCharset = MediaType.parse(mediaTypeAndCharset.mediaType); + if (optMediaTypeCharset.charset().isPresent()) + return optMediaTypeCharset.charset().get().toString(); + } + return null; + } + public static URI addMediaType(URI uri, MediaType mediaType) { return addQueryParameter(uri, MEDIA_TYPE, mediaType.toString().replace(" ", "")); } + public static URI addCharset(URI uri, Charset charset) { + return addQueryParameter(uri, CHARSET, charset.toString()); + } + public static URI addQuery(URI uri, Map parameters) { for (var parameter : parameters.entrySet()) { uri = addQueryParameter(uri, parameter.getKey(), parameter.getValue()); @@ -105,13 +121,12 @@ private static URI addQueryParameter(URI uri, String key, String value) { String connector; if (uri.getQuery() != null && uri.getQuery().contains(key)) { return uri; - } else if (uri.getQuery() == null) { - if (!uri.getSchemeSpecificPart().contains("?")) { - connector = "?"; - } else { - // Special case of "scheme:?"-like URIs with "empty" query - connector = ""; - } + } else if (uri.getQuery() == null && !uri.getSchemeSpecificPart().contains("?")) { + connector = "?"; + } else if (uri.getQuery() == null && uri.getSchemeSpecificPart().equals("?")) { + // Handle special (but technically invalid cases) of "scheme:?" URIs with "empty" query + // This is currently used by EmptyResource.EMPTY_URI (but may be changed later) + connector = ""; } else { connector = "&"; } diff --git a/java/dev/enola/common/io/iri/URIsTest.java b/java/dev/enola/common/io/iri/URIsTest.java index 8e4bf80e..17a7f08b 100644 --- a/java/dev/enola/common/io/iri/URIsTest.java +++ b/java/dev/enola/common/io/iri/URIsTest.java @@ -22,6 +22,7 @@ import static org.junit.Assert.assertThrows; import static java.net.URI.create; +import static java.nio.charset.StandardCharsets.UTF_8; import com.google.common.collect.ImmutableMap; import com.google.common.io.Resources; @@ -112,6 +113,20 @@ public void testAddQuery() { .isEqualTo("http://host/path?arg1=a&arg2=b&arg3=c"); } + @Test + public void testAddCharsetQueryParameter() { + assertThat(URIs.addCharset(URI.create("fd:1?mediaType=application/yaml"), UTF_8)) + .isEqualTo(URI.create("fd:1?mediaType=application/yaml&charset=UTF-8")); + } + + @Test + public void testGetCharsetFromMediaTypeOrCharsetQueryParameter() { + assertThat(URIs.getCharset(URI.create("fd:1?mediaType=application/yaml&charset=UTF-8"))) + .isEqualTo("UTF-8"); + assertThat(URIs.getCharset(URI.create("fd:1?mediaType=application/yaml;charset=UTF-8"))) + .isEqualTo("UTF-8"); + } + @Test public void testAddQueryGivenOriginalUriWithQuery() { var uri1 = URI.create("http://host/pathX"); diff --git a/java/dev/enola/common/io/mediatype/MarkdownMediaTypes.java b/java/dev/enola/common/io/mediatype/MarkdownMediaTypes.java index cc4b7ba1..dc7a4f1f 100644 --- a/java/dev/enola/common/io/mediatype/MarkdownMediaTypes.java +++ b/java/dev/enola/common/io/mediatype/MarkdownMediaTypes.java @@ -25,7 +25,6 @@ import com.google.common.collect.Multimap; import com.google.common.net.MediaType; -import java.nio.charset.StandardCharsets; import java.util.Map; import java.util.Set; @@ -40,26 +39,20 @@ public class MarkdownMediaTypes implements MediaTypeProvider { private static final String VARIANT = "variant"; - public static final MediaType MARKDOWN_UTF_8 = - create("text", "markdown").withCharset(StandardCharsets.UTF_8); + public static final MediaType MARKDOWN_UTF_8 = MediaType.MD_UTF_8; /** * CommonMark * variant, see CommonMark.org. */ public static final MediaType COMMON_MARKDOWN_UTF_8 = - create("text", "markdown") - .withCharset(StandardCharsets.UTF_8) - .withParameter(VARIANT, "CommonMark"); + MARKDOWN_UTF_8.withParameter(VARIANT, "CommonMark"); /** * GitHub Flavored Markdown * (GFM). */ - public static final MediaType GFM_MARKDOWN_UTF_8 = - create("text", "markdown") - .withCharset(StandardCharsets.UTF_8) - .withParameter(VARIANT, "GFM"); + public static final MediaType GFM_MARKDOWN_UTF_8 = MARKDOWN_UTF_8.withParameter(VARIANT, "GFM"); @Override public Multimap extensionsToTypes() { diff --git a/java/dev/enola/common/io/mediatype/ResourceCharsetDetectorSPI.java b/java/dev/enola/common/io/mediatype/ResourceCharsetDetectorSPI.java index e11d061e..705596d0 100644 --- a/java/dev/enola/common/io/mediatype/ResourceCharsetDetectorSPI.java +++ b/java/dev/enola/common/io/mediatype/ResourceCharsetDetectorSPI.java @@ -27,6 +27,7 @@ import java.nio.charset.Charset; import java.util.Optional; +// TODO Increase test coverage... (this currently only used by YamlMediaType) public abstract class ResourceCharsetDetectorSPI implements ResourceCharsetDetector { private static final Logger LOG = LoggerFactory.getLogger(ResourceCharsetDetectorSPI.class); diff --git a/java/dev/enola/common/io/resource/BaseResource.java b/java/dev/enola/common/io/resource/BaseResource.java index d43941e9..15c13524 100644 --- a/java/dev/enola/common/io/resource/BaseResource.java +++ b/java/dev/enola/common/io/resource/BaseResource.java @@ -39,14 +39,16 @@ protected BaseResource(URI uri) { } protected BaseResource(URI uri, ByteSource byteSource) { - this(uri, mtd.detect(uri, byteSource)); + this.uri = requireNonNull(uri, "uri"); + this.mediaType = mtd.detect(uri, byteSource); } protected BaseResource(URI uri, MediaType mediaType) { this.uri = requireNonNull(uri, "uri"); - this.mediaType = mtd.overwrite(uri, requireNonNull(mediaType, "mediaType")); + this.mediaType = mtd.adjustCharset(uri, requireNonNull(mediaType, "mediaType")); } + // TODO Remove this weird "hacked" constructor with "fake" fixedMediaType parameter... protected BaseResource(URI uri, MediaType mediaType, boolean fixedMediaType) { this.uri = requireNonNull(uri, "uri"); this.mediaType = requireNonNull(mediaType, "mediaType"); diff --git a/java/dev/enola/common/io/resource/EmptyResource.java b/java/dev/enola/common/io/resource/EmptyResource.java index 86c69c4a..40a15da5 100644 --- a/java/dev/enola/common/io/resource/EmptyResource.java +++ b/java/dev/enola/common/io/resource/EmptyResource.java @@ -44,7 +44,7 @@ public Resource getResource(URI uri) { } static final String SCHEME = "empty"; - public static final URI EMPTY_URI = URI.create(SCHEME + ":?"); // Maybe :/ is better? + public static final URI EMPTY_URI = URI.create(SCHEME + ":?"); // TODO :/ is probably better? public static final EmptyResource INSTANCE = new EmptyResource(EMPTY_URI); public EmptyResource(URI uri) { diff --git a/java/dev/enola/common/io/resource/FileDescriptorResource.java b/java/dev/enola/common/io/resource/FileDescriptorResource.java index cb59c7cd..2602b61a 100644 --- a/java/dev/enola/common/io/resource/FileDescriptorResource.java +++ b/java/dev/enola/common/io/resource/FileDescriptorResource.java @@ -19,7 +19,6 @@ import com.google.common.io.ByteSink; import com.google.common.io.ByteSource; -import com.google.common.net.MediaType; import dev.enola.common.io.iri.URIs; @@ -33,8 +32,16 @@ import java.nio.charset.Charset; /** - * Resource for URIs "fd:0?charset=ASCII" (STDIN), "fd:1?charset=UTF-8" (STDOUT), - * "fd:2?charset=UTF-16BE" STDERR. Implementation uses {@link FileDescriptor}. + * Resource for "fd:0" (STDIN), "fd:1" (STDOUT), "fd:2" (STDERR) URIs. + * + *

The {@link #mediaType()} will be {@link com.google.common.net.MediaType#OCTET_STREAM} (NOT + * {@link com.google.common.net.MediaType#APPLICATION_BINARY}!), unless there is a ?mediaType= query + * parameter in the URI argument. + * + *

The Charset of that Media Type will be the {@link Charset#defaultCharset()}, unless there is + * either (check first) a ?charset= query parameter in the URI argument (e.g. "fd:0?charset=ASCII", + * or "fd:1?charset=UTF-8", or "fd:2?charset=UTF-16BE") or the ?mediaType= query parameter includes + * a charset (e.g. "fd:1?mediaType=application/yaml;charset=utf-16be"). */ public class FileDescriptorResource extends BaseResource implements Resource { @@ -54,8 +61,7 @@ public Resource getResource(URI uri) { private final FileDescriptor fileDescriptor; public FileDescriptorResource(URI uri) { - // Default to Text in Platform's (Terminal) default charset - super(uri, MediaType.PLAIN_TEXT_UTF_8.withCharset(Charset.defaultCharset())); + super(addDefaultCharsetIfNone(uri)); if (!"fd".equals(uri.getScheme())) { throw new IllegalArgumentException(uri.toString()); @@ -73,6 +79,11 @@ public FileDescriptorResource(URI uri) { } } + private static URI addDefaultCharsetIfNone(URI uri) { + if (URIs.getCharset(uri) == null) return URIs.addCharset(uri, Charset.defaultCharset()); + else return uri; + } + @Override public ByteSink byteSink() { return new ByteSink() { diff --git a/java/dev/enola/common/io/resource/FileDescriptorResourceTest.java b/java/dev/enola/common/io/resource/FileDescriptorResourceTest.java index c8975a03..856f48ae 100644 --- a/java/dev/enola/common/io/resource/FileDescriptorResourceTest.java +++ b/java/dev/enola/common/io/resource/FileDescriptorResourceTest.java @@ -17,6 +17,7 @@ */ package dev.enola.common.io.resource; +import static com.google.common.net.MediaType.OCTET_STREAM; import static com.google.common.truth.Truth.assertThat; import static dev.enola.common.context.testlib.SingletonRule.$; @@ -40,11 +41,12 @@ public class FileDescriptorResourceTest { public @Rule SingletonRule r = $(MediaTypeProviders.set()); @Test - public void testSTDOUTwithoutCharset() throws IOException { + public void testSTDOUTwithoutCharsetNorMediaType() throws IOException { var FD1 = new FileDescriptorResource(URI.create("fd:1")); FD1.byteSink().write(new byte[] {1, 2, 3}); - FD1.charSink().write("hello"); assertThat(FD1.mediaType().charset()).hasValue(Charset.defaultCharset()); + assertThat(FD1.mediaType().withoutParameters()).isEqualTo(OCTET_STREAM.withoutParameters()); + FD1.charSink().write("hello"); } @Test diff --git a/java/dev/enola/common/io/resource/MediaTypeDetector.java b/java/dev/enola/common/io/resource/MediaTypeDetector.java index 12885855..14c1e5cb 100644 --- a/java/dev/enola/common/io/resource/MediaTypeDetector.java +++ b/java/dev/enola/common/io/resource/MediaTypeDetector.java @@ -75,23 +75,28 @@ MediaType detect(URI uri, ByteSource byteSource) { var mediaTypeCharset = URIs.getMediaTypeAndCharset(uri); var detected = detect(mediaTypeCharset.mediaType(), mediaTypeCharset.charset()); detected = detectCharsetAndMediaType(uri, byteSource, detected); + + detected = overwrite(uri, detected); + return detected; } /** * This is called by Resource* implementation constructors via {@link BaseResource}, if there is - * (only) an URI and a server proposed MediaType already. In this case, there is no point in - * also considering a ByteSource. This uses a number of hard-coded built-in heuristics to ignore - * certain server proposals and try to find a "better" one based on the URI. + * (only) an URI and already a MediaType proposed either by a server, or from {@link + * #detect(URI, ByteSource)}. In this case, there is no point in also considering a ByteSource. + * This uses a number of hard-coded built-in heuristics to ignore certain server proposals and + * try to find a "better" one based on the URI. */ - MediaType overwrite(URI uri, final MediaType serverProposedMediaType) { - var mediaType = serverProposedMediaType; + MediaType overwrite(URI uri, final MediaType proposedMediaType) { + var mediaType = proposedMediaType; // TODO Move this into UrlResource if "content/unknown" is hard-coded in URLConnection? if (IGNORE.contains(mediaType.withoutParameters())) { mediaType = MediaType.OCTET_STREAM; } + // TODO Reuse #adjust() ? var uriCharset = URIs.getMediaTypeAndCharset(uri); var uriMediaType = uriCharset.mediaType(); if (uriMediaType != null) mediaType = MediaType.parse(uriMediaType); @@ -99,8 +104,8 @@ MediaType overwrite(URI uri, final MediaType serverProposedMediaType) { var cs = uriCharset.charset(); if (cs != null) mediaType = mediaType.withCharset(Charset.forName(cs)); else { - if (!mediaType.charset().isPresent() && serverProposedMediaType.charset().isPresent()) { - mediaType = mediaType.withCharset(serverProposedMediaType.charset().get()); + if (!mediaType.charset().isPresent() && proposedMediaType.charset().isPresent()) { + mediaType = mediaType.withCharset(proposedMediaType.charset().get()); } else { mediaType = detectCharsetAndMediaType(uri, ByteSource.empty(), mediaType); } @@ -109,6 +114,19 @@ MediaType overwrite(URI uri, final MediaType serverProposedMediaType) { return mediaType; } + /** + * This is called by Resource* implementation constructors via {@link BaseResource}, if there is + * (only) an URI and a "client" (caller of API) requested MediaType. It generally "respects" + * (honors) the clients' wish, does not use any magic heuristics, and only adjusts (only) the it + * based on query parameters in the URI (if present). + */ + MediaType adjustCharset(URI uri, final MediaType clientRequestedMediaType) { + var charsetParameter = URIs.getCharset(uri); + if (charsetParameter != null) + return clientRequestedMediaType.withCharset(Charset.forName(charsetParameter)); + else return clientRequestedMediaType; + } + private MediaType detectCharsetAndMediaType( URI uri, ByteSource byteSource, MediaType detected) { if (!detected.charset().isPresent()) { diff --git a/java/dev/enola/common/io/resource/MediaTypeDetectorTest.java b/java/dev/enola/common/io/resource/MediaTypeDetectorTest.java index 620f8ee1..05022fed 100644 --- a/java/dev/enola/common/io/resource/MediaTypeDetectorTest.java +++ b/java/dev/enola/common/io/resource/MediaTypeDetectorTest.java @@ -38,7 +38,6 @@ import org.junit.Test; import java.io.File; -import java.io.IOException; import java.net.URI; import java.nio.charset.StandardCharsets; @@ -115,29 +114,19 @@ public void testTextproto() { } @Test - public void testTextYAML() { - // A text/plain with *.yaml is still application/yaml - var r = - new EmptyResource( - create("http://server/hello.yaml"), - PLAIN_TEXT_UTF_8.withoutParameters()); // drop charset! - + public void testYAML() { + var r = new EmptyResource(create("http://server/hello.yaml")); assertThat(r.mediaType()).isEqualTo(YAML_UTF_8); } @Test - public void testEmptyYAML() { - // Empty .YAML is UTF-8 - var r = new EmptyResource(YamlMediaType.YAML_UTF_8.withoutParameters()); // drop charset! - assertThat(r.mediaType()).isEqualTo(YAML_UTF_8); - } - - @Test - public void testNoHeaderYAML() throws IOException { - // A .YAML without header and just some ASCII is still UTF-8 - var text = "hello: world"; - var r = new MemoryResource(YamlMediaType.YAML_UTF_8.withoutParameters()); // drop charset! - r.byteSink().write(text.getBytes(StandardCharsets.US_ASCII)); + @Ignore // TODO This is an invalid test... rewrite it elsewhere. + // If a caller of an EmptyResource constructor says its TEXT, then it is that! + // What this meant to test is that if a HTTP server says something is TEXT, then + // that may be wrong, and we should detect if it may be YAML; but that goes elsewhere. + public void testTextYAML() { + // A text/plain with *.yaml is still application/yaml + var r = new EmptyResource(create("http://server/hello.yaml"), PLAIN_TEXT_UTF_8); assertThat(r.mediaType()).isEqualTo(YAML_UTF_8); } diff --git a/java/dev/enola/common/io/resource/OkHttpResource.java b/java/dev/enola/common/io/resource/OkHttpResource.java index 38374d59..fcc45914 100644 --- a/java/dev/enola/common/io/resource/OkHttpResource.java +++ b/java/dev/enola/common/io/resource/OkHttpResource.java @@ -103,6 +103,7 @@ public OkHttpResource(URI uri) { super(uri, mediaType(uri.toString())); } + // See also UrlResource#mediaType(URL url) private static MediaType mediaType(String url) { Request request = new Request.Builder().url(url).build(); diff --git a/java/dev/enola/common/io/resource/UrlResource.java b/java/dev/enola/common/io/resource/UrlResource.java index 1a906f54..48afd842 100644 --- a/java/dev/enola/common/io/resource/UrlResource.java +++ b/java/dev/enola/common/io/resource/UrlResource.java @@ -110,6 +110,7 @@ public UrlResource(URL url, MediaType mediaType) { this.url = url; } + // See also OkHttpResource#mediaType(String url) private static MediaType mediaType(URL url) { // This is slow - but more correct; see https://www.baeldung.com/java-file-mime-type URLConnection c = null;