From 67097474d91e63b6d05f55a5407ca86cd08f79e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20L=C3=A4ubrich?= Date: Sat, 6 Jan 2024 15:26:21 +0100 Subject: [PATCH] Add HTML artifact comparator HTML documents can differ in textual representation but equal semantically. This adds a very basic support for semantic compare of html documents as a first step it supports ignoring script/css changes between different java versions in the javadoc output. --- tycho-artifactcomparator/pom.xml | 5 + .../internal/HtmlComparator.java | 110 ++++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 tycho-artifactcomparator/src/main/java/org/eclipse/tycho/zipcomparator/internal/HtmlComparator.java diff --git a/tycho-artifactcomparator/pom.xml b/tycho-artifactcomparator/pom.xml index 22f375988e..077f532989 100644 --- a/tycho-artifactcomparator/pom.xml +++ b/tycho-artifactcomparator/pom.xml @@ -77,6 +77,11 @@ java-diff-utils 4.12 + + ch.digitalfondue.jfiveparse + jfiveparse + 1.0.1 + diff --git a/tycho-artifactcomparator/src/main/java/org/eclipse/tycho/zipcomparator/internal/HtmlComparator.java b/tycho-artifactcomparator/src/main/java/org/eclipse/tycho/zipcomparator/internal/HtmlComparator.java new file mode 100644 index 0000000000..8e5752547a --- /dev/null +++ b/tycho-artifactcomparator/src/main/java/org/eclipse/tycho/zipcomparator/internal/HtmlComparator.java @@ -0,0 +1,110 @@ +/******************************************************************************* + * Copyright (c) 2024 Christoph Läubrich and others. + * This program and the accompanying materials + * are made available under the terms of the Eclipse Public License 2.0 + * which accompanies this distribution, and is available at + * https://www.eclipse.org/legal/epl-2.0/ + * + * SPDX-License-Identifier: EPL-2.0 + * + * Contributors: + * Christoph Läubrich - initial API and implementation + *******************************************************************************/ +package org.eclipse.tycho.zipcomparator.internal; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.List; + +import org.codehaus.plexus.component.annotations.Component; +import org.eclipse.tycho.artifactcomparator.ArtifactComparator.ComparisonData; +import org.eclipse.tycho.artifactcomparator.ArtifactDelta; +import org.eclipse.tycho.artifactcomparator.ComparatorInputStream; + +import ch.digitalfondue.jfiveparse.Comment; +import ch.digitalfondue.jfiveparse.Document; +import ch.digitalfondue.jfiveparse.Element; +import ch.digitalfondue.jfiveparse.JFiveParse; +import ch.digitalfondue.jfiveparse.Node; + +/** + * Compares html files for some special cases and fall back to simple textcompare otherwise + */ +@Component(role = ContentsComparator.class, hint = HtmlComparator.HINT) +public class HtmlComparator implements ContentsComparator { + + private static final String META_ELEMENT = "meta"; + private static final String LINK_ELEMENT = "link"; + private static final String SCRIPT_ELEMENT = "script"; + private static final String HEAD_ELEMENT = "head"; + static final String HINT = "html"; + + @Override + public ArtifactDelta getDelta(ComparatorInputStream baseline, ComparatorInputStream reactor, ComparisonData data) + throws IOException { + try { + Document baselineDoc = JFiveParse.parse(baseline.asString(StandardCharsets.UTF_8)); + Document reactorDoc = JFiveParse.parse(reactor.asString(StandardCharsets.UTF_8)); + //TODO we might generally want to remove all comment tags? + if (isJavadocHtml(baselineDoc)) { + cleanJavaDoc(baselineDoc.getDocumentElement()); + cleanJavaDoc(reactorDoc.getDocumentElement()); + String serializeBaseline = serializeAndNormalize(baselineDoc); + String serializeReactor = serializeAndNormalize(reactorDoc); + if (serializeBaseline.equalsIgnoreCase(serializeReactor)) { + return ArtifactDelta.NO_DIFFERENCE; + } + } else { + //TODO compare each element and attributes ... + } + } catch (Exception e) { + //it might not be a valid html5 so don't bother and compare the text directly + } + return TextComparator.compareText(baseline, reactor, data); + } + + private String serializeAndNormalize(Document baselineDoc) { + return JFiveParse.serialize(baselineDoc).replaceAll("\\s+", " ").replace("\r", "").replace("\n", "") + .replace("> <", "><"); + } + + private void cleanJavaDoc(Element root) { + //for javadoc we want to ignore all script and css tags... + for (Element headElement : root.getElementsByTagName(HEAD_ELEMENT)) { + for (Node node : headElement.getChildNodes().toArray(Node[]::new)) { + if (node instanceof Element element) { + String nodeName = element.getNodeName(); + if (SCRIPT_ELEMENT.equalsIgnoreCase(nodeName) || LINK_ELEMENT.equalsIgnoreCase(nodeName) + || META_ELEMENT.equals(nodeName)) { + headElement.removeChild(node); + } + } + if (node instanceof Comment comment) { + headElement.removeChild(node); + } + } + } + //don't bother about lang... + root.removeAttribute("lang"); + } + + private boolean isJavadocHtml(Document doc) { + List elementsByTagName = doc.getDocumentElement().getElementsByTagName(HEAD_ELEMENT); + for (Element element : elementsByTagName) { + for (Node node : element.getChildNodes()) { + if (node instanceof Comment comment) { + String data = comment.getData(); + if (data != null && data.trim().toLowerCase().startsWith("generated by javadoc")) { + return true; + } + } + } + } + return false; + } + + @Override + public boolean matches(String nameOrExtension) { + return HINT.equalsIgnoreCase(nameOrExtension) || "htm".equalsIgnoreCase(nameOrExtension); + } +}