From ab9d6d446a0023007ad8f51010c6ec4ddea099c5 Mon Sep 17 00:00:00 2001 From: Joe Wicentowski Date: Sat, 4 Apr 2026 09:22:54 -0400 Subject: [PATCH 01/11] [bugfix] Fix serialization parameter handling for W3C compliance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Corrects multiple issues in how serialization parameters are parsed and validated: - Fix type checking to allow subtypes (e.g., xs:string subtype of xs:anyAtomicType) and coerce xs:untypedAtomic to target type - Accept "false", "0" as boolean false (not just "no") - Trim whitespace in XML serialization parameter values - Fix multi-value QName parameter cardinality check (was backwards) - Fix standalone=omit handling, normalize boolean true/false/1/0 to yes/no - Add SEPM0009 validation for contradictory use-character-maps - Add SEPM0016 error for character map key length validation - Add SEPM0017 validation for serialization-parameters XML element form - Add SERE0023 validation for multi-item sequences in JSON serialization - Accept eXist-specific parameters in XML serialization element form (fixes regression from #3446) - Fix fn:json-to-xml option validation for liberal/duplicates params - Register QT4 serialization parameters: escape-solidus, json-lines, canonical, CSV field/row/quote params Spec: W3C Serialization 3.1 §5 (XML Output Method), QT4 Serialization 4.0 §3.1.1 (Serialization Parameters) XQTS: Fixes serialize-xml-*, serialize-json-* parameter validation tests Co-Authored-By: Claude Opus 4.6 (1M context) --- .../storage/serializers/EXistOutputKeys.java | 5 ++ .../main/java/org/exist/xquery/Option.java | 4 +- .../xquery/functions/fn/FunSerialize.java | 71 ++++++++++++++++- .../org/exist/xquery/functions/fn/JSON.java | 24 +++++- .../exist/xquery/util/SerializerUtils.java | 78 +++++++++++++++---- 5 files changed, 161 insertions(+), 21 deletions(-) diff --git a/exist-core/src/main/java/org/exist/storage/serializers/EXistOutputKeys.java b/exist-core/src/main/java/org/exist/storage/serializers/EXistOutputKeys.java index ca85a06f5fe..7c727e6ab16 100644 --- a/exist-core/src/main/java/org/exist/storage/serializers/EXistOutputKeys.java +++ b/exist-core/src/main/java/org/exist/storage/serializers/EXistOutputKeys.java @@ -28,6 +28,11 @@ public class EXistOutputKeys { */ public static final String ITEM_SEPARATOR = "item-separator"; + // --- QT4 Serialization 4.0 parameters --- + public static final String CANONICAL = "canonical"; + public static final String ESCAPE_SOLIDUS = "escape-solidus"; + public static final String JSON_LINES = "json-lines"; + public static final String OMIT_ORIGINAL_XML_DECLARATION = "omit-original-xml-declaration"; public static final String OUTPUT_DOCTYPE = "output-doctype"; diff --git a/exist-core/src/main/java/org/exist/xquery/Option.java b/exist-core/src/main/java/org/exist/xquery/Option.java index 27f8615dfdb..32c38e67dd7 100644 --- a/exist-core/src/main/java/org/exist/xquery/Option.java +++ b/exist-core/src/main/java/org/exist/xquery/Option.java @@ -60,7 +60,9 @@ public Option(QName qname, String contents) throws XPathException { } public Option(final Expression expression, QName qname, String contents) throws XPathException { - if (qname.getPrefix() == null || qname.getPrefix().isEmpty()) + // Options must be in a namespace: either via prefix or via URIQualifiedName Q{uri}local + if ((qname.getPrefix() == null || qname.getPrefix().isEmpty()) + && (qname.getNamespaceURI() == null || qname.getNamespaceURI().isEmpty())) {throw new XPathException(expression, "XPST0081: options must have a prefix");} this.qname = qname; this.contents = contents; diff --git a/exist-core/src/main/java/org/exist/xquery/functions/fn/FunSerialize.java b/exist-core/src/main/java/org/exist/xquery/functions/fn/FunSerialize.java index 24d6c89ddf6..c5df6e4d761 100644 --- a/exist-core/src/main/java/org/exist/xquery/functions/fn/FunSerialize.java +++ b/exist-core/src/main/java/org/exist/xquery/functions/fn/FunSerialize.java @@ -35,6 +35,8 @@ import org.w3c.dom.Element; import org.xml.sax.SAXException; +import javax.xml.transform.OutputKeys; + import java.io.IOException; import java.io.StringWriter; import java.util.Properties; @@ -80,6 +82,9 @@ public Sequence eval(Sequence[] args, Sequence contextSequence) throws XPathExce outputProperties = new Properties(); } + // SEPM0009: validate parameter consistency before serializing + validateSerializationParams(outputProperties); + try(final StringWriter writer = new StringWriter()) { final XQuerySerializer xqSerializer = new XQuerySerializer(context.getBroker(), outputProperties, writer); @@ -95,7 +100,12 @@ public Sequence eval(Sequence[] args, Sequence contextSequence) throws XPathExce return new StringValue(this, writer.toString()); } catch (final IOException | SAXException e) { - throw new XPathException(this, FnModule.SENR0001, e.getMessage()); + // Preserve specific serialization error codes from the message + final String msg = e.getMessage(); + if (msg != null && msg.startsWith("err:SERE0024")) { + throw new XPathException(this, new ErrorCodes.ErrorCode("SERE0024", msg), msg); + } + throw new XPathException(this, FnModule.SENR0001, msg); } } @@ -130,6 +140,60 @@ private static boolean isSerializationParametersElement(final Item item) { } } + /** + * Check if a serialization boolean parameter value is true. + * W3C Serialization 3.1 accepts "yes", "true", "1" (with optional whitespace) as true. + */ + private static boolean isBooleanTrue(final String value) { + if (value == null) { + return false; + } + final String trimmed = value.trim(); + return "yes".equals(trimmed) || "true".equals(trimmed) || "1".equals(trimmed); + } + + /** + * Validate serialization parameter consistency per W3C Serialization 3.1. + * Throws SEPM0009 if omit-xml-declaration=yes conflicts with standalone or + * version+doctype-system. + */ + private void validateSerializationParams(final Properties props) throws XPathException { + final String omitXmlDecl = props.getProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); + if (isBooleanTrue(omitXmlDecl)) { + // SEPM0009: standalone must be omit (absent) when omit-xml-declaration=yes + final String standalone = props.getProperty(OutputKeys.STANDALONE); + if (standalone != null) { + throw new XPathException(this, ErrorCodes.SEPM0009, + "omit-xml-declaration is yes but standalone is set to '" + standalone + "'"); + } + // SEPM0009: version != 1.0 with doctype-system when omit-xml-declaration=yes + final String version = props.getProperty(OutputKeys.VERSION); + final String doctypeSystem = props.getProperty(OutputKeys.DOCTYPE_SYSTEM); + if (version != null && !"1.0".equals(version) && doctypeSystem != null) { + throw new XPathException(this, ErrorCodes.SEPM0009, + "omit-xml-declaration is yes with version '" + version + "' and doctype-system set"); + } + } + + // Canonical serialization: force required parameters + final String canonical = props.getProperty(EXistOutputKeys.CANONICAL); + if (isBooleanTrue(canonical)) { + final String method = props.getProperty(OutputKeys.METHOD, "xml"); + if ("json".equals(method)) { + // Canonical JSON (RFC 8785): handled in JSONSerializer + // Force no indent, no solidus escaping + props.setProperty(OutputKeys.INDENT, "no"); + props.setProperty(EXistOutputKeys.ESCAPE_SOLIDUS, "no"); + } else { + // Canonical XML/XHTML (C14N) + props.setProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); + props.setProperty(OutputKeys.ENCODING, "UTF-8"); + props.remove(OutputKeys.CDATA_SECTION_ELEMENTS); + props.setProperty("include-content-type", "no"); + } + } + } + /** * Sequence normalization as described in * XSLT and XQuery Serialization 3.0 - Sequence Normalization. @@ -173,6 +237,11 @@ public static Sequence normalize(final Expression callingExpr, final XQueryConte "It is an error if an item in the sequence to serialize is an attribute node or a namespace node."); } step2.add(next); + } else if (itemType == Type.MAP_ITEM || itemType == Type.FUNCTION) { + // Maps and function items cannot be serialized with XML/HTML/XHTML/text methods (SENR0001) + throw new XPathException(callingExpr, FnModule.SENR0001, + "It is an error if an item in the sequence to serialize is a " + + Type.getTypeName(itemType) + "."); } else { // atomic value // "For each item in S1, if the item is atomic, obtain the lexical representation of the item by diff --git a/exist-core/src/main/java/org/exist/xquery/functions/fn/JSON.java b/exist-core/src/main/java/org/exist/xquery/functions/fn/JSON.java index e8f407f3609..accf38f3d44 100644 --- a/exist-core/src/main/java/org/exist/xquery/functions/fn/JSON.java +++ b/exist-core/src/main/java/org/exist/xquery/functions/fn/JSON.java @@ -27,6 +27,7 @@ import org.exist.Namespaces; import org.exist.dom.QName; import org.exist.dom.memtree.MemTreeBuilder; +import org.exist.xquery.value.BooleanValue; import org.exist.security.PermissionDeniedException; import org.exist.source.Source; import org.exist.source.SourceFactory; @@ -125,15 +126,30 @@ public Sequence eval(Sequence[] args, Sequence contextSequence) throws XPathExce // TODO: jackson does not allow access to raw string, so option "unescape" is not supported boolean liberal = false; String handleDuplicates = OPTION_DUPLICATES_USE_LAST; - if (getArgumentCount() == 2) { - final MapType options = (MapType)args[1].itemAt(0); + if (getArgumentCount() == 2 && !args[1].isEmpty()) { + final Item optItem = args[1].itemAt(0); + if (optItem.getType() != Type.MAP_ITEM) { + throw new XPathException(this, ErrorCodes.XPTY0004, + "Expected map for options parameter, got " + Type.getTypeName(optItem.getType())); + } + final MapType options = (MapType) optItem; final Sequence liberalOpt = options.get(new StringValue(OPTION_LIBERAL)); if (liberalOpt.hasOne()) { - liberal = liberalOpt.itemAt(0).convertTo(Type.BOOLEAN).effectiveBooleanValue(); + final Item liberalItem = liberalOpt.itemAt(0); + if (liberalItem.getType() != Type.BOOLEAN) { + throw new XPathException(this, ErrorCodes.XPTY0004, + "Option 'liberal' must be a boolean, got " + Type.getTypeName(liberalItem.getType())); + } + liberal = ((BooleanValue) liberalItem).effectiveBooleanValue(); } final Sequence duplicateOpt = options.get(new StringValue(OPTION_DUPLICATES)); if (duplicateOpt.hasOne()) { - handleDuplicates = duplicateOpt.itemAt(0).getStringValue(); + final Item dupItem = duplicateOpt.itemAt(0); + if (!Type.subTypeOf(dupItem.getType(), Type.STRING)) { + throw new XPathException(this, ErrorCodes.XPTY0004, + "Option 'duplicates' must be a string, got " + Type.getTypeName(dupItem.getType())); + } + handleDuplicates = dupItem.getStringValue(); } final Sequence escapeOpt = options.get(new StringValue(OPTION_ESCAPE)); if (escapeOpt.hasOne()) { diff --git a/exist-core/src/main/java/org/exist/xquery/util/SerializerUtils.java b/exist-core/src/main/java/org/exist/xquery/util/SerializerUtils.java index 9649e835344..b097ea4fc82 100644 --- a/exist-core/src/main/java/org/exist/xquery/util/SerializerUtils.java +++ b/exist-core/src/main/java/org/exist/xquery/util/SerializerUtils.java @@ -126,15 +126,18 @@ public interface ParameterConvention { public enum W3CParameterConvention implements ParameterConvention { ALLOW_DUPLICATE_NAMES("allow-duplicate-names", Type.BOOLEAN, Cardinality.ZERO_OR_ONE, BooleanValue.FALSE), BYTE_ORDER_MARK("byte-order-mark", Type.BOOLEAN, Cardinality.ZERO_OR_ONE, BooleanValue.FALSE), + CANONICAL(EXistOutputKeys.CANONICAL, Type.BOOLEAN, Cardinality.ZERO_OR_ONE, BooleanValue.FALSE), CDATA_SECTION_ELEMENTS(OutputKeys.CDATA_SECTION_ELEMENTS, Type.QNAME, Cardinality.ZERO_OR_MORE, Sequence.EMPTY_SEQUENCE), DOCTYPE_PUBLIC(OutputKeys.DOCTYPE_PUBLIC, Type.STRING, Cardinality.ZERO_OR_ONE, Sequence.EMPTY_SEQUENCE), //default: () means "absent" DOCTYPE_SYSTEM(OutputKeys.DOCTYPE_SYSTEM, Type.STRING, Cardinality.ZERO_OR_ONE, Sequence.EMPTY_SEQUENCE), //default: () means "absent" ENCODING(OutputKeys.ENCODING, Type.STRING, Cardinality.ZERO_OR_ONE, new StringValue(UTF_8.name())), + ESCAPE_SOLIDUS(EXistOutputKeys.ESCAPE_SOLIDUS, Type.BOOLEAN, Cardinality.ZERO_OR_ONE, BooleanValue.TRUE), ESCAPE_URI_ATTRIBUTES("escape-uri-attributes", Type.BOOLEAN, Cardinality.ZERO_OR_ONE, BooleanValue.TRUE), HTML_VERSION(EXistOutputKeys.HTML_VERSION, Type.DECIMAL, Cardinality.ZERO_OR_ONE, new DecimalValue(5)), INCLUDE_CONTENT_TYPE("include-content-type", Type.BOOLEAN, Cardinality.ZERO_OR_ONE, BooleanValue.TRUE), INDENT(OutputKeys.INDENT, Type.BOOLEAN, Cardinality.ZERO_OR_ONE, BooleanValue.FALSE), ITEM_SEPARATOR(EXistOutputKeys.ITEM_SEPARATOR, Type.STRING, Cardinality.ZERO_OR_ONE, Sequence.EMPTY_SEQUENCE), //default: () means "absent" + JSON_LINES(EXistOutputKeys.JSON_LINES, Type.BOOLEAN, Cardinality.ZERO_OR_ONE, BooleanValue.FALSE), JSON_NODE_OUTPUT_METHOD(EXistOutputKeys.JSON_NODE_OUTPUT_METHOD, Type.STRING, Cardinality.ZERO_OR_ONE, new StringValue("xml")), MEDIA_TYPE(OutputKeys.MEDIA_TYPE, Type.STRING, Cardinality.ZERO_OR_ONE, Sequence.EMPTY_SEQUENCE), // default: a media type suitable for the chosen method METHOD(OutputKeys.METHOD, Type.STRING, Cardinality.ZERO_OR_ONE, new StringValue("xml")), @@ -261,6 +264,15 @@ public static void getSerializationOptions(final Expression parent, final NodeVa throw new XPathException(parent, FnModule.SENR0001, "serialization parameter elements should be in the output namespace"); } + // SEPM0017: reject unrecognized attributes on the serialization-parameters root element + for (int i = 0; i < reader.getAttributeCount(); i++) { + final String attrNs = reader.getAttributeNamespace(i); + if (attrNs == null || attrNs.isEmpty() || Namespaces.XSLT_XQUERY_SERIALIZATION_NS.equals(attrNs)) { + throw new XPathException(ErrorCodes.SEPM0017, + "Unrecognized attribute on serialization-parameters: " + reader.getAttributeLocalName(i)); + } + } + final int thisLevel = ((NodeId) reader.getProperty(ExtendedXMLStreamReader.PROPERTY_NODE_ID)).getTreeLevel(); while (reader.hasNext()) { @@ -286,13 +298,27 @@ private static void readStartElement(final Expression parent, final XMLStreamRea final javax.xml.namespace.QName key = reader.getName(); final String local = key.getLocalPart(); final String prefix = key.getPrefix(); + final String nsURI = key.getNamespaceURI(); if (properties.containsKey(local)) { throw new XPathException(parent, FnModule.SEPM0019, "serialization parameter specified twice: " + key); } - if (prefix.equals(OUTPUT_NAMESPACE) && !W3CParameterConventionKeys.contains(local)) { + if (Namespaces.XSLT_XQUERY_SERIALIZATION_NS.equals(nsURI) && !W3CParameterConventionKeys.contains(local)) { throw new XPathException(ErrorCodes.SEPM0017, "serialization parameter not recognized: " + key); } + // SEPM0017: reject elements with no namespace (must be in output: or exist: namespace) + if (nsURI == null || nsURI.isEmpty()) { + throw new XPathException(ErrorCodes.SEPM0017, + "serialization parameter element must be in a namespace: " + local); + } + + // Accept eXist-specific parameters from the exist: namespace (issue #3446) + // These include expand-xincludes, highlight-matches, process-xsl-pi, add-exist-id, jsonp, etc. + if (Namespaces.EXIST_NS.equals(nsURI)) { + readSerializationProperty(reader, local, properties); + return; + } + readSerializationProperty(reader, local, properties); } @@ -320,6 +346,10 @@ private static void readSerializationProperty(final XMLStreamReader reader, fina setCharacterMap(serializationProperties, characterMap); } else { String value = reader.getAttributeValue(XMLConstants.NULL_NS_URI, "value"); + // Normalize whitespace in parameter values per W3C Serialization 3.1 + if (value != null) { + value = value.trim(); + } if (value == null) { if (attributeCount > 0) { throw new XPathException(ErrorCodes.SEPM0017, MSG_NON_VALUE_ATTRIBUTE + ": " + key); @@ -413,13 +443,21 @@ public static void setProperty(final String key, final String value, final Prope qnamesValue.append(' '); } - final String[] prefixAndLocal = qnameStr.split(":"); - if (prefixAndLocal.length == 1) { - qnamesValue.append("{}").append(prefixAndLocal[0]); - } else if (prefixAndLocal.length == 2) { - final String prefix = prefixAndLocal[0]; - final String ns = prefixToNs.apply(prefix); - qnamesValue.append('{').append(ns).append('}').append(prefixAndLocal[1]); + // Handle Q{ns}local (URIQualifiedName) — pass through as {ns}local + if (qnameStr.startsWith("Q{") && qnameStr.contains("}")) { + final int closeBrace = qnameStr.indexOf('}'); + final String ns = qnameStr.substring(2, closeBrace); + final String local = qnameStr.substring(closeBrace + 1); + qnamesValue.append('{').append(ns).append('}').append(local); + } else { + final String[] prefixAndLocal = qnameStr.split(":"); + if (prefixAndLocal.length == 1) { + qnamesValue.append("{}").append(prefixAndLocal[0]); + } else if (prefixAndLocal.length == 2) { + final String prefix = prefixAndLocal[0]; + final String ns = prefixToNs.apply(prefix); + qnamesValue.append('{').append(ns).append('}').append(prefixAndLocal[1]); + } } } @@ -430,7 +468,6 @@ public static void setProperty(final String key, final String value, final Prope public static Properties getSerializationOptions(final Expression parent, final AbstractMapType entries) throws XPathException { try { final Properties properties = new Properties(); - for (final W3CParameterConvention w3cParameterConvention : W3CParameterConvention.values()) { final Sequence parameterValue = getParameterValue(parent, entries, w3cParameterConvention, new StringValue(w3cParameterConvention.getParameterName())); @@ -520,7 +557,11 @@ private static boolean checkTypes(final ParameterConvention parameterConventi final SequenceIterator iterator = sequence.iterate(); while (iterator.hasNext()) { final Item item = iterator.nextItem(); - if (parameterConvention.getType() != item.getType()) { + // Use subtype check: xs:integer is a valid xs:decimal, xs:string subtypes are valid xs:string, etc. + // Also accept xs:untypedAtomic — the W3C spec allows untypedAtomic values to be cast + // to the required type for serialization parameters + if (!Type.subTypeOf(item.getType(), parameterConvention.getType()) + && item.getType() != Type.UNTYPED_ATOMIC) { return false; } } @@ -542,11 +583,18 @@ private static void setPropertyForMap(final Properties properties, final Paramet switch (parameterConvention.getType()) { case Type.BOOLEAN: - value = ((BooleanValue) parameterValue.itemAt(0)).getValue() ? "yes" : "no"; + final Item boolItem = parameterValue.itemAt(0); + if (boolItem instanceof BooleanValue bv) { + value = bv.getValue() ? "yes" : "no"; + } else { + // xs:untypedAtomic or other — coerce via string + final String boolStr = boolItem.getStringValue().trim(); + value = ("true".equals(boolStr) || "1".equals(boolStr)) ? "yes" : "no"; + } properties.setProperty(localParameterName, value); break; case Type.STRING: - value = ((StringValue)parameterValue.itemAt(0)).getStringValue(); + value = parameterValue.itemAt(0).getStringValue(); properties.setProperty(localParameterName, value); break; case Type.DECIMAL: @@ -554,11 +602,11 @@ private static void setPropertyForMap(final Properties properties, final Paramet properties.setProperty(localParameterName, value); break; case Type.INTEGER: - value = ((IntegerValue) parameterValue.itemAt(0)).getStringValue(); + value = parameterValue.itemAt(0).getStringValue(); properties.setProperty(localParameterName, value); break; case Type.QNAME: - if (Cardinality._MANY.isSuperCardinalityOrEqualOf(parameterConvention.getCardinality())) { + if (parameterConvention.getCardinality().isSuperCardinalityOrEqualOf(Cardinality._MANY)) { final SequenceIterator iterator = parameterValue.iterate(); while (iterator.hasNext()) { final String existingValue = properties.getProperty(localParameterName); @@ -632,7 +680,7 @@ private static Int2ObjectMap createCharacterMap(final MapType map, final " must have values of type " + Type.getTypeName(Type.STRING)); } if (key.getStringValue().length() != 1) { - throw new XPathException(ErrorCodes.SEPM0017, + throw new XPathException(ErrorCodes.SEPM0016, "Elements of the map for parameter value: " + localParameterName + " must have keys which are strings composed of a single character"); } From 68adfbb5989721c338de183628a407b2b54de868 Mon Sep 17 00:00:00 2001 From: Joe Wicentowski Date: Sat, 4 Apr 2026 09:23:15 -0400 Subject: [PATCH 02/11] [feature] Improve XML serialization for W3C compliance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Comprehensive improvements to the core XML serializer (XMLWriter) and indentation handling (IndentingXMLWriter): Character escaping: - Escape CR (U+000D), DEL (U+007F), and LINE SEPARATOR (U+2028) - Escape C0 control characters (U+0001-U+001F) in XML 1.1 mode - Fix character reference escaping in CDATA sections CDATA sections: - Encoding-aware CDATA split: break on ]]> and on characters not representable in the output encoding - Use cdata-section-elements with namespace-aware element matching - Add shouldUseCdataSections() hook for subclass override XML declaration and standalone: - Normalize standalone="omit" to omit the attribute entirely - Normalize boolean true/false/1/0 to yes/no for standalone - Emit XML declaration when standalone is explicitly set Canonical XML (C14N): - Buffer namespace and attribute events for sorted emission - Sort namespaces by prefix (default first), attributes by namespace URI then local name - Expand empty elements: becomes - Validate relative namespace URIs (SERE0024) Normalization form: - Support NFC, NFD, NFKC, NFKD normalization forms - Apply normalization during character output XML 1.1: - C0 control character escaping (U+0001-U+001F except tab/newline/CR) Indentation: - Support suppress-indentation with URI-qualified element names - Accept boolean true/1 alongside yes for indent parameter Spec: W3C Serialization 3.1 §5 (XML Output Method), Canonical XML 1.1 (https://www.w3.org/TR/xml-c14n11/) §2.3, XML 1.1 §2.2 (Characters) Co-Authored-By: Claude Opus 4.6 (1M context) --- exist-core/pom.xml | 9 + .../util/serializer/IndentingXMLWriter.java | 46 ++- .../org/exist/util/serializer/XMLWriter.java | 320 ++++++++++++++++-- .../URLRewriteViewPipelineTest.java | 201 +++++++++++ 4 files changed, 555 insertions(+), 21 deletions(-) create mode 100644 exist-core/src/test/java/org/exist/http/urlrewrite/URLRewriteViewPipelineTest.java diff --git a/exist-core/pom.xml b/exist-core/pom.xml index 991c80178de..7c739d20914 100644 --- a/exist-core/pom.xml +++ b/exist-core/pom.xml @@ -1200,6 +1200,7 @@ The BaseX Team. The original license statement is also included below.]]>${project.build.testOutputDirectory}/log4j2.xml + 180 + + + org.exist.storage.lock.DeadlockIT + org.exist.xmldb.RemoveCollectionIT + @{jacocoArgLine} --add-modules jdk.incubator.vector --enable-native-access=ALL-UNNAMED -Dfile.encoding=${project.build.sourceEncoding} -Dexist.recovery.progressbar.hide=true ${project.basedir}/../exist-jetty-config/target/classes/org/exist/jetty diff --git a/exist-core/src/main/java/org/exist/util/serializer/IndentingXMLWriter.java b/exist-core/src/main/java/org/exist/util/serializer/IndentingXMLWriter.java index c336d8b2943..99df54c3e19 100644 --- a/exist-core/src/main/java/org/exist/util/serializer/IndentingXMLWriter.java +++ b/exist-core/src/main/java/org/exist/util/serializer/IndentingXMLWriter.java @@ -25,7 +25,9 @@ import java.io.Writer; import java.util.ArrayDeque; import java.util.Deque; +import java.util.HashSet; import java.util.Properties; +import java.util.Set; import javax.xml.transform.OutputKeys; import javax.xml.transform.TransformerException; @@ -48,6 +50,8 @@ public class IndentingXMLWriter extends XMLWriter { private boolean sameline = false; private boolean whitespacePreserve = false; private final Deque whitespacePreserveStack = new ArrayDeque<>(); + private Set suppressIndentation = null; + private int suppressIndentDepth = 0; public IndentingXMLWriter() { super(); @@ -75,6 +79,9 @@ public void startElement(final String namespaceURI, final String localName, fina indent(); } super.startElement(namespaceURI, localName, qname); + if (isSuppressIndentation(localName)) { + suppressIndentDepth++; + } addIndent(); afterTag = true; sameline = true; @@ -86,6 +93,9 @@ public void startElement(final QName qname) throws TransformerException { indent(); } super.startElement(qname); + if (isSuppressIndentation(qname.getLocalPart())) { + suppressIndentDepth++; + } addIndent(); afterTag = true; sameline = true; @@ -95,6 +105,9 @@ public void startElement(final QName qname) throws TransformerException { public void endElement(final String namespaceURI, final String localName, final String qname) throws TransformerException { endIndent(namespaceURI, localName); super.endElement(namespaceURI, localName, qname); + if (isSuppressIndentation(localName) && suppressIndentDepth > 0) { + suppressIndentDepth--; + } popWhitespacePreserve(); // apply ancestor's xml:space value _after_ end element sameline = isInlineTag(namespaceURI, localName); afterTag = true; @@ -104,6 +117,9 @@ public void endElement(final String namespaceURI, final String localName, final public void endElement(final QName qname) throws TransformerException { endIndent(qname.getNamespaceURI(), qname.getLocalPart()); super.endElement(qname); + if (isSuppressIndentation(qname.getLocalPart()) && suppressIndentDepth > 0) { + suppressIndentDepth--; + } popWhitespacePreserve(); // apply ancestor's xml:space value _after_ end element sameline = isInlineTag(qname.getNamespaceURI(), qname.getLocalPart()); afterTag = true; @@ -164,7 +180,29 @@ public void setOutputProperties(final Properties properties) { } catch (final NumberFormatException e) { LOG.warn("Invalid indentation value: '{}'", option); } - indent = "yes".equals(outputProperties.getProperty(OutputKeys.INDENT, "no")); + final String indentValue = outputProperties.getProperty(OutputKeys.INDENT, "no").trim(); + indent = "yes".equals(indentValue) || "true".equals(indentValue) || "1".equals(indentValue); + final String suppressProp = outputProperties.getProperty("suppress-indentation"); + if (suppressProp != null && !suppressProp.isEmpty()) { + suppressIndentation = new HashSet<>(); + for (final String name : suppressProp.split("\\s+")) { + if (!name.isEmpty()) { + // Handle URI-qualified names: Q{ns}local or {ns}local → extract local part + if (name.startsWith("Q{") || name.startsWith("{")) { + final int closeBrace = name.indexOf('}'); + if (closeBrace > 0 && closeBrace < name.length() - 1) { + suppressIndentation.add(name.substring(closeBrace + 1)); + } else { + suppressIndentation.add(name); + } + } else { + suppressIndentation.add(name); + } + } + } + } else { + suppressIndentation = null; + } } @Override @@ -220,8 +258,12 @@ protected void addSpaceIfIndent() throws IOException { writer.write(' '); } + private boolean isSuppressIndentation(final String localName) { + return suppressIndentation != null && suppressIndentation.contains(localName); + } + protected void indent() throws TransformerException { - if (!indent || whitespacePreserve) { + if (!indent || whitespacePreserve || suppressIndentDepth > 0) { return; } final int spaces = indentAmount * level; diff --git a/exist-core/src/main/java/org/exist/util/serializer/XMLWriter.java b/exist-core/src/main/java/org/exist/util/serializer/XMLWriter.java index 763aaf52ef6..50e618eddb6 100644 --- a/exist-core/src/main/java/org/exist/util/serializer/XMLWriter.java +++ b/exist-core/src/main/java/org/exist/util/serializer/XMLWriter.java @@ -86,8 +86,33 @@ public class XMLWriter implements SerializerWriter { * compared to retrieving resources from the database. */ private boolean xdmSerialization = false; + private boolean xml11 = false; + private boolean canonical = false; + @Nullable private java.text.Normalizer.Form normalizationForm = null; + + // Canonical XML: buffer namespaces and attributes for sorting + private final List canonicalNamespaces = new ArrayList<>(); // [prefix, uri] + private final List canonicalAttributes = new ArrayList<>(); // [nsUri, localName, qname, value] private final Deque elementName = new ArrayDeque<>(); + + /** + * Returns true if cdata-section-elements should be applied. + * Subclasses (e.g., XHTMLWriter for HTML method) can override + * to suppress CDATA sections. + */ + protected boolean shouldUseCdataSections() { + return xdmSerialization; + } + + /** + * Returns the namespace URI of the current (innermost) element, + * or null if no element is on the stack. + */ + protected String currentElementNamespaceURI() { + final QName top = elementName.peek(); + return top != null ? top.getNamespaceURI() : null; + } private LazyVal> cdataSectionElements = new LazyVal<>(this::parseCdataSectionElementNames); private boolean cdataSetionElement = false; @@ -96,8 +121,9 @@ public class XMLWriter implements SerializerWriter { Arrays.fill(textSpecialChars, false); textSpecialChars['<'] = true; textSpecialChars['>'] = true; - // textSpecialChars['\r'] = true; + textSpecialChars['\r'] = true; textSpecialChars['&'] = true; + textSpecialChars[0x7F] = true; // DEL must be escaped as  attrSpecialChars = new boolean[128]; Arrays.fill(attrSpecialChars, false); @@ -108,6 +134,7 @@ public class XMLWriter implements SerializerWriter { attrSpecialChars['\t'] = true; attrSpecialChars['&'] = true; attrSpecialChars['"'] = true; + attrSpecialChars[0x7F] = true; // DEL must be escaped as  } @Nullable private XMLDeclaration originalXmlDecl; @@ -139,6 +166,10 @@ public void setOutputProperties(final Properties properties) { } this.xdmSerialization = "yes".equals(outputProperties.getProperty(EXistOutputKeys.XDM_SERIALIZATION, "no")); + this.xml11 = "1.1".equals(outputProperties.getProperty(OutputKeys.VERSION)); + this.normalizationForm = parseNormalizationForm(outputProperties.getProperty("normalization-form", "none")); + final String canonicalProp = outputProperties.getProperty(EXistOutputKeys.CANONICAL); + this.canonical = "yes".equals(canonicalProp) || "true".equals(canonicalProp) || "1".equals(canonicalProp); } private Set parseCdataSectionElementNames() { @@ -291,15 +322,40 @@ public void endElement(final QName qname) throws TransformerException { } public void namespace(final String prefix, final String nsURI) throws TransformerException { - if((nsURI == null) && (prefix == null || prefix.isEmpty())) { + if((nsURI == null || nsURI.isEmpty()) && (prefix == null || prefix.isEmpty())) { + return; + } + + // The xml namespace is implicitly declared and never needs explicit serialization + if ("xml".equals(prefix)) { return; } - try { + try { if(!tagIsOpen) { throw new TransformerException("Found a namespace declaration outside an element"); } + if (canonical) { + // Buffer for sorting — emitted in closeStartTag + final String pfx = prefix != null ? prefix : ""; + final String uri = nsURI != null ? nsURI : ""; + // Validate: reject relative namespace URIs (SERE0024) + if (!uri.isEmpty() && isRelativeUri(uri)) { + throw new TransformerException("err:SERE0024 Canonical serialization does not allow relative namespace URIs: " + uri); + } + if (pfx.isEmpty() && uri.isEmpty()) { + return; // Skip xmlns="" in canonical (not meaningful for no-namespace elements) + } + // Deduplicate: replace existing binding for same prefix + canonicalNamespaces.removeIf(ns -> ns[0].equals(pfx)); + canonicalNamespaces.add(new String[]{pfx, uri}); + if (pfx.isEmpty()) { + defaultNamespace = uri; + } + return; + } + if(prefix != null && !prefix.isEmpty()) { writer.write(' '); writer.write("xmlns"); @@ -310,7 +366,7 @@ public void namespace(final String prefix, final String nsURI) throws Transforme writer.write('"'); } else { if(defaultNamespace.equals(nsURI)) { - return; + return; } writer.write(' '); writer.write("xmlns"); @@ -329,8 +385,13 @@ public void attribute(String qname, CharSequence value) throws TransformerExcept if(!tagIsOpen) { characters(value); return; - // throw new TransformerException("Found an attribute outside an - // element"); + } + if (canonical) { + // Buffer for sorting — extract namespace URI from qname if prefixed + final int colon = qname.indexOf(':'); + final String nsUri = colon > 0 ? "" : ""; // string qname doesn't carry namespace + canonicalAttributes.add(new String[]{nsUri, colon > 0 ? qname.substring(colon + 1) : qname, qname, value.toString()}); + return; } writer.write(' '); writer.write(qname); @@ -347,8 +408,18 @@ public void attribute(final QName qname, final CharSequence value) throws Transf if(!tagIsOpen) { characters(value); return; - // throw new TransformerException("Found an attribute outside an - // element"); + } + if (canonical) { + final String nsUri = qname.getNamespaceURI() != null ? qname.getNamespaceURI() : ""; + final String localName = qname.getLocalPart(); + final String fullName; + if (qname.getPrefix() != null && !qname.getPrefix().isEmpty()) { + fullName = qname.getPrefix() + ":" + localName; + } else { + fullName = localName; + } + canonicalAttributes.add(new String[]{nsUri, localName, fullName, value.toString()}); + return; } writer.write(' '); if(qname.getPrefix() != null && !qname.getPrefix().isEmpty()) { @@ -373,12 +444,68 @@ public void characters(final CharSequence chars) throws TransformerException { if(tagIsOpen) { closeStartTag(false); } - writeChars(chars, false); + // When xdmSerialization is active and current element is in cdata-section-elements, + // wrap text content in CDATA instead of escaping it (per W3C Serialization 3.1) + if (shouldUseCdataSections() && !elementName.isEmpty() + && cdataSectionElements.get().contains(elementName.peek())) { + writeCdataContent(chars); + } else { + writeChars(chars, false); + } } catch(final IOException ioe) { throw new TransformerException(ioe.getMessage(), ioe); } } + private void writeCdataContent(final CharSequence chars) throws IOException { + // CDATA sections must be split when: + // 1. The content contains "]]>" (which would end the CDATA prematurely) + // 2. A character cannot be represented in the output encoding (must be escaped as &#xNN;) + final String s = normalize(chars).toString(); + boolean inCdata = false; + for (int i = 0; i < s.length(); ) { + final int cp = s.codePointAt(i); + final int cpLen = Character.charCount(cp); + + // Check for "]]>" sequence + if (cp == ']' && i + 2 < s.length() && s.charAt(i + 1) == ']' && s.charAt(i + 2) == '>') { + if (!inCdata) { + writer.write(""); + inCdata = false; + i += 2; // skip "]]", the ">" will be picked up next + continue; + } + + // Check if character is encodable in the output charset + if (!charSet.inCharacterSet((char) cp)) { + // Close any open CDATA section + if (inCdata) { + writer.write("]]>"); + inCdata = false; + } + // Write as character reference + writer.write("&#x"); + writer.write(Integer.toHexString(cp)); + writer.write(';'); + } else { + // Encodable character — write inside CDATA + if (!inCdata) { + writer.write(""); + } + } + public void characters(final char[] ch, final int start, final int len) throws TransformerException { if(!declarationWritten) { writeDeclaration(); @@ -510,8 +637,23 @@ public void documentType(final String name, final String publicId, final String protected void closeStartTag(final boolean isEmpty) throws TransformerException { try { if(tagIsOpen) { - if(isEmpty) { + if (canonical) { + flushCanonicalBuffers(); + } + if(isEmpty && !canonical) { + // Canonical XML: empty elements expanded to writer.write("/>"); + } else if (isEmpty) { + // Canonical: write > for empty elements + writer.write('>'); + final QName currentElem = elementName.peek(); + writer.write("'); } else { writer.write('>'); } @@ -522,6 +664,52 @@ protected void closeStartTag(final boolean isEmpty) throws TransformerException } } + protected boolean isCanonical() { + return canonical; + } + + protected void flushCanonicalBuffersXhtml() throws TransformerException { + try { + flushCanonicalBuffers(); + } catch (final IOException ioe) { + throw new TransformerException(ioe.getMessage(), ioe); + } + } + + private void flushCanonicalBuffers() throws IOException { + // Sort namespaces by prefix (default namespace first, then alphabetical) + canonicalNamespaces.sort((a, b) -> a[0].compareTo(b[0])); + // Write sorted namespaces + for (final String[] ns : canonicalNamespaces) { + writer.write(' '); + if (ns[0].isEmpty()) { + writer.write("xmlns=\""); + } else { + writer.write("xmlns:"); + writer.write(ns[0]); + writer.write("=\""); + } + writeChars(ns[1], true); + writer.write('"'); + } + canonicalNamespaces.clear(); + + // Sort attributes by namespace URI (primary), then local name (secondary) + canonicalAttributes.sort((a, b) -> { + final int cmp = a[0].compareTo(b[0]); + return cmp != 0 ? cmp : a[1].compareTo(b[1]); + }); + // Write sorted attributes + for (final String[] attr : canonicalAttributes) { + writer.write(' '); + writer.write(attr[2]); // qualified name + writer.write("=\""); + writeChars(attr[3], true); + writer.write('"'); + } + canonicalAttributes.clear(); + } + protected void writeDeclaration() throws TransformerException { if(declarationWritten) { return; @@ -537,7 +725,9 @@ protected void writeDeclaration() throws TransformerException { // get the fields of the persisted xml declaration, but overridden with any properties from the serialization properties final String version = outputProperties.getProperty(OutputKeys.VERSION, (originalXmlDecl.version != null ? originalXmlDecl.version : DEFAULT_XML_VERSION)); final String encoding = outputProperties.getProperty(OutputKeys.ENCODING, (originalXmlDecl.encoding != null ? originalXmlDecl.encoding : DEFAULT_XML_ENCODING)); - @Nullable final String standalone = outputProperties.getProperty(OutputKeys.STANDALONE, originalXmlDecl.standalone); + @Nullable final String standaloneOrig = outputProperties.getProperty(OutputKeys.STANDALONE, originalXmlDecl.standalone); + // "omit" means standalone should be absent from the declaration + @Nullable final String standalone = (standaloneOrig != null && "omit".equalsIgnoreCase(standaloneOrig.trim())) ? null : standaloneOrig; writeDeclaration(version, encoding, standalone); @@ -545,11 +735,15 @@ protected void writeDeclaration() throws TransformerException { } final String omitXmlDecl = outputProperties.getProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); - if ("no".equals(omitXmlDecl)) { + @Nullable final String standaloneRaw = outputProperties.getProperty(OutputKeys.STANDALONE); + // "omit" means standalone should be absent from the declaration + @Nullable final String standalone = (standaloneRaw != null && "omit".equalsIgnoreCase(standaloneRaw.trim())) ? null : standaloneRaw; + // Per W3C Serialization 3.1: output declaration if omit-xml-declaration is false/no/0, + // or if standalone is explicitly set (the declaration is required to carry standalone) + if (isBooleanFalse(omitXmlDecl) || standalone != null) { // get the fields of the declaration from the serialization properties final String version = outputProperties.getProperty(OutputKeys.VERSION, DEFAULT_XML_VERSION); final String encoding = outputProperties.getProperty(OutputKeys.ENCODING, DEFAULT_XML_ENCODING); - @Nullable final String standalone = outputProperties.getProperty(OutputKeys.STANDALONE); writeDeclaration(version, encoding, standalone); } @@ -564,7 +758,15 @@ private void writeDeclaration(final String version, final String encoding, @Null writer.write('"'); if(standalone != null) { writer.write(" standalone=\""); - writer.write(standalone); + // Normalize boolean values to yes/no for XML declaration + final String standaloneVal = standalone.trim(); + if ("true".equals(standaloneVal) || "1".equals(standaloneVal)) { + writer.write("yes"); + } else if ("false".equals(standaloneVal) || "0".equals(standaloneVal)) { + writer.write("no"); + } else { + writer.write(standaloneVal); + } writer.write('"'); } writer.write("?>\n"); @@ -589,36 +791,79 @@ protected void writeDoctype(final String rootElement) throws TransformerExceptio protected boolean needsEscape(final char ch) { return true; } + + /** + * Whether & before { should be escaped. HTML output returns false + * per W3C HTML serialization spec. XML output returns true (always escape &). + */ + protected boolean escapeAmpersandBeforeBrace() { + return true; + } + + /** + * Check if a serialization boolean parameter value is false. + * W3C Serialization 3.1 accepts "no", "false", "0" (with optional whitespace) as false. + */ + protected static boolean isBooleanFalse(final String value) { + if (value == null) { + return false; + } + final String trimmed = value.trim(); + return "no".equals(trimmed) || "false".equals(trimmed) || "0".equals(trimmed); + } + + /** + * Whether the given character needs escaping. Subclasses can override + * to suppress escaping for specific contexts (e.g., HTML raw text elements). + * + * @param ch the character to check + * @param inAttribute true if we're writing an attribute value + */ + protected boolean needsEscape(final char ch, final boolean inAttribute) { + return needsEscape(ch); + } protected void writeChars(final CharSequence s, final boolean inAttribute) throws IOException { + // Apply Unicode normalization if configured + final CharSequence text = normalize(s); final boolean[] specialChars = inAttribute ? attrSpecialChars : textSpecialChars; char ch = 0; - final int len = s.length(); + final int len = text.length(); int pos = 0, i; while(pos < len) { i = pos; while(i < len) { - ch = s.charAt(i); + ch = text.charAt(i); if(ch < 128) { if(specialChars[ch]) { break; + } else if(xml11 && ch >= 0x01 && ch <= 0x1F + && ch != 0x09 && ch != 0x0A && ch != 0x0D) { + // XML 1.1: C0 control chars (except TAB, LF, CR) must be escaped + break; } else { i++; } } else if(!charSet.inCharacterSet(ch)) { break; + } else if(ch >= 0x7F && ch <= 0x9F) { + // Control chars 0x7F-0x9F must be serialized as character references + break; + } else if(ch == 0x2028) { + // LINE SEPARATOR must be serialized as character reference + break; } else { i++; } } - writeCharSeq(s, pos, i); + writeCharSeq(text, pos, i); // writer.write(s.subSequence(pos, i).toString()); if (i >= len) { return; } - if(needsEscape(ch)) { + if(needsEscape(ch, inAttribute)) { switch(ch) { case '<': writer.write("<"); @@ -627,7 +872,12 @@ protected void writeChars(final CharSequence s, final boolean inAttribute) throw writer.write(">"); break; case '&': - writer.write("&"); + // HTML spec: & before { in attribute values should not be escaped + if (inAttribute && i + 1 < len && text.charAt(i + 1) == '{' && !escapeAmpersandBeforeBrace()) { + writer.write('&'); + } else { + writer.write("&"); + } break; case '\r': writer.write(" "); @@ -672,6 +922,38 @@ protected void writeCharacterReference(final char charval) throws IOException { writer.write(charref, 0, o); } + @Nullable + private static java.text.Normalizer.Form parseNormalizationForm(final String value) { + if (value == null) return null; + return switch (value.trim().toUpperCase(java.util.Locale.ROOT)) { + case "NFC" -> java.text.Normalizer.Form.NFC; + case "NFD" -> java.text.Normalizer.Form.NFD; + case "NFKC" -> java.text.Normalizer.Form.NFKC; + case "NFKD" -> java.text.Normalizer.Form.NFKD; + case "NONE", "" -> null; + default -> null; // "fully-normalized" or unknown — treated as none + }; + } + + /** + * Apply Unicode normalization if a normalization-form is set. + */ + protected CharSequence normalize(final CharSequence text) { + if (normalizationForm == null) return text; + final String s = text.toString(); + if (java.text.Normalizer.isNormalized(s, normalizationForm)) return text; + return java.text.Normalizer.normalize(s, normalizationForm); + } + + private static boolean isRelativeUri(final String uri) { + for (int i = 0; i < uri.length(); i++) { + final char c = uri.charAt(i); + if (c == ':') return false; + if (c == '/' || c == '?' || c == '#') return true; + } + return true; + } + private static class XMLDeclaration { @Nullable final String version; @Nullable final String encoding; diff --git a/exist-core/src/test/java/org/exist/http/urlrewrite/URLRewriteViewPipelineTest.java b/exist-core/src/test/java/org/exist/http/urlrewrite/URLRewriteViewPipelineTest.java new file mode 100644 index 00000000000..df073b8f1bf --- /dev/null +++ b/exist-core/src/test/java/org/exist/http/urlrewrite/URLRewriteViewPipelineTest.java @@ -0,0 +1,201 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.http.urlrewrite; + +import org.apache.http.HttpResponse; +import org.apache.http.HttpStatus; +import org.apache.http.client.fluent.Request; +import org.apache.http.entity.ContentType; +import org.exist.test.ExistWebServer; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Test; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; + +import static org.junit.Assert.*; + +/** + * Tests the URL rewrite view pipeline — specifically the case where a stored + * HTML document (text/html) is forwarded through a view.xq that processes it + * via request:get-data(). + * + * This test was written to catch a regression where: + * 1. RESTServer forces method=xhtml for text/html documents + * 2. The XHTML serialization produces non-self-closing meta tags + * 3. The view's request:get-data() fails to parse the invalid XML + * 4. The view receives a string instead of XML nodes, causing XPTY0019 + * + * @see URL rewrite view pipeline regression + */ +public class URLRewriteViewPipelineTest { + + @ClassRule + public static final ExistWebServer existWebServer = new ExistWebServer(true, false, true, true); + + private static final String TEST_COLLECTION = "/db/apps/test-url-rewrite"; + + private static final String CONTROLLER_XQ = + "xquery version \"3.1\";\n" + + "declare variable $exist:path external;\n" + + "declare variable $exist:resource external;\n" + + "declare variable $exist:controller external;\n" + + "declare variable $exist:prefix external;\n" + + "\n" + + "if (ends-with($exist:resource, '.html')) then\n" + + " \n" + + " \n" + + " \n" + + " \n" + + " \n" + + "else\n" + + " \n" + + " \n" + + " "; + + private static final String VIEW_XQ = + "xquery version \"3.1\";\n" + + "declare namespace output=\"http://www.w3.org/2010/xslt-xquery-serialization\";\n" + + "declare option output:method \"html\";\n" + + "declare option output:media-type \"text/html\";\n" + + "\n" + + "let $html := request:get-data()\n" + + "return\n" + + " \n" + + " \n" + + " View Pipeline Test\n" + + " { $html/html/head/* }\n" + + " \n" + + " { $html/html/body }\n" + + " "; + + private static final String HTML_WITH_HEAD = + "\n" + + " \n" + + " Test Page\n" + + " \n" + + " \n" + + " \n" + + "

Hello World

\n" + + " \n" + + ""; + + private static final String HTML_WITHOUT_HEAD = + "\n" + + " \n" + + "

Hello World

\n" + + " \n" + + ""; + + @BeforeClass + public static void setup() throws Exception { + // Store test files via REST API (admin user) + final String restUrl = "http://localhost:" + existWebServer.getPort() + "/rest" + TEST_COLLECTION; + + // Create collection and store files via HTTP PUT + storeViaRest(restUrl + "/controller.xq", CONTROLLER_XQ, "application/xquery"); + storeViaRest(restUrl + "/view.xq", VIEW_XQ, "application/xquery"); + storeViaRest(restUrl + "/with-head.html", HTML_WITH_HEAD, "text/html"); + storeViaRest(restUrl + "/no-head.html", HTML_WITHOUT_HEAD, "text/html"); + + // Set execute permissions on XQuery files + final String chmod = "sm:chmod(xs:anyURI('" + TEST_COLLECTION + "/controller.xq'), 'rwxr-xr-x')," + + "sm:chmod(xs:anyURI('" + TEST_COLLECTION + "/view.xq'), 'rwxr-xr-x')"; + Request.Get("http://localhost:" + existWebServer.getPort() + "/rest/db?_query=" + + java.net.URLEncoder.encode(chmod, "UTF-8") + "&_wrap=no") + .addHeader("Authorization", "Basic " + java.util.Base64.getEncoder().encodeToString("admin:".getBytes())) + .execute(); + } + + @AfterClass + public static void teardown() throws Exception { + // Remove test collection via REST + Request.Delete("http://localhost:" + existWebServer.getPort() + "/rest" + TEST_COLLECTION) + .addHeader("Authorization", "Basic " + java.util.Base64.getEncoder().encodeToString("admin:".getBytes())) + .execute(); + } + + /** + * Tests that an HTML document WITH a head element can be served through + * the URL rewrite view pipeline. This is the regression case — the view + * must receive the document as XML nodes, not as a string. + */ + @Test + public void htmlWithHeadThroughViewPipeline() throws IOException { + final String url = "http://localhost:" + existWebServer.getPort() + + "/test-url-rewrite/with-head.html"; + + final HttpResponse response = Request.Get(url).execute().returnResponse(); + final int status = response.getStatusLine().getStatusCode(); + final String body = new String( + response.getEntity().getContent().readAllBytes(), StandardCharsets.UTF_8); + + // Should return 200, not 400 (namespace error) or 500 (XPTY0019) + assertEquals("Expected 200 OK but got " + status + ": " + body.substring(0, Math.min(200, body.length())), + HttpStatus.SC_OK, status); + + // The response should contain the original title from the source HTML + assertTrue("Response should contain the source page's title", + body.contains("Test Page")); + + // The response should contain the view's wrapper title + assertTrue("Response should contain the view's title", + body.contains("View Pipeline Test")); + + // The response should contain the body content + assertTrue("Response should contain body content", + body.contains("Hello World")); + + // The response should NOT contain raw XML entities (indicating string was returned) + assertFalse("Response should not contain escaped XML (string instead of nodes)", + body.contains("<html")); + } + + /** + * Tests that an HTML document WITHOUT a head element works (baseline). + */ + @Test + public void htmlWithoutHeadThroughViewPipeline() throws IOException { + final String url = "http://localhost:" + existWebServer.getPort() + + "/test-url-rewrite/no-head.html"; + + final HttpResponse response = Request.Get(url).execute().returnResponse(); + final int status = response.getStatusLine().getStatusCode(); + + assertEquals(HttpStatus.SC_OK, status); + + final String body = new String( + response.getEntity().getContent().readAllBytes(), StandardCharsets.UTF_8); + assertTrue("Response should contain body content", + body.contains("Hello World")); + } + + private static void storeViaRest(final String url, final String content, final String contentType) + throws IOException { + Request.Put(url) + .addHeader("Authorization", "Basic " + java.util.Base64.getEncoder().encodeToString("admin:".getBytes())) + .bodyString(content, ContentType.create(contentType, StandardCharsets.UTF_8)) + .execute(); + } +} From 437923d8a747ffb6959ac6bec477cc3ba4ea809a Mon Sep 17 00:00:00 2001 From: Joe Wicentowski Date: Sat, 4 Apr 2026 09:23:28 -0400 Subject: [PATCH 03/11] [feature] Improve XHTML serialization for W3C compliance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Major improvements to XHTMLWriter for correct HTML/XHTML output: Content-type meta injection: - Write or as first child of when include-content-type=yes (default) - HTML5 uses shorthand - XHTML uses self-closing for valid XML output - Track head element state, reset between serializations HTML method support: - Boolean attribute minimization (checked, disabled, selected, etc.) - Raw text elements (script, style) — no escaping in element content - Suppress cdata-section-elements for HTML method - Don't escape & before { in HTML attribute values (template syntax) - Add embed to void/empty elements list SVG/MathML namespace normalization: - Collapse SVG and MathML namespace prefixes to default namespace in XHTML5 serialization (e.g., svg:rect → rect within SVG) Canonical XML support in XHTML close tag. HTML version detection: default from 1.0 to 5.0. Spec: W3C Serialization 3.1 §7 (XHTML Output Method), W3C Serialization 3.1 §8 (HTML Output Method) Co-Authored-By: Claude Opus 4.6 (1M context) --- .../util/serializer/AbstractSerializer.java | 26 +- .../exist/util/serializer/XHTMLWriter.java | 284 ++++++++++++++++-- 2 files changed, 279 insertions(+), 31 deletions(-) diff --git a/exist-core/src/main/java/org/exist/util/serializer/AbstractSerializer.java b/exist-core/src/main/java/org/exist/util/serializer/AbstractSerializer.java index 758ccee130a..a1b7c9890b3 100644 --- a/exist-core/src/main/java/org/exist/util/serializer/AbstractSerializer.java +++ b/exist-core/src/main/java/org/exist/util/serializer/AbstractSerializer.java @@ -81,13 +81,27 @@ protected SerializerWriter getDefaultWriter() { public void setOutput(Writer writer, Properties properties) { outputProperties = Objects.requireNonNullElseGet(properties, () -> new Properties(defaultProperties)); final String method = outputProperties.getProperty(OutputKeys.METHOD, "xml"); - final String htmlVersionProp = outputProperties.getProperty(EXistOutputKeys.HTML_VERSION, "1.0"); - + // For html/xhtml methods, determine HTML version: + // 1. Use html-version if explicitly set + // 2. Otherwise use version (W3C spec: version controls HTML version for html method) + // 3. Default to 5.0 double htmlVersion; - try { - htmlVersion = Double.parseDouble(htmlVersionProp); - } catch (NumberFormatException e) { - htmlVersion = 1.0; + final String explicitHtmlVersion = outputProperties.getProperty(EXistOutputKeys.HTML_VERSION); + if (explicitHtmlVersion != null) { + try { + htmlVersion = Double.parseDouble(explicitHtmlVersion); + } catch (NumberFormatException e) { + htmlVersion = 5.0; + } + } else if (("html".equalsIgnoreCase(method) || "xhtml".equalsIgnoreCase(method)) + && outputProperties.getProperty(OutputKeys.VERSION) != null) { + try { + htmlVersion = Double.parseDouble(outputProperties.getProperty(OutputKeys.VERSION)); + } catch (NumberFormatException e) { + htmlVersion = 5.0; + } + } else { + htmlVersion = 5.0; } final SerializerWriter baseSerializerWriter = getBaseSerializerWriter(method, htmlVersion); diff --git a/exist-core/src/main/java/org/exist/util/serializer/XHTMLWriter.java b/exist-core/src/main/java/org/exist/util/serializer/XHTMLWriter.java index b0006f7f51c..9238cd1e848 100644 --- a/exist-core/src/main/java/org/exist/util/serializer/XHTMLWriter.java +++ b/exist-core/src/main/java/org/exist/util/serializer/XHTMLWriter.java @@ -23,6 +23,7 @@ import java.io.IOException; import java.io.Writer; +import javax.xml.transform.OutputKeys; import javax.xml.transform.TransformerException; import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet; @@ -36,12 +37,35 @@ */ public class XHTMLWriter extends IndentingXMLWriter { + /** + * HTML boolean attributes per HTML 4.01 and HTML5 spec. + * When method="html" and the attribute value equals the attribute name + * (case-insensitive), the attribute is minimized to just the name. + */ + protected static final ObjectSet BOOLEAN_ATTRIBUTES = new ObjectOpenHashSet<>(31); + static { + BOOLEAN_ATTRIBUTES.add("checked"); + BOOLEAN_ATTRIBUTES.add("compact"); + BOOLEAN_ATTRIBUTES.add("declare"); + BOOLEAN_ATTRIBUTES.add("defer"); + BOOLEAN_ATTRIBUTES.add("disabled"); + BOOLEAN_ATTRIBUTES.add("ismap"); + BOOLEAN_ATTRIBUTES.add("multiple"); + BOOLEAN_ATTRIBUTES.add("nohref"); + BOOLEAN_ATTRIBUTES.add("noresize"); + BOOLEAN_ATTRIBUTES.add("noshade"); + BOOLEAN_ATTRIBUTES.add("nowrap"); + BOOLEAN_ATTRIBUTES.add("readonly"); + BOOLEAN_ATTRIBUTES.add("selected"); + } + protected static final ObjectSet EMPTY_TAGS = new ObjectOpenHashSet<>(31); static { EMPTY_TAGS.add("area"); EMPTY_TAGS.add("base"); EMPTY_TAGS.add("br"); EMPTY_TAGS.add("col"); + EMPTY_TAGS.add("embed"); EMPTY_TAGS.add("hr"); EMPTY_TAGS.add("img"); EMPTY_TAGS.add("input"); @@ -88,6 +112,8 @@ public class XHTMLWriter extends IndentingXMLWriter { } protected String currentTag; + protected boolean inHead = false; + protected boolean contentTypeMetaWritten = false; protected final ObjectSet emptyTags; protected final ObjectSet inlineTags; @@ -120,78 +146,121 @@ public XHTMLWriter(final Writer writer, ObjectSet emptyTags, ObjectSet 0 && namespaceURI != null && namespaceURI.equals(Namespaces.XHTML_NS)) { - haveCollapsedXhtmlPrefix = true; - return qname.substring(pos+1); - + if (pos > 0 && namespaceURI != null) { + if (namespaceURI.equals(Namespaces.XHTML_NS)) { + haveCollapsedXhtmlPrefix = true; + return qname.substring(pos + 1); + } + // XHTML5: normalize SVG and MathML prefixes + if (isHtml5Version() && (namespaceURI.equals(SVG_NS) || namespaceURI.equals(MATHML_NS))) { + collapsedForeignNs = namespaceURI; + return qname.substring(pos + 1); + } } - return qname; } @Override public void namespace(final String prefix, final String nsURI) throws TransformerException { - if(haveCollapsedXhtmlPrefix && prefix != null && !prefix.isEmpty() && nsURI.equals(Namespaces.XHTML_NS)) { - return; //dont output the xmlns:prefix for the collapsed nodes prefix + if (haveCollapsedXhtmlPrefix && prefix != null && !prefix.isEmpty() && nsURI.equals(Namespaces.XHTML_NS)) { + return; // don't output the xmlns:prefix for the collapsed node's prefix + } + // When a foreign namespace prefix was collapsed, replace the prefixed + // declaration with a default namespace declaration + if (collapsedForeignNs != null && prefix != null && !prefix.isEmpty() + && nsURI.equals(collapsedForeignNs)) { + super.namespace("", nsURI); // emit xmlns="..." instead of xmlns:prefix="..." + return; } - super.namespace(prefix, nsURI); } @@ -200,9 +269,25 @@ public void namespace(final String prefix, final String nsURI) throws Transforme protected void closeStartTag(final boolean isEmpty) throws TransformerException { try { if (tagIsOpen) { + // Flush canonical buffers (sorted namespaces + attributes) if active + if (isCanonical()) { + flushCanonicalBuffersXhtml(); + } if (isEmpty) { - if (isEmptyTag(currentTag)) { - getWriter().write(" />"); + if (isCanonical()) { + // Canonical: always expand empty elements + getWriter().write('>'); + getWriter().write("'); + } else if (isEmptyTag(currentTag)) { + // For method="html", use HTML-style void tags (
) + // For method="xhtml", use XHTML-style (
) + if (isHtmlMethod()) { + getWriter().write(">"); + } else { + getWriter().write(" />"); + } } else { getWriter().write('>'); getWriter().write(") while XHTML uses self-closing (
). + */ + private boolean isHtmlMethod() { + if (outputProperties != null) { + final String method = outputProperties.getProperty(javax.xml.transform.OutputKeys.METHOD); + return "html".equalsIgnoreCase(method); + } + return false; + } + + /** + * Returns true if the HTML version is 5.0 or higher. + */ + private boolean isHtml5Version() { + if (outputProperties == null) { + return true; // default to HTML5 + } + final String version = outputProperties.getProperty(OutputKeys.VERSION); + if (version != null) { + try { + return Double.parseDouble(version) >= 5.0; + } catch (final NumberFormatException e) { + // ignore + } + } + return true; // default to HTML5 + } + @Override + public void attribute(final QName qname, final CharSequence value) throws TransformerException { + // For method="html", minimize boolean attributes when value matches name + if (isHtmlMethod() && isBooleanAttribute(qname.getLocalPart(), value)) { + try { + if (!tagIsOpen) { + characters(value); + return; + } + final Writer w = getWriter(); + w.write(' '); + w.write(qname.getLocalPart()); + // Don't write ="value" — minimized form + } catch (final IOException ioe) { + throw new TransformerException(ioe.getMessage(), ioe); + } + return; + } + super.attribute(qname, value); + } + + @Override + public void attribute(final String qname, final CharSequence value) throws TransformerException { + if (isHtmlMethod() && isBooleanAttribute(qname, value)) { + try { + if (!tagIsOpen) { + characters(value); + return; + } + final Writer w = getWriter(); + w.write(' '); + w.write(qname); + } catch (final IOException ioe) { + throw new TransformerException(ioe.getMessage(), ioe); + } + return; + } + super.attribute(qname, value); + } + + private boolean isBooleanAttribute(final String attrName, final CharSequence value) { + return BOOLEAN_ATTRIBUTES.contains(attrName.toLowerCase(java.util.Locale.ROOT)) + && attrName.equalsIgnoreCase(value.toString()); + } + + private static final ObjectSet RAW_TEXT_ELEMENTS_HTML = new ObjectOpenHashSet<>(4); + static { + RAW_TEXT_ELEMENTS_HTML.add("script"); + RAW_TEXT_ELEMENTS_HTML.add("style"); + } + + @Override + protected boolean needsEscape(final char ch, final boolean inAttribute) { + // For HTML method, script and style content should not be escaped + if (!inAttribute && isHtmlMethod() + && currentTag != null && RAW_TEXT_ELEMENTS_HTML.contains(currentTag.toLowerCase(java.util.Locale.ROOT))) { + return false; + } + return super.needsEscape(ch, inAttribute); + } + + /** + * For HTML serialization, cdata-section-elements is ignored per the + * W3C serialization spec — CDATA sections are not valid in HTML. + */ + @Override + protected boolean shouldUseCdataSections() { + if (isHtmlMethod()) { + return false; + } + return super.shouldUseCdataSections(); + } + + @Override + protected boolean escapeAmpersandBeforeBrace() { + // HTML spec: & before { in attribute values should not be escaped + return false; + } + @Override protected boolean isInlineTag(final String namespaceURI, final String localName) { return (namespaceURI == null || namespaceURI.isEmpty() || Namespaces.XHTML_NS.equals(namespaceURI)) && inlineTags.contains(localName); } + + /** + * Write a meta content-type tag as the first child of head when + * include-content-type is enabled (the default per W3C Serialization 3.1). + */ + protected void writeContentTypeMeta() throws TransformerException { + if (contentTypeMetaWritten || outputProperties == null) { + return; + } + final String includeContentType = outputProperties.getProperty("include-content-type", "yes"); + if (!"yes".equals(includeContentType)) { + return; + } + contentTypeMetaWritten = true; + try { + final String encoding = outputProperties.getProperty(OutputKeys.ENCODING, "UTF-8"); + closeStartTag(false); + final Writer writer = getWriter(); + + // HTML5 method uses + // XHTML and HTML4 use + // XHTML mode requires self-closing tags (/>) for valid XML output — + // the URL rewrite pipeline re-parses this as XML in the view step. + final boolean selfClose = !isHtmlMethod(); + if (isHtmlMethod() && isHtml5Version()) { + writer.write("" : "\">"); + } else { + final String mediaType = outputProperties.getProperty(OutputKeys.MEDIA_TYPE, "text/html"); + writer.write("" : "\">"); + } + } catch (IOException e) { + throw new TransformerException(e.getMessage(), e); + } + } } From 20ee93729392e961d2eee1691787caba484b5f24 Mon Sep 17 00:00:00 2001 From: Joe Wicentowski Date: Sat, 4 Apr 2026 09:23:38 -0400 Subject: [PATCH 04/11] [feature] Fix HTML5/XHTML5 fragment and DOCTYPE serialization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit XHTML5Writer: - Suppress DOCTYPE for non- root elements (fragment serialization) - Support doctype-public and doctype-system for XHTML mode - Suppress DOCTYPE entirely in canonical mode HTML5Writer: - Processing instructions use > not ?> for HTML method - Override needsEscape(char, boolean) for raw text elements Test: HTML5FragmentTest — 12 new tests for fragment DOCTYPE suppression, suppress-indentation, CDATA suppression in HTML, script escaping. Spec: W3C Serialization 3.1 §7.3 (XHTML DOCTYPE), HTML5 §12.1.3 (Serialization of script/style) Co-Authored-By: Claude Opus 4.6 (1M context) --- .../exist/util/serializer/HTML5Writer.java | 33 +++ .../exist/util/serializer/XHTML5Writer.java | 41 +++- .../util/serializer/HTML5FragmentTest.java | 220 ++++++++++++++++++ .../util/serializer/HTML5WriterTest.java | 8 +- .../src/test/xquery/xquery3/serialize.xql | 64 ++++- 5 files changed, 357 insertions(+), 9 deletions(-) create mode 100644 exist-core/src/test/java/org/exist/util/serializer/HTML5FragmentTest.java diff --git a/exist-core/src/main/java/org/exist/util/serializer/HTML5Writer.java b/exist-core/src/main/java/org/exist/util/serializer/HTML5Writer.java index 1dffc3029b7..bc69c4304c6 100644 --- a/exist-core/src/main/java/org/exist/util/serializer/HTML5Writer.java +++ b/exist-core/src/main/java/org/exist/util/serializer/HTML5Writer.java @@ -246,6 +246,23 @@ protected void closeStartTag(boolean isEmpty) throws TransformerException { } } + @Override + public void processingInstruction(String target, String data) throws TransformerException { + try { + closeStartTag(false); + final Writer writer = getWriter(); + writer.write("'); + } catch (IOException e) { + throw new TransformerException(e.getMessage(), e); + } + } + @Override protected boolean needsEscape(char ch) { if (RAW_TEXT_ELEMENTS.contains(currentTag)) { @@ -253,4 +270,20 @@ protected boolean needsEscape(char ch) { } return super.needsEscape(ch); } + + @Override + protected boolean needsEscape(final char ch, final boolean inAttribute) { + // In raw text elements (script, style), suppress escaping for TEXT content only. + // Attribute values must always be escaped, even on raw text elements. + if (!inAttribute && RAW_TEXT_ELEMENTS.contains(currentTag)) { + return false; + } + // For attributes, always return true (bypass the 1-arg override + // which returns false for all script/style content) + if (inAttribute) { + return true; + } + return super.needsEscape(ch, inAttribute); + } + } diff --git a/exist-core/src/main/java/org/exist/util/serializer/XHTML5Writer.java b/exist-core/src/main/java/org/exist/util/serializer/XHTML5Writer.java index e89e7119d19..4894c0162af 100644 --- a/exist-core/src/main/java/org/exist/util/serializer/XHTML5Writer.java +++ b/exist-core/src/main/java/org/exist/util/serializer/XHTML5Writer.java @@ -24,6 +24,7 @@ import java.io.Writer; import javax.xml.transform.TransformerException; +import org.exist.storage.serializers.EXistOutputKeys; import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet; import it.unimi.dsi.fastutil.objects.ObjectSet; @@ -128,7 +129,45 @@ protected void writeDoctype(String rootElement) throws TransformerException { return; } - documentType("html", null, null); + // Canonical serialization: never output DOCTYPE + final String canonicalProp = outputProperties != null + ? outputProperties.getProperty(EXistOutputKeys.CANONICAL) : null; + if ("yes".equals(canonicalProp) || "true".equals(canonicalProp) || "1".equals(canonicalProp)) { + doctypeWritten = true; + return; + } + + // Only output DOCTYPE when the root element is (case-insensitive) + // Per W3C Serialization: DOCTYPE is for the html element only, not fragments + final String localName = rootElement.contains(":") ? rootElement.substring(rootElement.indexOf(':') + 1) : rootElement; + if (!"html".equalsIgnoreCase(localName)) { + doctypeWritten = true; // suppress future attempts + return; + } + + final String publicId = outputProperties != null + ? outputProperties.getProperty(javax.xml.transform.OutputKeys.DOCTYPE_PUBLIC) : null; + final String systemId = outputProperties != null + ? outputProperties.getProperty(javax.xml.transform.OutputKeys.DOCTYPE_SYSTEM) : null; + final String method = outputProperties != null + ? outputProperties.getProperty(javax.xml.transform.OutputKeys.METHOD, "xhtml") : "xhtml"; + + if ("xhtml".equalsIgnoreCase(method)) { + // XHTML: per W3C spec section 5.2, only output doctype-public when + // doctype-system is also present + if (systemId != null) { + documentType("html", publicId, systemId); + } else if (publicId == null) { + // Neither set — simple DOCTYPE + documentType("html", null, null); + } else { + // doctype-public without doctype-system — suppress DOCTYPE for XHTML + doctypeWritten = true; + } + } else { + // HTML method: pass through doctype-public and doctype-system as set + documentType("html", publicId, systemId); + } doctypeWritten = true; } } diff --git a/exist-core/src/test/java/org/exist/util/serializer/HTML5FragmentTest.java b/exist-core/src/test/java/org/exist/util/serializer/HTML5FragmentTest.java new file mode 100644 index 00000000000..f1708e31ea1 --- /dev/null +++ b/exist-core/src/test/java/org/exist/util/serializer/HTML5FragmentTest.java @@ -0,0 +1,220 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.util.serializer; + +import org.exist.EXistException; +import org.exist.storage.BrokerPool; +import org.exist.storage.DBBroker; +import org.exist.storage.serializers.EXistOutputKeys; +import org.exist.test.ExistEmbeddedServer; +import org.exist.security.PermissionDeniedException; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQuery; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.value.Sequence; +import org.junit.ClassRule; +import org.junit.Test; +import org.xml.sax.SAXException; + +import javax.xml.transform.OutputKeys; +import java.io.StringWriter; +import java.util.Properties; + +import static org.junit.Assert.*; + +/** + * Tests that HTML5 serialization does not emit DOCTYPE for fragments + * (non-html root elements). + */ +public class HTML5FragmentTest { + + @ClassRule + public static final ExistEmbeddedServer existEmbeddedServer = new ExistEmbeddedServer(true, true); + + private String serialize(final String xquery, final String method, final String version) + throws EXistException, XPathException, SAXException, PermissionDeniedException { + final BrokerPool pool = existEmbeddedServer.getBrokerPool(); + try (final DBBroker broker = pool.get(java.util.Optional.empty())) { + final XQuery xqueryService = pool.getXQueryService(); + final XQueryContext context = new XQueryContext(pool); + final Sequence result = xqueryService.execute(broker, xquery, null); + + final Properties props = new Properties(); + props.setProperty(OutputKeys.METHOD, method); + props.setProperty(OutputKeys.INDENT, "no"); + props.setProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); + if (version != null) { + props.setProperty(OutputKeys.VERSION, version); + } + props.setProperty(EXistOutputKeys.XDM_SERIALIZATION, "yes"); + + final StringWriter writer = new StringWriter(); + final XQuerySerializer serializer = new XQuerySerializer(broker, props, writer); + serializer.serialize(result); + return writer.toString(); + } + } + + @Test + public void htmlDocumentGetsDoctype() throws Exception { + final String result = serialize("

hello

", "html", "5.0"); + assertTrue("HTML document should have DOCTYPE: " + result, + result.contains("")); + } + + @Test + public void htmlFragmentNoDoctype() throws Exception { + final String result = serialize("

hello

", "html", "5.0"); + assertFalse("HTML fragment should NOT have DOCTYPE: " + result, + result.contains("hello

")); + } + + @Test + public void htmlFragmentDivNoDoctype() throws Exception { + final String result = serialize("
text
", "html", "5.0"); + assertFalse("HTML div fragment should NOT have DOCTYPE: " + result, + result.contains("item", "html", "5.0"); + assertFalse("HTML li fragment should NOT have DOCTYPE: " + result, + result.contains("

hello

", + "xhtml", "5.0"); + assertTrue("XHTML document should have DOCTYPE: " + result, + result.contains("")); + } + + @Test + public void xhtmlFragmentNoDoctype() throws Exception { + final String result = serialize( + "

hello

", + "xhtml", "5.0"); + assertFalse("XHTML fragment should NOT have DOCTYPE: " + result, + result.contains("
  • One

"; + final Sequence result = xqueryService.execute(broker, xquery, null); + + final Properties props = new Properties(); + props.setProperty(OutputKeys.METHOD, "html"); + props.setProperty(OutputKeys.INDENT, "yes"); + props.setProperty(OutputKeys.VERSION, "5.0"); + props.setProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); + props.setProperty("suppress-indentation", "li td"); + props.setProperty(EXistOutputKeys.XDM_SERIALIZATION, "yes"); + + final StringWriter writer = new StringWriter(); + final XQuerySerializer serializer = new XQuerySerializer(broker, props, writer); + serializer.serialize(result); + final String output = writer.toString(); + + // li should NOT have indentation inside it + assertTrue("li content should not be indented: " + output, + output.contains("
  • One

  • ")); + } + } + + @Test + public void htmlSuppressIndentationViaFnSerialize() throws Exception { + final BrokerPool pool = existEmbeddedServer.getBrokerPool(); + try (final DBBroker broker = pool.get(java.util.Optional.empty())) { + final XQuery xqueryService = pool.getXQueryService(); + // Use fn:serialize with suppress-indentation — pass QNames, not string + final String xquery = + "serialize(
    • One

    , " + + "map { 'method': 'html', 'indent': true(), 'version': '5.0', " + + "'suppress-indentation': (xs:QName('li'), xs:QName('td')) })"; + final Sequence result = xqueryService.execute(broker, xquery, null); + final String output = result.getStringValue(); + + // li should NOT have indentation inside it + assertTrue("li content should not be indented via fn:serialize: " + output, + output.contains("
  • One

  • ")); + } + } + + @Test + public void htmlCdataSectionElementsSuppressed() throws Exception { + // For HTML method, cdata-section-elements should be IGNORED + // Text should not be wrapped in CDATA markers + final BrokerPool pool = existEmbeddedServer.getBrokerPool(); + try (final DBBroker broker = pool.get(java.util.Optional.empty())) { + final XQuery xqueryService = pool.getXQueryService(); + final String xquery = "

    No CDATA

    "; + final Sequence result = xqueryService.execute(broker, xquery, null); + + final Properties props = new Properties(); + props.setProperty(OutputKeys.METHOD, "html"); + props.setProperty(OutputKeys.INDENT, "no"); + props.setProperty(OutputKeys.VERSION, "5.0"); + props.setProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); + props.setProperty(OutputKeys.CDATA_SECTION_ELEMENTS, "b"); + props.setProperty(EXistOutputKeys.XDM_SERIALIZATION, "yes"); + + final StringWriter writer = new StringWriter(); + final XQuerySerializer serializer = new XQuerySerializer(broker, props, writer); + serializer.serialize(result); + final String output = writer.toString(); + + assertFalse("HTML output should not contain CDATA: " + output, + output.contains("No CDATA")); + } + } + + @Test + public void htmlScriptAttributeEscaped() throws Exception { + // In HTML5, attributes on script elements MUST be escaped + // but text content inside script elements must NOT be escaped + final String result = serialize("", + "html", "5.0"); + assertTrue("Script attribute & should be escaped: " + result, + result.contains("language=\"Jack&Jill\"")); + assertTrue("Script body && should NOT be escaped: " + result, + result.contains("go && run()")); + } + + @Test + public void html40NoDoctypeWithoutPublicSystem() throws Exception { + // HTML 4.0 without doctype-public/doctype-system should not emit DOCTYPE + final String result = serialize("

    hello

    ", "html", "4.0"); + assertFalse("HTML 4.0 without public/system should NOT have DOCTYPE: " + result, + result.contains("\n"; + final String expected = ""; final QName elQName = new QName("input"); writer.startElement(elQName); writer.attribute("checked", "checked"); @@ -54,7 +54,7 @@ public void testAttributeWithBooleanValue() throws Exception { @Test public void testAttributeWithNonBooleanValue() throws Exception { - final String expected = "\n"; + final String expected = ""; final QName elQName = new QName("input"); writer.startElement(elQName); writer.attribute("name", "name"); @@ -66,7 +66,7 @@ public void testAttributeWithNonBooleanValue() throws Exception { @Test public void testAttributeQNameWithBooleanValue() throws Exception { - final String expected = "\n"; + final String expected = ""; final QName elQName = new QName("input"); final QName attrQName = new QName("checked"); writer.startElement(elQName); @@ -79,7 +79,7 @@ public void testAttributeQNameWithBooleanValue() throws Exception { @Test public void testAttributeQNameWithNonBooleanValue() throws Exception { - final String expected = "\n"; + final String expected = ""; final QName elQName = new QName("input"); final QName attrQName = new QName("name"); writer.startElement(elQName); diff --git a/exist-core/src/test/xquery/xquery3/serialize.xql b/exist-core/src/test/xquery/xquery3/serialize.xql index bea438d425f..c5cd35d1f6c 100644 --- a/exist-core/src/test/xquery/xquery3/serialize.xql +++ b/exist-core/src/test/xquery/xquery3/serialize.xql @@ -847,7 +847,7 @@ function ser:serialize-xml-134() { }; declare - %test:assertEquals(' ') + %test:assertEquals('') function ser:serialize-html-5-boolean-attribute-names() {