diff --git a/exist-core/src/main/java/org/exist/storage/serializers/EXistOutputKeys.java b/exist-core/src/main/java/org/exist/storage/serializers/EXistOutputKeys.java index ca85a06f5fe..fb11391f4b0 100644 --- a/exist-core/src/main/java/org/exist/storage/serializers/EXistOutputKeys.java +++ b/exist-core/src/main/java/org/exist/storage/serializers/EXistOutputKeys.java @@ -118,4 +118,11 @@ public class EXistOutputKeys { public static final String INSERT_FINAL_NEWLINE = "insert-final-newline"; public static final String USE_CHARACTER_MAPS = "use-character-maps"; + + // --- CSV serialization parameters --- + public static final String CSV_FIELD_DELIMITER = "csv.field-delimiter"; + public static final String CSV_ROW_DELIMITER = "csv.row-delimiter"; + public static final String CSV_QUOTE_CHARACTER = "csv.quote-character"; + public static final String CSV_HEADER = "csv.header"; + public static final String CSV_QUOTES = "csv.quotes"; } diff --git a/exist-core/src/main/java/org/exist/util/serializer/CSVSerializer.java b/exist-core/src/main/java/org/exist/util/serializer/CSVSerializer.java new file mode 100644 index 00000000000..98c599fc582 --- /dev/null +++ b/exist-core/src/main/java/org/exist/util/serializer/CSVSerializer.java @@ -0,0 +1,295 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.util.serializer; + +import io.lacuna.bifurcan.IEntry; +import org.exist.storage.serializers.EXistOutputKeys; +import org.exist.xquery.XPathException; +import org.exist.xquery.functions.array.ArrayType; +import org.exist.xquery.functions.map.AbstractMapType; +import org.exist.xquery.value.*; +import org.xml.sax.SAXException; + +import java.io.IOException; +import java.io.Writer; +import java.util.*; + +/** + * Serializes XDM sequences as RFC 4180 CSV output. + * + * Accepts three input formats: + * + */ +public class CSVSerializer { + + private final String fieldDelimiter; + private final String rowDelimiter; + private final char quoteChar; + private final boolean alwaysQuote; + private final boolean includeHeader; + + public CSVSerializer(final Properties outputProperties) { + this.fieldDelimiter = outputProperties.getProperty(EXistOutputKeys.CSV_FIELD_DELIMITER, ","); + this.rowDelimiter = outputProperties.getProperty(EXistOutputKeys.CSV_ROW_DELIMITER, "\n"); + final String qc = outputProperties.getProperty(EXistOutputKeys.CSV_QUOTE_CHARACTER, "\""); + this.quoteChar = qc.isEmpty() ? '"' : qc.charAt(0); + this.alwaysQuote = !"no".equals(outputProperties.getProperty(EXistOutputKeys.CSV_QUOTES, "yes")); + this.includeHeader = "yes".equals(outputProperties.getProperty(EXistOutputKeys.CSV_HEADER, "no")); + } + + public void serialize(final Sequence sequence, final Writer writer) throws SAXException { + try { + if (sequence.isEmpty()) { + return; + } + + final Item first = sequence.itemAt(0); + + if (first.getType() == Type.ARRAY_ITEM) { + if (sequence.hasOne()) { + // Single array: treat as array-of-arrays + serializeArrayOfArrays((ArrayType) first, writer); + } else { + // Sequence of arrays: each array is a row + serializeSequenceOfArrays(sequence, writer); + } + } else if (first.getType() == Type.MAP_ITEM) { + serializeSequenceOfMaps(sequence, writer); + } else if (Type.subTypeOf(first.getType(), Type.NODE)) { + serializeXmlTable(sequence, writer); + } else { + // Single atomic or sequence of atomics — one row + serializeAtomicSequence(sequence, writer); + } + } catch (final IOException | XPathException e) { + throw new SAXException(e.getMessage(), e); + } + } + + private void serializeArrayOfArrays(final ArrayType outerArray, final Writer writer) throws IOException, XPathException { + for (int i = 0; i < outerArray.getSize(); i++) { + final Sequence member = outerArray.get(i); + if (member.getItemCount() == 1 && member.itemAt(0).getType() == Type.ARRAY_ITEM) { + writeRow((ArrayType) member.itemAt(0), writer); + } else { + writeSequenceRow(member, writer); + } + writer.write(rowDelimiter); + } + } + + private void serializeSequenceOfArrays(final Sequence sequence, final Writer writer) throws IOException, XPathException { + for (final SequenceIterator i = sequence.iterate(); i.hasNext(); ) { + final Item item = i.nextItem(); + if (item.getType() == Type.ARRAY_ITEM) { + writeRow((ArrayType) item, writer); + } else { + writer.write(quoteField(item.getStringValue())); + } + writer.write(rowDelimiter); + } + } + + private void serializeSequenceOfMaps(final Sequence sequence, final Writer writer) throws IOException, XPathException { + // Collect all keys from first map for header + final AbstractMapType firstMap = (AbstractMapType) sequence.itemAt(0); + final List keys = new ArrayList<>(); + for (final IEntry entry : firstMap) { + keys.add(entry.key().getStringValue()); + } + Collections.sort(keys); + + // Write header + if (includeHeader) { + writeFields(keys, writer); + writer.write(rowDelimiter); + } + + // Write rows + for (final SequenceIterator i = sequence.iterate(); i.hasNext(); ) { + final Item item = i.nextItem(); + if (item.getType() == Type.MAP_ITEM) { + final AbstractMapType map = (AbstractMapType) item; + boolean first = true; + for (final String key : keys) { + if (!first) { + writer.write(fieldDelimiter); + } + final Sequence value = map.get(new StringValue(key)); + writer.write(quoteField(value.isEmpty() ? "" : value.getStringValue())); + first = false; + } + } + writer.write(rowDelimiter); + } + } + + private void serializeXmlTable(final Sequence sequence, final Writer writer) throws IOException, XPathException { + // Walk XML table: value + // or
value
+ for (final SequenceIterator i = sequence.iterate(); i.hasNext(); ) { + final Item item = i.nextItem(); + if (Type.subTypeOf(item.getType(), Type.ELEMENT)) { + final org.w3c.dom.Element elem = (org.w3c.dom.Element) ((NodeValue) item).getNode(); + serializeXmlElement(elem, writer); + } + } + } + + private void serializeXmlElement(final org.w3c.dom.Element element, final Writer writer) throws IOException { + final org.w3c.dom.NodeList children = element.getChildNodes(); + boolean hasChildElements = false; + for (int i = 0; i < children.getLength(); i++) { + if (children.item(i).getNodeType() == org.w3c.dom.Node.ELEMENT_NODE) { + hasChildElements = true; + break; + } + } + + if (!hasChildElements) { + // Leaf element — output as a field value + writer.write(quoteField(element.getTextContent())); + return; + } + + // Check if children are "record" elements (containing field elements) + // or direct field elements + boolean firstRecord = true; + for (int i = 0; i < children.getLength(); i++) { + if (children.item(i).getNodeType() == org.w3c.dom.Node.ELEMENT_NODE) { + final org.w3c.dom.Element child = (org.w3c.dom.Element) children.item(i); + final org.w3c.dom.NodeList grandchildren = child.getChildNodes(); + boolean hasGrandchildElements = false; + for (int j = 0; j < grandchildren.getLength(); j++) { + if (grandchildren.item(j).getNodeType() == org.w3c.dom.Node.ELEMENT_NODE) { + hasGrandchildElements = true; + break; + } + } + + if (hasGrandchildElements) { + // This is a record element — its children are fields + if (!firstRecord) { + // row delimiter already written + } + boolean firstField = true; + for (int j = 0; j < grandchildren.getLength(); j++) { + if (grandchildren.item(j).getNodeType() == org.w3c.dom.Node.ELEMENT_NODE) { + if (!firstField) { + writer.write(fieldDelimiter); + } + writer.write(quoteField(grandchildren.item(j).getTextContent())); + firstField = false; + } + } + writer.write(rowDelimiter); + firstRecord = false; + } else { + // Direct field element — accumulate as part of a single row + if (!firstRecord) { + writer.write(fieldDelimiter); + } + writer.write(quoteField(child.getTextContent())); + firstRecord = false; + } + } + } + } + + private void serializeAtomicSequence(final Sequence sequence, final Writer writer) throws IOException, XPathException { + boolean first = true; + for (final SequenceIterator i = sequence.iterate(); i.hasNext(); ) { + if (!first) { + writer.write(fieldDelimiter); + } + writer.write(quoteField(i.nextItem().getStringValue())); + first = false; + } + writer.write(rowDelimiter); + } + + private void writeRow(final ArrayType array, final Writer writer) throws IOException, XPathException { + for (int i = 0; i < array.getSize(); i++) { + if (i > 0) { + writer.write(fieldDelimiter); + } + final Sequence member = array.get(i); + writer.write(quoteField(member.isEmpty() ? "" : member.getStringValue())); + } + } + + private void writeSequenceRow(final Sequence sequence, final Writer writer) throws IOException, XPathException { + boolean first = true; + for (final SequenceIterator i = sequence.iterate(); i.hasNext(); ) { + if (!first) { + writer.write(fieldDelimiter); + } + writer.write(quoteField(i.nextItem().getStringValue())); + first = false; + } + } + + private void writeFields(final List fields, final Writer writer) throws IOException { + boolean first = true; + for (final String field : fields) { + if (!first) { + writer.write(fieldDelimiter); + } + writer.write(quoteField(field)); + first = false; + } + } + + /** + * Quote a field value per RFC 4180. + * If alwaysQuote is true, all fields are quoted. + * If false, only fields containing the delimiter, quote char, or newline are quoted. + * Quote characters within the value are escaped by doubling. + */ + private String quoteField(final String value) { + final boolean needsQuoting = alwaysQuote + || value.contains(fieldDelimiter) + || value.indexOf(quoteChar) >= 0 + || value.contains("\n") + || value.contains("\r"); + + if (!needsQuoting) { + return value; + } + + final StringBuilder sb = new StringBuilder(value.length() + 2); + sb.append(quoteChar); + for (int i = 0; i < value.length(); i++) { + final char c = value.charAt(i); + if (c == quoteChar) { + sb.append(quoteChar); // escape by doubling + } + sb.append(c); + } + sb.append(quoteChar); + return sb.toString(); + } +} diff --git a/exist-core/src/main/java/org/exist/util/serializer/XQuerySerializer.java b/exist-core/src/main/java/org/exist/util/serializer/XQuerySerializer.java index 366e3866cbc..6dd737d8b23 100644 --- a/exist-core/src/main/java/org/exist/util/serializer/XQuerySerializer.java +++ b/exist-core/src/main/java/org/exist/util/serializer/XQuerySerializer.java @@ -70,6 +70,9 @@ public void serialize(final Sequence sequence, final int start, final int howman case "json": serializeJSON(sequence, compilationTime, executionTime); break; + case "csv": + serializeCSV(sequence); + break; case "xml": default: serializeXML(sequence, start, howmany, wrap, typed, compilationTime, executionTime); @@ -79,7 +82,7 @@ public void serialize(final Sequence sequence, final int start, final int howman public boolean normalize() { final String method = outputProperties.getProperty(OutputKeys.METHOD, "xml"); - return !("json".equals(method) || "adaptive".equals(method)); + return !("json".equals(method) || "adaptive".equals(method) || "csv".equals(method)); } private void serializeXML(final Sequence sequence, final int start, final int howmany, final boolean wrap, final boolean typed, final long compilationTime, final long executionTime) throws SAXException, XPathException { @@ -118,4 +121,9 @@ private void serializeAdaptive(final Sequence sequence) throws SAXException, XPa serializer.setOutput(writer, outputProperties); serializer.serialize(sequence); } + + private void serializeCSV(final Sequence sequence) throws SAXException { + final CSVSerializer serializer = new CSVSerializer(outputProperties); + serializer.serialize(sequence, writer); + } } diff --git a/exist-core/src/main/java/org/exist/xquery/util/SerializerUtils.java b/exist-core/src/main/java/org/exist/xquery/util/SerializerUtils.java index 9649e835344..236a327508d 100644 --- a/exist-core/src/main/java/org/exist/xquery/util/SerializerUtils.java +++ b/exist-core/src/main/java/org/exist/xquery/util/SerializerUtils.java @@ -136,6 +136,12 @@ public enum W3CParameterConvention implements ParameterConvention { INDENT(OutputKeys.INDENT, Type.BOOLEAN, Cardinality.ZERO_OR_ONE, BooleanValue.FALSE), ITEM_SEPARATOR(EXistOutputKeys.ITEM_SEPARATOR, Type.STRING, Cardinality.ZERO_OR_ONE, Sequence.EMPTY_SEQUENCE), //default: () means "absent" JSON_NODE_OUTPUT_METHOD(EXistOutputKeys.JSON_NODE_OUTPUT_METHOD, Type.STRING, Cardinality.ZERO_OR_ONE, new StringValue("xml")), + // CSV serialization parameters (vendor extension, modeled on BaseX) + CSV_FIELD_DELIMITER(EXistOutputKeys.CSV_FIELD_DELIMITER, Type.STRING, Cardinality.ZERO_OR_ONE, new StringValue(",")), + CSV_ROW_DELIMITER(EXistOutputKeys.CSV_ROW_DELIMITER, Type.STRING, Cardinality.ZERO_OR_ONE, new StringValue("\n")), + CSV_QUOTE_CHARACTER(EXistOutputKeys.CSV_QUOTE_CHARACTER, Type.STRING, Cardinality.ZERO_OR_ONE, new StringValue("\"")), + CSV_HEADER(EXistOutputKeys.CSV_HEADER, Type.BOOLEAN, Cardinality.ZERO_OR_ONE, BooleanValue.FALSE), + CSV_QUOTES(EXistOutputKeys.CSV_QUOTES, Type.BOOLEAN, Cardinality.ZERO_OR_ONE, BooleanValue.TRUE), MEDIA_TYPE(OutputKeys.MEDIA_TYPE, Type.STRING, Cardinality.ZERO_OR_ONE, Sequence.EMPTY_SEQUENCE), // default: a media type suitable for the chosen method METHOD(OutputKeys.METHOD, Type.STRING, Cardinality.ZERO_OR_ONE, new StringValue("xml")), NORMALIZATION_FORM("normalization-form", Type.STRING, Cardinality.ZERO_OR_ONE, new StringValue("none")), @@ -502,6 +508,7 @@ private static Sequence getDefaultMediaType(final Sequence providedMethod) throw case "jsonp" -> new StringValue("application/javascript"); case "html" -> new StringValue("text/html"); case "adaptive", "text" -> new StringValue("text/plain"); + case "csv" -> new StringValue("text/csv"); case "binary" -> new StringValue("application/octet-stream"); default -> throw new UnsupportedOperationException("Unrecognised serialization method: " + method); }; diff --git a/exist-core/src/test/xquery/xquery3/csvSerializer.xql b/exist-core/src/test/xquery/xquery3/csvSerializer.xql new file mode 100644 index 00000000000..a17aee559b7 --- /dev/null +++ b/exist-core/src/test/xquery/xquery3/csvSerializer.xql @@ -0,0 +1,165 @@ +(: + : eXist-db Open Source Native XML Database + : Copyright (C) 2001 The eXist-db Authors + : + : info@exist-db.org + : http://www.exist-db.org + : + : This library is free software; you can redistribute it and/or + : modify it under the terms of the GNU Lesser General Public + : License as published by the Free Software Foundation; either + : version 2.1 of the License, or (at your option) any later version. + : + : This library is distributed in the hope that it will be useful, + : but WITHOUT ANY WARRANTY; without even the implied warranty of + : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + : Lesser General Public License for more details. + : + : You should have received a copy of the GNU Lesser General Public + : License along with this library; if not, write to the Free Software + : Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + :) +xquery version "3.1"; + +(:~ + : Tests for method="csv" serialization. + :) +module namespace csv = "http://exist-db.org/xquery/test/csv-serializer"; + +declare namespace test = "http://exist-db.org/xquery/xqsuite"; + +(: === Array-of-arrays === :) + +declare + %test:assertEquals('"a","b" +"c","d" +') +function csv:array-of-arrays() { + serialize([["a","b"],["c","d"]], map { "method": "csv" }) +}; + +declare + %test:assertEquals('"1","2","3" +') +function csv:single-row-array() { + serialize([["1","2","3"]], map { "method": "csv" }) +}; + +(: === Empty input === :) + +declare + %test:assertEquals("") +function csv:empty-sequence() { + serialize((), map { "method": "csv" }) +}; + +declare + %test:assertEquals("") +function csv:empty-array() { + serialize([], map { "method": "csv" }) +}; + +(: === Custom delimiters === :) + +declare + %test:assertEquals('a;b +c;d +') +function csv:custom-field-delimiter() { + serialize([["a","b"],["c","d"]], map { + "method": "csv", + "csv.field-delimiter": ";", + "csv.quotes": false() + }) +}; + +(: === Quoting === :) + +declare + %test:assertEquals('a,b +c,d +') +function csv:no-quotes() { + serialize([["a","b"],["c","d"]], map { + "method": "csv", + "csv.quotes": false() + }) +}; + +declare + %test:assertTrue +function csv:value-with-comma-gets-quoted() { + let $result := serialize([["hello, world","test"]], map { + "method": "csv", + "csv.quotes": false() + }) + return contains($result, '"hello, world"') +}; + +declare + %test:assertTrue +function csv:value-with-quote-gets-escaped() { + let $result := serialize([["say ""hello""","test"]], map { + "method": "csv" + }) + return contains($result, '""hello""') +}; + +(: === Sequence of maps === :) + +declare + %test:assertTrue +function csv:maps-produces-output() { + let $result := serialize(( + map { "name": "Alice", "age": "30" }, + map { "name": "Bob", "age": "25" } + ), map { "method": "csv" }) + return string-length($result) > 0 +}; + +declare + %test:assertTrue +function csv:maps-with-header() { + let $result := serialize(( + map { "name": "Alice", "age": "30" }, + map { "name": "Bob", "age": "25" } + ), map { "method": "csv", "csv.header": true() }) + (: Header row should contain the key names :) + return contains($result, "age") and contains($result, "name") +}; + +(: === XML table === :) + +declare + %test:assertTrue +function csv:xml-table() { + let $result := serialize( + + ab + cd + , + map { "method": "csv" } + ) + return contains($result, "a") and contains($result, "d") +}; + +(: === Single atomic item === :) + +declare + %test:assertTrue +function csv:single-string() { + let $result := serialize("hello", map { "method": "csv" }) + return contains($result, "hello") +}; + +(: === Value with newline gets quoted === :) + +declare + %test:assertTrue +function csv:value-with-newline-gets-quoted() { + let $result := serialize([["line1 line2", "test"]], map { + "method": "csv", + "csv.quotes": false() + }) + return contains($result, '"line1') +};