diff --git a/exist-core/src/main/antlr/org/exist/xquery/parser/XQuery.g b/exist-core/src/main/antlr/org/exist/xquery/parser/XQuery.g index d852d700444..744a9f2f2ea 100644 --- a/exist-core/src/main/antlr/org/exist/xquery/parser/XQuery.g +++ b/exist-core/src/main/antlr/org/exist/xquery/parser/XQuery.g @@ -192,6 +192,40 @@ imaginaryTokenDefinitions PREVIOUS_ITEM NEXT_ITEM WINDOW_VARS + // Full Text (W3C XQuery and XPath Full Text 3.0) + FT_CONTAINS + FT_SELECTION + FT_OR + FT_AND + FT_MILD_NOT + FT_UNARY_NOT + FT_PRIMARY_WITH_OPTIONS + FT_WORDS + FT_ANYALL_OPTION + FT_TIMES + FT_RANGE + FT_ORDER + FT_WINDOW + FT_DISTANCE + FT_SCOPE + FT_CONTENT + FT_MATCH_OPTION + FT_CASE_OPTION + FT_DIACRITICS_OPTION + FT_STEM_OPTION + FT_THESAURUS_OPTION + FT_THESAURUS_ID + FT_STOP_WORD_OPTION + FT_STOP_WORDS + FT_STOP_WORDS_EXCEPT + FT_LANGUAGE_OPTION + FT_WILDCARD_OPTION + FT_EXTENSION_OPTION + FT_EXTENSION_SELECTION + FT_IGNORE_OPTION + FT_WEIGHT + FT_SCORE_VAR + FT_OPTION_DECL ; // === XPointer === @@ -262,6 +296,16 @@ prolog throws XPathException if(!inSetters) throw new XPathException(#s, "Default declarations have to come first"); } + | + ( "declare" "ft-option" ) + => fto:ftOptionDecl + { + // XQFT 3.0 §2.6: FTOptionDecl is in the first section of the prolog + // (same level as setters and imports), not the second section. + if (!inSetters) + throw new XPathException(#fto, ErrorCodes.XPST0003, + "'declare ft-option' must appear before variable and function declarations"); + } | ( "declare" "option" ) => optionDecl { inSetters = false; } @@ -702,7 +746,7 @@ expr throws XPathException exprSingle throws XPathException : - ( ( "for" | "let" ) ("tumbling" | "sliding" | DOLLAR ) ) => flworExpr + ( ( "for" | "let" ) ("tumbling" | "sliding" | "score" | DOLLAR ) ) => flworExpr | ( "try" LCURLY ) => tryCatchExpr | ( ( "some" | "every" ) DOLLAR ) => quantifiedExpr | ( "if" LPAREN ) => ifExpr @@ -838,7 +882,8 @@ forClause throws XPathException letClause throws XPathException : - "let"^ letVarBinding ( COMMA! letVarBinding )* + "let"^ ( ( "score" ) => ftScoreVarBinding | letVarBinding ) + ( COMMA! ( ( "score" ) => ftScoreVarBinding | letVarBinding ) )* ; windowClause throws XPathException @@ -851,6 +896,7 @@ inVarBinding throws XPathException : DOLLAR! varName=v:varName! ( typeDeclaration )? ( allowingEmpty )? ( positionalVar )? + ( ftScoreVar )? "in"! exprSingle { #inVarBinding= #(#[VARIABLE_BINDING, varName], #inVarBinding); @@ -912,6 +958,25 @@ letVarBinding throws XPathException } ; +// XQFT 3.0: FTScoreVar in for binding - "score" "$" VarName +ftScoreVar +{ String varName; } +: + "score" DOLLAR! varName=varName + { #ftScoreVar= #[FT_SCORE_VAR, varName]; } + ; + +// XQFT 3.0: FTScoreVar as let clause - "score" "$" VarName ":=" ExprSingle +ftScoreVarBinding throws XPathException +{ String varName; } +: + "score"! DOLLAR! varName=v:varName! COLON! EQ! exprSingle + { + #ftScoreVarBinding= #(#[VARIABLE_BINDING, varName], #[FT_SCORE_VAR, "score"], #ftScoreVarBinding); + #ftScoreVarBinding.copyLexInfo(#v); + } + ; + orderByClause throws XPathException : ( "order"! "by"! | "stable"! "order"! "by"! ) orderSpecList @@ -1066,15 +1131,33 @@ castExpr throws XPathException comparisonExpr throws XPathException : - r1:stringConcatExpr ( - ( BEFORE ) => BEFORE^ stringConcatExpr + r1:ftContainsExpr ( + ( BEFORE ) => BEFORE^ ftContainsExpr | - ( AFTER ) => AFTER^ stringConcatExpr - | ( ( "eq"^ | "ne"^ | "lt"^ | "le"^ | "gt"^ | "ge"^ ) stringConcatExpr ) - | ( GT EQ ) => GT^ EQ^ r2:rangeExpr + ( AFTER ) => AFTER^ ftContainsExpr + | ( ( "eq"^ | "ne"^ | "lt"^ | "le"^ | "gt"^ | "ge"^ ) ftContainsExpr ) + | ( GT EQ ) => GT^ EQ^ r2:ftContainsExpr { #comparisonExpr = #(#[GTEQ, ">="], #r1, #r2); } - | ( ( EQ^ | NEQ^ | GT^ | LT^ | LTEQ^ ) stringConcatExpr ) - | ( ( "is"^ | "isnot"^ ) stringConcatExpr ) + | ( ( EQ^ | NEQ^ | GT^ | LT^ | LTEQ^ ) ftContainsExpr ) + | ( ( "is"^ | "isnot"^ ) ftContainsExpr ) + )? + ; + +// XQFT 3.0: FTContainsExpr sits between StringConcatExpr and ComparisonExpr +ftContainsExpr throws XPathException +: + r1:stringConcatExpr ( + ( "contains" "text" ) => "contains"! "text"! ft:ftSelection ( ( "without" ) => fti:ftIgnoreOption )? + { + // Break auto-tree sibling links to prevent circular refs in ASTFactory.make() + #r1.setNextSibling(null); + #ft.setNextSibling(null); + if (#fti != null) { + #ftContainsExpr = #(#[FT_CONTAINS, "contains text"], #r1, #ft, #fti); + } else { + #ftContainsExpr = #(#[FT_CONTAINS, "contains text"], #r1, #ft); + } + } )? ; @@ -2062,6 +2145,397 @@ attributeEnclosedExpr throws XPathException } ; +// === Full Text (W3C XQuery and XPath Full Text 3.0) === +// Spec: https://www.w3.org/TR/xpath-full-text-30/ + +ftSelection throws XPathException +: + ftOr + ( + ( "ordered" | "window" | "distance" | "same" | "different" | "entire" | "at" ( "start" | "end" ) ) => + ftPosFilter + )* + { #ftSelection = #(#[FT_SELECTION, "FTSelection"], #ftSelection); } + ; + +ftOr throws XPathException +{ boolean hasOr = false; } +: + ftAnd ( "ftor"! ftAnd { hasOr = true; } )* + { + if (hasOr) + #ftOr = #(#[FT_OR, "ftor"], #ftOr); + } + ; + +ftAnd throws XPathException +{ boolean hasAnd = false; } +: + ftMildNot ( "ftand"! ftMildNot { hasAnd = true; } )* + { + if (hasAnd) + #ftAnd = #(#[FT_AND, "ftand"], #ftAnd); + } + ; + +ftMildNot throws XPathException +{ boolean hasMildNot = false; } +: + ftUnaryNot ( ( "not" "in" ) => "not"! "in"! ftUnaryNot { hasMildNot = true; } )* + { + if (hasMildNot) + #ftMildNot = #(#[FT_MILD_NOT, "not in"], #ftMildNot); + } + ; + +ftUnaryNot throws XPathException +{ boolean negated = false; } +: + ( "ftnot"! { negated = true; } )? ftPrimaryWithOptions + { + if (negated) + #ftUnaryNot = #(#[FT_UNARY_NOT, "ftnot"], #ftUnaryNot); + } + ; + +ftPrimaryWithOptions throws XPathException +{ boolean hasOptions = false; } +: + ftPrimary + ( ( "using" ) => ftMatchOptions { hasOptions = true; } )? + ( ( "weight" LCURLY ) => ftWeight { hasOptions = true; } )? + { + if (hasOptions) + #ftPrimaryWithOptions = #(#[FT_PRIMARY_WITH_OPTIONS, "FTPrimaryWithOptions"], #ftPrimaryWithOptions); + } + ; + +ftPrimary throws XPathException +: + ftWords + | + LPAREN! ftSelection RPAREN! + | + ftExtensionSelection + ; + +// XQFT 3.0 §3.4.8: FTExtensionSelection ::= Pragma+ "{" FTSelection? "}" +// Pragmas are parsed but ignored (no FT-specific pragma support). +// If all pragmas are unrecognized and the body is empty, XQST0079 applies. +ftExtensionSelection throws XPathException +{ boolean hasBody = false; } +: + ( pragma )+ LCURLY! ( ftSelection { hasBody = true; } )? RCURLY! + { + #ftExtensionSelection = #(#[FT_EXTENSION_SELECTION, "FTExtensionSelection"], #ftExtensionSelection); + } + ; + +ftWords throws XPathException +: + ftWordsValue ( ftAnyallOption )? ( ( "occurs" ) => ftTimes )? + { #ftWords = #(#[FT_WORDS, "FTWords"], #ftWords); } + ; + +ftWordsValue throws XPathException +: + STRING_LITERAL + | + LCURLY! expr RCURLY! + ; + +ftAnyallOption +: + ( "any" "word" ) => "any"! "word"! + { #ftAnyallOption = #[FT_ANYALL_OPTION, "any word"]; } + | + "any"! + { #ftAnyallOption = #[FT_ANYALL_OPTION, "any"]; } + | + ( "all" "words" ) => "all"! "words"! + { #ftAnyallOption = #[FT_ANYALL_OPTION, "all words"]; } + | + "all"! + { #ftAnyallOption = #[FT_ANYALL_OPTION, "all"]; } + | + "phrase"! + { #ftAnyallOption = #[FT_ANYALL_OPTION, "phrase"]; } + ; + +ftTimes throws XPathException +: + "occurs"! ftRange "times"! + { #ftTimes = #(#[FT_TIMES, "FTTimes"], #ftTimes); } + ; + +ftRange throws XPathException +: + ( "exactly" ) => "exactly"! additiveExpr + { #ftRange = #(#[FT_RANGE, "exactly"], #ftRange); } + | + ( "at" "least" ) => "at"! "least"! additiveExpr + { #ftRange = #(#[FT_RANGE, "at least"], #ftRange); } + | + ( "at" "most" ) => "at"! "most"! additiveExpr + { #ftRange = #(#[FT_RANGE, "at most"], #ftRange); } + | + "from"! additiveExpr "to"! additiveExpr + { #ftRange = #(#[FT_RANGE, "from"], #ftRange); } + ; + +ftPosFilter throws XPathException +: + ( "ordered" ) => ftOrder + | + ( "window" ) => ftWindow + | + ( "distance" ) => ftDistance + | + ( "same" ) => ftScope + | + ( "different" ) => ftScope + | + ( "at" "start" ) => ftContent + | + ( "at" "end" ) => ftContent + | + ( "entire" ) => ftContent + ; + +ftOrder +: + "ordered"! + { #ftOrder = #[FT_ORDER, "ordered"]; } + ; + +ftWindow throws XPathException +: + "window"! additiveExpr ftUnit + { #ftWindow = #(#[FT_WINDOW, "window"], #ftWindow); } + ; + +ftDistance throws XPathException +: + "distance"! ftRange ftUnit + { #ftDistance = #(#[FT_DISTANCE, "distance"], #ftDistance); } + ; + +ftScope +: + ( "same" "sentence" ) => "same"! "sentence"! + { #ftScope = #[FT_SCOPE, "same sentence"]; } + | + ( "same" "paragraph" ) => "same"! "paragraph"! + { #ftScope = #[FT_SCOPE, "same paragraph"]; } + | + ( "different" "sentence" ) => "different"! "sentence"! + { #ftScope = #[FT_SCOPE, "different sentence"]; } + | + "different"! "paragraph"! + { #ftScope = #[FT_SCOPE, "different paragraph"]; } + ; + +ftContent +: + ( "at" "start" ) => "at"! "start"! + { #ftContent = #[FT_CONTENT, "at start"]; } + | + ( "at" "end" ) => "at"! "end"! + { #ftContent = #[FT_CONTENT, "at end"]; } + | + "entire"! "content"! + { #ftContent = #[FT_CONTENT, "entire content"]; } + ; + +ftUnit +: + "words" | "sentences" | "paragraphs" + ; + +// === Full Text Option Declaration (prolog) === +// XQFT 3.0 §5.2: declare ft-option using + +ftOptionDecl throws XPathException +: + "declare"! "ft-option"! ftMatchOptions + { #ftOptionDecl = #(#[FT_OPTION_DECL, "ft-option"], #ftOptionDecl); } + ; + +// === Full Text Match Options === + +ftMatchOptions throws XPathException +: + ( "using"! ftMatchOption )+ + ; + +ftMatchOption throws XPathException +: + ( "case" ) => ftCaseOption + | + ( "lowercase" ) => ftCaseOption + | + ( "uppercase" ) => ftCaseOption + | + ( "diacritics" ) => ftDiacriticsOption + | + ( "stemming" ) => ftStemOption + | + ( "no" "stemming" ) => ftStemOption + | + ( "thesaurus" ) => ftThesaurusOption + | + ( "no" "thesaurus" ) => ftThesaurusOption + | + ( "stop" ) => ftStopWordOption + | + ( "no" "stop" ) => ftStopWordOption + | + ( "language" ) => ftLanguageOption + | + ( "wildcards" ) => ftWildCardOption + | + ( "no" "wildcards" ) => ftWildCardOption + | + ftExtensionOption + ; + +ftCaseOption +: + ( "case" "insensitive" ) => "case"! "insensitive"! + { #ftCaseOption = #[FT_CASE_OPTION, "insensitive"]; } + | + ( "case" "sensitive" ) => "case"! "sensitive"! + { #ftCaseOption = #[FT_CASE_OPTION, "sensitive"]; } + | + "lowercase"! + { #ftCaseOption = #[FT_CASE_OPTION, "lowercase"]; } + | + "uppercase"! + { #ftCaseOption = #[FT_CASE_OPTION, "uppercase"]; } + ; + +ftDiacriticsOption +: + ( "diacritics" "insensitive" ) => "diacritics"! "insensitive"! + { #ftDiacriticsOption = #[FT_DIACRITICS_OPTION, "insensitive"]; } + | + "diacritics"! "sensitive"! + { #ftDiacriticsOption = #[FT_DIACRITICS_OPTION, "sensitive"]; } + ; + +ftStemOption +: + "stemming"! + { #ftStemOption = #[FT_STEM_OPTION, "stemming"]; } + | + "no"! "stemming"! + { #ftStemOption = #[FT_STEM_OPTION, "no stemming"]; } + ; + +ftThesaurusOption throws XPathException +: + ( "no" "thesaurus" ) => "no"! "thesaurus"! + { #ftThesaurusOption = #[FT_THESAURUS_OPTION, "no thesaurus"]; } + | + ( "thesaurus" LPAREN ) => "thesaurus"! LPAREN! ftThesaurusIDOrDefault ( COMMA! ftThesaurusID )* RPAREN! + { #ftThesaurusOption = #(#[FT_THESAURUS_OPTION, "thesaurus list"], #ftThesaurusOption); } + | + "thesaurus"! ftThesaurusIDOrDefault + { #ftThesaurusOption = #(#[FT_THESAURUS_OPTION, "thesaurus"], #ftThesaurusOption); } + ; + +ftThesaurusIDOrDefault throws XPathException +: + ( "default" ) => "default"! + { #ftThesaurusIDOrDefault = #[FT_THESAURUS_ID, "default"]; } + | + ftThesaurusID + ; + +ftThesaurusID throws XPathException +: + "at"! STRING_LITERAL ( "relationship"! STRING_LITERAL )? ( ftLiteralRange "levels"! )? + { #ftThesaurusID = #(#[FT_THESAURUS_ID, "at"], #ftThesaurusID); } + ; + +ftLiteralRange +: + ( "exactly" ) => "exactly"! INTEGER_LITERAL + { #ftLiteralRange = #(#[FT_RANGE, "exactly"], #ftLiteralRange); } + | + ( "at" "least" ) => "at"! "least"! INTEGER_LITERAL + { #ftLiteralRange = #(#[FT_RANGE, "at least"], #ftLiteralRange); } + | + ( "at" "most" ) => "at"! "most"! INTEGER_LITERAL + { #ftLiteralRange = #(#[FT_RANGE, "at most"], #ftLiteralRange); } + | + "from"! INTEGER_LITERAL "to"! INTEGER_LITERAL + { #ftLiteralRange = #(#[FT_RANGE, "from"], #ftLiteralRange); } + ; + +ftStopWordOption throws XPathException +: + ( "no" "stop" ) => "no"! "stop"! "words"! + { #ftStopWordOption = #[FT_STOP_WORD_OPTION, "no stop words"]; } + | + ( "stop" "words" "default" ) => "stop"! "words"! "default"! ( ftStopWordsInclExcl )* + { #ftStopWordOption = #(#[FT_STOP_WORD_OPTION, "stop words default"], #ftStopWordOption); } + | + "stop"! "words"! ftStopWords ( ftStopWordsInclExcl )* + { #ftStopWordOption = #(#[FT_STOP_WORD_OPTION, "stop words"], #ftStopWordOption); } + ; + +ftStopWords +: + ( "at" ) => "at"! STRING_LITERAL + { #ftStopWords = #(#[FT_STOP_WORDS, "at"], #ftStopWords); } + | + LPAREN! STRING_LITERAL ( COMMA! STRING_LITERAL )* RPAREN! + { #ftStopWords = #(#[FT_STOP_WORDS, "list"], #ftStopWords); } + ; + +ftStopWordsInclExcl +: + "union"! ftStopWords + | + "except"! ftStopWords + { #ftStopWordsInclExcl = #(#[FT_STOP_WORDS_EXCEPT, "except"], #ftStopWordsInclExcl); } + ; + +ftLanguageOption +: + "language"! STRING_LITERAL + { #ftLanguageOption = #(#[FT_LANGUAGE_OPTION, "language"], #ftLanguageOption); } + ; + +ftWildCardOption +: + "wildcards"! + { #ftWildCardOption = #[FT_WILDCARD_OPTION, "wildcards"]; } + | + "no"! "wildcards"! + { #ftWildCardOption = #[FT_WILDCARD_OPTION, "no wildcards"]; } + ; + +ftExtensionOption throws XPathException +{ String name; } +: + "option"! name=eqName STRING_LITERAL + { #ftExtensionOption = #(#[FT_EXTENSION_OPTION, name], #ftExtensionOption); } + ; + +ftWeight throws XPathException +: + "weight"! LCURLY! expr RCURLY! + { #ftWeight = #(#[FT_WEIGHT, "weight"], #ftWeight); } + ; + +ftIgnoreOption throws XPathException +: + "without"! "content"! unionExpr + { #ftIgnoreOption = #(#[FT_IGNORE_OPTION, "without content"], #ftIgnoreOption); } + ; + /* All of the literals used in this grammar can also be * part of a valid QName. We thus have to test for each * of them below. @@ -2304,6 +2778,91 @@ reservedKeywords returns [String name] "next" { name = "next"; } | "when" { name = "when"; } + | + // Full Text keywords + "contains" { name = "contains"; } + | + "score" { name = "score"; } + | + "content" { name = "content"; } + | + "ftor" { name = "ftor"; } + | + "ftand" { name = "ftand"; } + | + "ftnot" { name = "ftnot"; } + | + "stemming" { name = "stemming"; } + | + "thesaurus" { name = "thesaurus"; } + | + "diacritics" { name = "diacritics"; } + | + "sensitive" { name = "sensitive"; } + | + "insensitive" { name = "insensitive"; } + | + "language" { name = "language"; } + | + "wildcards" { name = "wildcards"; } + | + "lowercase" { name = "lowercase"; } + | + "uppercase" { name = "uppercase"; } + | + "distance" { name = "distance"; } + | + "entire" { name = "entire"; } + | + "words" { name = "words"; } + | + "sentences" { name = "sentences"; } + | + "paragraphs" { name = "paragraphs"; } + | + "sentence" { name = "sentence"; } + | + "paragraph" { name = "paragraph"; } + | + "occurs" { name = "occurs"; } + | + "times" { name = "times"; } + | + "weight" { name = "weight"; } + | + "without" { name = "without"; } + | + "same" { name = "same"; } + | + "different" { name = "different"; } + | + "relationship" { name = "relationship"; } + | + "levels" { name = "levels"; } + | + "stop" { name = "stop"; } + | + "least" { name = "least"; } + | + "most" { name = "most"; } + | + "exactly" { name = "exactly"; } + | + "no" { name = "no"; } + | + "not" { name = "not"; } + | + "all" { name = "all"; } + | + "any" { name = "any"; } + | + "word" { name = "word"; } + | + "phrase" { name = "phrase"; } + | + "using" { name = "using"; } + | + "from" { name = "from"; } ; diff --git a/exist-core/src/main/antlr/org/exist/xquery/parser/XQueryTree.g b/exist-core/src/main/antlr/org/exist/xquery/parser/XQueryTree.g index 20308296806..33020331b60 100644 --- a/exist-core/src/main/antlr/org/exist/xquery/parser/XQueryTree.g +++ b/exist-core/src/main/antlr/org/exist/xquery/parser/XQueryTree.g @@ -56,6 +56,7 @@ header { import org.exist.storage.ElementValue; import org.exist.xquery.functions.map.MapExpr; import org.exist.xquery.functions.array.ArrayConstructor; + import org.exist.xquery.ft.*; import static org.apache.commons.lang3.ArrayUtils.isNotEmpty; } @@ -131,6 +132,8 @@ options { QName varName; SequenceType sequenceType= null; QName posVar = null; + QName scoreVar = null; + boolean isScoreBinding = false; Expression inputSequence; Expression action; FLWORClause.ClauseType type = FLWORClause.ClauseType.FOR; @@ -632,6 +635,22 @@ throws PermissionDeniedException, EXistException, XPathException ) ) | + // XQFT 3.0 §5.2: declare ft-option using + #( + FT_OPTION_DECL + { + FTMatchOptions ftDefaultOpts = new FTMatchOptions(); + } + ftDefaultOpts=ftMatchOptionsExpr + { + if (ftDefaultOpts.hasConflict()) { + throw new XPathException(ErrorCodes.FTST0019, + ftDefaultOpts.getConflictDescription()); + } + context.setDefaultFTMatchOptions(ftDefaultOpts); + } + ) + | functionDecl [path] | importDecl [path] @@ -1616,6 +1635,16 @@ throws PermissionDeniedException, EXistException, XPathException } } )? + ( + scoreVar:FT_SCORE_VAR + { + try { + clause.scoreVar = distinctVariableNames.check(ErrorCodes.XQST0089, scoreVar, QName.parse(staticContext, scoreVar.getText(), null)); + } catch (final IllegalQNameException iqe) { + throw new XPathException(scoreVar.getLine(), scoreVar.getColumn(), ErrorCodes.XPST0081, "No namespace defined for prefix " + scoreVar.getText()); + } + } + )? step=expr [inputSequence] { try { @@ -1642,6 +1671,12 @@ throws PermissionDeniedException, EXistException, XPathException PathExpr inputSequence= new PathExpr(context); inputSequence.setASTNode(expr_AST_in); } + ( + letScoreVar:FT_SCORE_VAR + { + clause.isScoreBinding = true; + } + )? ( #( "as" @@ -2048,7 +2083,13 @@ throws PermissionDeniedException, EXistException, XPathException bind.setInputSequence(clause.inputSequence); if (clause.type == FLWORClause.ClauseType.FOR) { ((ForExpr) bind).setPositionalVariable(clause.posVar); + if (clause.scoreVar != null) { + ((ForExpr) bind).setScoreVariable(clause.scoreVar); + } } + if (clause.type == FLWORClause.ClauseType.LET && clause.isScoreBinding) { + ((LetExpr) bind).setScoreBinding(true); + } } else if (clause.type == FLWORClause.ClauseType.GROUPBY) { if (clause.groupSpecs != null) { GroupSpec specs[] = new GroupSpec[clause.groupSpecs.size()]; @@ -2401,6 +2442,8 @@ throws PermissionDeniedException, EXistException, XPathException | step=nodeComp [path] | + step=ftContainsExpr [path] + | step=primaryExpr [path] | step=pathExpr [path] @@ -3513,6 +3556,585 @@ throws PermissionDeniedException, EXistException, XPathException ) ; +// === Full Text (W3C XQuery and XPath Full Text 3.0) === + +ftContainsExpr [PathExpr path] +returns [Expression step] +throws PermissionDeniedException, EXistException, XPathException +{ + step = null; + PathExpr source = new PathExpr(context); + source.setASTNode(ftContainsExpr_AST_in); + FTSelection ftSel = null; + Expression ignoreExpr = null; +} +: + #( + ft:FT_CONTAINS + step=expr [source] + ftSel=ftSelectionExpr + ( ignoreExpr=ftIgnoreExpr )? + { + FTContainsExpr ftContains = new FTContainsExpr(context); + ftContains.setASTNode(ft); + ftContains.setSearchSource(source); + ftContains.setFTSelection(ftSel); + ftContains.setIgnoreExpr(ignoreExpr); + path.add(ftContains); + step = ftContains; + } + ) + ; + +ftSelectionExpr +returns [FTSelection ftSel] +throws PermissionDeniedException, EXistException, XPathException +{ + ftSel = new FTSelection(context); + ftSel.setASTNode(ftSelectionExpr_AST_in); + Expression ftOr = null; + Expression posFilter = null; +} +: + #( + FT_SELECTION + ftOr=ftOrExpr + { ftSel.setFTOr(ftOr); } + ( posFilter=ftPosFilterExpr { ftSel.addPosFilter(posFilter); } )* + ) + ; + +ftOrExpr +returns [Expression step] +throws PermissionDeniedException, EXistException, XPathException +{ + step = null; + Expression operand = null; + FTOr ftOr = null; +} +: + #( + FT_OR + { + ftOr = new FTOr(context); + ftOr.setASTNode(ftOrExpr_AST_in); + } + ( operand=ftAndExpr { ftOr.addOperand(operand); } )+ + { step = ftOr; } + ) + | + step=ftAndExpr + ; + +ftAndExpr +returns [Expression step] +throws PermissionDeniedException, EXistException, XPathException +{ + step = null; + Expression operand = null; + FTAnd ftAnd = null; +} +: + #( + FT_AND + { + ftAnd = new FTAnd(context); + ftAnd.setASTNode(ftAndExpr_AST_in); + } + ( operand=ftMildNotExpr { ftAnd.addOperand(operand); } )+ + { step = ftAnd; } + ) + | + step=ftMildNotExpr + ; + +ftMildNotExpr +returns [Expression step] +throws PermissionDeniedException, EXistException, XPathException +{ + step = null; + Expression operand = null; + FTMildNot ftMildNot = null; +} +: + #( + FT_MILD_NOT + { + ftMildNot = new FTMildNot(context); + ftMildNot.setASTNode(ftMildNotExpr_AST_in); + } + ( operand=ftUnaryNotExpr { ftMildNot.addOperand(operand); } )+ + { step = ftMildNot; } + ) + | + step=ftUnaryNotExpr + ; + +ftUnaryNotExpr +returns [Expression step] +throws PermissionDeniedException, EXistException, XPathException +{ + step = null; + Expression operand = null; +} +: + #( + FT_UNARY_NOT + operand=ftPrimaryWithOptionsExpr + { + FTUnaryNot ftNot = new FTUnaryNot(context); + ftNot.setASTNode(ftUnaryNotExpr_AST_in); + ftNot.setOperand(operand); + step = ftNot; + } + ) + | + step=ftPrimaryWithOptionsExpr + ; + +ftPrimaryWithOptionsExpr +returns [Expression step] +throws PermissionDeniedException, EXistException, XPathException +{ + step = null; + Expression primary = null; + FTMatchOptions matchOpts = null; + Expression weightExpr = null; +} +: + #( + FT_PRIMARY_WITH_OPTIONS + primary=ftPrimaryExpr + ( matchOpts=ftMatchOptionsExpr )? + ( weightExpr=ftWeightExpr )? + { + FTPrimaryWithOptions pwo = new FTPrimaryWithOptions(context); + pwo.setASTNode(ftPrimaryWithOptionsExpr_AST_in); + pwo.setPrimary(primary); + pwo.setMatchOptions(matchOpts); + pwo.setWeight(weightExpr); + step = pwo; + } + ) + | + step=ftPrimaryExpr + ; + +ftPrimaryExpr +returns [Expression step] +throws PermissionDeniedException, EXistException, XPathException +{ + step = null; +} +: + step=ftWordsExpr + | + step=ftSelectionExpr + | + step=ftExtensionSelectionExpr + ; + +ftWordsExpr +returns [Expression step] +throws PermissionDeniedException, EXistException, XPathException +{ + step = null; + PathExpr wordsValue = new PathExpr(context); + FTWords.AnyallMode mode = FTWords.AnyallMode.ANY; + FTTimes ftTimes = null; +} +: + #( + FT_WORDS + step=expr [wordsValue] + ( aa:FT_ANYALL_OPTION { mode = FTWords.AnyallMode.fromString(aa.getText()); } )? + ( ftTimes=ftTimesExpr )? + { + FTWords ftWords = new FTWords(context); + ftWords.setASTNode(ftWordsExpr_AST_in); + ftWords.setWordsValue(wordsValue); + ftWords.setMode(mode); + ftWords.setFTTimes(ftTimes); + step = ftWords; + } + ) + ; + +// XQFT 3.0 3.4.8: FTExtensionSelection -- pragmas wrapping an optional FTSelection. +// Pragmas are parsed but ignored (no FT-specific pragmas are recognized). +// If the body is empty, XQST0079 is raised. If the body is present, +// the pragmas are discarded and the inner FTSelection is returned. +// Namespace prefix validation is performed via context.getPragma(). +ftExtensionSelectionExpr +returns [Expression step] +throws PermissionDeniedException, EXistException, XPathException +{ + step = null; + FTSelection innerSel = null; +} +: + #( + FT_EXTENSION_SELECTION + // Validate pragma namespace prefixes (raises XPST0081 for undeclared prefixes). + // We don't recognize any FT-specific pragmas, so the result is always null. + ( + #( p:PRAGMA ( c:PRAGMA_END )? ) + { + // Validates namespace prefix; throws XPST0081 if prefix is undeclared + context.getPragma(p.getText(), c != null ? c.getText() : ""); + } + )* + ( innerSel=ftSelectionExpr )? + { + if (innerSel == null) { + // XQST0079: all pragmas unrecognized and no fallback body + throw new XPathException(ftExtensionSelectionExpr_AST_in, + ErrorCodes.XQST0079, + "No recognized pragmas in FTExtensionSelection and no fallback expression"); + } + step = innerSel; + } + ) + ; + +ftTimesExpr +returns [FTTimes step] +throws PermissionDeniedException, EXistException, XPathException +{ + step = null; + FTRange range = null; +} +: + #( + FT_TIMES + range=ftRangeExpr + { + step = new FTTimes(context); + step.setASTNode(ftTimesExpr_AST_in); + step.setRange(range); + } + ) + ; + +ftRangeExpr +returns [FTRange step] +throws PermissionDeniedException, EXistException, XPathException +{ + step = new FTRange(context); + PathExpr e1 = new PathExpr(context); + PathExpr e2 = new PathExpr(context); + Expression tmp = null; +} +: + #( + r:FT_RANGE + { + String rangeMode = r.getText(); + switch (rangeMode) { + case "exactly": step.setMode(FTRange.RangeMode.EXACTLY); break; + case "at least": step.setMode(FTRange.RangeMode.AT_LEAST); break; + case "at most": step.setMode(FTRange.RangeMode.AT_MOST); break; + case "from": step.setMode(FTRange.RangeMode.FROM_TO); break; + } + } + tmp=expr [e1] { step.setExpr1(e1); } + ( tmp=expr [e2] { step.setExpr2(e2); } )? + ) + ; + +ftPosFilterExpr +returns [Expression step] +throws PermissionDeniedException, EXistException, XPathException +{ + step = null; +} +: + o:FT_ORDER + { + FTOrder order = new FTOrder(context); + order.setASTNode(o); + step = order; + } + | + step=ftWindowExpr + | + step=ftDistanceExpr + | + s:FT_SCOPE + { + FTScope scope = new FTScope(context); + scope.setASTNode(s); + String scopeText = s.getText(); + if (scopeText.startsWith("same")) { + scope.setScopeType(FTScope.ScopeType.SAME); + } else { + scope.setScopeType(FTScope.ScopeType.DIFFERENT); + } + if (scopeText.endsWith("sentence")) { + scope.setBigUnit(FTScope.BigUnit.SENTENCE); + } else { + scope.setBigUnit(FTScope.BigUnit.PARAGRAPH); + } + step = scope; + } + | + c:FT_CONTENT + { + FTContent content = new FTContent(context); + content.setASTNode(c); + switch (c.getText()) { + case "at start": content.setContentType(FTContent.ContentType.AT_START); break; + case "at end": content.setContentType(FTContent.ContentType.AT_END); break; + case "entire content": content.setContentType(FTContent.ContentType.ENTIRE_CONTENT); break; + } + step = content; + } + ; + +ftWindowExpr +returns [Expression step] +throws PermissionDeniedException, EXistException, XPathException +{ + step = null; + PathExpr winExpr = new PathExpr(context); + Expression tmp = null; +} +: + #( + w:FT_WINDOW + tmp=expr [winExpr] + u1:. // ftUnit token (words|sentences|paragraphs) + { + FTWindow win = new FTWindow(context); + win.setASTNode(w); + win.setWindowExpr(winExpr); + win.setUnit(FTUnit.fromString(u1.getText())); + step = win; + } + ) + ; + +ftDistanceExpr +returns [Expression step] +throws PermissionDeniedException, EXistException, XPathException +{ + step = null; + FTRange range = null; +} +: + #( + d:FT_DISTANCE + range=ftRangeExpr + u2:. // ftUnit token (words|sentences|paragraphs) + { + FTDistance dist = new FTDistance(context); + dist.setASTNode(d); + dist.setRange(range); + dist.setUnit(FTUnit.fromString(u2.getText())); + step = dist; + } + ) + ; + +ftMatchOptionsExpr +returns [FTMatchOptions opts] +throws PermissionDeniedException, EXistException, XPathException +{ + opts = new FTMatchOptions(); +} +: + ( + co:FT_CASE_OPTION + { + switch (co.getText()) { + case "sensitive": opts.setCaseMode(FTMatchOptions.CaseMode.SENSITIVE); break; + case "insensitive": opts.setCaseMode(FTMatchOptions.CaseMode.INSENSITIVE); break; + case "lowercase": opts.setCaseMode(FTMatchOptions.CaseMode.LOWERCASE); break; + case "uppercase": opts.setCaseMode(FTMatchOptions.CaseMode.UPPERCASE); break; + } + } + | + di:FT_DIACRITICS_OPTION + { + switch (di.getText()) { + case "sensitive": opts.setDiacriticsMode(FTMatchOptions.DiacriticsMode.SENSITIVE); break; + case "insensitive": opts.setDiacriticsMode(FTMatchOptions.DiacriticsMode.INSENSITIVE); break; + } + } + | + st:FT_STEM_OPTION + { opts.setStemming("stemming".equals(st.getText())); } + | + #( FT_LANGUAGE_OPTION lang:STRING_LITERAL { opts.setLanguage(lang.getText()); } ) + | + wc:FT_WILDCARD_OPTION + { opts.setWildcards("wildcards".equals(wc.getText())); } + | + #( thesOpt:FT_THESAURUS_OPTION + { + final String thesText = thesOpt.getText(); + if ("no thesaurus".equals(thesText)) { + opts.setNoThesaurus(true); + } else { + opts.setNoThesaurus(false); + AST thesChild = thesOpt.getFirstChild(); + while (thesChild != null) { + if (thesChild.getType() == FT_THESAURUS_ID) { + final String idText = thesChild.getText(); + if ("default".equals(idText)) { + opts.getThesaurusIDs().add( + new FTMatchOptions.ThesaurusID(null, null, 0, Integer.MAX_VALUE)); + } else { + // "at" -- children: STRING_LITERAL (uri), optional STRING_LITERAL (rel), optional FT_RANGE + String uri = null; + String relationship = null; + int minLevels = 0; + int maxLevels = Integer.MAX_VALUE; + AST idChild = thesChild.getFirstChild(); + if (idChild != null && idChild.getType() == STRING_LITERAL) { + uri = idChild.getText(); + idChild = idChild.getNextSibling(); + } + if (idChild != null && idChild.getType() == STRING_LITERAL) { + relationship = idChild.getText(); + idChild = idChild.getNextSibling(); + } + if (idChild != null && idChild.getType() == FT_RANGE) { + final String rangeType = idChild.getText(); + AST rangeChild = idChild.getFirstChild(); + if (rangeChild != null) { + final int val1 = Integer.parseInt(rangeChild.getText()); + switch (rangeType) { + case "exactly": + minLevels = val1; + maxLevels = val1; + break; + case "at least": + minLevels = val1; + break; + case "at most": + maxLevels = val1; + break; + case "from": + minLevels = val1; + AST rangeChild2 = rangeChild.getNextSibling(); + if (rangeChild2 != null) { + maxLevels = Integer.parseInt(rangeChild2.getText()); + } + break; + } + } + } + if (uri != null) { + opts.getThesaurusIDs().add( + new FTMatchOptions.ThesaurusID(uri, relationship, minLevels, maxLevels)); + opts.getThesaurusURIs().add(uri); + } + } + } + thesChild = thesChild.getNextSibling(); + } + } + } + ) + | + #( sw:FT_STOP_WORD_OPTION + { + final String swText = sw.getText(); + if ("no stop words".equals(swText)) { + opts.setNoStopWords(true); + } else { + if ("stop words default".equals(swText)) { + opts.setUseDefaultStopWords(true); + } + // Walk children to extract stop words (union and except) + AST swChild = sw.getFirstChild(); + while (swChild != null) { + if (swChild.getType() == FT_STOP_WORDS_EXCEPT) { + // Except wrapper -- inner child is FT_STOP_WORDS + AST exceptInner = swChild.getFirstChild(); + while (exceptInner != null) { + if (exceptInner.getType() == FT_STOP_WORDS) { + final String swMode = exceptInner.getText(); + AST swWordNode = exceptInner.getFirstChild(); + while (swWordNode != null) { + if ("at".equals(swMode)) { + opts.getExceptStopWordURIs().add(swWordNode.getText()); + } else { + opts.getExceptInlineStopWords().add(swWordNode.getText()); + } + swWordNode = swWordNode.getNextSibling(); + } + } + exceptInner = exceptInner.getNextSibling(); + } + } else if (swChild.getType() == FT_STOP_WORDS) { + // Union stop words (primary or union-added) + final String swMode = swChild.getText(); + AST swWordNode = swChild.getFirstChild(); + while (swWordNode != null) { + if ("at".equals(swMode)) { + opts.getStopWordURIs().add(swWordNode.getText()); + } else { + opts.getInlineStopWords().add(swWordNode.getText()); + } + swWordNode = swWordNode.getNextSibling(); + } + } + swChild = swChild.getNextSibling(); + } + } + } + ( . )* + ) + | + #( eo:FT_EXTENSION_OPTION ( . )* + { + // XQFT 3.0 §4.10: validate namespace prefix for extension option. + // Raises XPST0081 if the prefix is not declared. + final String extOptName = eo.getText(); + try { + QName.parse(staticContext, extOptName); + } catch (final QName.IllegalQNameException e) { + throw new XPathException(eo.getLine(), eo.getColumn(), + ErrorCodes.XPST0081, + "No namespace defined for prefix in extension option: " + extOptName); + } + } + ) + )+ + ; + +ftWeightExpr +returns [Expression step] +throws PermissionDeniedException, EXistException, XPathException +{ + step = null; + PathExpr weightPath = new PathExpr(context); +} +: + #( + FT_WEIGHT + step=expr [weightPath] + { step = weightPath; } + ) + ; + +ftIgnoreExpr +returns [Expression step] +throws PermissionDeniedException, EXistException, XPathException +{ + step = null; + PathExpr ignorePath = new PathExpr(context); +} +: + #( + FT_IGNORE_OPTION + step=expr [ignorePath] + { step = ignorePath; } + ) + ; + constructor [PathExpr path] returns [Expression step] throws PermissionDeniedException, EXistException, XPathException diff --git a/exist-core/src/main/java/org/exist/xquery/ErrorCodes.java b/exist-core/src/main/java/org/exist/xquery/ErrorCodes.java index 23226a155f2..71b72327bcf 100644 --- a/exist-core/src/main/java/org/exist/xquery/ErrorCodes.java +++ b/exist-core/src/main/java/org/exist/xquery/ErrorCodes.java @@ -243,6 +243,19 @@ public class ErrorCodes { public static final ErrorCode XTSE0165 = new W3CErrorCode("XTSE0165","It is a static error if the processor is not able to retrieve the resource identified by the URI reference [ in the href attribute of xsl:include or xsl:import] , or if the resource that is retrieved does not contain a stylesheet module conforming to this specification."); + // W3C XQuery and XPath Full Text 3.0 error codes + public static final ErrorCode FTST0001 = new W3CErrorCode("FTST0001", "It is a static error if an operand of mild not (not in) contains ftnot or occurs."); + public static final ErrorCode FTST0003 = new W3CErrorCode("FTST0003", "It is a static error if a tokenizer for the language specified by the language option is not available."); + public static final ErrorCode FTST0004 = new W3CErrorCode("FTST0004", "It is a static error if sentence/paragraph boundaries are required but not supported by the tokenizer."); + public static final ErrorCode FTST0006 = new W3CErrorCode("FTST0006", "It is a static error if a stop word list cannot be found."); + public static final ErrorCode FTST0008 = new W3CErrorCode("FTST0008", "It is a static error if a stop word list is not in the correct format."); + public static final ErrorCode FTST0009 = new W3CErrorCode("FTST0009", "It is a static error if the specified language is not supported."); + public static final ErrorCode FTDY0016 = new W3CErrorCode("FTDY0016", "It is a dynamic error if a weight value is not within the required range."); + public static final ErrorCode FTDY0017 = new W3CErrorCode("FTDY0017", "It is a dynamic error if the right-hand match of mild not has any include-matches matching tokens not matched by include-matches of the left-hand match."); + public static final ErrorCode FTST0013 = new W3CErrorCode("FTST0013", "It is a static error if, in an implementation which does not support the Stop Word Languages feature, a stop word option includes a language specification."); + public static final ErrorCode FTST0018 = new W3CErrorCode("FTST0018", "It is a static error if a thesaurus is not available."); + public static final ErrorCode FTST0019 = new W3CErrorCode("FTST0019", "It is a static error if match options in a single contains text expression conflict with each other."); + /* eXist specific XQuery and XPath errors * * Codes have the format [EX][XQ|XP][DY|SE|ST][nnnn] diff --git a/exist-core/src/main/java/org/exist/xquery/ForExpr.java b/exist-core/src/main/java/org/exist/xquery/ForExpr.java index 1a5eab2f4dd..096ad48b241 100644 --- a/exist-core/src/main/java/org/exist/xquery/ForExpr.java +++ b/exist-core/src/main/java/org/exist/xquery/ForExpr.java @@ -37,6 +37,7 @@ public class ForExpr extends BindingExpression { private QName positionalVariable = null; + private QName scoreVariable = null; private boolean allowEmpty = false; private boolean isOuterFor = true; @@ -60,6 +61,17 @@ public void setPositionalVariable(final QName variable) { positionalVariable = variable; } + /** + * XQFT 3.0 §2.3: A "for" expression may have an optional score variable + * whose QName can be set via this method. The score variable is bound to + * an xs:double value representing the relevance score for each item. + * + * @param variable the name of the score variable + */ + public void setScoreVariable(final QName variable) { + scoreVariable = variable; + } + /* (non-Javadoc) * @see org.exist.xquery.Expression#analyze(org.exist.xquery.Expression) */ @@ -83,6 +95,13 @@ public void analyze(AnalyzeContextInfo contextInfo) throws XPathException { posVar.setStaticType(Type.INTEGER); context.declareVariableBinding(posVar); } + // Declare score variable (XQFT 3.0 §2.3) + if (scoreVariable != null) { + final LocalVariable scoreVar = new LocalVariable(scoreVariable); + scoreVar.setSequenceType(new SequenceType(Type.DOUBLE, Cardinality.EXACTLY_ONE)); + scoreVar.setStaticType(Type.DOUBLE); + context.declareVariableBinding(scoreVar); + } final AnalyzeContextInfo newContextInfo = new AnalyzeContextInfo(contextInfo); newContextInfo.addFlag(SINGLE_STEP_EXECUTION); @@ -135,6 +154,15 @@ public Sequence eval(Sequence contextSequence, Item contextItem) at.setSequenceType(POSITIONAL_VAR_TYPE); context.declareVariableBinding(at); } + // Declare score variable (XQFT 3.0 §2.3) + LocalVariable score = null; + if (scoreVariable != null) { + score = new LocalVariable(scoreVariable); + score.setSequenceType(new SequenceType(Type.DOUBLE, Cardinality.EXACTLY_ONE)); + context.declareVariableBinding(score); + // Naive implementation: always bind score to 1.0 + score.setValue(new DoubleValue(this, 1.0)); + } // Assign the whole input sequence to the bound variable. // This is required if we process the "where" or "order by" clause // in one step. @@ -238,6 +266,8 @@ private boolean callPostEval() { case ORDERBY: case GROUPBY: return true; + default: + break; } prev = prev.getPreviousClause(); } @@ -264,6 +294,8 @@ public void dump(ExpressionDumper dumper) { } if (positionalVariable != null) {dumper.display(" at ").display(positionalVariable);} + if (scoreVariable != null) + {dumper.display(" score ").display(scoreVariable);} dumper.display(" in "); inputSequence.dump(dumper); dumper.endIndent().nl(); @@ -290,6 +322,9 @@ public String toString() { if (positionalVariable != null) { result.append(" at ").append(positionalVariable); } + if (scoreVariable != null) { + result.append(" score ").append(scoreVariable); + } result.append(" in "); result.append(inputSequence.toString()); result.append(" "); @@ -313,6 +348,9 @@ public Set getTupleStreamVariables() { if (positionalVariable != null) { variables.add(positionalVariable); } + if (scoreVariable != null) { + variables.add(scoreVariable); + } final QName variable = getVariable(); if (variable != null) { diff --git a/exist-core/src/main/java/org/exist/xquery/LetExpr.java b/exist-core/src/main/java/org/exist/xquery/LetExpr.java index 278e7d18295..ff07ec04b86 100644 --- a/exist-core/src/main/java/org/exist/xquery/LetExpr.java +++ b/exist-core/src/main/java/org/exist/xquery/LetExpr.java @@ -37,10 +37,21 @@ */ public class LetExpr extends BindingExpression { + private boolean scoreBinding = false; + public LetExpr(XQueryContext context) { super(context); } + /** + * XQFT 3.0 §2.3: Mark this let binding as a score variable binding. + * When true, the variable is bound to the score (xs:double in [0,1]) + * of the input expression rather than the expression's value. + */ + public void setScoreBinding(final boolean scoreBinding) { + this.scoreBinding = scoreBinding; + } + @Override public ClauseType getType() { return ClauseType.LET; @@ -102,9 +113,15 @@ public Sequence eval(Sequence contextSequence, Item contextItem) var = createVariable(varName); var.setSequenceType(sequenceType); context.declareVariableBinding(var); - var.setValue(in); + if (scoreBinding) { + // XQFT 3.0 §2.3: score binding — bind variable to the score + // of the expression. Naive implementation: 1.0 if non-empty, 0.0 if empty. + var.setValue(new DoubleValue(this, in.isEmpty() ? 0.0 : 1.0)); + } else { + var.setValue(in); + } if (sequenceType == null) - {var.checkType();} //Just because it makes conversions ! + {var.checkType();} //Just because it makes conversions ! var.setContextDocs(inputSequence.getContextDocSet()); registerUpdateListener(in); diff --git a/exist-core/src/main/java/org/exist/xquery/StaticXQueryException.java b/exist-core/src/main/java/org/exist/xquery/StaticXQueryException.java index 682be4dfff1..3d9ae6e795b 100644 --- a/exist-core/src/main/java/org/exist/xquery/StaticXQueryException.java +++ b/exist-core/src/main/java/org/exist/xquery/StaticXQueryException.java @@ -21,6 +21,8 @@ */ package org.exist.xquery; +import org.exist.xquery.ErrorCodes.ErrorCode; + public class StaticXQueryException extends XPathException { private static final long serialVersionUID = -8229758099980343418L; @@ -53,7 +55,15 @@ public StaticXQueryException(final Expression expression, String message, Throwa super(expression, message, cause); } - //TODO add in ErrorCode and ErrorVal + public StaticXQueryException(int line, int column, ErrorCode errorCode, String message) { + super(line, column, errorCode, message); + } + + public StaticXQueryException(int line, int column, ErrorCode errorCode, String message, Throwable cause) { + super(line, column, errorCode, message); + initCause(cause); + } + public StaticXQueryException(int line, int column, String message, Throwable cause) { super(line, column, message, cause); } diff --git a/exist-core/src/main/java/org/exist/xquery/XQuery.java b/exist-core/src/main/java/org/exist/xquery/XQuery.java index 5eba728708b..e52b5f70767 100644 --- a/exist-core/src/main/java/org/exist/xquery/XQuery.java +++ b/exist-core/src/main/java/org/exist/xquery/XQuery.java @@ -288,7 +288,7 @@ private CompiledXQuery compile(final XQueryContext context, final Reader reader, if (msg.endsWith(", found 'null'")) { msg = msg.substring(0, msg.length() - ", found 'null'".length()); } - throw new StaticXQueryException(e.getLine(), e.getColumn(), msg); + throw new StaticXQueryException(e.getLine(), e.getColumn(), ErrorCodes.XPST0003, msg); } catch(final TokenStreamException e) { final String es = e.toString(); if(es.matches("^line \\d+:\\d+:.+")) { @@ -298,7 +298,7 @@ private CompiledXQuery compile(final XQueryContext context, final Reader reader, final int line = Integer.parseInt(es.substring(5, es.indexOf(':'))); final String tmpColumn = es.substring(es.indexOf(':') + 1); final int column = Integer.parseInt(tmpColumn.substring(0, tmpColumn.indexOf(':'))); - throw new StaticXQueryException(line, column, e.getMessage(), e); + throw new StaticXQueryException(line, column, ErrorCodes.XPST0003, e.getMessage(), e); } else { if (LOG.isDebugEnabled()) { LOG.debug("Error compiling query: {}", e.getMessage(), e); diff --git a/exist-core/src/main/java/org/exist/xquery/XQueryContext.java b/exist-core/src/main/java/org/exist/xquery/XQueryContext.java index b3721c34179..5928df7e4c3 100644 --- a/exist-core/src/main/java/org/exist/xquery/XQueryContext.java +++ b/exist-core/src/main/java/org/exist/xquery/XQueryContext.java @@ -30,6 +30,8 @@ import java.net.URISyntaxException; import java.nio.charset.Charset; import java.nio.file.Path; + +import org.exist.xquery.ft.FTMatchOptions; import java.nio.file.Paths; import java.util.*; import java.util.concurrent.CopyOnWriteArrayList; @@ -307,6 +309,18 @@ public class XQueryContext implements BinaryValueManager, Context { */ private String defaultCollation = Collations.UNICODE_CODEPOINT_COLLATION_URI; + /** + * XQFT 3.0: default full-text match options declared via "declare ft-option". + */ + private FTMatchOptions defaultFTMatchOptions; + + /** + * XQFT 3.0: thesaurus URI-to-file mapping. + * Maps thesaurus URIs (e.g., "http://bstore1.example.com/UsabilityThesaurus.xml") + * to local file paths. + */ + private final Map thesaurusRegistry = new HashMap<>(); + /** * The default language */ @@ -1090,6 +1104,22 @@ public String getDefaultCollation() { return defaultCollation; } + public void setDefaultFTMatchOptions(final FTMatchOptions opts) { + this.defaultFTMatchOptions = opts; + } + + public FTMatchOptions getDefaultFTMatchOptions() { + return defaultFTMatchOptions; + } + + public void registerThesaurus(final String uri, final Path file) { + thesaurusRegistry.put(uri, file); + } + + public Path resolveThesaurusURI(final String uri) { + return thesaurusRegistry.get(uri); + } + @Override public Collator getCollator(String uri) throws XPathException { return getCollator(uri, ErrorCodes.XQST0076); diff --git a/exist-core/src/main/java/org/exist/xquery/ft/FTAbstractExpr.java b/exist-core/src/main/java/org/exist/xquery/ft/FTAbstractExpr.java new file mode 100644 index 00000000000..27407a5d3e7 --- /dev/null +++ b/exist-core/src/main/java/org/exist/xquery/ft/FTAbstractExpr.java @@ -0,0 +1,53 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.xquery.ft; + +import org.exist.xquery.AbstractExpression; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.value.Item; +import org.exist.xquery.value.Sequence; +import org.exist.xquery.value.Type; + +/** + * Abstract base class for Full Text expression nodes. + * + * FT expression nodes participate in the expression tree for analysis + * and serialization but are not independently evaluable — evaluation + * is driven by {@link FTContainsExpr}. + */ +public abstract class FTAbstractExpr extends AbstractExpression { + + protected FTAbstractExpr(final XQueryContext context) { + super(context); + } + + @Override + public Sequence eval(final Sequence contextSequence, final Item contextItem) throws XPathException { + throw new XPathException(this, getClass().getSimpleName() + " cannot be evaluated directly"); + } + + @Override + public int returnsType() { + return Type.ITEM; + } +} diff --git a/exist-core/src/main/java/org/exist/xquery/ft/FTAnd.java b/exist-core/src/main/java/org/exist/xquery/ft/FTAnd.java new file mode 100644 index 00000000000..c7bc533521d --- /dev/null +++ b/exist-core/src/main/java/org/exist/xquery/ft/FTAnd.java @@ -0,0 +1,92 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.xquery.ft; + +import org.exist.xquery.AnalyzeContextInfo; +import org.exist.xquery.Expression; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.util.ExpressionDumper; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * W3C XQFT 3.0 — FTAnd. + * + *
FTAnd ::= FTMildNot ( "ftand" FTMildNot )*
+ */ +public class FTAnd extends FTAbstractExpr { + + private final List operands = new ArrayList<>(); + + public FTAnd(final XQueryContext context) { + super(context); + } + + public void addOperand(final Expression operand) { + operands.add(operand); + } + + public List getOperands() { + return Collections.unmodifiableList(operands); + } + + @Override + public void analyze(final AnalyzeContextInfo contextInfo) throws XPathException { + contextInfo.setParent(this); + for (final Expression operand : operands) { + operand.analyze(contextInfo); + } + } + + @Override + public void dump(final ExpressionDumper dumper) { + for (int i = 0; i < operands.size(); i++) { + if (i > 0) { + dumper.display(" ftand "); + } + operands.get(i).dump(dumper); + } + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder(); + for (int i = 0; i < operands.size(); i++) { + if (i > 0) { + sb.append(" ftand "); + } + sb.append(operands.get(i).toString()); + } + return sb.toString(); + } + + @Override + public void resetState(final boolean postOptimization) { + super.resetState(postOptimization); + for (final Expression operand : operands) { + operand.resetState(postOptimization); + } + } +} diff --git a/exist-core/src/main/java/org/exist/xquery/ft/FTContainsExpr.java b/exist-core/src/main/java/org/exist/xquery/ft/FTContainsExpr.java new file mode 100644 index 00000000000..8516a8f8730 --- /dev/null +++ b/exist-core/src/main/java/org/exist/xquery/ft/FTContainsExpr.java @@ -0,0 +1,320 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.xquery.ft; + +import org.exist.xquery.AbstractExpression; +import org.exist.xquery.AnalyzeContextInfo; +import org.exist.xquery.Dependency; +import org.exist.xquery.ErrorCodes; +import org.exist.xquery.Expression; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.util.ExpressionDumper; +import org.exist.xquery.value.BooleanValue; +import org.exist.xquery.value.Item; +import org.exist.xquery.value.Sequence; +import org.exist.xquery.value.Type; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * W3C XQuery and XPath Full Text 3.0 — FTContainsExpr. + * + *
FTContainsExpr ::= StringConcatExpr ( "contains" "text" FTSelection FTIgnoreOption? )?
+ * + * Evaluates whether the string value of the left-hand expression, after + * tokenization, matches the FTSelection. Returns xs:boolean. + * + * @see XQFT 3.0 §2.1 + */ +public class FTContainsExpr extends AbstractExpression { + + private Expression source; + private FTSelection ftSelection; + private Expression ignoreExpr; + + // Cached URI maps — captured during analyze() to avoid reading from + // context attributes during eval() (context may be reset concurrently) + private Map cachedStopWordURIMap; + private Map cachedThesaurusURIMap; + + public FTContainsExpr(final XQueryContext context) { + super(context); + } + + public void setSearchSource(final Expression source) { + this.source = source; + } + + public Expression getSearchSource() { + return source; + } + + public void setFTSelection(final FTSelection ftSelection) { + this.ftSelection = ftSelection; + } + + public FTSelection getFTSelection() { + return ftSelection; + } + + public void setIgnoreExpr(final Expression ignoreExpr) { + this.ignoreExpr = ignoreExpr; + } + + public Expression getIgnoreExpr() { + return ignoreExpr; + } + + @Override + public int getDependencies() { + // The source expression (left-hand side of "contains text") is always + // evaluated against the context item, so we must report CONTEXT_ITEM + // dependency. Without this, Predicate.evalPredicate may pass null + // as the context sequence, causing XPDY0002 errors on step expressions. + return source.getDependencies() | Dependency.CONTEXT_ITEM; + } + + @Override + @SuppressWarnings("unchecked") + public void analyze(final AnalyzeContextInfo contextInfo) throws XPathException { + contextInfo.setParent(this); + source.analyze(contextInfo); + ftSelection.analyze(contextInfo); + if (ignoreExpr != null) { + ignoreExpr.analyze(contextInfo); + } + // Cache URI maps from context attributes at analyze time. + // Reading them during eval() is unreliable because context.reset() + // (called between test executions in the XQTS runner) clears attributes. + cachedStopWordURIMap = (Map) context.getAttribute("ft.stopWordURIMap"); + cachedThesaurusURIMap = (Map) context.getAttribute("ft.thesaurusURIMap"); + } + + @Override + @SuppressWarnings("PMD.NPathComplexity") + public Sequence eval(final Sequence contextSequence, final Item contextItem) throws XPathException { + Sequence effectiveContext = contextSequence; + if (contextItem != null) { + effectiveContext = contextItem.toSequence(); + } + + // Evaluate source expression to get the search context + final Sequence sourceSeq = source.eval(effectiveContext, null); + + // Per XQFT 3.0 §2.1: if the source evaluates to an empty sequence, + // there is no text to search — return false immediately. + if (sourceSeq.isEmpty()) { + return BooleanValue.FALSE; + } + + // Collect ignored nodes if FTIgnoreOption is present + Set ignoredNodes = null; + if (ignoreExpr != null) { + final Sequence ignoredSeq = ignoreExpr.eval(effectiveContext, null); + if (!ignoredSeq.isEmpty()) { + // XQFT 3.0 §3.7: FTIgnoreOption must evaluate to a node sequence. + // Non-node values raise XPTY0004. + ignoredNodes = new HashSet<>(); + for (int i = 0; i < ignoredSeq.getItemCount(); i++) { + final Item item = ignoredSeq.itemAt(i); + if (!Type.subTypeOf(item.getType(), Type.NODE)) { + throw new XPathException(this, ErrorCodes.XPTY0004, + "FTIgnoreOption 'without content' expression must evaluate to nodes, got: " + + Type.getTypeName(item.getType())); + } + if (item instanceof Node) { + ignoredNodes.add((Node) item); + } + } + } + } + + // Per XQFT 3.0 §2.1: if the source is a sequence of items, + // evaluate each item independently and return true if ANY matches. + for (int i = 0; i < sourceSeq.getItemCount(); i++) { + final Item sourceItem = sourceSeq.itemAt(i); + String sourceText; + + // Apply FTIgnoreOption: extract text from DOM while skipping ignored nodes + List elementBoundaries = null; + if (ignoredNodes != null && !ignoredNodes.isEmpty() && sourceItem instanceof Node) { + sourceText = extractTextWithoutIgnored((Node) sourceItem, ignoredNodes); + } else if (sourceItem instanceof Node) { + // Collect element boundary offsets within the string value for + // sentence/paragraph detection. The string value itself is unchanged + // (getStringValue concatenates text nodes), but we record where + // element boundaries occur so FTEvaluator can treat them as + // sentence/paragraph breaks for scope/distance unit detection. + elementBoundaries = new ArrayList<>(); + collectElementBoundaries((Node) sourceItem, elementBoundaries, new int[]{0}); + sourceText = sourceItem.getStringValue(); + } else { + sourceText = sourceItem.getStringValue(); + } + + // Use cached URI maps (captured during analyze), falling back to context attributes. + // The cache avoids the race condition where context.reset() clears attributes + // between analyze and eval in concurrent test runner scenarios. + @SuppressWarnings("unchecked") + final Map stopWordURIMap = cachedStopWordURIMap != null + ? cachedStopWordURIMap + : (Map) context.getAttribute("ft.stopWordURIMap"); + @SuppressWarnings("unchecked") + final Map thesaurusURIMap = cachedThesaurusURIMap != null + ? cachedThesaurusURIMap + : (Map) context.getAttribute("ft.thesaurusURIMap"); + final FTEvaluator evaluator = new FTEvaluator(sourceText, stopWordURIMap, thesaurusURIMap, + elementBoundaries); + // Provide XQuery context for dynamic expressions in positional filters + // (e.g., window size expressions that reference the predicate context item) + evaluator.setContextSequence(contextSequence); + // Pass default FT match options from static context (declare ft-option) + final FTMatchOptions defaultOpts = context.getDefaultFTMatchOptions(); + if (evaluator.evaluate(ftSelection, defaultOpts)) { + return BooleanValue.TRUE; + } + } + + return BooleanValue.FALSE; + } + + @Override + public int returnsType() { + return Type.BOOLEAN; + } + + @Override + public void dump(final ExpressionDumper dumper) { + source.dump(dumper); + dumper.display(" contains text "); + ftSelection.dump(dumper); + if (ignoreExpr != null) { + dumper.display(" without content "); + ignoreExpr.dump(dumper); + } + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder(); + sb.append(source.toString()); + sb.append(" contains text "); + sb.append(ftSelection.toString()); + if (ignoreExpr != null) { + sb.append(" without content "); + sb.append(ignoreExpr.toString()); + } + return sb.toString(); + } + + /** + * Extract text content from a DOM node, skipping any descendant nodes + * that are in the ignored set. This implements XQFT 3.0 §3.7 FTIgnoreOption + * at the DOM level rather than by string replacement. + */ + private static String extractTextWithoutIgnored(final Node node, final Set ignoredNodes) { + final StringBuilder sb = new StringBuilder(); + collectText(node, ignoredNodes, sb); + return sb.toString(); + } + + /** + * Collect character offsets within the string value where element boundaries occur. + * These offsets are used by FTEvaluator for sentence/paragraph boundary detection + * without modifying the actual text (which would change tokenization and matching). + * + * @param node the DOM node to walk + * @param boundaries list to collect boundary offsets into + * @param offset mutable offset tracker (single-element array) + */ + private static void collectElementBoundaries(final Node node, + final List boundaries, + final int[] offset) { + if (node.getNodeType() == Node.TEXT_NODE || node.getNodeType() == Node.CDATA_SECTION_NODE) { + offset[0] += node.getNodeValue().length(); + } else if (node.getNodeType() == Node.ELEMENT_NODE) { + final NodeList children = node.getChildNodes(); + if (children != null) { + for (int i = 0; i < children.getLength(); i++) { + final Node child = children.item(i); + if (child.getNodeType() == Node.ELEMENT_NODE) { + // Record the current offset as an element boundary + boundaries.add(offset[0]); + } + collectElementBoundaries(child, boundaries, offset); + } + } + } + } + + /** + * Check if a node is in the ignored set using equals() with linear scan. + * HashSet.contains() may fail for eXist's DOM nodes where equals() is + * overridden (comparing document + nodeNumber) but hashCode() isn't, + * causing hash bucket mismatch. + */ + private static boolean isIgnored(final Node node, final Set ignoredNodes) { + for (final Node ignored : ignoredNodes) { + if (node.equals(ignored)) { + return true; + } + } + return false; + } + + private static void collectText(final Node node, final Set ignoredNodes, + final StringBuilder sb) { + if (isIgnored(node, ignoredNodes)) { + // Replace ignored node's contribution with a space to maintain token boundaries + sb.append(' '); + return; + } + if (node.getNodeType() == Node.TEXT_NODE || node.getNodeType() == Node.CDATA_SECTION_NODE) { + sb.append(node.getNodeValue()); + } else { + final NodeList children = node.getChildNodes(); + if (children != null) { + for (int i = 0; i < children.getLength(); i++) { + collectText(children.item(i), ignoredNodes, sb); + } + } + } + } + + @Override + public void resetState(final boolean postOptimization) { + super.resetState(postOptimization); + source.resetState(postOptimization); + ftSelection.resetState(postOptimization); + if (ignoreExpr != null) { + ignoreExpr.resetState(postOptimization); + } + } +} diff --git a/exist-core/src/main/java/org/exist/xquery/ft/FTContent.java b/exist-core/src/main/java/org/exist/xquery/ft/FTContent.java new file mode 100644 index 00000000000..ae9cb9c8ca3 --- /dev/null +++ b/exist-core/src/main/java/org/exist/xquery/ft/FTContent.java @@ -0,0 +1,76 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.xquery.ft; + +import org.exist.xquery.AnalyzeContextInfo; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.util.ExpressionDumper; + +/** + * W3C XQFT 3.0 — FTContent positional filter. + * + *
FTContent ::= ("at" "start") | ("at" "end") | ("entire" "content")
+ */ +public class FTContent extends FTAbstractExpr { + + public enum ContentType { AT_START, AT_END, ENTIRE_CONTENT } + + private ContentType contentType; + + public FTContent(final XQueryContext context) { + super(context); + } + + public void setContentType(final ContentType contentType) { + this.contentType = contentType; + } + + public ContentType getContentType() { + return contentType; + } + + @Override + public void analyze(final AnalyzeContextInfo contextInfo) throws XPathException { + // no children to analyze + } + + @Override + public void dump(final ExpressionDumper dumper) { + switch (contentType) { + case AT_START: dumper.display("at start"); break; + case AT_END: dumper.display("at end"); break; + case ENTIRE_CONTENT: dumper.display("entire content"); break; + default: break; + } + } + + @Override + public String toString() { + switch (contentType) { + case AT_START: return "at start"; + case AT_END: return "at end"; + case ENTIRE_CONTENT: return "entire content"; + default: return ""; + } + } +} diff --git a/exist-core/src/main/java/org/exist/xquery/ft/FTDistance.java b/exist-core/src/main/java/org/exist/xquery/ft/FTDistance.java new file mode 100644 index 00000000000..0e0597fc492 --- /dev/null +++ b/exist-core/src/main/java/org/exist/xquery/ft/FTDistance.java @@ -0,0 +1,82 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.xquery.ft; + +import org.exist.xquery.AnalyzeContextInfo; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.util.ExpressionDumper; + +/** + * W3C XQFT 3.0 — FTDistance positional filter. + * + *
FTDistance ::= "distance" FTRange FTUnit
+ */ +public class FTDistance extends FTAbstractExpr { + + private FTRange range; + private FTUnit unit; + + public FTDistance(final XQueryContext context) { + super(context); + } + + public void setRange(final FTRange range) { + this.range = range; + } + + public FTRange getRange() { + return range; + } + + public void setUnit(final FTUnit unit) { + this.unit = unit; + } + + public FTUnit getUnit() { + return unit; + } + + @Override + public void analyze(final AnalyzeContextInfo contextInfo) throws XPathException { + contextInfo.setParent(this); + range.analyze(contextInfo); + } + + @Override + public void dump(final ExpressionDumper dumper) { + dumper.display("distance "); + range.dump(dumper); + dumper.display(' ').display(unit.toString()); + } + + @Override + public String toString() { + return "distance " + range.toString() + " " + unit.toString(); + } + + @Override + public void resetState(final boolean postOptimization) { + super.resetState(postOptimization); + range.resetState(postOptimization); + } +} diff --git a/exist-core/src/main/java/org/exist/xquery/ft/FTEvaluator.java b/exist-core/src/main/java/org/exist/xquery/ft/FTEvaluator.java new file mode 100644 index 00000000000..11ef237514d --- /dev/null +++ b/exist-core/src/main/java/org/exist/xquery/ft/FTEvaluator.java @@ -0,0 +1,1863 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.xquery.ft; + +import org.exist.xquery.ErrorCodes; +import org.exist.xquery.Expression; +import org.exist.xquery.XPathException; +import org.exist.xquery.value.Item; +import org.exist.xquery.value.Sequence; +import org.exist.xquery.value.Type; + +import java.io.BufferedReader; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.net.URI; +import java.net.URISyntaxException; +import java.text.BreakIterator; +import java.text.Normalizer; +import java.util.*; +import java.util.regex.Pattern; + +/** + * Sequential (in-memory) evaluator for W3C XQFT 3.0 expressions. + * + * Implements the AllMatches model from the spec in simplified form: + * each FT expression node returns a list of {@link Match} objects, + * where each Match records which token positions were matched and whether + * they are inclusions or exclusions (for mild-not / not-in). + * + * @see XQFT 3.0 §2 + */ +@SuppressWarnings("PMD.NPathComplexity") +public class FTEvaluator { + + // --- Instance fields (declared before inner classes per Java convention) --- + + private final List tokens; + /** Tokens with trailing punctuation preserved — used for wildcard matching. */ + private final List rawTokens; + private final int totalTokens; + /** Maps each token index to its sentence number (0-based). */ + private final int[] sentenceOf; + /** Maps each token index to its paragraph number (0-based). */ + private final int[] paragraphOf; + + /** + * Maps stop word URIs (as they appear in XQuery source) to local file paths. + * Used by the XQFTTS test runner to map test URIs like + * "http://bstore1.example.com/StopWordList.xml" to local stop word files. + * In production use, stop word URIs would typically be file:// paths + * or relative paths resolved against the static context base URI. + */ + private Map stopWordURIMap = Collections.emptyMap(); + + /** + * Maps thesaurus URIs to local file paths. + */ + private Map thesaurusURIMap = Collections.emptyMap(); + + /** + * Cache of loaded thesauri (URI -> FTThesaurus). + */ + private final Map thesaurusCache = new HashMap<>(); + + /** + * Context sequence for evaluating dynamic expressions inside FT positional + * filters (e.g., window size expressions like {@code count(content/part/chapter) * 4}). + * Set from FTContainsExpr when the contains-text predicate is evaluated in context. + */ + private Sequence contextSequence; + + /** + * Current case mode for the FTWords being evaluated. Set in evaluateFTWords() + * and checked in wordMatches() for LOWERCASE/UPPERCASE token normalization. + */ + private FTMatchOptions.CaseMode currentCaseMode; + + // --- Inner classes --- + + /** + * A single match result: a set of token positions that were matched. + * Positions are 0-based indices into the token array. + */ + public static class Match { + private final SortedSet includePositions; + private final SortedSet excludePositions; + // Tracks positions per operand group for the 'ordered' filter. + // Each element is the set of positions from one FTAnd operand. + private final List> operandGroups; + + public Match() { + this.includePositions = new TreeSet<>(); + this.excludePositions = new TreeSet<>(); + this.operandGroups = new ArrayList<>(); + } + + public Match(final int pos) { + this(); + includePositions.add(pos); + final SortedSet group = new TreeSet<>(); + group.add(pos); + operandGroups.add(group); + } + + public Match(final SortedSet includes, final SortedSet excludes) { + this.includePositions = new TreeSet<>(includes); + this.excludePositions = new TreeSet<>(excludes); + this.operandGroups = new ArrayList<>(); + if (!includes.isEmpty()) { + operandGroups.add(new TreeSet<>(includes)); + } + } + + private Match(final SortedSet includes, final SortedSet excludes, + final List> groups) { + this.includePositions = new TreeSet<>(includes); + this.excludePositions = new TreeSet<>(excludes); + this.operandGroups = new ArrayList<>(groups); + } + + public SortedSet getIncludePositions() { + return includePositions; + } + + public SortedSet getExcludePositions() { + return excludePositions; + } + + public List> getOperandGroups() { + return operandGroups; + } + + public SortedSet getAllPositions() { + final SortedSet all = new TreeSet<>(includePositions); + all.addAll(excludePositions); + return all; + } + + /** + * Collapse operand groups into a single group containing all include positions. + * Used after positional filters so outer filters see this match as a single unit. + */ + public Match collapseGroups() { + final List> collapsed = new ArrayList<>(); + if (!includePositions.isEmpty()) { + collapsed.add(new TreeSet<>(includePositions)); + } + return new Match(includePositions, excludePositions, collapsed); + } + + /** Combine two matches (e.g. for ftand), preserving operand groups */ + public Match combine(final Match other) { + final SortedSet inc = new TreeSet<>(includePositions); + inc.addAll(other.includePositions); + final SortedSet exc = new TreeSet<>(excludePositions); + exc.addAll(other.excludePositions); + final List> groups = new ArrayList<>(operandGroups); + groups.addAll(other.operandGroups); + return new Match(inc, exc, groups); + } + } + + /** All possible matches for an FT expression */ + public static class AllMatches { + private final List matches; + + public AllMatches() { + this.matches = new ArrayList<>(); + } + + public AllMatches(final List matches) { + this.matches = new ArrayList<>(matches); + } + + public List getMatches() { + return matches; + } + + public void addMatch(final Match match) { + matches.add(match); + } + + public boolean hasMatches() { + return !matches.isEmpty(); + } + } + + public FTEvaluator(final String text) { + this(text, (List) null); + } + + public FTEvaluator(final String text, final List elementBoundaries) { + this.tokens = tokenize(text); + this.rawTokens = tokenizeRaw(text); + this.totalTokens = tokens.size(); + // Build sentence/paragraph maps, augmented by element boundary info + final int[] offsets = tokenCharOffsets(text); + this.sentenceOf = buildSentenceMap(text, offsets, elementBoundaries); + this.paragraphOf = buildParagraphMap(text, offsets, elementBoundaries); + } + + public FTEvaluator(final String text, final Map stopWordURIMap) { + this(text, (List) null); + if (stopWordURIMap != null) { + this.stopWordURIMap = stopWordURIMap; + } + } + + public FTEvaluator(final String text, final Map stopWordURIMap, + final Map thesaurusURIMap) { + this(text, stopWordURIMap); + if (thesaurusURIMap != null) { + this.thesaurusURIMap = thesaurusURIMap; + } + } + + public FTEvaluator(final String text, final Map stopWordURIMap, + final Map thesaurusURIMap, + final List elementBoundaries) { + this(text, elementBoundaries); + if (stopWordURIMap != null) { + this.stopWordURIMap = stopWordURIMap; + } + if (thesaurusURIMap != null) { + this.thesaurusURIMap = thesaurusURIMap; + } + } + + public void setContextSequence(final Sequence contextSequence) { + this.contextSequence = contextSequence; + } + + public List getTokens() { + return Collections.unmodifiableList(tokens); + } + + /** + * Tokenize text into words using Unicode word boundaries. + */ + static List tokenize(final String text) { + if (text == null || text.isEmpty()) { + return Collections.emptyList(); + } + final List result = new ArrayList<>(); + final BreakIterator wb = BreakIterator.getWordInstance(Locale.ROOT); + wb.setText(text); + int start = wb.first(); + for (int end = wb.next(); end != BreakIterator.DONE; start = end, end = wb.next()) { + final String word = text.substring(start, end); + // Only include words that contain at least one letter or digit + if (word.codePoints().anyMatch(Character::isLetterOrDigit)) { + result.add(word); + } + } + return result; + } + + /** + * Tokenize text preserving trailing punctuation on each word token. + * Used for wildcard matching where patterns may include literal punctuation + * (e.g., "task?" matches the literal string "task?" with a question mark). + */ + static List tokenizeRaw(final String text) { + if (text == null || text.isEmpty()) { + return Collections.emptyList(); + } + final List result = new ArrayList<>(); + final BreakIterator wb = BreakIterator.getWordInstance(Locale.ROOT); + wb.setText(text); + int start = wb.first(); + // Collect all segments with their boundaries + final List segments = new ArrayList<>(); + final List isWord = new ArrayList<>(); + for (int end = wb.next(); end != BreakIterator.DONE; start = end, end = wb.next()) { + final String seg = text.substring(start, end); + segments.add(seg); + isWord.add(seg.codePoints().anyMatch(Character::isLetterOrDigit)); + } + // Build raw tokens: word + trailing non-whitespace punctuation + for (int i = 0; i < segments.size(); i++) { + if (isWord.get(i)) { + final StringBuilder token = new StringBuilder(segments.get(i)); + // Append immediately following non-whitespace, non-word segments + while (i + 1 < segments.size() && !isWord.get(i + 1) + && !segments.get(i + 1).isBlank()) { + i++; + token.append(segments.get(i)); + } + result.add(token.toString()); + } + } + return result; + } + + /** + * Returns the character offset of each word token in the original text. + * Token i starts at offsets[i]. Only includes tokens that match the tokenize() output. + */ + static int[] tokenCharOffsets(final String text) { + if (text == null || text.isEmpty()) { + return new int[0]; + } + final List offsets = new ArrayList<>(); + final BreakIterator wb = BreakIterator.getWordInstance(Locale.ROOT); + wb.setText(text); + int start = wb.first(); + for (int end = wb.next(); end != BreakIterator.DONE; start = end, end = wb.next()) { + final String word = text.substring(start, end); + if (word.codePoints().anyMatch(Character::isLetterOrDigit)) { + offsets.add(start); + } + } + return offsets.stream().mapToInt(Integer::intValue).toArray(); + } + + /** + * Build sentence number map using Java's sentence boundary detection, + * augmented by element boundary offsets from the DOM structure. + * Element boundaries are treated as sentence breaks even when + * BreakIterator can't detect them (e.g. "example.It" from concatenated elements). + */ + private int[] buildSentenceMap(final String text, final int[] offsets, + final List elementBoundaries) { + if (offsets.length == 0 || text == null || text.isEmpty()) { + return new int[0]; + } + // Find sentence boundaries from BreakIterator + final SortedSet sentBounds = new TreeSet<>(); + final BreakIterator sb = BreakIterator.getSentenceInstance(Locale.ROOT); + sb.setText(text); + for (int boundary = sb.first(); boundary != BreakIterator.DONE; boundary = sb.next()) { + sentBounds.add(boundary); + } + // Add element boundaries as additional sentence breaks + if (elementBoundaries != null) { + sentBounds.addAll(elementBoundaries); + } + // Convert to sorted list for indexed access + final List sortedBounds = new ArrayList<>(sentBounds); + // Map each token to its sentence + final int[] map = new int[offsets.length]; + int sentIdx = 0; + for (int i = 0; i < offsets.length; i++) { + while (sentIdx + 1 < sortedBounds.size() && offsets[i] >= sortedBounds.get(sentIdx + 1)) { + sentIdx++; + } + map[i] = sentIdx; + } + return map; + } + + /** + * Build paragraph number map. Paragraphs are separated by blank lines + * (two or more consecutive newlines, possibly with whitespace between) + * OR by element boundaries from the DOM structure. + */ + private int[] buildParagraphMap(final String text, final int[] offsets, + final List elementBoundaries) { + if (offsets.length == 0 || text == null || text.isEmpty()) { + return new int[0]; + } + // Build set of element boundary offsets for quick lookup + final Set elemBounds = elementBoundaries != null + ? new HashSet<>(elementBoundaries) : Collections.emptySet(); + // Find paragraph boundaries by scanning for double-newline patterns + // and element boundaries + final int[] paraAt = new int[text.length()]; + int paraNum = 0; + boolean prevNewline = false; + for (int i = 0; i < text.length(); i++) { + // Element boundary: increment paragraph if we have content before it + if (elemBounds.contains(i) && i > 0 && paraAt[i - 1] == paraNum) { + paraNum++; + } + final char c = text.charAt(i); + if (c == '\n') { + if (prevNewline) { + paraNum++; + prevNewline = false; + } else { + prevNewline = true; + } + } else if (c != '\r' && c != ' ' && c != '\t') { + prevNewline = false; + } + paraAt[i] = paraNum; + } + // Map each token to its paragraph + final int[] map = new int[offsets.length]; + for (int i = 0; i < offsets.length; i++) { + map[i] = paraAt[Math.min(offsets[i], text.length() - 1)]; + } + return map; + } + + /** + * Evaluate the full FTSelection and apply positional filters. + */ + public boolean evaluate(final FTSelection selection, final FTMatchOptions inheritedOptions) + throws XPathException { + AllMatches result = evalExpression(selection.getFTOr(), inheritedOptions); + // Apply positional filters in sequence. Collapse operand groups + // between filters so subsequent filters treat results as single units, + // EXCEPT before an 'ordered' filter — ordered needs to see the + // original operand groups to check left-to-right ordering. + final List filters = selection.getPosFilters(); + for (int f = 0; f < filters.size(); f++) { + result = applyPosFilter(result, filters.get(f)); + if (f < filters.size() - 1 && !(filters.get(f + 1) instanceof FTOrder)) { + result = collapseAllGroups(result); + } + } + return result.hasMatches(); + } + + /** + * Recursively evaluate an FT expression node. + */ + AllMatches evalExpression(final Expression expr, final FTMatchOptions options) + throws XPathException { + if (expr instanceof FTWords) { + return evalFTWords((FTWords) expr, options); + } else if (expr instanceof FTPrimaryWithOptions) { + return evalFTPrimaryWithOptions((FTPrimaryWithOptions) expr, options); + } else if (expr instanceof FTOr) { + return evalFTOr((FTOr) expr, options); + } else if (expr instanceof FTAnd) { + return evalFTAnd((FTAnd) expr, options); + } else if (expr instanceof FTMildNot) { + return evalFTMildNot((FTMildNot) expr, options); + } else if (expr instanceof FTUnaryNot) { + return evalFTUnaryNot((FTUnaryNot) expr, options); + } else if (expr instanceof FTSelection) { + // Nested parenthesized FTSelection + final FTSelection sel = (FTSelection) expr; + AllMatches result = evalExpression(sel.getFTOr(), options); + for (final Expression filter : sel.getPosFilters()) { + result = applyPosFilter(result, filter); + } + // After applying inner positional filters, collapse operand groups + // so outer filters treat this sub-expression as a single unit. + if (!sel.getPosFilters().isEmpty()) { + result = collapseAllGroups(result); + } + return result; + } + throw new XPathException(expr, "Unsupported FT expression type: " + expr.getClass().getSimpleName()); + } + + /** + * FTWords: the terminal matching node. + * Evaluates the words value, tokenizes it, and finds matches in the source tokens. + */ + AllMatches evalFTWords(final FTWords ftWords, final FTMatchOptions options) + throws XPathException { + // Evaluate the words value expression to get the search string(s) + final Sequence wordsSeq = ftWords.getWordsValue().eval(contextSequence, null); + final List searchStrings = new ArrayList<>(); + for (int i = 0; i < wordsSeq.getItemCount(); i++) { + final Item item = wordsSeq.itemAt(i); + // XQFT 3.0 §3.1: FTWords values must be coercible to xs:string*. + // Nodes are atomized to xs:untypedAtomic (always valid). + // Atomic types must be xs:string, xs:untypedAtomic, or xs:anyURI. + // Other atomic types (xs:integer, etc.) raise XPTY0004. + final int itemType = item.getType(); + if (!Type.subTypeOf(itemType, Type.NODE) + && !Type.subTypeOf(itemType, Type.STRING) + && !Type.subTypeOf(itemType, Type.ANY_URI) + && !Type.subTypeOf(itemType, Type.UNTYPED_ATOMIC)) { + throw new XPathException(ftWords, ErrorCodes.XPTY0004, + "Full-text search value must be of type xs:string, got: " + + Type.getTypeName(itemType)); + } + searchStrings.add(item.getStringValue()); + } + + if (searchStrings.isEmpty()) { + // XQFT 3.0 §3.1: empty sequence produces no matches. + return new AllMatches(); + } + + // XQFT 3.0 §4.1: case mode handling (XQFTTS interpretation). + // - INSENSITIVE (default): compare tokens ignoring case. + // - SENSITIVE: compare tokens with exact case. + // - LOWERCASE: source token must be all lowercase; compare case-insensitively. + // - UPPERCASE: source token must be all uppercase; compare case-insensitively. + final FTMatchOptions.CaseMode caseMode = options == null ? null : options.getCaseMode(); + this.currentCaseMode = caseMode; + final boolean caseInsensitive = caseMode == null || + caseMode == FTMatchOptions.CaseMode.INSENSITIVE || + caseMode == FTMatchOptions.CaseMode.LOWERCASE || + caseMode == FTMatchOptions.CaseMode.UPPERCASE; + + // Apply lowercase/uppercase normalization to search strings. + // Source tokens are normalized in wordMatches() to avoid mutating the shared list. + if (caseMode == FTMatchOptions.CaseMode.LOWERCASE) { + searchStrings.replaceAll(s -> s.toLowerCase(Locale.ROOT)); + } else if (caseMode == FTMatchOptions.CaseMode.UPPERCASE) { + searchStrings.replaceAll(s -> s.toUpperCase(Locale.ROOT)); + } + final boolean useWildcards = options != null && + Boolean.TRUE.equals(options.getWildcards()); + // XQFT 3.0 §4.3: diacritics mode. Default to insensitive. + final boolean diacriticsInsensitive = options == null || + options.getDiacriticsMode() == null || + options.getDiacriticsMode() == FTMatchOptions.DiacriticsMode.INSENSITIVE; + // XQFT 3.0 §4.4: stemming mode. Default to no stemming. + final boolean useStemming = options != null && + Boolean.TRUE.equals(options.getStemming()); + + // Collect stop words from options (XQFT 3.0 §4.6) + final Set stopWords = collectStopWords(options, caseInsensitive, ftWords); + + // XQFT 3.0 §4.5: Thesaurus expansion. + // For each search string, look up the full string in the thesaurus first + // (for multi-word terms like "web site components"), then try individual words. + if (options != null && Boolean.FALSE.equals(options.getNoThesaurus()) + && !options.getThesaurusIDs().isEmpty()) { + final List expanded = new ArrayList<>(searchStrings); + for (final String searchStr : searchStrings) { + for (final FTMatchOptions.ThesaurusID tid : options.getThesaurusIDs()) { + // First try the full search string as a thesaurus term + final Set fullSynonyms = expandThesaurus(searchStr.trim(), tid, ftWords); + for (final String syn : fullSynonyms) { + if (!syn.equalsIgnoreCase(searchStr.trim()) && !expanded.contains(syn)) { + expanded.add(syn); + } + } + // Also try individual words (for single-word thesaurus entries) + for (final String word : tokenize(searchStr)) { + final Set wordSynonyms = expandThesaurus(word, tid, ftWords); + for (final String syn : wordSynonyms) { + if (!syn.equalsIgnoreCase(word) && !expanded.contains(syn)) { + expanded.add(syn); + } + } + } + } + } + searchStrings.clear(); + searchStrings.addAll(expanded); + } + + // Validate wildcard patterns (XQFT 1.0 §A.2: only ., .+, .*, .? are valid) + if (useWildcards) { + for (final String searchStr : searchStrings) { + validateWildcardPattern(searchStr, ftWords); + } + } + + final FTWords.AnyallMode mode = ftWords.getMode(); + AllMatches result; + switch (mode) { + case ANY: + result = evalAny(searchStrings, caseInsensitive, useWildcards, diacriticsInsensitive, useStemming, stopWords); break; + case ANY_WORD: + result = evalAnyWord(searchStrings, caseInsensitive, useWildcards, diacriticsInsensitive, useStemming, stopWords); break; + case ALL: + result = evalAll(searchStrings, caseInsensitive, useWildcards, diacriticsInsensitive, useStemming, stopWords); break; + case ALL_WORDS: + result = evalAllWords(searchStrings, caseInsensitive, useWildcards, diacriticsInsensitive, useStemming, stopWords); break; + case PHRASE: + result = evalPhrase(searchStrings, caseInsensitive, useWildcards, diacriticsInsensitive, useStemming, stopWords); break; + default: + result = evalAny(searchStrings, caseInsensitive, useWildcards, diacriticsInsensitive, useStemming, stopWords); break; + } + + // Apply FTTimes constraint if present + final FTTimes ftTimes = ftWords.getFTTimes(); + if (ftTimes != null) { + result = applyTimes(result, ftTimes); + } + return result; + } + + /** + * "any" mode: any of the search strings can match (each as a phrase). + */ + private AllMatches evalAny(final List searchStrings, final boolean caseInsensitive, + final boolean useWildcards, final boolean diacriticsInsensitive, + final boolean useStemming, final Set stopWords) { + final AllMatches result = new AllMatches(); + for (final String searchStr : searchStrings) { + final List searchTokens = useWildcards ? tokenizeWildcard(searchStr) : tokenize(searchStr); + if (searchTokens.isEmpty()) { + // XQFT 3.0: empty search string (no tokens) vacuously matches + result.addMatch(new Match(new TreeSet<>(), new TreeSet<>())); + continue; + } + if (searchTokens.size() == 1) { + findWordMatches(searchTokens.get(0), caseInsensitive, useWildcards, diacriticsInsensitive, useStemming, stopWords, result); + } else { + findPhraseMatches(searchTokens, caseInsensitive, useWildcards, diacriticsInsensitive, useStemming, stopWords, result); + } + } + return result; + } + + /** + * "any word" mode: tokenize all search strings into individual words, + * any single word can match. + */ + private AllMatches evalAnyWord(final List searchStrings, final boolean caseInsensitive, + final boolean useWildcards, final boolean diacriticsInsensitive, + final boolean useStemming, final Set stopWords) { + final AllMatches result = new AllMatches(); + for (final String searchStr : searchStrings) { + final List words = useWildcards ? tokenizeWildcard(searchStr) : tokenize(searchStr); + for (final String word : words) { + if (isStopWord(word, stopWords, caseInsensitive)) { + continue; + } + findWordMatches(word, caseInsensitive, useWildcards, diacriticsInsensitive, useStemming, stopWords, result); + } + } + return result; + } + + /** + * "all" mode: all search strings must match (each as a phrase). + */ + private AllMatches evalAll(final List searchStrings, final boolean caseInsensitive, + final boolean useWildcards, final boolean diacriticsInsensitive, + final boolean useStemming, final Set stopWords) { + AllMatches combined = null; + for (final String searchStr : searchStrings) { + final List searchTokens = useWildcards ? tokenizeWildcard(searchStr) : tokenize(searchStr); + if (searchTokens.isEmpty()) { + continue; + } + final AllMatches phraseMatches = new AllMatches(); + findPhraseMatches(searchTokens, caseInsensitive, useWildcards, diacriticsInsensitive, useStemming, stopWords, phraseMatches); + if (!phraseMatches.hasMatches()) { + return new AllMatches(); // all must match — one failed + } + combined = (combined == null) ? phraseMatches : crossProduct(combined, phraseMatches); + } + return combined != null ? combined : new AllMatches(); + } + + /** + * "all words" mode: tokenize all search strings, every individual word must match. + */ + private AllMatches evalAllWords(final List searchStrings, final boolean caseInsensitive, + final boolean useWildcards, final boolean diacriticsInsensitive, + final boolean useStemming, final Set stopWords) { + final List allWords = new ArrayList<>(); + for (final String s : searchStrings) { + allWords.addAll(useWildcards ? tokenizeWildcard(s) : tokenize(s)); + } + if (allWords.isEmpty()) { + return singleEmptyMatch(); + } + AllMatches combined = null; + for (final String word : allWords) { + if (isStopWord(word, stopWords, caseInsensitive)) { + continue; + } + final AllMatches wordMatches = new AllMatches(); + findWordMatches(word, caseInsensitive, useWildcards, diacriticsInsensitive, useStemming, stopWords, wordMatches); + if (!wordMatches.hasMatches()) { + return new AllMatches(); // all must match + } + combined = (combined == null) ? wordMatches : crossProduct(combined, wordMatches); + } + return combined != null ? combined : singleEmptyMatch(); + } + + /** + * "phrase" mode: all search strings concatenated form one phrase. + */ + private AllMatches evalPhrase(final List searchStrings, final boolean caseInsensitive, + final boolean useWildcards, final boolean diacriticsInsensitive, + final boolean useStemming, final Set stopWords) { + final List phraseTokens = new ArrayList<>(); + for (final String s : searchStrings) { + phraseTokens.addAll(useWildcards ? tokenizeWildcard(s) : tokenize(s)); + } + if (phraseTokens.isEmpty()) { + return new AllMatches(); // no tokens, no match + } + final AllMatches result = new AllMatches(); + findPhraseMatches(phraseTokens, caseInsensitive, useWildcards, diacriticsInsensitive, useStemming, stopWords, result); + return result; + } + + /** + * Find all positions where a single word matches in the token list. + */ + private void findWordMatches(final String word, final boolean caseInsensitive, + final boolean useWildcards, final boolean diacriticsInsensitive, + final boolean useStemming, final Set stopWords, + final AllMatches result) { + if (isStopWord(word, stopWords, caseInsensitive)) { + // Stop words in search query are treated as automatically matching + return; + } + for (int i = 0; i < totalTokens; i++) { + final String rawToken = (useWildcards && i < rawTokens.size()) ? rawTokens.get(i) : null; + if (wordMatches(tokens.get(i), rawToken, word, caseInsensitive, useWildcards, diacriticsInsensitive, useStemming)) { + result.addMatch(new Match(i)); + } + } + } + + /** + * Find all positions where a phrase (sequence of words) matches consecutively. + * Stop words in the search phrase are treated as matching any source token. + */ + private void findPhraseMatches(final List phraseTokens, final boolean caseInsensitive, + final boolean useWildcards, final boolean diacriticsInsensitive, + final boolean useStemming, final Set stopWords, + final AllMatches result) { + final int phraseLen = phraseTokens.size(); + outer: + for (int i = 0; i <= totalTokens - phraseLen; i++) { + for (int j = 0; j < phraseLen; j++) { + final String searchToken = phraseTokens.get(j); + // Stop words in search phrases match any source token position + if (isStopWord(searchToken, stopWords, caseInsensitive)) { + continue; // this position is OK + } + final int idx = i + j; + final String rawToken = (useWildcards && idx < rawTokens.size()) ? rawTokens.get(idx) : null; + if (!wordMatches(tokens.get(idx), rawToken, searchToken, caseInsensitive, useWildcards, diacriticsInsensitive, useStemming)) { + continue outer; + } + } + // Found a phrase match at positions i..i+phraseLen-1 + final SortedSet positions = new TreeSet<>(); + for (int j = 0; j < phraseLen; j++) { + positions.add(i + j); + } + result.addMatch(new Match(positions, new TreeSet<>())); + } + } + + /** + * Check if a source token matches a search word. + * @param rawSourceToken token with trailing punctuation preserved (for wildcard matching), or null + */ + private boolean wordMatches(final String sourceToken, final String rawSourceToken, + final String searchWord, + final boolean caseInsensitive, final boolean useWildcards, + final boolean diacriticsInsensitive, final boolean useStemming) { + String src = sourceToken; + String search = searchWord; + + // Apply diacritics normalization if insensitive + if (diacriticsInsensitive) { + src = stripDiacritics(src); + search = stripDiacritics(search); + } + + if (useWildcards && containsWildcardIndicator(search)) { + final String regex = wildcardToRegex(search, caseInsensitive); + // First try matching against the clean token + if (Pattern.matches(regex, src)) { + return true; + } + // If the pattern contains literal punctuation (via XQFT escape + // sequences like \. \? etc.), that punctuation is only present in + // the raw token. Use lookingAt() instead of matches() to handle + // trailing punctuation (e.g. raw token "next?," for pattern "nex.\?"). + if (search.contains("\\") && rawSourceToken != null) { + String rawSrc = rawSourceToken; + if (diacriticsInsensitive) { + rawSrc = stripDiacritics(rawSrc); + } + return Pattern.compile(regex).matcher(rawSrc).lookingAt(); + } + return false; + } + + // When wildcards are enabled but the search token has no wildcard indicator, + // strip punctuation from the search token so it matches as a normal token. + // tokenizeWildcard() preserves all characters; normal matching needs clean tokens. + // XQFTTS: "task?" with no wildcard indicator strips to "task" and matches source "task". + if (useWildcards) { + search = search.replaceAll("[^\\p{L}\\p{N}]", ""); + if (search.isEmpty()) { + return false; + } + } + + // Apply stemming: compare stems instead of exact words + if (useStemming) { + src = stem(src); + search = stem(search); + } + + if (caseInsensitive) { + // XQFT §4.1 + XQFTTS interpretation: for lowercase/uppercase modes, + // the source token must already be in the specified case. The search + // token is normalized in evalFTWords(), and comparison is case-insensitive. + // This acts as a FILTER: 'using uppercase' only matches tokens that + // are already uppercase in the source (e.g., "AIDS" matches but "aids" does not). + if (currentCaseMode == FTMatchOptions.CaseMode.LOWERCASE) { + if (!sourceToken.equals(sourceToken.toLowerCase(Locale.ROOT))) { + return false; + } + } else if (currentCaseMode == FTMatchOptions.CaseMode.UPPERCASE + && !sourceToken.equals(sourceToken.toUpperCase(Locale.ROOT))) { + return false; + } + return src.equalsIgnoreCase(search); + } + return src.equals(search); + } + + /** + * Tokenize a wildcard search pattern into tokens. + * Unlike the normal tokenizer, this preserves wildcard characters (., *, +, ?, \, {, }) + * within tokens. Splits on whitespace boundaries. + */ + static List tokenizeWildcard(final String pattern) { + if (pattern == null || pattern.isEmpty()) { + return Collections.emptyList(); + } + final List result = new ArrayList<>(); + for (final String part : pattern.split("\\s+")) { + if (!part.isEmpty()) { + result.add(part); + } + } + return result; + } + + /** + * Basic English stemmer using suffix stripping. + * Reduces common English inflections (plurals, verb forms, etc.) + * to approximate stems for full-text comparison. Based on a simplified + * version of the Porter stemming algorithm. + */ + static String stem(final String word) { + if (word == null || word.length() < 3) { + return word; + } + String s = word.toLowerCase(Locale.ROOT); + + // Step 1: Strip inflectional suffixes (longest match first) + if (s.endsWith("ational")) { + s = s.substring(0, s.length() - 7) + "ate"; + } else if (s.endsWith("iveness")) { + s = s.substring(0, s.length() - 7) + "ive"; + } else if (s.endsWith("fulness")) { + s = s.substring(0, s.length() - 7) + "ful"; + } else if (s.endsWith("ously")) { + s = s.substring(0, s.length() - 5) + "ous"; + } else if (s.endsWith("ement")) { + s = s.substring(0, s.length() - 5); + } else if (s.endsWith("ness")) { + s = s.substring(0, s.length() - 4); + } else if (s.endsWith("ment") && !s.endsWith("mment")) { + s = s.substring(0, s.length() - 4); + } else if (s.endsWith("ies")) { + s = s.substring(0, s.length() - 3) + "i"; + } else if (s.endsWith("ied")) { + s = s.substring(0, s.length() - 3) + "i"; + } else if (s.endsWith("eed")) { + // keep as-is (e.g. "feed") + } else if (s.endsWith("ing")) { + final String base = s.substring(0, s.length() - 3); + if (base.length() >= 2) { + s = undouble(base); + } + } else if (s.endsWith("ed")) { + final String base = s.substring(0, s.length() - 2); + if (base.length() >= 2) { + s = undouble(base); + } + } else if (s.endsWith("ers")) { + final String base = s.substring(0, s.length() - 3); + if (base.length() >= 2) { + s = undouble(base); + } + } else if (s.endsWith("er")) { + final String base = s.substring(0, s.length() - 2); + if (base.length() >= 2) { + s = undouble(base); + } + } else if (s.endsWith("es")) { + final String base = s.substring(0, s.length() - 2); + if (base.length() >= 3) { + s = base; + } + } else if (s.endsWith("s") && !s.endsWith("ss")) { + s = s.substring(0, s.length() - 1); + } else if (s.endsWith("ly")) { + final String base = s.substring(0, s.length() - 2); + if (base.length() >= 3) { + s = base; + } + } + + // Step 2: Remove trailing 'e' if the stem is long enough. + // This ensures "picture" → "pictur" matches "pictures" → "pictur". + if (s.length() >= 4 && s.endsWith("e") && !s.endsWith("ee")) { + s = s.substring(0, s.length() - 1); + } + + return s; + } + + /** + * Undo doubled consonant at end of stem (e.g. "runn" → "run"). + */ + private static String undouble(final String base) { + if (base.length() >= 3 + && base.charAt(base.length() - 1) == base.charAt(base.length() - 2) + && !isVowel(base.charAt(base.length() - 1))) { + return base.substring(0, base.length() - 1); + } + return base; + } + + private static boolean isVowel(final char c) { + return c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u'; + } + + /** + * Strip diacritical marks from a string using Unicode normalization. + * NFD decomposes characters, then we remove combining diacritical marks. + */ + private static String stripDiacritics(final String text) { + final String normalized = Normalizer.normalize(text, Normalizer.Form.NFD); + // Remove combining diacritical marks (Unicode block 0300-036F) + return normalized.replaceAll("[\\p{InCombiningDiacriticalMarks}]", ""); + } + + /** + * Check if a word is in the stop word set. + */ + private static boolean isStopWord(final String word, final Set stopWords, + final boolean caseInsensitive) { + if (stopWords.isEmpty()) { + return false; + } + return caseInsensitive ? stopWords.contains(word.toLowerCase(Locale.ROOT)) : stopWords.contains(word); + } + + /** + * Collect stop words from FTMatchOptions. + * XQFT 3.0 §4.6: inline stop words and stop word URIs. + * + *

Stop words can come from two sources: + *

    + *
  • Inline: literal words specified in the query via {@code ("word1", "word2")}
  • + *
  • External: loaded from URIs specified via {@code at "URI"}
  • + *
+ * + * XQFT 3.0 §4.5: Expand a word using a thesaurus. + * Loads the thesaurus from the URI (using the thesaurusURIMap for resolution) + * and returns synonyms matching the relationship and level constraints. + * + * @throws XPathException FTST0018 if the thesaurus cannot be loaded + */ + private Set expandThesaurus(final String word, final FTMatchOptions.ThesaurusID tid, + final Expression context) throws XPathException { + if (tid.isDefault()) { + // Default thesaurus: look up "##default" in the URI map + final Path defaultFile = thesaurusURIMap.get("##default"); + if (defaultFile == null || !Files.exists(defaultFile)) { + // No default thesaurus configured — return just the word itself + return Collections.singleton(word); + } + FTThesaurus thesaurus = thesaurusCache.get("##default"); + if (thesaurus == null) { + try { + thesaurus = FTThesaurus.load(defaultFile); + thesaurusCache.put("##default", thesaurus); + } catch (final Exception e) { + return Collections.singleton(word); + } + } + return thesaurus.expand(word, tid.getRelationship(), tid.getMinLevels(), tid.getMaxLevels()); + } + final String uri = tid.getUri(); + FTThesaurus thesaurus = thesaurusCache.get(uri); + if (thesaurus == null) { + // Resolve URI to file path + Path file = thesaurusURIMap.get(uri); + if (file == null) { + // Try resolving as a direct file path + try { + file = Path.of(new URI(uri)); + } catch (final Exception e) { + // Not a valid file URI + } + } + if (file == null || !Files.exists(file)) { + throw new XPathException(context, ErrorCodes.FTST0018, + "Thesaurus not available: " + uri); + } + try { + thesaurus = FTThesaurus.load(file); + thesaurusCache.put(uri, thesaurus); + } catch (final Exception e) { + throw new XPathException(context, ErrorCodes.FTST0018, + "Failed to load thesaurus: " + uri + " - " + e.getMessage()); + } + } + return thesaurus.expand(word, tid.getRelationship(), tid.getMinLevels(), tid.getMaxLevels()); + } + + /** + *

External stop word files are plain text, one word per line (or whitespace-delimited). + * BaseX uses the same format. URI resolution uses the {@link #stopWordURIMap} for mapped + * URIs (e.g., XQFTTS test URIs), falling back to direct file path or URL resolution. + * + *

Limitation: This implementation supports simple whitespace-delimited text files. + * A future enhancement could integrate with Lucene/Snowball stop word lists for broader + * language coverage beyond what the basic text file format provides. + * + * @throws XPathException FTST0008 if an external stop word URI cannot be loaded + */ + private Set collectStopWords(final FTMatchOptions options, final boolean caseInsensitive, + final Expression context) throws XPathException { + if (options == null) { + return Collections.emptySet(); + } + if (Boolean.TRUE.equals(options.getNoStopWords())) { + return Collections.emptySet(); + } + final Set result = new HashSet<>(); + + // XQFT 3.0 §4.6: raise FTST0013 if stop words are requested with a language + // specification but we don't support language-specific stop word lists. + // We only have a default (English) stop word list; no per-language lists. + if (options.getUseDefaultStopWords() && options.getLanguage() != null) { + final String lang = options.getLanguage().trim().toLowerCase(Locale.ROOT); + if (!lang.isEmpty() && !lang.equals("en") && !lang.startsWith("en-")) { + throw new XPathException(context, ErrorCodes.FTST0013, + "Stop word list not available for language: " + options.getLanguage()); + } + } + + // Handle "stop words default" — load from ##default URI mapping + if (options.getUseDefaultStopWords()) { + final Path defaultPath = stopWordURIMap.get("##default"); + if (defaultPath != null) { + loadStopWordsFromPath(defaultPath, result, caseInsensitive, context, "##default"); + } + // If no ##default mapping, silently use empty set (implementation-defined) + } + + // Add inline stop words (union) + final List inlineWords = options.getInlineStopWords(); + if (inlineWords != null) { + for (final String sw : inlineWords) { + result.add(caseInsensitive ? sw.toLowerCase(Locale.ROOT) : sw); + } + } + + // Load external stop words from URIs (union) + final List uris = options.getStopWordURIs(); + if (uris != null) { + for (final String uri : uris) { + loadStopWordsFromURI(uri, result, caseInsensitive, context); + } + } + + // Remove excepted inline stop words + final List exceptInline = options.getExceptInlineStopWords(); + if (exceptInline != null) { + for (final String sw : exceptInline) { + result.remove(caseInsensitive ? sw.toLowerCase(Locale.ROOT) : sw); + } + } + + // Remove excepted URI stop words + final List exceptURIs = options.getExceptStopWordURIs(); + if (exceptURIs != null && !exceptURIs.isEmpty()) { + final Set exceptWords = new HashSet<>(); + for (final String uri : exceptURIs) { + loadStopWordsFromURI(uri, exceptWords, caseInsensitive, context); + } + result.removeAll(exceptWords); + } + + return result; + } + + /** + * Load stop words from an external URI. + * Tries the following resolution strategies in order: + *

    + *
  1. Mapped URI via {@link #stopWordURIMap} (for test runner URI mappings)
  2. + *
  3. Direct file path (if the URI is a valid local file path)
  4. + *
  5. file:// URI scheme
  6. + *
+ * + *

Stop word files are expected to contain whitespace-delimited words. + * This matches the format used by BaseX and the XQFTTS test suite. + * + *

Note: HTTP URI fetching is not supported. For production use with + * remote stop word lists, consider pre-loading them or using a URI catalog. + * A Lucene-based stop word provider could also be plugged in here. + * + * @throws XPathException FTST0008 if the stop word file cannot be loaded + */ + private void loadStopWordsFromURI(final String uri, final Set result, + final boolean caseInsensitive, + final Expression context) throws XPathException { + // Strategy 1: check the URI map (e.g., XQFTTS test runner mappings) + final Path mappedPath = stopWordURIMap.get(uri); + if (mappedPath != null) { + loadStopWordsFromPath(mappedPath, result, caseInsensitive, context, uri); + return; + } + + // Strategy 2: try as a local file path + try { + final Path filePath = Path.of(uri); + if (Files.exists(filePath)) { + loadStopWordsFromPath(filePath, result, caseInsensitive, context, uri); + return; + } + } catch (final Exception e) { + // Not a valid file path — try URI parsing + } + + // Strategy 3: try as a file:// URI + try { + final URI parsed = new URI(uri); + if ("file".equals(parsed.getScheme())) { + final Path filePath = Path.of(parsed); + loadStopWordsFromPath(filePath, result, caseInsensitive, context, uri); + return; + } + } catch (final URISyntaxException | IllegalArgumentException e) { + // Not a valid URI — fall through to error + } + + // Could not resolve the URI — raise FTST0008 + throw new XPathException(context, ErrorCodes.FTST0008, + "Cannot load external stop word list: " + uri + + ". Only mapped URIs, local file paths, and file:// URIs are supported. " + + "HTTP fetching is not implemented."); + } + + /** + * Read stop words from a local file path. Words are whitespace-delimited. + */ + private static void loadStopWordsFromPath(final Path path, final Set result, + final boolean caseInsensitive, + final Expression context, + final String originalURI) throws XPathException { + if (!Files.exists(path)) { + throw new XPathException(context, ErrorCodes.FTST0008, + "Stop word file not found: " + path + " (from URI: " + originalURI + ")"); + } + try (final BufferedReader reader = Files.newBufferedReader(path, StandardCharsets.UTF_8)) { + String line; + while ((line = reader.readLine()) != null) { + for (final String word : line.trim().split("\\s+")) { + if (!word.isEmpty()) { + result.add(caseInsensitive ? word.toLowerCase(Locale.ROOT) : word); + } + } + } + } catch (final IOException e) { + throw new XPathException(context, ErrorCodes.FTST0008, + "Error reading stop word file: " + path + " (from URI: " + originalURI + "): " + e.getMessage()); + } + } + + /** + * Validate a wildcard pattern for XQFT syntax compliance. + * Raises FTDY0020 if the pattern contains invalid wildcard constructs. + * Valid: .{n,m} (comma-separated numeric range), .{c,c} (comma-separated char range) + * Invalid: .{n} (single number), .{n-m} (dash-separated), .{c-c} (dash-separated chars) + */ + static void validateWildcardPattern(final String pattern, final Expression context) throws XPathException { + int i = 0; + while (i < pattern.length()) { + final char c = pattern.charAt(i); + if (c == '.') { + i++; + if (i < pattern.length()) { + final char next = pattern.charAt(i); + if (next == '{') { + // Extract content between { and } + i++; // skip { + final StringBuilder content = new StringBuilder(); + while (i < pattern.length() && pattern.charAt(i) != '}') { + content.append(pattern.charAt(i)); + i++; + } + if (i < pattern.length()) { + i++; // skip } + } + final String rangeContent = content.toString(); + // Only .{X,Y} with commas is valid; dashes and single values are invalid + if (!rangeContent.contains(",")) { + throw new XPathException(context, ErrorCodes.FTDY0020, + "Invalid wildcard pattern: .{" + rangeContent + "} is not valid wildcard syntax"); + } + } else if (next == '*' || next == '+' || next == '?') { + i++; + } + // else just '.', which is fine + } + } else if (c == '\\') { + i += 2; // skip escaped char + } else { + i++; + } + } + } + + /** + * Check if a search token contains an unescaped '.' (the XQFT wildcard indicator). + * Per XQFT §4.7, only tokens containing '.' use wildcard matching; others are + * matched as normal tokens even when the wildcard option is enabled. + */ + static boolean containsWildcardIndicator(final String token) { + for (int i = 0; i < token.length(); i++) { + final char c = token.charAt(i); + if (c == '\\') { + return true; // escape sequence is wildcard syntax (e.g. \. \? \*) + } else if (c == '.') { + return true; + } + } + return false; + } + + /** + * Convert XQFT wildcard pattern to Java regex. + * XQFT wildcards: "." matches any single char, ".+" matches one or more, + * ".*" matches zero or more, ".{n,m}" etc. + */ + static String wildcardToRegex(final String pattern, final boolean caseInsensitive) { + final StringBuilder sb = new StringBuilder(); + if (caseInsensitive) { + sb.append("(?i)"); + } + // XQFT wildcard grammar (§4.7): + // "." matches any single char + // ".?" zero or one, ".+" one or more, ".*" zero or more + // ".{n-m}" n to m of any char (note: dash, not comma) + // ".{a-z}" character range (single char in range) + int i = 0; + while (i < pattern.length()) { + final char c = pattern.charAt(i); + if (c == '.') { + i++; + if (i < pattern.length()) { + final char next = pattern.charAt(i); + if (next == '*' || next == '+' || next == '?') { + sb.append('.'); + sb.append(next); + i++; + } else if (next == '{') { + // Extract content between { and } + i++; // skip { + final StringBuilder rangeContent = new StringBuilder(); + while (i < pattern.length() && pattern.charAt(i) != '}') { + rangeContent.append(pattern.charAt(i)); + i++; + } + if (i < pattern.length()) { + i++; // skip } + } + final String range = rangeContent.toString(); + final int dashIdx = range.indexOf('-'); + if (dashIdx > 0 && dashIdx < range.length() - 1) { + final String left = range.substring(0, dashIdx); + final String right = range.substring(dashIdx + 1); + if (left.chars().allMatch(Character::isDigit) && right.chars().allMatch(Character::isDigit)) { + // Numeric range: .{n-m} → .{n,m} + sb.append(".{").append(left).append(',').append(right).append('}'); + } else { + // Character range: .{a-z} → [a-z] + sb.append('[').append(left).append('-').append(right).append(']'); + } + } else { + // Single number: .{n} → .{n} + sb.append('.').append('{').append(range).append('}'); + } + } else { + // Just "." — match any single char + sb.append('.'); + } + } else { + sb.append('.'); + } + } else if (c == '\\') { + // Escaped char — treat next char as literal + i++; + if (i < pattern.length()) { + sb.append(Pattern.quote(String.valueOf(pattern.charAt(i)))); + i++; + } + } else { + // Literal character — escape for regex + sb.append(Pattern.quote(String.valueOf(c))); + i++; + } + } + return sb.toString(); + } + + // === Boolean operators === + + AllMatches evalFTOr(final FTOr ftOr, final FTMatchOptions options) + throws XPathException { + final AllMatches result = new AllMatches(); + for (final Expression operand : ftOr.getOperands()) { + final AllMatches sub = evalExpression(operand, options); + result.getMatches().addAll(sub.getMatches()); + } + return result; + } + + AllMatches evalFTAnd(final FTAnd ftAnd, final FTMatchOptions options) + throws XPathException { + AllMatches combined = null; + for (final Expression operand : ftAnd.getOperands()) { + final AllMatches sub = evalExpression(operand, options); + if (!sub.hasMatches()) { + return new AllMatches(); // short-circuit: one operand has no matches + } + combined = (combined == null) ? sub : crossProduct(combined, sub); + } + return combined != null ? combined : singleEmptyMatch(); + } + + AllMatches evalFTMildNot(final FTMildNot ftMildNot, final FTMatchOptions options) + throws XPathException { + final List operands = ftMildNot.getOperands(); + if (operands.isEmpty()) { + return new AllMatches(); + } + AllMatches result = evalExpression(operands.get(0), options); + for (int i = 1; i < operands.size(); i++) { + final AllMatches exclude = evalExpression(operands.get(i), options); + result = applyMildNot(result, exclude); + } + return result; + } + + /** + * Mild not: remove matches from left where a right match covers ALL + * include positions of the left match (XQFT 3.0 §4.5.3). + * + * A left match is removed only when there exists a right match whose + * include positions are a superset of the left match's include positions. + */ + private AllMatches applyMildNot(final AllMatches left, final AllMatches right) { + if (!right.hasMatches()) { + return left; + } + final AllMatches result = new AllMatches(); + for (final Match lm : left.getMatches()) { + boolean covered = false; + for (final Match rm : right.getMatches()) { + if (rm.getIncludePositions().containsAll(lm.getIncludePositions())) { + covered = true; + break; + } + } + if (!covered) { + result.addMatch(lm); + } + } + return result; + } + + AllMatches evalFTUnaryNot(final FTUnaryNot ftNot, final FTMatchOptions options) + throws XPathException { + final AllMatches inner = evalExpression(ftNot.getOperand(), options); + if (inner.hasMatches()) { + return new AllMatches(); // negation: inner matched → overall doesn't match + } + return singleEmptyMatch(); // inner didn't match → overall matches + } + + AllMatches evalFTPrimaryWithOptions(final FTPrimaryWithOptions pwo, final FTMatchOptions inheritedOptions) + throws XPathException { + // XQFT 3.0 §4.9: raise FTST0019 if match options conflict + final FTMatchOptions localOptions = pwo.getMatchOptions(); + if (localOptions != null && localOptions.hasConflict()) { + throw new XPathException(pwo, ErrorCodes.FTST0019, + localOptions.getConflictDescription()); + } + + // Merge match options: local options override inherited ones + final FTMatchOptions effective = mergeOptions(inheritedOptions, localOptions); + + // XQFT 3.0 §4.8: raise FTST0009 for unsupported languages. + // We support Latin-script languages (en, de, fr, es, it, pt, nl, etc.) + // using the default whitespace tokenizer and Snowball stemmer. + // Non-Latin-script languages (zh, ja, ko, ar, he, th, etc.) require + // specialized tokenizers we don't have, so we raise FTST0009. + if (effective != null && effective.getLanguage() != null) { + final String lang = effective.getLanguage().trim(); + if (!lang.isEmpty()) { + // Reject invalid BCP 47 tags + if (!lang.matches("[a-zA-Z]{2,8}(-.*)?")) { + throw new XPathException(pwo, ErrorCodes.FTST0009, + "Language not supported: " + effective.getLanguage()); + } + // Reject languages requiring non-Latin tokenizers + final String primary = lang.contains("-") ? lang.substring(0, lang.indexOf('-')) : lang; + final String lc = primary.toLowerCase(Locale.ROOT); + if ("zh".equals(lc) || "ja".equals(lc) || "ko".equals(lc) || + "ar".equals(lc) || "he".equals(lc) || "th".equals(lc) || + "hi".equals(lc) || "bn".equals(lc) || "ta".equals(lc) || + "ka".equals(lc) || "km".equals(lc) || "my".equals(lc)) { + throw new XPathException(pwo, ErrorCodes.FTST0009, + "Language not supported (no tokenizer for non-Latin script): " + lang); + } + } + } + + // XQFT 3.0 §3.7: Validate weight expression if present. + // Weight must evaluate to a numeric value in [-1000, 1000]; otherwise FTDY0016. + // Non-numeric values raise XPTY0004. + if (pwo.getWeight() != null) { + final Sequence weightSeq = pwo.getWeight().eval(contextSequence, null); + if (weightSeq.isEmpty() || !Type.subTypeOfUnion(weightSeq.itemAt(0).getType(), Type.NUMERIC)) { + throw new XPathException(pwo, ErrorCodes.XPTY0004, + "Weight expression must evaluate to a numeric value, got: " + + (weightSeq.isEmpty() ? "empty sequence" : Type.getTypeName(weightSeq.itemAt(0).getType()))); + } + final double w = weightSeq.itemAt(0).toJavaObject(Double.class); + if (w < -1000.0 || w > 1000.0 || Double.isNaN(w)) { + throw new XPathException(pwo, ErrorCodes.FTDY0016, + "Weight value " + w + " is out of the allowed range [-1000.0, 1000.0]"); + } + } + + return evalExpression(pwo.getPrimary(), effective); + } + + // === Positional filters === + + AllMatches applyPosFilter(final AllMatches input, final Expression filter) + throws XPathException { + if (filter instanceof FTOrder) { + return applyOrdered(input); + } else if (filter instanceof FTWindow) { + return applyWindow(input, (FTWindow) filter); + } else if (filter instanceof FTDistance) { + return applyDistance(input, (FTDistance) filter); + } else if (filter instanceof FTContent) { + return applyContent(input, (FTContent) filter); + } else if (filter instanceof FTScope) { + return applyScope(input, (FTScope) filter); + } + return input; + } + + /** + * "ordered": keep matches where operand groups appear in ascending + * position order — i.e., max position of group i < min position of group i+1. + */ + private AllMatches applyOrdered(final AllMatches input) { + final AllMatches result = new AllMatches(); + for (final Match m : input.getMatches()) { + if (isOrdered(m)) { + result.addMatch(m); + } + } + return result; + } + + private boolean isOrdered(final Match match) { + final List> groups = match.getOperandGroups(); + if (groups.size() <= 1) { + return true; + } + int prevMax = Integer.MIN_VALUE; + for (final SortedSet group : groups) { + if (group.isEmpty()) { + continue; + } + if (group.first() <= prevMax) { + return false; + } + prevMax = group.last(); + } + return true; + } + + /** + * "window N unit": all matched positions must fit within N consecutive units. + */ + private AllMatches applyWindow(final AllMatches input, final FTWindow ftWindow) + throws XPathException { + final int windowSize = evalIntExpr(ftWindow.getWindowExpr()); + final FTUnit unit = ftWindow.getUnit(); + final AllMatches result = new AllMatches(); + for (final Match m : input.getMatches()) { + final SortedSet positions = m.getIncludePositions(); + if (positions.isEmpty()) { + result.addMatch(m); + } else { + final int span = unitSpan(positions.first(), positions.last(), unit); + if (span <= windowSize) { + result.addMatch(m); + } + } + } + return result; + } + + /** + * "distance range unit": the distance between consecutive match positions + * must satisfy the range constraint. + */ + private AllMatches applyDistance(final AllMatches input, final FTDistance ftDistance) + throws XPathException { + final FTRange range = ftDistance.getRange(); + final int[] bounds = evalRange(range); + final int min = bounds[0]; + final int max = bounds[1]; + final FTUnit unit = ftDistance.getUnit(); + + final AllMatches result = new AllMatches(); + for (final Match m : input.getMatches()) { + final List> groups = m.getOperandGroups(); + // Single group (e.g. after positional filter collapse): vacuously satisfied + if (groups.size() <= 1) { + result.addMatch(m); + continue; + } + // Per XQFT §4.5: distance is measured between consecutive operand groups + // (StringIncludes), not between individual token positions. + // Sort groups by their minimum position for consistent ordering. + final List> sorted = new ArrayList<>(groups); + sorted.sort((a, b) -> { + if (a.isEmpty()) return -1; + if (b.isEmpty()) return 1; + return Integer.compare(a.first(), b.first()); + }); + boolean satisfies = true; + for (int i = 1; i < sorted.size(); i++) { + final SortedSet prev = sorted.get(i - 1); + final SortedSet curr = sorted.get(i); + if (prev.isEmpty() || curr.isEmpty()) { + continue; + } + // Distance = gap between last token of previous group and first token of next group + final int dist = unitDistance(prev.last(), curr.first(), unit); + if (dist < min || dist > max) { + satisfies = false; + break; + } + } + if (satisfies) { + result.addMatch(m); + } + } + return result; + } + + /** + * "at start" / "at end" / "entire content": content-based positional filter. + */ + private AllMatches applyContent(final AllMatches input, final FTContent ftContent) { + final AllMatches result = new AllMatches(); + for (final Match m : input.getMatches()) { + final SortedSet positions = m.getIncludePositions(); + if (positions.isEmpty()) { + continue; + } + switch (ftContent.getContentType()) { + case AT_START: + if (positions.first() == 0) { + result.addMatch(m); + } + break; + case AT_END: + if (positions.last() == totalTokens - 1) { + result.addMatch(m); + } + break; + case ENTIRE_CONTENT: + // XQFT 3.0 §3.6.2: entire content requires that the match covers + // all token positions from 0 to totalTokens-1. + if (positions.first() == 0 && positions.last() == totalTokens - 1 + && positions.size() == totalTokens) { + result.addMatch(m); + } + break; + default: + break; + } + } + return result; + } + + /** + * FTScope: "same sentence", "same paragraph", "different sentence", "different paragraph". + * + * For "same": all positions in all StringIncludes must be in the same unit. + * For "different": requires >= 2 StringIncludes; each single-unit StringInclude + * must be in a distinct unit (multi-unit StringIncludes that span unit boundaries + * are never rejected). + */ + private AllMatches applyScope(final AllMatches input, final FTScope ftScope) { + final AllMatches result = new AllMatches(); + final boolean isSentence = ftScope.getBigUnit() == FTScope.BigUnit.SENTENCE; + final int[] unitMap = isSentence ? sentenceOf : paragraphOf; + final boolean isSame = ftScope.getScopeType() == FTScope.ScopeType.SAME; + + for (final Match m : input.getMatches()) { + final List> groups = m.getOperandGroups(); + if (groups.isEmpty()) { + continue; + } + if (isSame) { + // All positions from all groups must be in the same unit + int commonUnit = -1; + boolean allSame = true; + for (final SortedSet group : groups) { + for (final int pos : group) { + final int u = pos < unitMap.length ? unitMap[pos] : 0; + if (commonUnit < 0) { + commonUnit = u; + } else if (u != commonUnit) { + allSame = false; + break; + } + } + if (!allSame) { + break; + } + } + if (allSame) { + result.addMatch(m); + } + } else { + // "different": require >= 2 groups. + // For each group, determine the unit of its first and last positions. + // If a group spans a single unit, that unit must not already be seen. + // Groups spanning multiple units (first unit != last unit) are always accepted. + int count = 0; + boolean allDifferent = true; + final Set usedUnits = new HashSet<>(); + for (final SortedSet group : groups) { + if (group.isEmpty()) { + continue; + } + count++; + final int startUnit = group.first() < unitMap.length ? unitMap[group.first()] : 0; + final int endUnit = group.last() < unitMap.length ? unitMap[group.last()] : 0; + if (startUnit == endUnit && !usedUnits.add(startUnit)) { + // Single-unit group already seen: not all different + allDifferent = false; + break; + } + // Multi-unit group (spans boundary): mark start unit but never reject + usedUnits.add(startUnit); + } + if (allDifferent && count > 1) { + result.addMatch(m); + } + } + } + return result; + } + + /** + * Compute the span between two token positions in the given unit. + * For WORDS: last - first + 1. + * For SENTENCES/PARAGRAPHS: unit(last) - unit(first) + 1. + */ + private int unitSpan(final int first, final int last, final FTUnit unit) { + if (unit == FTUnit.WORDS) { + return last - first + 1; + } + final int[] unitMap = (unit == FTUnit.SENTENCES) ? sentenceOf : paragraphOf; + final int u1 = first < unitMap.length ? unitMap[first] : 0; + final int u2 = last < unitMap.length ? unitMap[last] : 0; + return Math.abs(u2 - u1) + 1; + } + + /** + * Compute the distance (gap) between two token positions in the given unit. + * Per XQFT 3.0 §4.5, distance counts the number of intervening units between + * two positions — i.e. the gap. "distance exactly 0 sentences" means adjacent + * sentences (no sentence between them), analogous to "distance 0 words" meaning + * adjacent words. Formula: abs(unitOf(pos2) - unitOf(pos1)) - 1. + */ + private int unitDistance(final int pos1, final int pos2, final FTUnit unit) { + if (unit == FTUnit.WORDS) { + return pos2 - pos1 - 1; + } + final int[] unitMap = (unit == FTUnit.SENTENCES) ? sentenceOf : paragraphOf; + final int u1 = pos1 < unitMap.length ? unitMap[pos1] : 0; + final int u2 = pos2 < unitMap.length ? unitMap[pos2] : 0; + return Math.abs(u2 - u1) - 1; + } + + /** + * Apply FTTimes constraint: the number of matches must satisfy the range. + * "occurs exactly N times" means exactly N distinct matches. + */ + private AllMatches applyTimes(final AllMatches input, final FTTimes ftTimes) + throws XPathException { + final FTRange range = ftTimes.getRange(); + final int[] bounds = evalRange(range); + final int min = bounds[0]; + final int max = bounds[1]; + final int matchCount = input.getMatches().size(); + + if (matchCount >= min && matchCount <= max) { + // If the count satisfies the range but AllMatches is empty (0 matches), + // return a single empty match to signal "constraint satisfied". + // Per XQFT 3.0 §4.8: 0 occurrences satisfies "at most N times". + if (matchCount == 0) { + return singleEmptyMatch(); + } + return input; + } + return new AllMatches(); // constraint not satisfied + } + + // === Helpers === + + /** + * Cross product of two AllMatches: combine each match from left with + * each match from right. + */ + private AllMatches crossProduct(final AllMatches left, final AllMatches right) { + final AllMatches result = new AllMatches(); + for (final Match lm : left.getMatches()) { + for (final Match rm : right.getMatches()) { + result.addMatch(lm.combine(rm)); + } + } + return result; + } + + /** + * Collapse operand groups in all matches to single groups. + * Used after positional filters in nested FTSelection so outer filters + * treat the result as a single unit. + */ + private AllMatches collapseAllGroups(final AllMatches input) { + final AllMatches result = new AllMatches(); + for (final Match m : input.getMatches()) { + result.addMatch(m.collapseGroups()); + } + return result; + } + + private AllMatches singleEmptyMatch() { + final AllMatches am = new AllMatches(); + am.addMatch(new Match()); + return am; + } + + private int evalIntExpr(final Expression expr) throws XPathException { + final Sequence seq = expr.eval(contextSequence, null); + if (seq.isEmpty()) { + throw new XPathException(expr, ErrorCodes.XPTY0004, + "Full-text range/window/distance expression must evaluate to a single integer"); + } + final Item item = seq.itemAt(0); + final int type = item.getType(); + // Per XQFT 3.0: must be a non-negative integer + if (type != Type.INTEGER && type != Type.INT && type != Type.SHORT + && type != Type.LONG && type != Type.BYTE + && type != Type.UNSIGNED_INT && type != Type.UNSIGNED_SHORT + && type != Type.UNSIGNED_LONG && type != Type.UNSIGNED_BYTE + && type != Type.NON_NEGATIVE_INTEGER && type != Type.POSITIVE_INTEGER + && type != Type.NON_POSITIVE_INTEGER && type != Type.NEGATIVE_INTEGER) { + throw new XPathException(expr, ErrorCodes.XPTY0004, + "Full-text range/window/distance expression must evaluate to an integer, got: " + + Type.getTypeName(type)); + } + return item.toJavaObject(int.class); + } + + private int[] evalRange(final FTRange range) throws XPathException { + switch (range.getMode()) { + case EXACTLY: { + final int n = evalIntExpr(range.getExpr1()); + return new int[]{n, n}; + } + case AT_LEAST: { + final int n = evalIntExpr(range.getExpr1()); + return new int[]{n, Integer.MAX_VALUE}; + } + case AT_MOST: { + final int n = evalIntExpr(range.getExpr1()); + return new int[]{0, n}; + } + case FROM_TO: { + final int from = evalIntExpr(range.getExpr1()); + final int to = evalIntExpr(range.getExpr2()); + return new int[]{from, to}; + } + default: + return new int[]{0, Integer.MAX_VALUE}; + } + } + + /** + * Merge inherited options with local overrides. + */ + static FTMatchOptions mergeOptions(final FTMatchOptions inherited, final FTMatchOptions local) { + if (local == null) { + return inherited; + } + if (inherited == null) { + return local; + } + // Local overrides inherited + final FTMatchOptions merged = new FTMatchOptions(); + merged.setCaseMode(local.getCaseMode() != null ? local.getCaseMode() : inherited.getCaseMode()); + merged.setDiacriticsMode(local.getDiacriticsMode() != null ? local.getDiacriticsMode() : inherited.getDiacriticsMode()); + merged.setStemming(local.getStemming() != null ? local.getStemming() : inherited.getStemming()); + merged.setWildcards(local.getWildcards() != null ? local.getWildcards() : inherited.getWildcards()); + merged.setLanguage(local.getLanguage() != null ? local.getLanguage() : inherited.getLanguage()); + merged.setNoThesaurus(local.getNoThesaurus() != null ? local.getNoThesaurus() : inherited.getNoThesaurus()); + merged.setNoStopWords(local.getNoStopWords() != null ? local.getNoStopWords() : inherited.getNoStopWords()); + // Merge stop word lists (local overrides if non-empty) + if (!local.getInlineStopWords().isEmpty()) { + merged.getInlineStopWords().addAll(local.getInlineStopWords()); + } else if (inherited.getInlineStopWords() != null) { + merged.getInlineStopWords().addAll(inherited.getInlineStopWords()); + } + if (!local.getStopWordURIs().isEmpty()) { + merged.getStopWordURIs().addAll(local.getStopWordURIs()); + } else if (inherited.getStopWordURIs() != null) { + merged.getStopWordURIs().addAll(inherited.getStopWordURIs()); + } + // Merge thesaurus IDs (local overrides if non-empty) + if (!local.getThesaurusIDs().isEmpty()) { + merged.getThesaurusIDs().addAll(local.getThesaurusIDs()); + } else if (!inherited.getThesaurusIDs().isEmpty()) { + merged.getThesaurusIDs().addAll(inherited.getThesaurusIDs()); + } + return merged; + } +} diff --git a/exist-core/src/main/java/org/exist/xquery/ft/FTMatchOptions.java b/exist-core/src/main/java/org/exist/xquery/ft/FTMatchOptions.java new file mode 100644 index 00000000000..e9f67f97335 --- /dev/null +++ b/exist-core/src/main/java/org/exist/xquery/ft/FTMatchOptions.java @@ -0,0 +1,175 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.xquery.ft; + +import org.exist.xquery.util.ExpressionDumper; + +import java.util.ArrayList; +import java.util.List; + +/** + * W3C XQFT 3.0 — FTMatchOptions. + * + * Collects match options specified via "using" clauses on an FTPrimaryWithOptions. + * Each option overrides the inherited default from the static context. + */ +public class FTMatchOptions { + + public enum CaseMode { SENSITIVE, INSENSITIVE, LOWERCASE, UPPERCASE } + public enum DiacriticsMode { SENSITIVE, INSENSITIVE } + + private CaseMode caseMode; + private DiacriticsMode diacriticsMode; + private Boolean stemming; // null = not specified + private Boolean wildcards; // null = not specified + private String language; // BCP 47 tag, null = not specified + private Boolean noThesaurus; + private final List thesaurusURIs = new ArrayList<>(); + private final List thesaurusIDs = new ArrayList<>(); + private Boolean noStopWords; + private boolean useDefaultStopWords; + private final List stopWordURIs = new ArrayList<>(); + private final List inlineStopWords = new ArrayList<>(); + private final List exceptStopWordURIs = new ArrayList<>(); + private final List exceptInlineStopWords = new ArrayList<>(); + private boolean hasConflict; + private String conflictDescription; + + /** + * A thesaurus reference with URI, optional relationship, and optional level range. + */ + public static class ThesaurusID { + private final String uri; // null for "default" + private final String relationship; // null = any + private final int minLevels; // 0 = no minimum + private final int maxLevels; // Integer.MAX_VALUE = no maximum + + public ThesaurusID(final String uri, final String relationship, + final int minLevels, final int maxLevels) { + this.uri = uri; + this.relationship = relationship; + this.minLevels = minLevels; + this.maxLevels = maxLevels; + } + + public String getUri() { return uri; } + public String getRelationship() { return relationship; } + public int getMinLevels() { return minLevels; } + public int getMaxLevels() { return maxLevels; } + public boolean isDefault() { return uri == null; } + } + + public boolean hasConflict() { return hasConflict; } + public String getConflictDescription() { return conflictDescription; } + + public CaseMode getCaseMode() { return caseMode; } + public void setCaseMode(final CaseMode caseMode) { + if (this.caseMode != null && this.caseMode != caseMode) { + hasConflict = true; + conflictDescription = "Conflicting case options: " + this.caseMode + " and " + caseMode; + } + this.caseMode = caseMode; + } + + public DiacriticsMode getDiacriticsMode() { return diacriticsMode; } + public void setDiacriticsMode(final DiacriticsMode diacriticsMode) { + if (this.diacriticsMode != null && this.diacriticsMode != diacriticsMode) { + hasConflict = true; + conflictDescription = "Conflicting diacritics options: " + this.diacriticsMode + " and " + diacriticsMode; + } + this.diacriticsMode = diacriticsMode; + } + + public Boolean getStemming() { return stemming; } + public void setStemming(final Boolean stemming) { + if (this.stemming != null && !this.stemming.equals(stemming)) { + hasConflict = true; + conflictDescription = "Conflicting stemming options"; + } + this.stemming = stemming; + } + + public Boolean getWildcards() { return wildcards; } + public void setWildcards(final Boolean wildcards) { + if (this.wildcards != null && !this.wildcards.equals(wildcards)) { + hasConflict = true; + conflictDescription = "Conflicting wildcard options"; + } + this.wildcards = wildcards; + } + + public String getLanguage() { return language; } + public void setLanguage(final String language) { this.language = language; } + + public Boolean getNoThesaurus() { return noThesaurus; } + public void setNoThesaurus(final Boolean noThesaurus) { this.noThesaurus = noThesaurus; } + public List getThesaurusURIs() { return thesaurusURIs; } + public List getThesaurusIDs() { return thesaurusIDs; } + + public Boolean getNoStopWords() { return noStopWords; } + public void setNoStopWords(final Boolean noStopWords) { this.noStopWords = noStopWords; } + public boolean getUseDefaultStopWords() { return useDefaultStopWords; } + public void setUseDefaultStopWords(final boolean useDefaultStopWords) { this.useDefaultStopWords = useDefaultStopWords; } + public List getStopWordURIs() { return stopWordURIs; } + public List getInlineStopWords() { return inlineStopWords; } + public List getExceptStopWordURIs() { return exceptStopWordURIs; } + public List getExceptInlineStopWords() { return exceptInlineStopWords; } + + public void dump(final ExpressionDumper dumper) { + if (caseMode != null) { + dumper.display(" using case ").display(caseMode.name().toLowerCase()); + } + if (diacriticsMode != null) { + dumper.display(" using diacritics ").display(diacriticsMode.name().toLowerCase()); + } + if (stemming != null) { + dumper.display(stemming ? " using stemming" : " using no stemming"); + } + if (wildcards != null) { + dumper.display(wildcards ? " using wildcards" : " using no wildcards"); + } + if (language != null) { + dumper.display(" using language \"").display(language).display("\""); + } + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder(); + if (caseMode != null) { + sb.append(" using case ").append(caseMode.name().toLowerCase()); + } + if (diacriticsMode != null) { + sb.append(" using diacritics ").append(diacriticsMode.name().toLowerCase()); + } + if (stemming != null) { + sb.append(stemming ? " using stemming" : " using no stemming"); + } + if (wildcards != null) { + sb.append(wildcards ? " using wildcards" : " using no wildcards"); + } + if (language != null) { + sb.append(" using language \"").append(language).append("\""); + } + return sb.toString(); + } +} diff --git a/exist-core/src/main/java/org/exist/xquery/ft/FTMildNot.java b/exist-core/src/main/java/org/exist/xquery/ft/FTMildNot.java new file mode 100644 index 00000000000..80ef0038efe --- /dev/null +++ b/exist-core/src/main/java/org/exist/xquery/ft/FTMildNot.java @@ -0,0 +1,135 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.xquery.ft; + +import org.exist.xquery.AnalyzeContextInfo; +import org.exist.xquery.ErrorCodes; +import org.exist.xquery.Expression; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.util.ExpressionDumper; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * W3C XQFT 3.0 — FTMildNot. + * + *

FTMildNot ::= FTUnaryNot ( "not" "in" FTUnaryNot )*
+ */ +public class FTMildNot extends FTAbstractExpr { + + private final List operands = new ArrayList<>(); + + public FTMildNot(final XQueryContext context) { + super(context); + } + + public void addOperand(final Expression operand) { + operands.add(operand); + } + + public List getOperands() { + return Collections.unmodifiableList(operands); + } + + @Override + public void analyze(final AnalyzeContextInfo contextInfo) throws XPathException { + contextInfo.setParent(this); + for (final Expression operand : operands) { + operand.analyze(contextInfo); + } + // XQFT 3.0 §3.3: operands of "not in" (mild not) must not contain + // ftnot (FTUnaryNot) or "occurs" (FTTimes). Raise FTST0001 if found. + if (operands.size() > 1) { + for (final Expression operand : operands) { + validateMildNotOperand(operand); + } + } + } + + /** + * Recursively check that an expression tree does not contain FTUnaryNot or FTTimes. + */ + private void validateMildNotOperand(final Expression expr) throws XPathException { + if (expr instanceof FTUnaryNot) { + throw new XPathException(this, ErrorCodes.FTST0001, + "ftnot is not allowed as operand of 'not in' (mild not)"); + } + if (expr instanceof FTWords) { + final FTWords ftWords = (FTWords) expr; + if (ftWords.getFTTimes() != null) { + throw new XPathException(this, ErrorCodes.FTST0001, + "'occurs' is not allowed as operand of 'not in' (mild not)"); + } + } + // Recurse into sub-expressions + if (expr instanceof FTAnd) { + for (final Expression child : ((FTAnd) expr).getOperands()) { + validateMildNotOperand(child); + } + } else if (expr instanceof FTOr) { + for (final Expression child : ((FTOr) expr).getOperands()) { + validateMildNotOperand(child); + } + } else if (expr instanceof FTMildNot) { + for (final Expression child : ((FTMildNot) expr).getOperands()) { + validateMildNotOperand(child); + } + } else if (expr instanceof FTPrimaryWithOptions) { + validateMildNotOperand(((FTPrimaryWithOptions) expr).getPrimary()); + } else if (expr instanceof FTSelection) { + validateMildNotOperand(((FTSelection) expr).getFTOr()); + } + } + + @Override + public void dump(final ExpressionDumper dumper) { + for (int i = 0; i < operands.size(); i++) { + if (i > 0) { + dumper.display(" not in "); + } + operands.get(i).dump(dumper); + } + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder(); + for (int i = 0; i < operands.size(); i++) { + if (i > 0) { + sb.append(" not in "); + } + sb.append(operands.get(i).toString()); + } + return sb.toString(); + } + + @Override + public void resetState(final boolean postOptimization) { + super.resetState(postOptimization); + for (final Expression operand : operands) { + operand.resetState(postOptimization); + } + } +} diff --git a/exist-core/src/main/java/org/exist/xquery/ft/FTOr.java b/exist-core/src/main/java/org/exist/xquery/ft/FTOr.java new file mode 100644 index 00000000000..7a8ccaf0821 --- /dev/null +++ b/exist-core/src/main/java/org/exist/xquery/ft/FTOr.java @@ -0,0 +1,92 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.xquery.ft; + +import org.exist.xquery.AnalyzeContextInfo; +import org.exist.xquery.Expression; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.util.ExpressionDumper; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * W3C XQFT 3.0 — FTOr. + * + *
FTOr ::= FTAnd ( "ftor" FTAnd )*
+ */ +public class FTOr extends FTAbstractExpr { + + private final List operands = new ArrayList<>(); + + public FTOr(final XQueryContext context) { + super(context); + } + + public void addOperand(final Expression operand) { + operands.add(operand); + } + + public List getOperands() { + return Collections.unmodifiableList(operands); + } + + @Override + public void analyze(final AnalyzeContextInfo contextInfo) throws XPathException { + contextInfo.setParent(this); + for (final Expression operand : operands) { + operand.analyze(contextInfo); + } + } + + @Override + public void dump(final ExpressionDumper dumper) { + for (int i = 0; i < operands.size(); i++) { + if (i > 0) { + dumper.display(" ftor "); + } + operands.get(i).dump(dumper); + } + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder(); + for (int i = 0; i < operands.size(); i++) { + if (i > 0) { + sb.append(" ftor "); + } + sb.append(operands.get(i).toString()); + } + return sb.toString(); + } + + @Override + public void resetState(final boolean postOptimization) { + super.resetState(postOptimization); + for (final Expression operand : operands) { + operand.resetState(postOptimization); + } + } +} diff --git a/exist-core/src/main/java/org/exist/xquery/ft/FTOrder.java b/exist-core/src/main/java/org/exist/xquery/ft/FTOrder.java new file mode 100644 index 00000000000..7db0e75a8f5 --- /dev/null +++ b/exist-core/src/main/java/org/exist/xquery/ft/FTOrder.java @@ -0,0 +1,54 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.xquery.ft; + +import org.exist.xquery.AnalyzeContextInfo; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.util.ExpressionDumper; + +/** + * W3C XQFT 3.0 — FTOrder positional filter. + * + *
FTOrder ::= "ordered"
+ */ +public class FTOrder extends FTAbstractExpr { + + public FTOrder(final XQueryContext context) { + super(context); + } + + @Override + public void analyze(final AnalyzeContextInfo contextInfo) throws XPathException { + // no children to analyze + } + + @Override + public void dump(final ExpressionDumper dumper) { + dumper.display("ordered"); + } + + @Override + public String toString() { + return "ordered"; + } +} diff --git a/exist-core/src/main/java/org/exist/xquery/ft/FTPrimaryWithOptions.java b/exist-core/src/main/java/org/exist/xquery/ft/FTPrimaryWithOptions.java new file mode 100644 index 00000000000..2d0b7b896ed --- /dev/null +++ b/exist-core/src/main/java/org/exist/xquery/ft/FTPrimaryWithOptions.java @@ -0,0 +1,112 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.xquery.ft; + +import org.exist.xquery.AnalyzeContextInfo; +import org.exist.xquery.Expression; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.util.ExpressionDumper; + +/** + * W3C XQFT 3.0 — FTPrimaryWithOptions. + * + *
FTPrimaryWithOptions ::= FTPrimary FTMatchOptions? FTWeight?
+ */ +public class FTPrimaryWithOptions extends FTAbstractExpr { + + private Expression primary; + private FTMatchOptions matchOptions; + private Expression weight; + + public FTPrimaryWithOptions(final XQueryContext context) { + super(context); + } + + public void setPrimary(final Expression primary) { + this.primary = primary; + } + + public Expression getPrimary() { + return primary; + } + + public void setMatchOptions(final FTMatchOptions matchOptions) { + this.matchOptions = matchOptions; + } + + public FTMatchOptions getMatchOptions() { + return matchOptions; + } + + public void setWeight(final Expression weight) { + this.weight = weight; + } + + public Expression getWeight() { + return weight; + } + + @Override + public void analyze(final AnalyzeContextInfo contextInfo) throws XPathException { + contextInfo.setParent(this); + primary.analyze(contextInfo); + if (weight != null) { + weight.analyze(contextInfo); + } + } + + @Override + public void dump(final ExpressionDumper dumper) { + primary.dump(dumper); + if (matchOptions != null) { + matchOptions.dump(dumper); + } + if (weight != null) { + dumper.display(" weight { "); + weight.dump(dumper); + dumper.display(" }"); + } + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder(); + sb.append(primary.toString()); + if (matchOptions != null) { + sb.append(matchOptions.toString()); + } + if (weight != null) { + sb.append(" weight { ").append(weight.toString()).append(" }"); + } + return sb.toString(); + } + + @Override + public void resetState(final boolean postOptimization) { + super.resetState(postOptimization); + primary.resetState(postOptimization); + if (weight != null) { + weight.resetState(postOptimization); + } + } +} diff --git a/exist-core/src/main/java/org/exist/xquery/ft/FTRange.java b/exist-core/src/main/java/org/exist/xquery/ft/FTRange.java new file mode 100644 index 00000000000..ab90192d202 --- /dev/null +++ b/exist-core/src/main/java/org/exist/xquery/ft/FTRange.java @@ -0,0 +1,132 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.xquery.ft; + +import org.exist.xquery.AnalyzeContextInfo; +import org.exist.xquery.Expression; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.util.ExpressionDumper; + +/** + * W3C XQFT 3.0 — FTRange. + * + *
+ * FTRange ::= ("exactly" AdditiveExpr)
+ *           | ("at" "least" AdditiveExpr)
+ *           | ("at" "most" AdditiveExpr)
+ *           | ("from" AdditiveExpr "to" AdditiveExpr)
+ * 
+ */ +public class FTRange extends FTAbstractExpr { + + public enum RangeMode { + EXACTLY, AT_LEAST, AT_MOST, FROM_TO + } + + private RangeMode mode; + private Expression expr1; + private Expression expr2; // only for FROM_TO + + public FTRange(final XQueryContext context) { + super(context); + } + + public void setMode(final RangeMode mode) { + this.mode = mode; + } + + public RangeMode getMode() { + return mode; + } + + public void setExpr1(final Expression expr1) { + this.expr1 = expr1; + } + + public Expression getExpr1() { + return expr1; + } + + public void setExpr2(final Expression expr2) { + this.expr2 = expr2; + } + + public Expression getExpr2() { + return expr2; + } + + @Override + public void analyze(final AnalyzeContextInfo contextInfo) throws XPathException { + contextInfo.setParent(this); + expr1.analyze(contextInfo); + if (expr2 != null) { + expr2.analyze(contextInfo); + } + } + + @Override + public void dump(final ExpressionDumper dumper) { + switch (mode) { + case EXACTLY: + dumper.display("exactly "); + expr1.dump(dumper); + break; + case AT_LEAST: + dumper.display("at least "); + expr1.dump(dumper); + break; + case AT_MOST: + dumper.display("at most "); + expr1.dump(dumper); + break; + case FROM_TO: + dumper.display("from "); + expr1.dump(dumper); + dumper.display(" to "); + expr2.dump(dumper); + break; + default: + break; + } + } + + @Override + public String toString() { + switch (mode) { + case EXACTLY: return "exactly " + expr1.toString(); + case AT_LEAST: return "at least " + expr1.toString(); + case AT_MOST: return "at most " + expr1.toString(); + case FROM_TO: return "from " + expr1.toString() + " to " + expr2.toString(); + default: return ""; + } + } + + @Override + public void resetState(final boolean postOptimization) { + super.resetState(postOptimization); + expr1.resetState(postOptimization); + if (expr2 != null) { + expr2.resetState(postOptimization); + } + } +} diff --git a/exist-core/src/main/java/org/exist/xquery/ft/FTScope.java b/exist-core/src/main/java/org/exist/xquery/ft/FTScope.java new file mode 100644 index 00000000000..a3c359ca7d4 --- /dev/null +++ b/exist-core/src/main/java/org/exist/xquery/ft/FTScope.java @@ -0,0 +1,78 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.xquery.ft; + +import org.exist.xquery.AnalyzeContextInfo; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.util.ExpressionDumper; + +/** + * W3C XQFT 3.0 — FTScope positional filter. + * + *
FTScope ::= ("same" | "different") FTBigUnit
+ */ +public class FTScope extends FTAbstractExpr { + + public enum ScopeType { SAME, DIFFERENT } + public enum BigUnit { SENTENCE, PARAGRAPH } + + private ScopeType scopeType; + private BigUnit bigUnit; + + public FTScope(final XQueryContext context) { + super(context); + } + + public void setScopeType(final ScopeType scopeType) { + this.scopeType = scopeType; + } + + public ScopeType getScopeType() { + return scopeType; + } + + public void setBigUnit(final BigUnit bigUnit) { + this.bigUnit = bigUnit; + } + + public BigUnit getBigUnit() { + return bigUnit; + } + + @Override + public void analyze(final AnalyzeContextInfo contextInfo) throws XPathException { + // no children to analyze + } + + @Override + public void dump(final ExpressionDumper dumper) { + dumper.display(scopeType.name().toLowerCase()); + dumper.display(' '); + dumper.display(bigUnit.name().toLowerCase()); + } + + @Override + public String toString() { + return scopeType.name().toLowerCase() + " " + bigUnit.name().toLowerCase(); + } +} diff --git a/exist-core/src/main/java/org/exist/xquery/ft/FTSelection.java b/exist-core/src/main/java/org/exist/xquery/ft/FTSelection.java new file mode 100644 index 00000000000..2413ce55ecc --- /dev/null +++ b/exist-core/src/main/java/org/exist/xquery/ft/FTSelection.java @@ -0,0 +1,116 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.xquery.ft; + +import org.exist.xquery.AbstractExpression; +import org.exist.xquery.AnalyzeContextInfo; +import org.exist.xquery.Expression; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.util.ExpressionDumper; +import org.exist.xquery.value.Item; +import org.exist.xquery.value.Sequence; +import org.exist.xquery.value.Type; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * W3C XQFT 3.0 — FTSelection. + * + *
FTSelection ::= FTOr FTPosFilter*
+ * + * Wraps an FTOr expression with optional positional filters. + */ +public class FTSelection extends AbstractExpression { + + private Expression ftOr; + private final List posFilters = new ArrayList<>(); + + public FTSelection(final XQueryContext context) { + super(context); + } + + public void setFTOr(final Expression ftOr) { + this.ftOr = ftOr; + } + + public Expression getFTOr() { + return ftOr; + } + + public void addPosFilter(final Expression filter) { + posFilters.add(filter); + } + + public List getPosFilters() { + return Collections.unmodifiableList(posFilters); + } + + @Override + public void analyze(final AnalyzeContextInfo contextInfo) throws XPathException { + contextInfo.setParent(this); + ftOr.analyze(contextInfo); + for (final Expression filter : posFilters) { + filter.analyze(contextInfo); + } + } + + @Override + public Sequence eval(final Sequence contextSequence, final Item contextItem) throws XPathException { + throw new XPathException(this, "FTSelection cannot be evaluated directly"); + } + + @Override + public int returnsType() { + return Type.ITEM; + } + + @Override + public void dump(final ExpressionDumper dumper) { + ftOr.dump(dumper); + for (final Expression filter : posFilters) { + dumper.display(' '); + filter.dump(dumper); + } + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder(); + sb.append(ftOr.toString()); + for (final Expression filter : posFilters) { + sb.append(' ').append(filter.toString()); + } + return sb.toString(); + } + + @Override + public void resetState(final boolean postOptimization) { + super.resetState(postOptimization); + ftOr.resetState(postOptimization); + for (final Expression filter : posFilters) { + filter.resetState(postOptimization); + } + } +} diff --git a/exist-core/src/main/java/org/exist/xquery/ft/FTThesaurus.java b/exist-core/src/main/java/org/exist/xquery/ft/FTThesaurus.java new file mode 100644 index 00000000000..7d8d09bd293 --- /dev/null +++ b/exist-core/src/main/java/org/exist/xquery/ft/FTThesaurus.java @@ -0,0 +1,173 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.xquery.ft; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.*; + +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.NodeList; + +/** + * W3C XQFT 3.0 thesaurus support. + * + * Loads thesaurus data from XML files using the XQFTTS thesaurus schema + * and provides term expansion based on relationship type and level constraints. + * + * @see XQFT 3.0 §3.4.3 + */ +public class FTThesaurus { + + private static final String THESAURUS_NS = "http://www.w3.org/2007/xqftts/thesaurus"; + private final Map> entries = new HashMap<>(); + + /** + * A single thesaurus entry: a term with its synonyms at various levels. + */ + private static class Synonym { + final String term; + final String relationship; + final List children; + + Synonym(final String term, final String relationship, final List children) { + this.term = term; + this.relationship = relationship; + this.children = children; + } + } + + /** + * Load thesaurus from an XML file. + */ + public static FTThesaurus load(final Path file) throws Exception { + final FTThesaurus thes = new FTThesaurus(); + final DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); + dbf.setNamespaceAware(true); + final DocumentBuilder db = dbf.newDocumentBuilder(); + try (final InputStream is = Files.newInputStream(file)) { + final Document doc = db.parse(is); + final NodeList entryNodes = doc.getDocumentElement().getElementsByTagNameNS(THESAURUS_NS, "entry"); + for (int i = 0; i < entryNodes.getLength(); i++) { + final Element entryEl = (Element) entryNodes.item(i); + final String term = getDirectChildText(entryEl, "term"); + if (term == null) { + continue; + } + final List synonyms = parseSynonyms(entryEl); + thes.entries.computeIfAbsent(term.toLowerCase(), k -> new ArrayList<>()).addAll(synonyms); + } + } + return thes; + } + + private static List parseSynonyms(final Element parent) { + final List result = new ArrayList<>(); + final NodeList synNodes = parent.getChildNodes(); + for (int i = 0; i < synNodes.getLength(); i++) { + if (!(synNodes.item(i) instanceof Element)) { + continue; + } + final Element el = (Element) synNodes.item(i); + if (!"synonym".equals(el.getLocalName())) { + continue; + } + final String term = getDirectChildText(el, "term"); + final String rel = getDirectChildText(el, "relationship"); + if (term == null) { + continue; + } + final List children = parseSynonyms(el); + result.add(new Synonym(term, rel != null ? rel : "", children)); + } + return result; + } + + private static String getDirectChildText(final Element parent, final String localName) { + final NodeList children = parent.getChildNodes(); + for (int i = 0; i < children.getLength(); i++) { + if (children.item(i) instanceof Element) { + final Element child = (Element) children.item(i); + if (localName.equals(child.getLocalName())) { + return child.getTextContent().trim(); + } + } + } + return null; + } + + /** + * Expand a term using this thesaurus. + * + * @param term the search term to expand + * @param relationship if non-null, only follow synonyms with this relationship + * @param minLevels minimum depth (0 = include direct synonyms) + * @param maxLevels maximum depth (Integer.MAX_VALUE = unlimited) + * @return set of expanded terms (always includes the original term) + */ + public Set expand(final String term, final String relationship, + final int minLevels, final int maxLevels) { + final Set result = new LinkedHashSet<>(); + result.add(term.toLowerCase()); + final List syns = entries.get(term.toLowerCase()); + if (syns != null) { + for (final Synonym syn : syns) { + collectSynonyms(syn, relationship, 1, minLevels, maxLevels, result, new HashSet<>()); + } + } + return result; + } + + private void collectSynonyms(final Synonym syn, final String relationship, + final int currentLevel, final int minLevels, final int maxLevels, + final Set result, final Set visited) { + if (currentLevel > maxLevels) { + return; + } + if (relationship != null && !relationship.isEmpty() + && !relationship.equalsIgnoreCase(syn.relationship)) { + return; + } + final String lc = syn.term.toLowerCase(); + if (!visited.add(lc)) { + return; + } + if (currentLevel >= minLevels) { + result.add(lc); + } + // Recurse into nested synonyms (sub-levels) + for (final Synonym child : syn.children) { + collectSynonyms(child, relationship, currentLevel + 1, minLevels, maxLevels, result, visited); + } + // Also look up this synonym term in the main entries for transitive expansion + final List transitive = entries.get(lc); + if (transitive != null) { + for (final Synonym ts : transitive) { + collectSynonyms(ts, relationship, currentLevel + 1, minLevels, maxLevels, result, visited); + } + } + } +} diff --git a/exist-core/src/main/java/org/exist/xquery/ft/FTTimes.java b/exist-core/src/main/java/org/exist/xquery/ft/FTTimes.java new file mode 100644 index 00000000000..5bf4924e4c1 --- /dev/null +++ b/exist-core/src/main/java/org/exist/xquery/ft/FTTimes.java @@ -0,0 +1,74 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.xquery.ft; + +import org.exist.xquery.AnalyzeContextInfo; +import org.exist.xquery.Expression; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.util.ExpressionDumper; + +/** + * W3C XQFT 3.0 — FTTimes. + * + *
FTTimes ::= "occurs" FTRange "times"
+ */ +public class FTTimes extends FTAbstractExpr { + + private FTRange range; + + public FTTimes(final XQueryContext context) { + super(context); + } + + public void setRange(final FTRange range) { + this.range = range; + } + + public FTRange getRange() { + return range; + } + + @Override + public void analyze(final AnalyzeContextInfo contextInfo) throws XPathException { + contextInfo.setParent(this); + range.analyze(contextInfo); + } + + @Override + public void dump(final ExpressionDumper dumper) { + dumper.display("occurs "); + range.dump(dumper); + dumper.display(" times"); + } + + @Override + public String toString() { + return "occurs " + range.toString() + " times"; + } + + @Override + public void resetState(final boolean postOptimization) { + super.resetState(postOptimization); + range.resetState(postOptimization); + } +} diff --git a/exist-core/src/main/java/org/exist/xquery/ft/FTUnaryNot.java b/exist-core/src/main/java/org/exist/xquery/ft/FTUnaryNot.java new file mode 100644 index 00000000000..4f3eb152827 --- /dev/null +++ b/exist-core/src/main/java/org/exist/xquery/ft/FTUnaryNot.java @@ -0,0 +1,73 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.xquery.ft; + +import org.exist.xquery.AnalyzeContextInfo; +import org.exist.xquery.Expression; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.util.ExpressionDumper; + +/** + * W3C XQFT 3.0 — FTUnaryNot. + * + *
FTUnaryNot ::= ("ftnot")? FTPrimaryWithOptions
+ */ +public class FTUnaryNot extends FTAbstractExpr { + + private Expression operand; + + public FTUnaryNot(final XQueryContext context) { + super(context); + } + + public void setOperand(final Expression operand) { + this.operand = operand; + } + + public Expression getOperand() { + return operand; + } + + @Override + public void analyze(final AnalyzeContextInfo contextInfo) throws XPathException { + contextInfo.setParent(this); + operand.analyze(contextInfo); + } + + @Override + public void dump(final ExpressionDumper dumper) { + dumper.display("ftnot "); + operand.dump(dumper); + } + + @Override + public String toString() { + return "ftnot " + operand.toString(); + } + + @Override + public void resetState(final boolean postOptimization) { + super.resetState(postOptimization); + operand.resetState(postOptimization); + } +} diff --git a/exist-core/src/main/java/org/exist/xquery/ft/FTUnit.java b/exist-core/src/main/java/org/exist/xquery/ft/FTUnit.java new file mode 100644 index 00000000000..be45a378540 --- /dev/null +++ b/exist-core/src/main/java/org/exist/xquery/ft/FTUnit.java @@ -0,0 +1,45 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.xquery.ft; + +/** + * W3C XQFT 3.0 — FTUnit. + * + *
FTUnit ::= "words" | "sentences" | "paragraphs"
+ */ +public enum FTUnit { + WORDS, SENTENCES, PARAGRAPHS; + + public static FTUnit fromString(final String s) { + switch (s) { + case "words": return WORDS; + case "sentences": return SENTENCES; + case "paragraphs": return PARAGRAPHS; + default: throw new IllegalArgumentException("Unknown FTUnit: " + s); + } + } + + @Override + public String toString() { + return name().toLowerCase(); + } +} diff --git a/exist-core/src/main/java/org/exist/xquery/ft/FTWindow.java b/exist-core/src/main/java/org/exist/xquery/ft/FTWindow.java new file mode 100644 index 00000000000..ef44664a141 --- /dev/null +++ b/exist-core/src/main/java/org/exist/xquery/ft/FTWindow.java @@ -0,0 +1,83 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.xquery.ft; + +import org.exist.xquery.AnalyzeContextInfo; +import org.exist.xquery.Expression; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.util.ExpressionDumper; + +/** + * W3C XQFT 3.0 — FTWindow positional filter. + * + *
FTWindow ::= "window" AdditiveExpr FTUnit
+ */ +public class FTWindow extends FTAbstractExpr { + + private Expression windowExpr; + private FTUnit unit; + + public FTWindow(final XQueryContext context) { + super(context); + } + + public void setWindowExpr(final Expression windowExpr) { + this.windowExpr = windowExpr; + } + + public Expression getWindowExpr() { + return windowExpr; + } + + public void setUnit(final FTUnit unit) { + this.unit = unit; + } + + public FTUnit getUnit() { + return unit; + } + + @Override + public void analyze(final AnalyzeContextInfo contextInfo) throws XPathException { + contextInfo.setParent(this); + windowExpr.analyze(contextInfo); + } + + @Override + public void dump(final ExpressionDumper dumper) { + dumper.display("window "); + windowExpr.dump(dumper); + dumper.display(' ').display(unit.toString()); + } + + @Override + public String toString() { + return "window " + windowExpr.toString() + " " + unit.toString(); + } + + @Override + public void resetState(final boolean postOptimization) { + super.resetState(postOptimization); + windowExpr.resetState(postOptimization); + } +} diff --git a/exist-core/src/main/java/org/exist/xquery/ft/FTWords.java b/exist-core/src/main/java/org/exist/xquery/ft/FTWords.java new file mode 100644 index 00000000000..716e70d1fc5 --- /dev/null +++ b/exist-core/src/main/java/org/exist/xquery/ft/FTWords.java @@ -0,0 +1,124 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.xquery.ft; + +import org.exist.xquery.AnalyzeContextInfo; +import org.exist.xquery.Expression; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQueryContext; +import org.exist.xquery.util.ExpressionDumper; + +/** + * W3C XQFT 3.0 — FTWords. + * + *
FTWords ::= FTWordsValue FTAnyallOption?
+ * + * The terminal node in the FT expression tree: matches words or phrases. + */ +public class FTWords extends FTAbstractExpr { + + /** any, any word, all, all words, phrase */ + public enum AnyallMode { + ANY, ANY_WORD, ALL, ALL_WORDS, PHRASE; + + public static AnyallMode fromString(final String s) { + switch (s) { + case "any": return ANY; + case "any word": return ANY_WORD; + case "all": return ALL; + case "all words": return ALL_WORDS; + case "phrase": return PHRASE; + default: return ANY; + } + } + + @Override + public String toString() { + return name().toLowerCase().replace('_', ' '); + } + } + + private Expression wordsValue; + private AnyallMode mode = AnyallMode.ANY; + private FTTimes ftTimes; + + public FTWords(final XQueryContext context) { + super(context); + } + + public void setWordsValue(final Expression wordsValue) { + this.wordsValue = wordsValue; + } + + public Expression getWordsValue() { + return wordsValue; + } + + public void setMode(final AnyallMode mode) { + this.mode = mode; + } + + public AnyallMode getMode() { + return mode; + } + + public void setFTTimes(final FTTimes ftTimes) { + this.ftTimes = ftTimes; + } + + public FTTimes getFTTimes() { + return ftTimes; + } + + @Override + public void analyze(final AnalyzeContextInfo contextInfo) throws XPathException { + contextInfo.setParent(this); + wordsValue.analyze(contextInfo); + if (ftTimes != null) { + ftTimes.analyze(contextInfo); + } + } + + @Override + public void dump(final ExpressionDumper dumper) { + wordsValue.dump(dumper); + if (mode != AnyallMode.ANY) { + dumper.display(' ').display(mode.toString()); + } + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder(); + sb.append(wordsValue.toString()); + if (mode != AnyallMode.ANY) { + sb.append(' ').append(mode.toString()); + } + return sb.toString(); + } + + @Override + public void resetState(final boolean postOptimization) { + super.resetState(postOptimization); + wordsValue.resetState(postOptimization); + } +} diff --git a/exist-core/src/test/java/org/exist/xquery/ft/FTConformanceTest.java b/exist-core/src/test/java/org/exist/xquery/ft/FTConformanceTest.java new file mode 100644 index 00000000000..cc12194ccc5 --- /dev/null +++ b/exist-core/src/test/java/org/exist/xquery/ft/FTConformanceTest.java @@ -0,0 +1,622 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.xquery.ft; + +import org.exist.EXistException; +import org.exist.security.PermissionDeniedException; +import org.exist.storage.BrokerPool; +import org.exist.storage.DBBroker; +import org.exist.test.ExistEmbeddedServer; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQuery; +import org.exist.xquery.value.Sequence; +import org.junit.ClassRule; +import org.junit.Test; + +import static org.junit.Assert.*; + +/** + * W3C XQFT 3.0 conformance tests based on spec examples and XQFTTS patterns. + * + * Tests are organized by spec section. Each test name includes the spec section + * reference for traceability. + * + * @see W3C XQFT 3.0 Spec + */ +public class FTConformanceTest { + + @ClassRule + public static final ExistEmbeddedServer existEmbeddedServer = new ExistEmbeddedServer(true, true); + + private Sequence executeQuery(final String query) throws EXistException, PermissionDeniedException, XPathException { + final BrokerPool pool = existEmbeddedServer.getBrokerPool(); + final XQuery xquery = pool.getXQueryService(); + try (final DBBroker broker = pool.getBroker()) { + return xquery.execute(broker, query, null); + } + } + + private boolean evalBool(final String query) throws EXistException, PermissionDeniedException, XPathException { + final Sequence result = executeQuery(query); + assertNotNull(result); + assertEquals(1, result.getItemCount()); + return result.effectiveBooleanValue(); + } + + private int evalCount(final String query) throws EXistException, PermissionDeniedException, XPathException { + return executeQuery(query).getItemCount(); + } + + private String evalString(final String query) throws EXistException, PermissionDeniedException, XPathException { + return executeQuery(query).getStringValue(); + } + + // ========================================================================= + // §2.1 FTContainsExpr — basic "contains text" semantics + // ========================================================================= + + @Test + public void s2_1_basicContainsText() throws Exception { + assertTrue(evalBool("'usability testing' contains text 'usability'")); + } + + @Test + public void s2_1_noMatch() throws Exception { + assertFalse(evalBool("'usability testing' contains text 'performance'")); + } + + @Test + public void s2_1_multiWordMatch() throws Exception { + // Default "any" mode: each search string is treated as a phrase + assertTrue(evalBool("'usability testing and analysis' contains text 'usability testing'")); + } + + @Test + public void s2_1_emptyStringAlwaysMatches() throws Exception { + assertTrue(evalBool("'anything' contains text ''")); + } + + @Test + public void s2_1_xmlElement() throws Exception { + assertTrue(evalBool("

The quick brown fox

contains text 'quick'")); + } + + @Test + public void s2_1_variableSource() throws Exception { + assertTrue(evalBool("let $x := 'hello world' return $x contains text 'hello'")); + } + + // ========================================================================= + // §2.2 FTWords — word/phrase matching with AnyallOption + // ========================================================================= + + // --- "any" (default) --- + + @Test + public void s2_2_anyDefault() throws Exception { + // "any" is the default; any single search string can match as a phrase + assertTrue(evalBool("'hello world' contains text 'hello'")); + } + + @Test + public void s2_2_anyMultipleStrings() throws Exception { + // With computed value producing multiple strings (must use {Expr} syntax) + assertTrue(evalBool("'hello world' contains text {('goodbye', 'hello')}")); + } + + // --- "any word" --- + + @Test + public void s2_2_anyWord() throws Exception { + // Tokenize into individual words; any one can match + assertTrue(evalBool("'hello world' contains text 'goodbye hello' any word")); + } + + @Test + public void s2_2_anyWordNoMatch() throws Exception { + assertFalse(evalBool("'hello world' contains text 'goodbye farewell' any word")); + } + + // --- "all" --- + + @Test + public void s2_2_all() throws Exception { + // All search strings must match (each as a phrase) + assertTrue(evalBool("'hello world' contains text 'hello' all")); + } + + @Test + public void s2_2_allMultiple() throws Exception { + assertTrue(evalBool("'hello world' contains text {('hello', 'world')} all")); + } + + @Test + public void s2_2_allFails() throws Exception { + assertFalse(evalBool("'hello world' contains text {('hello', 'gone')} all")); + } + + // --- "all words" --- + + @Test + public void s2_2_allWords() throws Exception { + // Tokenize into words; all must individually match + assertTrue(evalBool("'the quick brown fox' contains text 'quick fox' all words")); + } + + @Test + public void s2_2_allWordsFail() throws Exception { + assertFalse(evalBool("'the quick brown fox' contains text 'quick gone' all words")); + } + + // --- "phrase" --- + + @Test + public void s2_2_phrase() throws Exception { + // All words form one phrase — must appear consecutively + assertTrue(evalBool("'the quick brown fox' contains text 'quick brown' phrase")); + } + + @Test + public void s2_2_phraseNoMatch() throws Exception { + // Words not consecutive + assertFalse(evalBool("'the quick brown fox' contains text 'quick fox' phrase")); + } + + // ========================================================================= + // §2.3 FTOr, FTAnd, FTMildNot, FTUnaryNot + // ========================================================================= + + @Test + public void s2_3_ftor() throws Exception { + assertTrue(evalBool("'hello world' contains text 'hello' ftor 'goodbye'")); + assertTrue(evalBool("'hello world' contains text 'goodbye' ftor 'hello'")); + assertFalse(evalBool("'hello world' contains text 'goodbye' ftor 'farewell'")); + } + + @Test + public void s2_3_ftand() throws Exception { + assertTrue(evalBool("'hello world' contains text 'hello' ftand 'world'")); + assertFalse(evalBool("'hello world' contains text 'hello' ftand 'gone'")); + } + + @Test + public void s2_3_ftnot() throws Exception { + // ftnot: negation — matches if search term does NOT appear + assertTrue(evalBool("'hello world' contains text ftnot 'gone'")); + assertFalse(evalBool("'hello world' contains text ftnot 'hello'")); + } + + @Test + public void s2_3_mildNot() throws Exception { + // "not in": matches from left that don't overlap with right positions + // "hello" matches at pos 0, "hello" also matches at pos 0 in right operand + // So the match DOES overlap → should be excluded + assertFalse(evalBool("'hello world' contains text 'hello' not in 'hello'")); + } + + @Test + public void s2_3_mildNotNoOverlap() throws Exception { + // "hello" at pos 0, "world" at pos 1 — no overlap + assertTrue(evalBool("'hello world' contains text 'hello' not in 'world'")); + } + + @Test + public void s2_3_complexBoolean() throws Exception { + // Nested: (A ftand B) ftor C + assertTrue(evalBool( + "'the quick brown fox' contains text ('quick' ftand 'fox') ftor 'elephant'" + )); + assertTrue(evalBool( + "'the quick brown fox' contains text 'elephant' ftor ('quick' ftand 'fox')" + )); + } + + // ========================================================================= + // §2.4 Positional Filters + // ========================================================================= + + // --- ordered --- + + @Test + public void s2_4_ordered() throws Exception { + assertTrue(evalBool( + "'the quick brown fox' contains text 'quick' ftand 'fox' ordered" + )); + } + + @Test + public void s2_4_orderedReverse() throws Exception { + // "fox" (first operand) at pos 3, "quick" (second operand) at pos 1. + // Ordered requires first operand before second in text → 3 > 1 → fails. + assertFalse(evalBool( + "'the quick brown fox' contains text 'fox' ftand 'quick' ordered" + )); + } + + // --- window --- + + @Test + public void s2_4_windowFits() throws Exception { + // "quick" at pos 1, "brown" at pos 2 → span = 2, fits in window 3 + assertTrue(evalBool( + "'the quick brown fox' contains text 'quick' ftand 'brown' window 3 words" + )); + } + + @Test + public void s2_4_windowTooSmall() throws Exception { + // "quick" at pos 1, "fox" at pos 3 → span = 3, doesn't fit in window 2 + assertFalse(evalBool( + "'the quick brown fox' contains text 'quick' ftand 'fox' window 2 words" + )); + } + + @Test + public void s2_4_windowExact() throws Exception { + // span = 3, window = 3 → exactly fits + assertTrue(evalBool( + "'the quick brown fox' contains text 'quick' ftand 'fox' window 3 words" + )); + } + + // --- distance --- + + @Test + public void s2_4_distanceExactly() throws Exception { + // "quick" at pos 1, "brown" at pos 2 → gap = 0 + assertTrue(evalBool( + "'the quick brown fox' contains text 'quick' ftand 'brown' distance exactly 0 words" + )); + } + + @Test + public void s2_4_distanceAtMost() throws Exception { + // "quick" at pos 1, "fox" at pos 3 → gap = 1 + assertTrue(evalBool( + "'the quick brown fox' contains text 'quick' ftand 'fox' distance at most 2 words" + )); + } + + @Test + public void s2_4_distanceFromTo() throws Exception { + // gap = 1 (one word "brown" between "quick" and "fox") + assertTrue(evalBool( + "'the quick brown fox' contains text 'quick' ftand 'fox' distance from 1 to 3 words" + )); + } + + // --- at start / at end / entire content --- + + @Test + public void s2_4_atStart() throws Exception { + assertTrue(evalBool("'hello world' contains text 'hello' at start")); + assertFalse(evalBool("'hello world' contains text 'world' at start")); + } + + @Test + public void s2_4_atEnd() throws Exception { + assertTrue(evalBool("'hello world' contains text 'world' at end")); + assertFalse(evalBool("'hello world' contains text 'hello' at end")); + } + + @Test + public void s2_4_entireContent() throws Exception { + assertTrue(evalBool("'hello' contains text 'hello' entire content")); + assertFalse(evalBool("'hello world' contains text 'hello' entire content")); + } + + @Test + public void s2_4_entireContentAllWords() throws Exception { + assertTrue(evalBool( + "'hello world' contains text 'hello world' all words entire content" + )); + } + + // ========================================================================= + // §2.5 Match Options + // ========================================================================= + + // --- case --- + + @Test + public void s2_5_caseSensitive() throws Exception { + assertFalse(evalBool("'Hello World' contains text 'hello' using case sensitive")); + assertTrue(evalBool("'Hello World' contains text 'Hello' using case sensitive")); + } + + @Test + public void s2_5_caseInsensitive() throws Exception { + assertTrue(evalBool("'Hello World' contains text 'hello' using case insensitive")); + assertTrue(evalBool("'HELLO WORLD' contains text 'hello' using case insensitive")); + } + + @Test + public void s2_5_lowercase() throws Exception { + // XQFTTS interpretation: "lowercase" only matches tokens already in lowercase. + // "Hello" is mixed case, so it doesn't match "hello" using lowercase. + assertFalse(evalBool("'Hello World' contains text 'hello' using lowercase")); + // All-lowercase source matches + assertTrue(evalBool("'hello world' contains text 'hello' using lowercase")); + } + + @Test + public void s2_5_uppercase() throws Exception { + // XQFTTS interpretation: "uppercase" only matches tokens already in uppercase. + // "Hello" is mixed case, so it doesn't match "HELLO" using uppercase. + assertFalse(evalBool("'Hello World' contains text 'HELLO' using uppercase")); + // All-uppercase source matches + assertTrue(evalBool("'HELLO WORLD' contains text 'HELLO' using uppercase")); + } + + // --- wildcards --- + + @Test + public void s2_5_wildcardsStar() throws Exception { + assertTrue(evalBool("'hello world' contains text 'hel.*' using wildcards")); + } + + @Test + public void s2_5_wildcardsDot() throws Exception { + // . matches exactly one character + assertTrue(evalBool("'hello world' contains text 'h.llo' using wildcards")); + assertFalse(evalBool("'hello world' contains text 'h.lo' using wildcards")); + } + + @Test + public void s2_5_wildcardsPlus() throws Exception { + // .+ matches one or more + assertTrue(evalBool("'hello world' contains text 'hel.+' using wildcards")); + assertFalse(evalBool("'hello world' contains text 'hello.+' using wildcards")); + } + + @Test + public void s2_5_wildcardsCaseInsensitive() throws Exception { + assertTrue(evalBool( + "'Hello World' contains text 'hel.*' using wildcards using case insensitive" + )); + } + + // --- multiple using clauses --- + + @Test + public void s2_5_multipleMatchOptions() throws Exception { + assertTrue(evalBool( + "'Hello World' contains text 'hel.*' using case insensitive using wildcards" + )); + } + + // ========================================================================= + // §2.6 FTTimes — occurrence constraints + // ========================================================================= + + @Test + public void s2_6_occursExactly() throws Exception { + // "the" appears 2 times in "the quick brown the fox" + assertTrue(evalBool( + "'the quick brown the fox' contains text 'the' occurs exactly 2 times" + )); + assertFalse(evalBool( + "'the quick brown the fox' contains text 'the' occurs exactly 3 times" + )); + } + + @Test + public void s2_6_occursAtLeast() throws Exception { + assertTrue(evalBool( + "'the quick the brown the fox' contains text 'the' occurs at least 2 times" + )); + } + + @Test + public void s2_6_occursAtMost() throws Exception { + assertTrue(evalBool( + "'the quick brown fox' contains text 'the' occurs at most 2 times" + )); + assertFalse(evalBool( + "'the quick the brown the fox' contains text 'the' occurs at most 1 times" + )); + } + + @Test + public void s2_6_occursFromTo() throws Exception { + assertTrue(evalBool( + "'the quick the fox' contains text 'the' occurs from 1 to 3 times" + )); + } + + // ========================================================================= + // §2.7 Parenthesized FTSelection + // ========================================================================= + + @Test + public void s2_7_parenthesizedSelection() throws Exception { + assertTrue(evalBool( + "'the quick brown fox' contains text ('quick' ftand 'fox')" + )); + } + + @Test + public void s2_7_parenthesizedWithPosFilter() throws Exception { + assertTrue(evalBool( + "'the quick brown fox' contains text " + + "('quick' ftand 'fox') using case insensitive ordered" + )); + } + + // ========================================================================= + // Practical use cases — XML document queries + // ========================================================================= + + @Test + public void useCase_filterBooks() throws Exception { + assertEquals(2, evalCount( + "let $books := (" + + " Learning XQuery," + + " Java Programming," + + " XQuery for Web Developers" + + ") return $books[title contains text 'XQuery']" + )); + } + + @Test + public void useCase_filterWithBoolean() throws Exception { + // 2 XQuery books + 2 Java books = 4 matches + assertEquals(4, evalCount( + "let $books := (" + + " Learning XQuery," + + " Java Programming," + + " XQuery for Web Developers," + + " Advanced Java" + + ") return $books[title contains text 'XQuery' ftor 'Java']" + )); + } + + @Test + public void useCase_nestedElements() throws Exception { + // "contains text" uses the string value of the element (including descendants) + // String value of

Hello

World

is "HelloWorld" + assertTrue(evalBool( + "

Hello

World

contains text 'HelloWorld'" + )); + } + + @Test + public void useCase_flworFilter() throws Exception { + assertEquals(2, evalCount( + "for $w in ('apple', 'banana', 'apricot', 'cherry') " + + "where $w contains text 'ap.*' using wildcards " + + "return $w" + )); + } + + @Test + public void useCase_conditionalFT() throws Exception { + assertEquals("found", evalString( + "if ('hello world' contains text 'hello') then 'found' else 'not found'" + )); + } + + @Test + public void useCase_countMatches() throws Exception { + // hello, help, hero, hope all start with 'h' — 4 matches + assertEquals("4", evalString( + "let $words := ('hello', 'world', 'help', 'hero', 'hope') " + + "return count(for $w in $words where $w contains text 'h.*' using wildcards return $w)" + )); + } + + // ========================================================================= + // Edge cases + // ========================================================================= + + @Test + public void edge_emptySource() throws Exception { + assertFalse(evalBool("'' contains text 'hello'")); + } + + @Test + public void edge_emptySearchEmptySource() throws Exception { + assertTrue(evalBool("'' contains text ''")); + } + + @Test + public void edge_numericSource() throws Exception { + assertTrue(evalBool("42 contains text '42'")); + } + + @Test + public void edge_sequenceSource() throws Exception { + // String value of a sequence of strings is their concatenation + assertTrue(evalBool("('hello', 'world') contains text 'hello'")); + } + + @Test + public void edge_multipleSpaces() throws Exception { + // Extra whitespace shouldn't affect word tokenization + assertTrue(evalBool("'hello world' contains text 'hello'")); + assertTrue(evalBool("'hello world' contains text 'world'")); + } + + @Test + public void edge_punctuation() throws Exception { + assertTrue(evalBool("'hello, world!' contains text 'hello'")); + assertTrue(evalBool("'hello, world!' contains text 'world'")); + } + + @Test + public void edge_unicodeText() throws Exception { + assertTrue(evalBool("'Stra\u00DFe und Gr\u00FC\u00DFe' contains text 'Stra\u00DFe'")); + } + + // ========================================================================= + // XQFTTS-style tests: predicates with step expressions and positional filters + // ========================================================================= + + @Test + public void xqftts_predicateWithDistance() throws Exception { + // Reproduces XQFTTS FTDistance-words1: step expression "para" in predicate with distance filter + final String query = + "let $doc := " + + "Book1" + + "The physical swift movement" + + "" + + "Book2" + + "No match here" + + " " + + "return $doc/book[para contains text ('physical' ftand 'swift') distance exactly 0 words]/title/string()"; + assertEquals("Book1", evalString(query)); + } + + @Test + public void xqftts_predicateWithWindow() throws Exception { + final String query = + "let $doc := " + + "Book1" + + "The physical swift movement" + + " " + + "return $doc/book[para contains text ('physical' ftand 'swift') window 3 words]/title/string()"; + assertEquals("Book1", evalString(query)); + } + + @Test + public void xqftts_predicateWithOrdered() throws Exception { + final String query = + "let $doc := " + + "Book1" + + "The physical swift movement" + + " " + + "return $doc/book[para contains text 'physical' ftand 'swift' ordered]/title/string()"; + assertEquals("Book1", evalString(query)); + } + + @Test + public void xqftts_predicateBasicFTAnd() throws Exception { + // This pattern already works (FTAnd-q1 in XQFTTS passes) + final String query = + "let $doc := " + + "Book1" + + "software ninja skills" + + " " + + "return $doc/book[para contains text 'software' ftand 'ninja']/title/string()"; + assertEquals("Book1", evalString(query)); + } +} diff --git a/exist-core/src/test/java/org/exist/xquery/ft/FTContainsTest.java b/exist-core/src/test/java/org/exist/xquery/ft/FTContainsTest.java new file mode 100644 index 00000000000..44dc72fd190 --- /dev/null +++ b/exist-core/src/test/java/org/exist/xquery/ft/FTContainsTest.java @@ -0,0 +1,490 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.xquery.ft; + +import org.exist.EXistException; +import org.exist.security.PermissionDeniedException; +import org.exist.storage.BrokerPool; +import org.exist.storage.DBBroker; +import org.exist.test.ExistEmbeddedServer; +import org.exist.xquery.XPathException; +import org.exist.xquery.XQuery; +import org.exist.xquery.value.Sequence; +import org.junit.ClassRule; +import org.junit.Test; + +import static org.junit.Assert.*; + +/** + * End-to-end integration tests for W3C XQFT 3.0 "contains text" expressions. + * These tests exercise the full pipeline: parse → tree-walk → evaluate. + */ +public class FTContainsTest { + + @ClassRule + public static final ExistEmbeddedServer existEmbeddedServer = new ExistEmbeddedServer(true, true); + + private Sequence executeQuery(final String query) throws EXistException, PermissionDeniedException, XPathException { + final BrokerPool pool = existEmbeddedServer.getBrokerPool(); + final XQuery xquery = pool.getXQueryService(); + try (final DBBroker broker = pool.getBroker()) { + return xquery.execute(broker, query, null); + } + } + + private boolean evalBool(final String query) throws EXistException, PermissionDeniedException, XPathException { + final Sequence result = executeQuery(query); + assertNotNull(result); + assertEquals(1, result.getItemCount()); + return result.effectiveBooleanValue(); + } + + // === Basic matching === + + @Test + public void simpleWordMatch() throws Exception { + assertTrue(evalBool("'hello world' contains text 'hello'")); + } + + @Test + public void simpleWordNoMatch() throws Exception { + assertFalse(evalBool("'hello world' contains text 'goodbye'")); + } + + @Test + public void caseInsensitiveByDefault() throws Exception { + // XQFT 3.0 §4.1: default case mode is implementation-defined. + // Our implementation defaults to case-insensitive, matching XQFTTS expectations. + assertTrue(evalBool("'Hello World' contains text 'hello'")); + } + + @Test + public void caseInsensitive() throws Exception { + assertTrue(evalBool("'Hello World' contains text 'hello' using case insensitive")); + } + + @Test + public void phraseMatch() throws Exception { + assertTrue(evalBool("'the quick brown fox' contains text 'quick brown' phrase")); + } + + @Test + public void phraseNoMatch() throws Exception { + assertFalse(evalBool("'the quick brown fox' contains text 'brown quick' phrase")); + } + + // === AnyallMode === + + @Test + public void anyWordMode() throws Exception { + assertTrue(evalBool("'hello world' contains text 'goodbye hello' any word")); + } + + @Test + public void allWordsMode() throws Exception { + assertTrue(evalBool("'hello world' contains text 'hello world' all words")); + } + + @Test + public void allWordsModeFailure() throws Exception { + assertFalse(evalBool("'hello world' contains text 'hello goodbye' all words")); + } + + // === Boolean operators === + + @Test + public void ftand() throws Exception { + assertTrue(evalBool("'hello world' contains text 'hello' ftand 'world'")); + } + + @Test + public void ftandFailure() throws Exception { + assertFalse(evalBool("'hello world' contains text 'hello' ftand 'goodbye'")); + } + + @Test + public void ftor() throws Exception { + assertTrue(evalBool("'hello world' contains text 'goodbye' ftor 'hello'")); + } + + @Test + public void ftorFailure() throws Exception { + assertFalse(evalBool("'hello world' contains text 'goodbye' ftor 'farewell'")); + } + + @Test + public void ftnot() throws Exception { + assertTrue(evalBool("'hello world' contains text ftnot 'goodbye'")); + } + + @Test + public void ftnotFailure() throws Exception { + assertFalse(evalBool("'hello world' contains text ftnot 'hello'")); + } + + @Test + public void mildNot() throws Exception { + // "hello" not in "world" — "hello" matches at pos 0, "world" matches at pos 1 + // They don't overlap, so hello's match survives + assertTrue(evalBool("'hello world' contains text 'hello' not in 'world'")); + } + + // === Positional filters === + + @Test + public void atStart() throws Exception { + assertTrue(evalBool("'hello world' contains text 'hello' at start")); + } + + @Test + public void atStartFailure() throws Exception { + assertFalse(evalBool("'hello world' contains text 'world' at start")); + } + + @Test + public void atEnd() throws Exception { + assertTrue(evalBool("'hello world' contains text 'world' at end")); + } + + @Test + public void atEndFailure() throws Exception { + assertFalse(evalBool("'hello world' contains text 'hello' at end")); + } + + @Test + public void entireContent() throws Exception { + assertTrue(evalBool("'hello world' contains text 'hello world' all words entire content")); + } + + @Test + public void entireContentFailure() throws Exception { + assertFalse(evalBool("'hello world foo' contains text 'hello world' all words entire content")); + } + + // === Window === + + @Test + public void windowMatch() throws Exception { + assertTrue(evalBool("'the quick brown fox' contains text 'quick' ftand 'fox' window 4 words")); + } + + @Test + public void windowTooSmall() throws Exception { + assertFalse(evalBool("'the quick brown fox' contains text 'quick' ftand 'fox' window 2 words")); + } + + // === Distance === + + @Test + public void distanceMatch() throws Exception { + // "quick" is at pos 1, "fox" at pos 3 → gap = 1 (brown is between) + assertTrue(evalBool("'the quick brown fox' contains text 'quick' ftand 'fox' distance at most 2 words")); + } + + @Test + public void distanceTooFar() throws Exception { + assertFalse(evalBool("'the quick brown fox' contains text 'quick' ftand 'fox' distance exactly 0 words")); + } + + // === Wildcards === + + @Test + public void wildcards() throws Exception { + assertTrue(evalBool("'hello world' contains text 'hel.*' using wildcards")); + } + + @Test + public void wildcardsNoMatch() throws Exception { + assertFalse(evalBool("'hello world' contains text 'xyz.*' using wildcards")); + } + + @Test + public void wildcardLiteralPunctuation() throws Exception { + // "task?" has no wildcard indicator (no unescaped "."), so punctuation + // is stripped from the search token: "task?" -> "task". Source token + // "task" matches. XQFTTS ftwildcard-q4 confirms this behavior. + assertTrue(evalBool("'complete the task? yes' contains text 'task?' using wildcards")); + } + + @Test + public void wildcardEscapedDot() throws Exception { + // "specialist\." — escaped dot matches literal period in raw token "specialist." + // The backslash escape triggers raw token matching. + assertTrue(evalBool("'the specialist. good' contains text 'specialist\\.' using wildcards")); + } + + @Test + public void wildcardDotThenEscapedQuestion() throws Exception { + // "nex.\?" — "." matches any char, "\?" is literal ? + // Raw token for "next?" is "next?" — pattern matches via escape-triggered raw fallback. + assertTrue(evalBool("'what is next? ok' contains text 'nex.\\?' using wildcards")); + } + + // === With XML nodes === + + @Test + public void xmlNodeMatch() throws Exception { + assertTrue(evalBool("Hello World contains text 'Hello'")); + } + + @Test + public void xmlFilterExpression() throws Exception { + final Sequence result = executeQuery( + "let $books := (XQuery in Action," + + " Java Programming," + + " XML and XQuery)" + + "return $books[title contains text 'XQuery']" + ); + assertEquals(2, result.getItemCount()); + } + + // === FLWOR with contains text === + + @Test + public void flworWithContainsText() throws Exception { + final Sequence result = executeQuery( + "for $w in ('hello', 'goodbye', 'world') " + + "where $w contains text 'hello' ftor 'world' " + + "return $w" + ); + assertEquals(2, result.getItemCount()); + } + + // === Case modes === + + @Test + public void lowercaseMode() throws Exception { + // "using lowercase" normalizes search to lowercase, then compares case-sensitively. + // Source "hello" matches search "hello" (both lowercase). + assertTrue(evalBool("'hello world' contains text 'Hello' using lowercase")); + } + + @Test + public void lowercaseModeNoMatch() throws Exception { + // XQFT §4.1: "using lowercase" normalizes BOTH source and search to lowercase. + // For no-match, the actual word must differ. + assertFalse(evalBool("'Hello World' contains text 'goodbye' using lowercase")); + } + + @Test + public void uppercaseMode() throws Exception { + // XQFT §4.1: "using uppercase" normalizes BOTH source and search to uppercase. + assertTrue(evalBool("'HELLO WORLD' contains text 'hello' using uppercase")); + } + + @Test + public void uppercaseModeNoMatch() throws Exception { + // XQFT §4.1: "using uppercase" normalizes BOTH source and search to uppercase. + // For no-match, the actual word must differ. + assertFalse(evalBool("'Hello World' contains text 'GOODBYE' using uppercase")); + } + + // === FTTimes === + + @Test + public void timesAtMostZeroOccurrences() throws Exception { + // "goodbye" doesn't appear in "hello world", which satisfies "at most 1 times" + assertTrue(evalBool("'hello world' contains text 'goodbye' occurs at most 1 times")); + } + + @Test + public void timesAtMostOneOccurrence() throws Exception { + // "hello" appears exactly 1 time, which satisfies "at most 1 times" + assertTrue(evalBool("'hello world' contains text 'hello' occurs at most 1 times")); + } + + @Test + public void timesAtMostExceeded() throws Exception { + // "hello" appears 2 times, which does NOT satisfy "at most 1 times" + assertFalse(evalBool("'hello hello world' contains text 'hello' occurs at most 1 times")); + } + + // === FTOr with empty sequence === + + @Test + public void ftorEmptySequence() throws Exception { + // {()} (empty sequence) produces no match; only "hello" side of ftor matches + assertTrue(evalBool("'hello world' contains text {()} ftor 'hello'")); + } + + @Test + public void ftorEmptySequenceNoMatch() throws Exception { + // {()} produces no match, and 'goodbye' doesn't match — result is false + assertFalse(evalBool("'hello world' contains text {()} ftor 'goodbye'")); + } + + // === XPTY0004 for non-string FTWords values === + + @Test(expected = XPathException.class) + public void ftWordsIntegerRaisesTypeError() throws Exception { + evalBool("'hello world' contains text {42} ftor 'hello'"); + } + + // === Stemming === + + @Test + public void stemmingMatch() throws Exception { + // "pictures" stems to same root as "picture" + assertTrue(evalBool("'hand-drawn pictures of pages' contains text 'picture' using stemming")); + } + + @Test + public void stemmingNoMatch() throws Exception { + // "tasks" stems to "task", but "picture" stems to "pictur" — no match + assertFalse(evalBool("'tasks and training' contains text 'picture' using stemming")); + } + + @Test + public void stemmingVerbForms() throws Exception { + // "performing" and "performed" should share same stem + assertTrue(evalBool("'performing specified tasks' contains text 'performed' using stemming")); + } + + // === declare ft-option === + + @Test + public void declareFtOption() throws Exception { + assertTrue(evalBool( + "declare ft-option using case sensitive;\n" + + "'Hello World' contains text 'Hello'" + )); + } + + @Test + public void declareFtOptionCaseSensitiveRejects() throws Exception { + // With case sensitive declared, 'hello' (lowercase) should NOT match 'Hello' + assertFalse(evalBool( + "declare ft-option using case sensitive;\n" + + "'Hello World' contains text 'hello'" + )); + } + + // === FTST0019: conflicting match options === + + @Test(expected = XPathException.class) + public void conflictingCaseOptionsInProlog() throws Exception { + // FTST0019: conflicting case options in declare ft-option + evalBool( + "declare ft-option using case sensitive using case insensitive;\n" + + "'Hello World' contains text 'Hello'" + ); + } + + // === entire content strictness === + + @Test + public void entireContentRejectsPartialMatch() throws Exception { + // "entire content" must cover ALL token positions, not just first and last + assertFalse(evalBool( + "'one two three four five' contains text 'one' ftand 'five' entire content" + )); + } + + // === FTST0001: mild not operand restrictions === + + @Test(expected = XPathException.class) + public void mildNotRejectsFtnotLeft() throws Exception { + // ftnot in left operand of "not in" must raise FTST0001 + evalBool("'hello world' contains text ('hello' ftand ftnot 'x') not in 'y'"); + } + + @Test(expected = XPathException.class) + public void mildNotRejectsFtnotRight() throws Exception { + // ftnot in right operand of "not in" must raise FTST0001 + evalBool("'hello world' contains text 'hello' not in ('world' ftand ftnot 'x')"); + } + + @Test(expected = XPathException.class) + public void mildNotRejectsOccurs() throws Exception { + // "occurs" in operand of "not in" must raise FTST0001 + evalBool("'hello world' contains text 'hello' occurs exactly 1 times not in 'world'"); + } + + // === Positional filter interaction === + + @Test + public void orderedAfterWindowInParens() throws Exception { + // After window collapses groups, ordered sees a single unit → vacuously true + assertTrue(evalBool( + "'one two three' contains text ('three' ftand 'one' window 3 words) ordered" + )); + } + + // === Complex distance/window interactions === + + @Test + public void distanceWithWindow() throws Exception { + // Window collapses inner group to positions {2,3}; 'swift' is at position 6. + // Distance between last of {2,3} (=3) and first of {6} (=6): 6-3-1 = 2 words gap. + // "distance exactly 2 words" matches, so the expression is true. + assertTrue("distance exactly 2 between window group and swift", + evalBool("'They prefer usability studies to the swift application' contains text " + + "('usability' ftand 'studies' window 2 words) ftand 'swift' distance exactly 2 words")); + // With distance exactly 1, it should reject (actual gap is 2) + assertFalse("distance exactly 1 should reject (actual gap is 2)", + evalBool("'They prefer usability studies to the swift application' contains text " + + "('usability' ftand 'studies' window 2 words) ftand 'swift' distance exactly 1 words")); + } + + // === Dynamic expressions in positional filters === + + @Test + public void dynamicWindowSize() throws Exception { + // Window size computed from a dynamic expression using context + final Sequence result = executeQuery( + "let $items := the quick brown fox jumps" + + "return $items/item[. contains text 'quick' ftand 'fox' window (2 + 2) words]" + ); + assertEquals(1, result.getItemCount()); + } + + // === contains text with comparison === + + // === Score variables === + + @Test + public void forScoreVariable() throws Exception { + // for $t score $s in expr — $s should be bound to a double in [0, 1] + assertTrue(evalBool( + "for $w score $s in ('hello', 'world') " + + "where $w contains text 'hello' " + + "return ($s ge 0.0) and ($s le 1.0)" + )); + } + + @Test + public void letScoreVariable() throws Exception { + // let score $s := expr — $s should be a double in [0, 1] + assertTrue(evalBool( + "let score $s := 'hello' " + + "return ($s ge 0.0) and ($s le 1.0)" + )); + } + + @Test + public void containsTextEqComparison() throws Exception { + // "contains text" has higher precedence than "eq" + assertFalse(evalBool( + "'Hello World' contains text 'Hello' eq fn:false()" + )); + } +} diff --git a/exist-core/src/test/java/org/exist/xquery/ft/FTEvaluatorTest.java b/exist-core/src/test/java/org/exist/xquery/ft/FTEvaluatorTest.java new file mode 100644 index 00000000000..d2d9a36648d --- /dev/null +++ b/exist-core/src/test/java/org/exist/xquery/ft/FTEvaluatorTest.java @@ -0,0 +1,121 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.xquery.ft; + +import org.junit.Test; + +import java.util.Arrays; +import java.util.List; + +import static org.junit.Assert.*; + +/** + * Unit tests for the FTEvaluator sequential full-text matching engine. + */ +public class FTEvaluatorTest { + + @Test + public void tokenizeSimple() { + final List tokens = FTEvaluator.tokenize("hello world"); + assertEquals(Arrays.asList("hello", "world"), tokens); + } + + @Test + public void tokenizePunctuation() { + final List tokens = FTEvaluator.tokenize("Hello, World! How's it going?"); + assertEquals(Arrays.asList("Hello", "World", "How's", "it", "going"), tokens); + } + + @Test + public void tokenizeEmpty() { + assertTrue(FTEvaluator.tokenize("").isEmpty()); + assertTrue(FTEvaluator.tokenize(null).isEmpty()); + assertTrue(FTEvaluator.tokenize(" ").isEmpty()); + } + + @Test + public void tokenizeNumbers() { + final List tokens = FTEvaluator.tokenize("abc 123 def"); + assertEquals(Arrays.asList("abc", "123", "def"), tokens); + } + + @Test + public void wildcardToRegexSimple() { + // . matches any single char + assertTrue("hXllo".matches(FTEvaluator.wildcardToRegex("h.llo", false))); + assertFalse("hllo".matches(FTEvaluator.wildcardToRegex("h.llo", false))); + } + + @Test + public void wildcardToRegexStar() { + // .* matches zero or more + assertTrue("hello".matches(FTEvaluator.wildcardToRegex("hel.*", false))); + assertTrue("hel".matches(FTEvaluator.wildcardToRegex("hel.*", false))); + } + + @Test + public void wildcardToRegexPlus() { + // .+ matches one or more + assertTrue("hello".matches(FTEvaluator.wildcardToRegex("hel.+", false))); + assertFalse("hel".matches(FTEvaluator.wildcardToRegex("hel.+", false))); + } + + @Test + public void wildcardToRegexCaseInsensitive() { + assertTrue("HELLO".matches(FTEvaluator.wildcardToRegex("hello", true))); + } + + @Test + public void mergeOptionsLocalOverrides() { + final FTMatchOptions inherited = new FTMatchOptions(); + inherited.setCaseMode(FTMatchOptions.CaseMode.SENSITIVE); + inherited.setLanguage("en"); + + final FTMatchOptions local = new FTMatchOptions(); + local.setCaseMode(FTMatchOptions.CaseMode.INSENSITIVE); + + final FTMatchOptions merged = FTEvaluator.mergeOptions(inherited, local); + assertEquals(FTMatchOptions.CaseMode.INSENSITIVE, merged.getCaseMode()); + assertEquals("en", merged.getLanguage()); // inherited + } + + @Test + public void mergeOptionsNullLocal() { + final FTMatchOptions inherited = new FTMatchOptions(); + inherited.setCaseMode(FTMatchOptions.CaseMode.SENSITIVE); + assertSame(inherited, FTEvaluator.mergeOptions(inherited, null)); + } + + @Test + public void wildcardMatchInEvaluator() { + final FTEvaluator evaluator = new FTEvaluator("hello world"); + final String regex = FTEvaluator.wildcardToRegex("hel.*", false); + assertTrue("hel.* should match hello", "hello".matches(regex)); + } + + @Test + public void mergeOptionsNullInherited() { + final FTMatchOptions local = new FTMatchOptions(); + local.setCaseMode(FTMatchOptions.CaseMode.INSENSITIVE); + assertSame(local, FTEvaluator.mergeOptions(null, local)); + } +} diff --git a/exist-core/src/test/java/org/exist/xquery/ft/FTParserTest.java b/exist-core/src/test/java/org/exist/xquery/ft/FTParserTest.java new file mode 100644 index 00000000000..9a65cddbf90 --- /dev/null +++ b/exist-core/src/test/java/org/exist/xquery/ft/FTParserTest.java @@ -0,0 +1,251 @@ +/* + * eXist-db Open Source Native XML Database + * Copyright (C) 2001 The eXist-db Authors + * + * info@exist-db.org + * http://www.exist-db.org + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +package org.exist.xquery.ft; + +import antlr.RecognitionException; +import antlr.TokenStreamException; +import antlr.collections.AST; +import org.exist.xquery.XPathException; +import org.exist.xquery.parser.XQueryLexer; +import org.exist.xquery.parser.XQueryParser; +import org.exist.xquery.parser.XQueryTokenTypes; +import org.junit.Test; + +import java.io.StringReader; + +import static org.junit.Assert.*; + +/** + * Tests that the XQFT grammar extensions parse correctly into AST nodes. + * These tests verify Phase 1 (parser) without requiring a running database. + */ +public class FTParserTest { + + private AST parse(final String xquery) throws RecognitionException, TokenStreamException, XPathException { + final XQueryLexer lexer = new XQueryLexer(new StringReader(xquery)); + final XQueryParser parser = new XQueryParser(lexer); + parser.xpath(); + return parser.getAST(); + } + + private AST findToken(final AST root, final int tokenType) { + if (root == null) return null; + if (root.getType() == tokenType) return root; + AST found = findToken(root.getFirstChild(), tokenType); + if (found != null) return found; + return findToken(root.getNextSibling(), tokenType); + } + + @Test + public void simpleContainsText() throws Exception { + final AST ast = parse("$x contains text 'hello'"); + assertNotNull("AST should not be null", ast); + final AST ftContains = findToken(ast, XQueryTokenTypes.FT_CONTAINS); + assertNotNull("Should find FT_CONTAINS token", ftContains); + } + + @Test + public void ftAnd() throws Exception { + final AST ast = parse("$x contains text 'hello' ftand 'world'"); + assertNotNull(ast); + assertNotNull("Should find FT_CONTAINS", findToken(ast, XQueryTokenTypes.FT_CONTAINS)); + assertNotNull("Should find FT_AND", findToken(ast, XQueryTokenTypes.FT_AND)); + } + + @Test + public void ftOr() throws Exception { + final AST ast = parse("$x contains text 'hello' ftor 'world'"); + assertNotNull(ast); + assertNotNull("Should find FT_OR", findToken(ast, XQueryTokenTypes.FT_OR)); + } + + @Test + public void ftNot() throws Exception { + final AST ast = parse("$x contains text ftnot 'hello'"); + assertNotNull(ast); + assertNotNull("Should find FT_UNARY_NOT", findToken(ast, XQueryTokenTypes.FT_UNARY_NOT)); + } + + @Test + public void ftMildNot() throws Exception { + final AST ast = parse("$x contains text 'hello' not in 'world'"); + assertNotNull(ast); + assertNotNull("Should find FT_MILD_NOT", findToken(ast, XQueryTokenTypes.FT_MILD_NOT)); + } + + @Test + public void allWords() throws Exception { + final AST ast = parse("$x contains text 'hello world' all words"); + assertNotNull(ast); + final AST anyall = findToken(ast, XQueryTokenTypes.FT_ANYALL_OPTION); + assertNotNull("Should find FT_ANYALL_OPTION", anyall); + assertEquals("all words", anyall.getText()); + } + + @Test + public void phrase() throws Exception { + final AST ast = parse("$x contains text 'hello world' phrase"); + assertNotNull(ast); + final AST anyall = findToken(ast, XQueryTokenTypes.FT_ANYALL_OPTION); + assertNotNull(anyall); + assertEquals("phrase", anyall.getText()); + } + + @Test + public void ordered() throws Exception { + final AST ast = parse("$x contains text 'hello' ftand 'world' ordered"); + assertNotNull(ast); + assertNotNull("Should find FT_ORDER", findToken(ast, XQueryTokenTypes.FT_ORDER)); + } + + @Test + public void window() throws Exception { + final AST ast = parse("$x contains text 'hello' ftand 'world' window 5 words"); + assertNotNull(ast); + assertNotNull("Should find FT_WINDOW", findToken(ast, XQueryTokenTypes.FT_WINDOW)); + } + + @Test + public void distance() throws Exception { + final AST ast = parse("$x contains text 'hello' ftand 'world' distance at most 3 words"); + assertNotNull(ast); + assertNotNull("Should find FT_DISTANCE", findToken(ast, XQueryTokenTypes.FT_DISTANCE)); + } + + @Test + public void scope() throws Exception { + final AST ast = parse("$x contains text 'hello' ftand 'world' same sentence"); + assertNotNull(ast); + final AST scope = findToken(ast, XQueryTokenTypes.FT_SCOPE); + assertNotNull("Should find FT_SCOPE", scope); + assertEquals("same sentence", scope.getText()); + } + + @Test + public void contentAtStart() throws Exception { + final AST ast = parse("$x contains text 'hello' at start"); + assertNotNull(ast); + final AST content = findToken(ast, XQueryTokenTypes.FT_CONTENT); + assertNotNull("Should find FT_CONTENT", content); + assertEquals("at start", content.getText()); + } + + @Test + public void contentAtEnd() throws Exception { + final AST ast = parse("$x contains text 'hello' at end"); + assertNotNull(ast); + final AST content = findToken(ast, XQueryTokenTypes.FT_CONTENT); + assertNotNull(content); + assertEquals("at end", content.getText()); + } + + @Test + public void entireContent() throws Exception { + final AST ast = parse("$x contains text 'hello' entire content"); + assertNotNull(ast); + final AST content = findToken(ast, XQueryTokenTypes.FT_CONTENT); + assertNotNull(content); + assertEquals("entire content", content.getText()); + } + + @Test + public void caseOption() throws Exception { + final AST ast = parse("$x contains text 'hello' using case insensitive"); + assertNotNull(ast); + final AST caseOpt = findToken(ast, XQueryTokenTypes.FT_CASE_OPTION); + assertNotNull("Should find FT_CASE_OPTION", caseOpt); + assertEquals("insensitive", caseOpt.getText()); + } + + @Test + public void stemmingOption() throws Exception { + final AST ast = parse("$x contains text 'hello' using stemming"); + assertNotNull(ast); + final AST stemOpt = findToken(ast, XQueryTokenTypes.FT_STEM_OPTION); + assertNotNull("Should find FT_STEM_OPTION", stemOpt); + assertEquals("stemming", stemOpt.getText()); + } + + @Test + public void languageOption() throws Exception { + final AST ast = parse("$x contains text 'hello' using language 'en'"); + assertNotNull(ast); + assertNotNull("Should find FT_LANGUAGE_OPTION", findToken(ast, XQueryTokenTypes.FT_LANGUAGE_OPTION)); + } + + @Test + public void wildcardOption() throws Exception { + final AST ast = parse("$x contains text 'hel.*' using wildcards"); + assertNotNull(ast); + final AST wcOpt = findToken(ast, XQueryTokenTypes.FT_WILDCARD_OPTION); + assertNotNull("Should find FT_WILDCARD_OPTION", wcOpt); + assertEquals("wildcards", wcOpt.getText()); + } + + @Test + public void weightExpr() throws Exception { + final AST ast = parse("$x contains text 'hello' weight { 2.0 }"); + assertNotNull(ast); + assertNotNull("Should find FT_WEIGHT", findToken(ast, XQueryTokenTypes.FT_WEIGHT)); + } + + @Test + public void occurs() throws Exception { + final AST ast = parse("$x contains text 'hello' occurs at least 2 times"); + assertNotNull(ast); + assertNotNull("Should find FT_TIMES", findToken(ast, XQueryTokenTypes.FT_TIMES)); + } + + @Test + public void complexExpression() throws Exception { + // Mix of operators, positional filters, and match options + // Match options go on ftPrimaryWithOptions (before pos filters) + // Pos filters go on ftSelection (after the ftOr chain) + final AST ast = parse( + "$x contains text ('hello' ftand 'world' all words) " + + "using case insensitive using stemming " + + "ordered window 10 words" + ); + assertNotNull(ast); + assertNotNull("Should find FT_CONTAINS", findToken(ast, XQueryTokenTypes.FT_CONTAINS)); + assertNotNull("Should find FT_AND", findToken(ast, XQueryTokenTypes.FT_AND)); + assertNotNull("Should find FT_ORDER", findToken(ast, XQueryTokenTypes.FT_ORDER)); + assertNotNull("Should find FT_WINDOW", findToken(ast, XQueryTokenTypes.FT_WINDOW)); + assertNotNull("Should find FT_CASE_OPTION", findToken(ast, XQueryTokenTypes.FT_CASE_OPTION)); + assertNotNull("Should find FT_STEM_OPTION", findToken(ast, XQueryTokenTypes.FT_STEM_OPTION)); + } + + @Test + public void containsFunctionNotAffected() throws Exception { + // Ensure fn:contains() still parses as a function call, not as FT + final AST ast = parse("contains('hello world', 'hello')"); + assertNotNull(ast); + assertNull("Should NOT find FT_CONTAINS", findToken(ast, XQueryTokenTypes.FT_CONTAINS)); + } + + @Test + public void withoutContent() throws Exception { + final AST ast = parse("$x contains text 'hello' without content $footnotes"); + assertNotNull(ast); + assertNotNull("Should find FT_IGNORE_OPTION", findToken(ast, XQueryTokenTypes.FT_IGNORE_OPTION)); + } +}