From a29ce8cad90cf34641cea999f255b8056c138f73 Mon Sep 17 00:00:00 2001 From: Mohamed Ashraf Date: Tue, 21 Apr 2026 22:40:16 +0000 Subject: [PATCH] perf(griffin): dispatch fetchNext comment checks on token length SqlUtil.fetchNext runs on every token the SQL lexer produces during query compilation. Profile samples show it at ~23% of parse CPU for ExpressionParserBenchmark, with a hot loop that does five sequential Chars.equals calls ("--", "/*", "/*+", "*/", plus WHITESPACE.excludes) per token. For the overwhelmingly common case -- identifiers, numbers, and single-char operators -- every one of those comparisons first calls cs.length() on the GenericLexer's floating sequence, and the first four always miss. Read cs.length() once per iteration and dispatch the four comment-marker checks via a switch on the length. Tokens whose length is not 2 or 3 skip those checks entirely; length-2 tokens run a single 2-char compare for each marker. Identical treatment applied to fetchNextHintToken. Benchmark: ExpressionParserBenchmark, JMH @Fork=2, 5 warmup + 10 measurement iterations, JDK 21. All ten @Param expressions improve with non-overlapping 99.9% confidence intervals. a + b 956.7 -> 872.8 (-8.8%) a + b * c / 2 1392.8 -> 1186.4 (-14.8%) a + b * c(x, y) / 2 1772.8 -> 1486.9 (-16.1%) a = 1 and b = 2 or c=3 1807.0 -> 1587.8 (-12.1%) case-when-else 2832.8 -> 2336.9 (-17.5%) a in (1,2,3,4,5) 1509.6 -> 1302.1 (-13.7%) cast(a) + cast(b) 2264.4 -> 1870.2 (-17.4%) between/like 1819.0 -> 1647.0 (-9.5%) coalesce(a,b,c,d,0) 1539.0 -> 1406.1 (-8.6%) sum over window 3210.8 -> 2618.7 (-18.4%) Geometric mean improvement: ~13.6%. Behaviour is bit-identical: length-2 sequences "--", "/*", "*/" and length-3 "/*+" trigger the same branches as before. All 1394 griffin parser tests pass, including the line/block/hint comment coverage in ExpressionParserTest, SqlParserTest, and SqlHintsTest. --- .../main/java/io/questdb/griffin/SqlUtil.java | 153 ++++++++++-------- 1 file changed, 83 insertions(+), 70 deletions(-) diff --git a/core/src/main/java/io/questdb/griffin/SqlUtil.java b/core/src/main/java/io/questdb/griffin/SqlUtil.java index a96fe5caea49..df91620ffd02 100644 --- a/core/src/main/java/io/questdb/griffin/SqlUtil.java +++ b/core/src/main/java/io/questdb/griffin/SqlUtil.java @@ -456,53 +456,60 @@ public static CharSequence fetchNext(GenericLexer lexer, boolean includeHints) t boolean lineComment = false; while (lexer.hasNext()) { CharSequence cs = lexer.next(); + final int csLen = cs.length(); if (lineComment) { - if (Chars.equals(cs, '\n') || Chars.equals(cs, '\r')) { - lineComment = false; - } else { - // Check if token contains a newline (can happen with unbalanced quotes in comments) - for (int i = 0, n = cs.length(); i < n; i++) { - char c = cs.charAt(i); - if (c == '\n' || c == '\r') { - // Found newline inside token - reposition lexer to after newline - int newPos = lexer.lastTokenPosition() + i + 1; - // Skip \r\n sequence - if (c == '\r' && i + 1 < n && cs.charAt(i + 1) == '\n') { - newPos++; - } - lexer.backTo(newPos, null); - lineComment = false; - break; + if (csLen == 1) { + final char c0 = cs.charAt(0); + if (c0 == '\n' || c0 == '\r') { + lineComment = false; + continue; + } + } + // Check if token contains a newline (can happen with unbalanced quotes in comments) + for (int i = 0; i < csLen; i++) { + char c = cs.charAt(i); + if (c == '\n' || c == '\r') { + // Found newline inside token - reposition lexer to after newline + int newPos = lexer.lastTokenPosition() + i + 1; + // Skip \r\n sequence + if (c == '\r' && i + 1 < csLen && cs.charAt(i + 1) == '\n') { + newPos++; } + lexer.backTo(newPos, null); + lineComment = false; + break; } } continue; } - if (Chars.equals("--", cs)) { - lineComment = true; - continue; - } - - if (Chars.equals("/*", cs)) { - blockCount++; - continue; - } - - if (Chars.equals("/*+", cs) && (!includeHints || blockCount > 0)) { + // Dispatch on token length: "--", "/*", "*/" are length 2; "/*+" is length 3. + // All other lengths short-circuit the comment-marker checks entirely. + if (csLen == 2) { + final char c0 = cs.charAt(0); + final char c1 = cs.charAt(1); + if (c0 == '-' && c1 == '-') { + lineComment = true; + continue; + } + if (c0 == '/' && c1 == '*') { + blockCount++; + continue; + } + if (c0 == '*' && c1 == '/' && blockCount > 0) { + blockCount--; + continue; + } + } else if (csLen == 3 && (!includeHints || blockCount > 0) + && cs.charAt(0) == '/' && cs.charAt(1) == '*' && cs.charAt(2) == '+') { blockCount++; continue; } - if (Chars.equals("*/", cs) && blockCount > 0) { - blockCount--; - continue; - } - if (blockCount == 0 && GenericLexer.WHITESPACE.excludes(cs)) { // unclosed quote check - if (cs.length() == 1 && cs.charAt(0) == '"') { + if (csLen == 1 && cs.charAt(0) == '"') { throw SqlException.$(lexer.lastTokenPosition(), "unclosed quotation mark"); } return cs; @@ -539,58 +546,64 @@ public static CharSequence fetchNextHintToken(GenericLexer lexer) { boolean inError = false; while (lexer.hasNext()) { CharSequence cs = lexer.next(); + final int csLen = cs.length(); if (lineComment) { - if (Chars.equals(cs, '\n') || Chars.equals(cs, '\r')) { - lineComment = false; - } else { - // Check if token contains a newline (can happen with unbalanced quotes in comments) - for (int i = 0, n = cs.length(); i < n; i++) { - char c = cs.charAt(i); - if (c == '\n' || c == '\r') { - // Found newline inside token - reposition lexer to after newline - int newPos = lexer.lastTokenPosition() + i + 1; - // Skip \r\n sequence - if (c == '\r' && i + 1 < n && cs.charAt(i + 1) == '\n') { - newPos++; - } - lexer.backTo(newPos, null); - lineComment = false; - break; + if (csLen == 1) { + final char c0 = cs.charAt(0); + if (c0 == '\n' || c0 == '\r') { + lineComment = false; + continue; + } + } + // Check if token contains a newline (can happen with unbalanced quotes in comments) + for (int i = 0; i < csLen; i++) { + char c = cs.charAt(i); + if (c == '\n' || c == '\r') { + // Found newline inside token - reposition lexer to after newline + int newPos = lexer.lastTokenPosition() + i + 1; + // Skip \r\n sequence + if (c == '\r' && i + 1 < csLen && cs.charAt(i + 1) == '\n') { + newPos++; } + lexer.backTo(newPos, null); + lineComment = false; + break; } } continue; } - if (Chars.equals("--", cs)) { - lineComment = true; - continue; - } - - if (Chars.equals("/*", cs)) { - blockCount++; - continue; - } - - if (Chars.equals("/*+", cs)) { + // Dispatch on token length: "--", "/*", "*/" are length 2; "/*+" is length 3. + // All other lengths short-circuit the comment-marker checks entirely. + if (csLen == 2) { + final char c0 = cs.charAt(0); + final char c1 = cs.charAt(1); + if (c0 == '-' && c1 == '-') { + lineComment = true; + continue; + } + if (c0 == '/' && c1 == '*') { + blockCount++; + continue; + } + // end of hints or a nested comment + if (c0 == '*' && c1 == '/') { + if (blockCount > 0) { + blockCount--; + continue; + } + return null; + } + } else if (csLen == 3 && cs.charAt(0) == '/' && cs.charAt(1) == '*' && cs.charAt(2) == '+') { // nested hints are treated as regular comments blockCount++; continue; } - // end of hints or a nested comment - if (Chars.equals("*/", cs)) { - if (blockCount > 0) { - blockCount--; - continue; - } - return null; - } - if (!inError && blockCount == 0 && GenericLexer.WHITESPACE.excludes(cs)) { // unclosed quote check - if (cs.length() == 1 && cs.charAt(0) == '"') { + if (csLen == 1 && cs.charAt(0) == '"') { inError = true; } else { return cs;