From 3b37e338a92bbb3272dbd6d78470f752521cb1b5 Mon Sep 17 00:00:00 2001 From: stephenkgu Date: Tue, 21 Apr 2026 01:39:01 -0700 Subject: [PATCH 01/48] func/regexp_extract: new scalar func and test cases --- include/libs/function/functionMgt.h | 1 + include/libs/scalar/scalar.h | 1 + source/libs/function/src/builtins.c | 84 ++++- source/libs/scalar/src/sclfunc.c | 169 ++++++++++ .../01-Scalar/test_fun_sca_regexp_extract.py | 309 ++++++++++++++++++ 5 files changed, 562 insertions(+), 2 deletions(-) create mode 100644 test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py diff --git a/include/libs/function/functionMgt.h b/include/libs/function/functionMgt.h index 72785beb4598..81a54475b8a9 100644 --- a/include/libs/function/functionMgt.h +++ b/include/libs/function/functionMgt.h @@ -141,6 +141,7 @@ typedef enum EFunctionType { FUNCTION_TYPE_AES_DECRYPT, FUNCTION_TYPE_SM4_ENCRYPT, FUNCTION_TYPE_SM4_DECRYPT, + FUNCTION_TYPE_REGEXP_EXTRACT, // conversion function FUNCTION_TYPE_CAST = 2000, diff --git a/include/libs/scalar/scalar.h b/include/libs/scalar/scalar.h index bb3cee9aeb05..8943e82e3de4 100644 --- a/include/libs/scalar/scalar.h +++ b/include/libs/scalar/scalar.h @@ -128,6 +128,7 @@ int32_t crc32Function(SScalarParam *pInput, int32_t inputNum, SScalarParam *pOut int32_t findInSetFunction(SScalarParam *pInput, int32_t inputNum, SScalarParam *pOutput); int32_t likeInSetFunction(SScalarParam *pInput, int32_t inputNum, SScalarParam *pOutput); int32_t regexpInSetFunction(SScalarParam *pInput, int32_t inputNum, SScalarParam *pOutput); +int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarParam *pOutput); int32_t generateTotpSecretFunction(SScalarParam *pInput, int32_t inputNum, SScalarParam *pOutput); int32_t generateTotpCodeFunction(SScalarParam *pInput, int32_t inputNum, SScalarParam *pOutput); diff --git a/source/libs/function/src/builtins.c b/source/libs/function/src/builtins.c index 58897d99bf67..1a748f573655 100644 --- a/source/libs/function/src/builtins.c +++ b/source/libs/function/src/builtins.c @@ -1102,13 +1102,58 @@ static int32_t translateRand(SFunctionNode* pFunc, char* pErrBuf, int32_t len) { return TSDB_CODE_SUCCESS; } -// return type is same as first input parameter's type -static int32_t translateOutFirstIn(SFunctionNode* pFunc, char* pErrBuf, int32_t len) { +static int32_t translateRegexpExtract(SFunctionNode* pFunc, char* pErrBuf, int32_t len) { FUNC_ERR_RET(validateParam(pFunc, pErrBuf, len)); + int32_t numOfParams = LIST_LENGTH(pFunc->pParameterList); + + // param[1]: pattern must be a constant VALUE node + SNode* pPatNode = nodesListGetNode(pFunc->pParameterList, 1); + if (QUERY_NODE_VALUE != nodeType(pPatNode)) { + return invaildFuncParaTypeErrMsg(pErrBuf, len, "regexp_extract: pattern must be a constant"); + } + + // Validate the regex pattern compiles as POSIX ERE + SValueNode* pPatVal = (SValueNode*)pPatNode; + if (pPatVal->literal != NULL) { + regex_t re; + int ret = regcomp(&re, pPatVal->literal, REG_EXTENDED); + if (ret != 0) { + char msgbuf[256] = {0}; + (void)regerror(ret, &re, msgbuf, sizeof(msgbuf)); + regfree(&re); + return buildFuncErrMsg(pErrBuf, len, TSDB_CODE_PAR_REGULAR_EXPRESSION_ERROR, + "Invalid regex pattern for regexp_extract: %s", msgbuf); + } + regfree(&re); + } + + // param[2]: group_idx (optional) must be a non-negative integer constant + if (numOfParams == 3) { + SNode* pIdxNode = nodesListGetNode(pFunc->pParameterList, 2); + if (QUERY_NODE_VALUE != nodeType(pIdxNode)) { + return invaildFuncParaTypeErrMsg(pErrBuf, len, "regexp_extract: group_idx must be a constant integer"); + } + SValueNode* pIdxVal = (SValueNode*)pIdxNode; + if (!IS_INTEGER_TYPE(pIdxVal->node.resType.type)) { + return invaildFuncParaTypeErrMsg(pErrBuf, len, "regexp_extract: group_idx must be an integer"); + } + int64_t groupIdx = taosStr2Int64(pIdxVal->literal, NULL, 10); + if (groupIdx < 0 || groupIdx > 512) { + return invaildFuncParaValueErrMsg(pErrBuf, len, "regexp_extract: group_idx must be between 0 and 512"); + } + } + + // Return type matches str (param[0]): same VARCHAR/NCHAR type and byte width pFunc->node.resType = *getSDataTypeFromNode(nodesListGetNode(pFunc->pParameterList, 0)); return TSDB_CODE_SUCCESS; } +// return type is same as first input parameter's type +static int32_t translateOutFirstIn(SFunctionNode* pFunc, char* pErrBuf, int32_t len) { + FUNC_ERR_RET(validateParam(pFunc, pErrBuf, len)); pFunc->node.resType = *getSDataTypeFromNode(nodesListGetNode(pFunc->pParameterList, 0)); + return TSDB_CODE_SUCCESS; +} + static int32_t translatePlaceHolderPseudoColumn(SFunctionNode* pFunc, char* pErrBuf, int32_t len) { // pseudo column do not need to check parameters switch (pFunc->funcType) { @@ -7414,6 +7459,41 @@ const SBuiltinFuncDefinition funcMgtBuiltins[] = { .sprocessFunc = streamPseudoScalarFunction, .finalizeFunc = NULL, }, + { + .name = "regexp_extract", + .type = FUNCTION_TYPE_REGEXP_EXTRACT, + .classification = FUNC_MGT_SCALAR_FUNC | FUNC_MGT_STRING_FUNC, + .parameters = {.minParamNum = 2, + .maxParamNum = 3, + .paramInfoPattern = 1, + .inputParaInfo[0][0] = {.isLastParam = false, + .startParam = 1, + .endParam = 1, + .validDataType = FUNC_PARAM_SUPPORT_VARCHAR_TYPE | FUNC_PARAM_SUPPORT_NCHAR_TYPE | FUNC_PARAM_SUPPORT_NULL_TYPE, + .validNodeType = FUNC_PARAM_SUPPORT_EXPR_NODE, + .paramAttribute = FUNC_PARAM_NO_SPECIFIC_ATTRIBUTE, + .valueRangeFlag = FUNC_PARAM_NO_SPECIFIC_VALUE,}, + .inputParaInfo[0][1] = {.isLastParam = false, + .startParam = 2, + .endParam = 2, + .validDataType = FUNC_PARAM_SUPPORT_VARCHAR_TYPE | FUNC_PARAM_SUPPORT_NCHAR_TYPE, + .validNodeType = FUNC_PARAM_SUPPORT_VALUE_NODE, + .paramAttribute = FUNC_PARAM_NO_SPECIFIC_ATTRIBUTE, + .valueRangeFlag = FUNC_PARAM_NO_SPECIFIC_VALUE,}, + .inputParaInfo[0][2] = {.isLastParam = true, + .startParam = 3, + .endParam = 3, + .validDataType = FUNC_PARAM_SUPPORT_INTEGER_TYPE, + .validNodeType = FUNC_PARAM_SUPPORT_VALUE_NODE, + .paramAttribute = FUNC_PARAM_NO_SPECIFIC_ATTRIBUTE, + .valueRangeFlag = FUNC_PARAM_NO_SPECIFIC_VALUE,}, + .outputParaInfo = {.validDataType = FUNC_PARAM_SUPPORT_VARCHAR_TYPE | FUNC_PARAM_SUPPORT_NCHAR_TYPE}}, + .translateFunc = translateRegexpExtract, + .getEnvFunc = NULL, + .initFunc = NULL, + .sprocessFunc = regexpExtractFunction, + .finalizeFunc = NULL, + }, }; // clang-format on diff --git a/source/libs/scalar/src/sclfunc.c b/source/libs/scalar/src/sclfunc.c index 443a9da20f62..062e655ea433 100644 --- a/source/libs/scalar/src/sclfunc.c +++ b/source/libs/scalar/src/sclfunc.c @@ -1817,6 +1817,175 @@ static int32_t base32Encode(const uint8_t *in, int32_t inLen, char *out) { return outLen; } +int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarParam *pOutput) { + int32_t code = TSDB_CODE_SUCCESS; + + int32_t numOfRows = pInput[0].numOfRows; + SColumnInfoData *pStrData = pInput[0].columnData; + SColumnInfoData *pPatData = pInput[1].columnData; + SColumnInfoData *pOutputData = pOutput->columnData; + + if (numOfRows == 0) { + pOutput->numOfRows = 0; + return TSDB_CODE_SUCCESS; + } + + // NULL-type str: all output rows are NULL + if (IS_NULL_TYPE(GET_PARAM_TYPE(&pInput[0])) || IS_NULL_TYPE(GET_PARAM_TYPE(&pInput[1]))) { + colDataSetNNULL(pOutputData, 0, numOfRows); + pOutput->numOfRows = numOfRows; + return TSDB_CODE_SUCCESS; + } + + // NULL pattern: all output rows are NULL + if (colDataIsNull_s(pPatData, 0)) { + colDataSetNNULL(pOutputData, 0, numOfRows); + pOutput->numOfRows = numOfRows; + return TSDB_CODE_SUCCESS; + } + + // Get group_idx (default 1; param[2] is an optional integer constant) + int32_t groupIdx = 1; + if (inputNum == 3 && !IS_NULL_TYPE(GET_PARAM_TYPE(&pInput[2])) && !colDataIsNull_s(pInput[2].columnData, 0)) { + GET_TYPED_DATA(groupIdx, int32_t, GET_PARAM_TYPE(&pInput[2]), + colDataGetData(pInput[2].columnData, 0), + typeGetTypeModFromColInfo(&pInput[2].columnData->info)); + } + if (groupIdx < 0 || groupIdx > 512) { + return TSDB_CODE_FUNC_FUNTION_PARA_VALUE; + } + + // Build null-terminated UTF-8 pattern string (pattern is a constant, always 1 row) + char patBuf[512]; + char *patStr = patBuf; + int32_t patLen = 0; + bool needFreePat = false; + { + char *rawPat = varDataVal(colDataGetData(pPatData, 0)); + int32_t rawPatLen = varDataLen(colDataGetData(pPatData, 0)); + if (GET_PARAM_TYPE(&pInput[1]) == TSDB_DATA_TYPE_NCHAR) { + SCL_ERR_RET(convNcharToVarchar(rawPat, &patStr, rawPatLen, &patLen, pInput[1].charsetCxt)); + needFreePat = true; + } else { + patLen = rawPatLen; + if (patLen >= (int32_t)sizeof(patBuf)) { + patStr = taosMemoryMalloc(patLen + 1); + if (patStr == NULL) return terrno; + needFreePat = true; + } + (void)memcpy(patStr, rawPat, patLen); + patStr[patLen] = '\0'; + } + } + + // Compile (or retrieve cached) regex — pattern is constant so cache hits every row + regex_t *regex = NULL; + if (threadGetRegComp(®ex, patStr) != 0) { + code = TSDB_CODE_PAR_REGULAR_EXPRESSION_ERROR; + goto _exit; + } + + // regmatch_t array: index 0 = whole match, 1..groupIdx = capture groups + int32_t nmatch = groupIdx + 1; + regmatch_t *pmatch = taosMemoryMalloc(nmatch * sizeof(regmatch_t)); + if (pmatch == NULL) { + code = terrno; + goto _exit; + } + + // Output buffer: same byte width as the str column + int32_t outBufLen = pStrData->info.bytes; + char *outBuf = taosMemoryMalloc(outBufLen); + if (outBuf == NULL) { + taosMemoryFree(pmatch); + code = terrno; + goto _exit; + } + + int32_t strType = GET_PARAM_TYPE(&pInput[0]); + bool isNchar = (strType == TSDB_DATA_TYPE_NCHAR); + + for (int32_t i = 0; i < numOfRows; i++) { + if (colDataIsNull_s(pStrData, i)) { + colDataSetNULL(pOutputData, i); + continue; + } + + char *strRaw = colDataGetData(pStrData, i); + char *strVal = varDataVal(strRaw); + int32_t strLen = varDataLen(strRaw); + + // For NCHAR (UCS-4), convert to UTF-8 before matching + char *strUtf8 = strVal; + int32_t strUtf8Len = strLen; + bool needFreeUtf8 = false; + if (isNchar) { + if (convNcharToVarchar(strVal, &strUtf8, strLen, &strUtf8Len, pInput[0].charsetCxt) != 0) { + colDataSetNULL(pOutputData, i); + continue; + } + needFreeUtf8 = true; + } + + // Null-terminate the string for regexec + char ntBuf[1024]; + char *strNt = ntBuf; + bool needFreeNt = false; + if (strUtf8Len >= (int32_t)sizeof(ntBuf)) { + strNt = taosMemoryMalloc(strUtf8Len + 1); + needFreeNt = true; + if (strNt == NULL) { + if (needFreeUtf8) taosMemoryFree(strUtf8); + colDataSetNULL(pOutputData, i); + continue; + } + } + (void)memcpy(strNt, strUtf8, strUtf8Len); + strNt[strUtf8Len] = '\0'; + + int ret = regexec(regex, strNt, nmatch, pmatch, 0); + if (ret != 0 || pmatch[groupIdx].rm_so == -1) { + // REG_NOMATCH, or the requested capture group did not participate + colDataSetNULL(pOutputData, i); + } else { + int32_t matchStart = pmatch[groupIdx].rm_so; + int32_t matchLen = pmatch[groupIdx].rm_eo - pmatch[groupIdx].rm_so; + + if (isNchar) { + // Convert matched UTF-8 bytes back to NCHAR (UCS-4) + char *matchedNchar = NULL; + int32_t matchedNcharLen = 0; + if (convVarcharToNchar(strNt + matchStart, &matchedNchar, matchLen, &matchedNcharLen, + pInput[0].charsetCxt) != 0) { + colDataSetNULL(pOutputData, i); + } else { + *(VarDataLenT *)outBuf = matchedNcharLen; + (void)memcpy(outBuf + VARSTR_HEADER_SIZE, matchedNchar, matchedNcharLen); + taosMemoryFree(matchedNchar); + code = colDataSetVal(pOutputData, i, outBuf, false); + if (code != TSDB_CODE_SUCCESS) terrno = code; + } + } else { + *(VarDataLenT *)outBuf = matchLen; + (void)memcpy(outBuf + VARSTR_HEADER_SIZE, strNt + matchStart, matchLen); + code = colDataSetVal(pOutputData, i, outBuf, false); + if (code != TSDB_CODE_SUCCESS) terrno = code; + } + } + + if (needFreeNt) taosMemoryFree(strNt); + if (needFreeUtf8) taosMemoryFree(strUtf8); + if (code != TSDB_CODE_SUCCESS) break; + } + + taosMemoryFree(outBuf); + taosMemoryFree(pmatch); +_exit: + if (needFreePat) taosMemoryFree(patStr); + pOutput->numOfRows = numOfRows; + return code; +} + int32_t generateTotpSecretFunction(SScalarParam *pInput, int32_t inputNum, SScalarParam *pOutput) { SColumnInfoData *pInputData = pInput->columnData; SColumnInfoData *pOutputData = pOutput->columnData; diff --git a/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py b/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py new file mode 100644 index 000000000000..0856a1df8402 --- /dev/null +++ b/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py @@ -0,0 +1,309 @@ +from new_test_framework.utils import tdLog, tdSql +import datetime + + +class TestFunRegexpExtract: + + def setup_class(cls): + tdLog.debug(f"start to execute {__file__}") + + # ------------------------------------------------------------------ + # Helpers + # ------------------------------------------------------------------ + + def _create_tb(self, dbname="db"): + tdSql.execute(f"""CREATE STABLE {dbname}.st ( + ts TIMESTAMP, vc VARCHAR(128), nc NCHAR(64), iv INT + ) TAGS (t INT)""") + tdSql.execute(f"CREATE TABLE {dbname}.ct1 USING {dbname}.st TAGS(1)") + tdSql.execute(f"CREATE TABLE {dbname}.ct2 USING {dbname}.st TAGS(2)") + tdSql.execute(f"""CREATE TABLE {dbname}.nt ( + ts TIMESTAMP, vc VARCHAR(128), nc NCHAR(64), iv INT + )""") + + def _insert_data(self, dbname="db"): + now = int(datetime.datetime.timestamp(datetime.datetime.now()) * 1000) + # ct1: log-style rows + one NULL row + ct1_rows = [ + (now - 4000, "'code=42,type=DISK_FULL'", "'code=42,type=DISK_FULL'", 42), + (now - 3000, "'code=7,type=LOW_MEM'", "'code=7,type=LOW_MEM'", 7), + (now - 2000, "'code=0,type=OK'", "'code=0,type=OK'", 0), + (now - 1000, "NULL", "NULL", "NULL"), + ] + for ts, vc, nc, iv in ct1_rows: + tdSql.execute(f"INSERT INTO {dbname}.ct1 VALUES({ts}, {vc}, {nc}, {iv})") + # ct2: URL-style rows + ct2_rows = [ + (now - 3000, "'https://example.com'", "'https://example.com'", 1), + (now - 2000, "'http://api.example.org'", "'http://api.example.org'", 2), + (now - 1000, "'ftp://files.example.net'", "'ftp://files.example.net'", 3), + ] + for ts, vc, nc, iv in ct2_rows: + tdSql.execute(f"INSERT INTO {dbname}.ct2 VALUES({ts}, {vc}, {nc}, {iv})") + # nt: same as ct1 + for ts, vc, nc, iv in ct1_rows: + tdSql.execute(f"INSERT INTO {dbname}.nt VALUES({ts}, {vc}, {nc}, {iv})") + + def _check_basic(self, dbname="db"): + # ----------------------------------------------------------------- + # §1 Default group_idx=1 — no-table queries + # ----------------------------------------------------------------- + # RXE-BASIC-001: single capture group → group 1 + tdSql.query("SELECT REGEXP_EXTRACT('abc', '(b)')") + tdSql.checkRows(1) + tdSql.checkData(0, 0, 'b') + + # RXE-BASIC-002: multiple capture groups, default → group 1 only + tdSql.query("SELECT REGEXP_EXTRACT('abc', '(b)(c)')") + tdSql.checkRows(1) + tdSql.checkData(0, 0, 'b') + + # RXE-BASIC-003: no capture group, default group_idx=1 → NULL + tdSql.query("SELECT REGEXP_EXTRACT('abc', 'b')") + tdSql.checkRows(1) + tdSql.checkData(0, 0, None) + + # ----------------------------------------------------------------- + # §2 group_idx=0 whole match + # ----------------------------------------------------------------- + # RXE-GRP0-001: no capture group, group_idx=0 → whole match + tdSql.query("SELECT REGEXP_EXTRACT('abc', 'b', 0)") + tdSql.checkRows(1) + tdSql.checkData(0, 0, 'b') + + # RXE-GRP0-002: with capture group, group_idx=0 → whole match ≠ group 1 + tdSql.query("SELECT REGEXP_EXTRACT('abc', '(b)c', 0)") + tdSql.checkRows(1) + tdSql.checkData(0, 0, 'bc') + + # RXE-GRP0-003: no match, group_idx=0 → NULL + tdSql.query("SELECT REGEXP_EXTRACT('abc', 'x+', 0)") + tdSql.checkRows(1) + tdSql.checkData(0, 0, None) + + # ----------------------------------------------------------------- + # §3 Multiple capture groups by explicit index + # ----------------------------------------------------------------- + # RXE-GRP-001: explicit group_idx=1 → group 1 + tdSql.query("SELECT REGEXP_EXTRACT('abc', '(b)(c)', 1)") + tdSql.checkRows(1) + tdSql.checkData(0, 0, 'b') + + # RXE-GRP-002: explicit group_idx=2 → group 2 + tdSql.query("SELECT REGEXP_EXTRACT('abc', '(b)(c)', 2)") + tdSql.checkRows(1) + tdSql.checkData(0, 0, 'c') + + # RXE-GRP-003: group_idx out of range → NULL, no error + tdSql.query("SELECT REGEXP_EXTRACT('abc', '(b)(c)', 3)") + tdSql.checkRows(1) + tdSql.checkData(0, 0, None) + + # ----------------------------------------------------------------- + # §4 NULL and no-match + # ----------------------------------------------------------------- + # RXE-NULL-001: str=NULL → NULL + tdSql.query("SELECT REGEXP_EXTRACT(NULL, '(a+)')") + tdSql.checkRows(1) + tdSql.checkData(0, 0, None) + + # RXE-NULL-002: no match → NULL + tdSql.query("SELECT REGEXP_EXTRACT('abc', '(x+)')") + tdSql.checkRows(1) + tdSql.checkData(0, 0, None) + + # RXE-NULL-003: multiple matches, only first (leftmost) returned + tdSql.query("SELECT REGEXP_EXTRACT('a1b2', '([0-9])')") + tdSql.checkRows(1) + tdSql.checkData(0, 0, '1') + + # RXE-NULL-004: str=NULL with group_idx=0 → NULL + tdSql.query("SELECT REGEXP_EXTRACT(NULL, 'a+', 0)") + tdSql.checkRows(1) + tdSql.checkData(0, 0, None) + + # RXE-NULL-005: non-participating group in alternation → NULL + # pattern '(a)|(b)' matches 'b' via group 2; group 1 did not participate + tdSql.query("SELECT REGEXP_EXTRACT('b', '(a)|(b)', 1)") + tdSql.checkRows(1) + tdSql.checkData(0, 0, None) + + # participating group 2 → 'b' + tdSql.query("SELECT REGEXP_EXTRACT('b', '(a)|(b)', 2)") + tdSql.checkRows(1) + tdSql.checkData(0, 0, 'b') + + # ----------------------------------------------------------------- + # §5 Empty string scenarios + # ----------------------------------------------------------------- + # RXE-EMPTY-001: capture group matches empty string → '' (not NULL) + tdSql.query("SELECT REGEXP_EXTRACT('ac', '(b?)')") + tdSql.checkRows(1) + tdSql.checkData(0, 0, '') + + # RXE-EMPTY-002: empty input str with zero-length match → '' + tdSql.query("SELECT REGEXP_EXTRACT('', '(a*)')") + tdSql.checkRows(1) + tdSql.checkData(0, 0, '') + + # ----------------------------------------------------------------- + # §6 Table queries — per-row scalar behavior + # ----------------------------------------------------------------- + # RXE-TBL-001: extract numeric code — only first row matches 'code=N' + # fully; verify row-by-row extraction + tdSql.query(f"SELECT REGEXP_EXTRACT(vc, 'code=([0-9]+)') FROM {dbname}.ct1 ORDER BY ts") + tdSql.checkRows(4) + tdSql.checkData(0, 0, '42') + tdSql.checkData(1, 0, '7') + tdSql.checkData(2, 0, '0') + tdSql.checkData(3, 0, None) # NULL row → NULL + + # RXE-TBL-002: NULL column row → NULL; non-NULL rows → extracted value + tdSql.query(f"SELECT REGEXP_EXTRACT(vc, 'type=([A-Z_]+)') FROM {dbname}.ct1 ORDER BY ts") + tdSql.checkRows(4) + tdSql.checkData(0, 0, 'DISK_FULL') + tdSql.checkData(1, 0, 'LOW_MEM') + tdSql.checkData(2, 0, 'OK') + tdSql.checkData(3, 0, None) + + # RXE-TBL-003: empty table → 0 rows, no error + tdSql.execute(f"CREATE TABLE IF NOT EXISTS {dbname}.empty_t (ts TIMESTAMP, vc VARCHAR(64))") + tdSql.query(f"SELECT REGEXP_EXTRACT(vc, '([0-9]+)') FROM {dbname}.empty_t") + tdSql.checkRows(0) + + # ----------------------------------------------------------------- + # §7 WHERE clause + # ----------------------------------------------------------------- + # RXE-WHERE-001: IS NOT NULL filters to rows with a match + tdSql.query(f"SELECT vc FROM {dbname}.ct1 " + "WHERE REGEXP_EXTRACT(vc, 'code=([4-9][0-9]+)') IS NOT NULL " + "ORDER BY ts") + tdSql.checkRows(1) + tdSql.checkData(0, 0, 'code=42,type=DISK_FULL') + + # RXE-WHERE-002: equality on extracted scheme value + tdSql.query(f"SELECT vc FROM {dbname}.ct2 " + "WHERE REGEXP_EXTRACT(vc, '(https?)://') = 'https' " + "ORDER BY ts") + tdSql.checkRows(1) + tdSql.checkData(0, 0, 'https://example.com') + + # ----------------------------------------------------------------- + # §8 NCHAR column: extraction result equals VARCHAR equivalent + # ----------------------------------------------------------------- + # RXE-NCHAR-001: NCHAR input yields same extracted value as VARCHAR + tdSql.query(f"SELECT REGEXP_EXTRACT(nc, 'code=([0-9]+)') FROM {dbname}.ct1 ORDER BY ts") + tdSql.checkRows(4) + tdSql.checkData(0, 0, '42') + tdSql.checkData(1, 0, '7') + tdSql.checkData(2, 0, '0') + tdSql.checkData(3, 0, None) + + # ----------------------------------------------------------------- + # §9 Subquery with GROUP BY + # ----------------------------------------------------------------- + # RXE-SUB-001: group by extracted URL scheme + tdSql.query(f"""SELECT scheme, COUNT(*) AS cnt + FROM (SELECT REGEXP_EXTRACT(vc, '(https?)://') AS scheme FROM {dbname}.ct2) t + WHERE scheme IS NOT NULL + GROUP BY scheme + ORDER BY scheme""") + tdSql.checkRows(2) + tdSql.checkData(0, 0, 'http') + tdSql.checkData(0, 1, 1) + tdSql.checkData(1, 0, 'https') + tdSql.checkData(1, 1, 1) + + # ----------------------------------------------------------------- + # §10 ERE features (character class, anchors, case sensitivity) + # ----------------------------------------------------------------- + # RXE-RE-001: character class extracts decimal number + tdSql.query("SELECT REGEXP_EXTRACT('v=3.14', '([0-9]+\\.[0-9]+)')") + tdSql.checkRows(1) + tdSql.checkData(0, 0, '3.14') + + # RXE-RE-002a: anchor ^ matches at start → 'a' + tdSql.query("SELECT REGEXP_EXTRACT('abc', '^(a)')") + tdSql.checkRows(1) + tdSql.checkData(0, 0, 'a') + + # RXE-RE-002b: anchor ^ requires position 0; 'x' blocks match → NULL + tdSql.query("SELECT REGEXP_EXTRACT('xabc', '^(a)')") + tdSql.checkRows(1) + tdSql.checkData(0, 0, None) + + # RXE-RE-003a: case-sensitive by default → NULL + tdSql.query("SELECT REGEXP_EXTRACT('ABC', '(abc)')") + tdSql.checkRows(1) + tdSql.checkData(0, 0, None) + + # RXE-RE-003b: LOWER() enables case-insensitive extraction → 'abc' + tdSql.query("SELECT REGEXP_EXTRACT(LOWER('ABC'), '(abc)')") + tdSql.checkRows(1) + tdSql.checkData(0, 0, 'abc') + + def _check_error(self, dbname="db"): + # ----------------------------------------------------------------- + # §11 Error cases + # ----------------------------------------------------------------- + # RXE-ERR-001: too few arguments (1) + tdSql.error("SELECT REGEXP_EXTRACT('abc')") + + # RXE-ERR-002: too many arguments (4) + tdSql.error("SELECT REGEXP_EXTRACT('abc', '(b)', 1, 0)") + + # RXE-ERR-003: str is non-string type (INT column) + tdSql.error(f"SELECT REGEXP_EXTRACT(iv, '([0-9]+)') FROM {dbname}.ct1") + + # RXE-ERR-004: pattern is a column reference (not a constant) + tdSql.error(f"SELECT REGEXP_EXTRACT(vc, vc) FROM {dbname}.ct1") + + # RXE-ERR-005: negative group_idx → translation-phase error + tdSql.error("SELECT REGEXP_EXTRACT('abc', '(b)', -1)") + + # RXE-ERR-006: invalid regex (unmatched parenthesis) + tdSql.error("SELECT REGEXP_EXTRACT('abc', '(b', 1)") + + def all_test(self, dbname="db"): + self._check_basic(dbname) + self._check_error(dbname) + + def test_fun_sca_regexp_extract(self): + """Fun: regexp_extract() + + 1. regexp_extract default group_idx=1 returns first capture group + 2. regexp_extract group_idx=0 returns whole match substring + 3. regexp_extract with explicit group index (1, 2, out-of-range) + 4. regexp_extract NULL input and no-match return NULL + 5. regexp_extract capture group matching empty string returns '' + 6. regexp_extract on table columns with per-row scalar semantics + 7. regexp_extract in WHERE clause for row filtering + 8. regexp_extract on NCHAR column (return type NCHAR) + 9. regexp_extract in subquery with GROUP BY + 10. regexp_extract POSIX ERE features: character class, anchors, case sensitivity + 11. regexp_extract invalid parameter error cases + + Since: v3.4.2.0 + + Labels: common,ci + + Jira: None + + History: + - 2026-04-20 Stephen Created + """ + tdSql.prepare() + + tdLog.printNoPrefix("==========step1:create table") + self._create_tb() + + tdLog.printNoPrefix("==========step2:insert data") + self._insert_data() + + tdLog.printNoPrefix("==========step3:all check") + self.all_test() + + tdSql.execute("flush database db") + + tdLog.printNoPrefix("==========step4:after wal, all check again") + self.all_test() From 6d2e73ab2f74846a5cfe56a1549b23dbae11dabb Mon Sep 17 00:00:00 2001 From: stephenkgu Date: Tue, 21 Apr 2026 02:20:28 -0700 Subject: [PATCH 02/48] fix ai review issues --- source/libs/function/src/builtins.c | 52 ++++++++++++++++++++++------- source/libs/scalar/src/sclfunc.c | 15 ++++++--- 2 files changed, 51 insertions(+), 16 deletions(-) diff --git a/source/libs/function/src/builtins.c b/source/libs/function/src/builtins.c index 1a748f573655..0797b6e76c1a 100644 --- a/source/libs/function/src/builtins.c +++ b/source/libs/function/src/builtins.c @@ -1112,19 +1112,46 @@ static int32_t translateRegexpExtract(SFunctionNode* pFunc, char* pErrBuf, int32 return invaildFuncParaTypeErrMsg(pErrBuf, len, "regexp_extract: pattern must be a constant"); } - // Validate the regex pattern compiles as POSIX ERE + // Validate the regex pattern compiles as POSIX ERE. + // For NCHAR patterns datum.p holds the UCS-4 vardata (populated before function + // translate is called); convert it to UTF-8 to match the runtime path in + // regexpExtractFunction. For VARCHAR patterns literal is already UTF-8. SValueNode* pPatVal = (SValueNode*)pPatNode; - if (pPatVal->literal != NULL) { - regex_t re; - int ret = regcomp(&re, pPatVal->literal, REG_EXTENDED); - if (ret != 0) { - char msgbuf[256] = {0}; - (void)regerror(ret, &re, msgbuf, sizeof(msgbuf)); - regfree(&re); - return buildFuncErrMsg(pErrBuf, len, TSDB_CODE_PAR_REGULAR_EXPRESSION_ERROR, - "Invalid regex pattern for regexp_extract: %s", msgbuf); + { + const char *regPattern = pPatVal->literal; + char *utf8Pat = NULL; + bool freeUtf8Pat = false; + + if (pPatVal->node.resType.type == TSDB_DATA_TYPE_NCHAR && pPatVal->datum.p != NULL) { + int32_t ncharBytes = varDataLen(pPatVal->datum.p); + utf8Pat = taosMemoryCalloc(ncharBytes + 1, 1); + if (utf8Pat == NULL) return terrno; + int32_t utf8Len = taosUcs4ToMbs((TdUcs4*)varDataVal(pPatVal->datum.p), ncharBytes, + utf8Pat, pPatVal->charsetCxt); + if (utf8Len < 0) { + taosMemoryFree(utf8Pat); + return buildFuncErrMsg(pErrBuf, len, TSDB_CODE_PAR_REGULAR_EXPRESSION_ERROR, + "regexp_extract: failed to convert NCHAR pattern to UTF-8"); + } + utf8Pat[utf8Len] = '\0'; + regPattern = utf8Pat; + freeUtf8Pat = true; } - regfree(&re); + + if (regPattern != NULL) { + regex_t re; + int ret = regcomp(&re, regPattern, REG_EXTENDED); + if (ret != 0) { + char msgbuf[256] = {0}; + (void)regerror(ret, &re, msgbuf, sizeof(msgbuf)); + // do not call regfree — regcomp failed, re contents are undefined (POSIX) + if (freeUtf8Pat) taosMemoryFree(utf8Pat); + return buildFuncErrMsg(pErrBuf, len, TSDB_CODE_PAR_REGULAR_EXPRESSION_ERROR, + "Invalid regex pattern for regexp_extract: %s", msgbuf); + } + regfree(&re); // only reached when regcomp succeeded + } + if (freeUtf8Pat) taosMemoryFree(utf8Pat); } // param[2]: group_idx (optional) must be a non-negative integer constant @@ -1150,7 +1177,8 @@ static int32_t translateRegexpExtract(SFunctionNode* pFunc, char* pErrBuf, int32 // return type is same as first input parameter's type static int32_t translateOutFirstIn(SFunctionNode* pFunc, char* pErrBuf, int32_t len) { - FUNC_ERR_RET(validateParam(pFunc, pErrBuf, len)); pFunc->node.resType = *getSDataTypeFromNode(nodesListGetNode(pFunc->pParameterList, 0)); + FUNC_ERR_RET(validateParam(pFunc, pErrBuf, len)); + pFunc->node.resType = *getSDataTypeFromNode(nodesListGetNode(pFunc->pParameterList, 0)); return TSDB_CODE_SUCCESS; } diff --git a/source/libs/scalar/src/sclfunc.c b/source/libs/scalar/src/sclfunc.c index 062e655ea433..20fda82fa1ba 100644 --- a/source/libs/scalar/src/sclfunc.c +++ b/source/libs/scalar/src/sclfunc.c @@ -1880,8 +1880,8 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar // Compile (or retrieve cached) regex — pattern is constant so cache hits every row regex_t *regex = NULL; - if (threadGetRegComp(®ex, patStr) != 0) { - code = TSDB_CODE_PAR_REGULAR_EXPRESSION_ERROR; + code = threadGetRegComp(®ex, patStr); + if (code != 0) { goto _exit; } @@ -1944,9 +1944,16 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar strNt[strUtf8Len] = '\0'; int ret = regexec(regex, strNt, nmatch, pmatch, 0); - if (ret != 0 || pmatch[groupIdx].rm_so == -1) { - // REG_NOMATCH, or the requested capture group did not participate + if (ret == REG_NOMATCH || (ret == 0 && pmatch[groupIdx].rm_so == -1)) { + // no match, or the requested capture group did not participate colDataSetNULL(pOutputData, i); + } else if (ret != 0) { + // real regex execution error (e.g. REG_ESPACE) + code = TSDB_CODE_PAR_REGULAR_EXPRESSION_ERROR; + terrno = code; + if (needFreeNt) taosMemoryFree(strNt); + if (needFreeUtf8) taosMemoryFree(strUtf8); + break; } else { int32_t matchStart = pmatch[groupIdx].rm_so; int32_t matchLen = pmatch[groupIdx].rm_eo - pmatch[groupIdx].rm_so; From 654819206bed252d7517f58892232b17822749af Mon Sep 17 00:00:00 2001 From: stephenkgu Date: Tue, 21 Apr 2026 02:26:03 -0700 Subject: [PATCH 03/48] fix c null-terminated string's mem manipulation --- source/libs/scalar/src/sclfunc.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/source/libs/scalar/src/sclfunc.c b/source/libs/scalar/src/sclfunc.c index 20fda82fa1ba..fdbb975f8a29 100644 --- a/source/libs/scalar/src/sclfunc.c +++ b/source/libs/scalar/src/sclfunc.c @@ -1866,6 +1866,17 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar if (GET_PARAM_TYPE(&pInput[1]) == TSDB_DATA_TYPE_NCHAR) { SCL_ERR_RET(convNcharToVarchar(rawPat, &patStr, rawPatLen, &patLen, pInput[1].charsetCxt)); needFreePat = true; + // convNcharToVarchar allocates rawPatLen bytes (no +1 for NUL); when the + // UTF-8 output fills the buffer entirely there is no room for a terminator. + // threadGetRegComp requires a NUL-terminated string — grow by one byte. + char *tmp = taosMemoryRealloc(patStr, patLen + 1); + if (tmp == NULL) { + taosMemoryFree(patStr); + needFreePat = false; + return terrno; + } + patStr = tmp; + patStr[patLen] = '\0'; } else { patLen = rawPatLen; if (patLen >= (int32_t)sizeof(patBuf)) { From d770e45992fa8e5eb0f3ca42297162bba83d784d Mon Sep 17 00:00:00 2001 From: Minglei Jin <49711132+stephenkgu@users.noreply.github.com> Date: Tue, 21 Apr 2026 02:45:20 -0700 Subject: [PATCH 04/48] Update source/libs/scalar/src/sclfunc.c Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- source/libs/scalar/src/sclfunc.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/source/libs/scalar/src/sclfunc.c b/source/libs/scalar/src/sclfunc.c index fdbb975f8a29..f8a36d867681 100644 --- a/source/libs/scalar/src/sclfunc.c +++ b/source/libs/scalar/src/sclfunc.c @@ -1954,9 +1954,14 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar (void)memcpy(strNt, strUtf8, strUtf8Len); strNt[strUtf8Len] = '\0'; - int ret = regexec(regex, strNt, nmatch, pmatch, 0); - if (ret == REG_NOMATCH || (ret == 0 && pmatch[groupIdx].rm_so == -1)) { - // no match, or the requested capture group did not participate + int ret = regexec(regex, strNt, nmatch, pmatch, 0); + bool requestedGroupAvailable = + (groupIdx >= 0) && + ((size_t)groupIdx < nmatch) && + ((size_t)groupIdx <= regex->re_nsub); + + if (ret == REG_NOMATCH || (ret == 0 && (!requestedGroupAvailable || pmatch[groupIdx].rm_so == -1))) { + // no match, the requested capture group does not exist, or it did not participate colDataSetNULL(pOutputData, i); } else if (ret != 0) { // real regex execution error (e.g. REG_ESPACE) From dc985caa466f85d8a54d116aa9df85cd7d22a17f Mon Sep 17 00:00:00 2001 From: Minglei Jin <49711132+stephenkgu@users.noreply.github.com> Date: Tue, 21 Apr 2026 02:45:52 -0700 Subject: [PATCH 05/48] Update source/libs/scalar/src/sclfunc.c Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- source/libs/scalar/src/sclfunc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/source/libs/scalar/src/sclfunc.c b/source/libs/scalar/src/sclfunc.c index f8a36d867681..15d3388c8f21 100644 --- a/source/libs/scalar/src/sclfunc.c +++ b/source/libs/scalar/src/sclfunc.c @@ -1927,13 +1927,13 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar int32_t strLen = varDataLen(strRaw); // For NCHAR (UCS-4), convert to UTF-8 before matching - char *strUtf8 = strVal; - int32_t strUtf8Len = strLen; + char *strUtf8 = strVal; + int32_t strUtf8Len = strLen; bool needFreeUtf8 = false; if (isNchar) { if (convNcharToVarchar(strVal, &strUtf8, strLen, &strUtf8Len, pInput[0].charsetCxt) != 0) { - colDataSetNULL(pOutputData, i); - continue; + terrno = TSDB_CODE_SCALAR_CONVERT_ERROR; + return terrno; } needFreeUtf8 = true; } From c4a561953f2b93fdfd12ddc9235adb55b2a00936 Mon Sep 17 00:00:00 2001 From: Minglei Jin <49711132+stephenkgu@users.noreply.github.com> Date: Tue, 21 Apr 2026 02:46:14 -0700 Subject: [PATCH 06/48] Update source/libs/function/src/builtins.c Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- source/libs/function/src/builtins.c | 47 +++++++++++++++++------------ 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/source/libs/function/src/builtins.c b/source/libs/function/src/builtins.c index 0797b6e76c1a..d741d2773373 100644 --- a/source/libs/function/src/builtins.c +++ b/source/libs/function/src/builtins.c @@ -1113,29 +1113,38 @@ static int32_t translateRegexpExtract(SFunctionNode* pFunc, char* pErrBuf, int32 } // Validate the regex pattern compiles as POSIX ERE. - // For NCHAR patterns datum.p holds the UCS-4 vardata (populated before function - // translate is called); convert it to UTF-8 to match the runtime path in - // regexpExtractFunction. For VARCHAR patterns literal is already UTF-8. + // For prepared-statement placeholders, literal may contain the placeholder + // token (for example "?") instead of the bound pattern. Prefer the + // materialized datum when available, and otherwise defer validation to + // runtime for placeholders. For NCHAR patterns datum.p holds UCS-4 vardata; + // convert it to UTF-8 to match the runtime path in regexpExtractFunction. SValueNode* pPatVal = (SValueNode*)pPatNode; { - const char *regPattern = pPatVal->literal; - char *utf8Pat = NULL; + const char* regPattern = NULL; + char* utf8Pat = NULL; bool freeUtf8Pat = false; - - if (pPatVal->node.resType.type == TSDB_DATA_TYPE_NCHAR && pPatVal->datum.p != NULL) { - int32_t ncharBytes = varDataLen(pPatVal->datum.p); - utf8Pat = taosMemoryCalloc(ncharBytes + 1, 1); - if (utf8Pat == NULL) return terrno; - int32_t utf8Len = taosUcs4ToMbs((TdUcs4*)varDataVal(pPatVal->datum.p), ncharBytes, - utf8Pat, pPatVal->charsetCxt); - if (utf8Len < 0) { - taosMemoryFree(utf8Pat); - return buildFuncErrMsg(pErrBuf, len, TSDB_CODE_PAR_REGULAR_EXPRESSION_ERROR, - "regexp_extract: failed to convert NCHAR pattern to UTF-8"); + bool deferValidation = (pPatVal->placeholderNo != 0 && pPatVal->datum.p == NULL); + + if (!deferValidation) { + if (pPatVal->node.resType.type == TSDB_DATA_TYPE_NCHAR && pPatVal->datum.p != NULL) { + int32_t ncharBytes = varDataLen(pPatVal->datum.p); + utf8Pat = taosMemoryCalloc(ncharBytes + 1, 1); + if (utf8Pat == NULL) return terrno; + int32_t utf8Len = taosUcs4ToMbs((TdUcs4*)varDataVal(pPatVal->datum.p), ncharBytes, + utf8Pat, pPatVal->charsetCxt); + if (utf8Len < 0) { + taosMemoryFree(utf8Pat); + return buildFuncErrMsg(pErrBuf, len, TSDB_CODE_PAR_REGULAR_EXPRESSION_ERROR, + "regexp_extract: failed to convert NCHAR pattern to UTF-8"); + } + utf8Pat[utf8Len] = '\0'; + regPattern = utf8Pat; + freeUtf8Pat = true; + } else if (pPatVal->datum.p != NULL) { + regPattern = (const char*)varDataVal(pPatVal->datum.p); + } else { + regPattern = pPatVal->literal; } - utf8Pat[utf8Len] = '\0'; - regPattern = utf8Pat; - freeUtf8Pat = true; } if (regPattern != NULL) { From 6a151e29bc57b96c7e4cece5bdf8a154397d4308 Mon Sep 17 00:00:00 2001 From: Minglei Jin <49711132+stephenkgu@users.noreply.github.com> Date: Tue, 21 Apr 2026 02:55:55 -0700 Subject: [PATCH 07/48] Apply suggestions from copilot code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- source/libs/scalar/src/sclfunc.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/source/libs/scalar/src/sclfunc.c b/source/libs/scalar/src/sclfunc.c index 15d3388c8f21..43108ea2f47f 100644 --- a/source/libs/scalar/src/sclfunc.c +++ b/source/libs/scalar/src/sclfunc.c @@ -1947,8 +1947,8 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar needFreeNt = true; if (strNt == NULL) { if (needFreeUtf8) taosMemoryFree(strUtf8); - colDataSetNULL(pOutputData, i); - continue; + code = terrno; + break; } } (void)memcpy(strNt, strUtf8, strUtf8Len); @@ -1978,9 +1978,12 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar // Convert matched UTF-8 bytes back to NCHAR (UCS-4) char *matchedNchar = NULL; int32_t matchedNcharLen = 0; - if (convVarcharToNchar(strNt + matchStart, &matchedNchar, matchLen, &matchedNcharLen, - pInput[0].charsetCxt) != 0) { - colDataSetNULL(pOutputData, i); + int32_t convCode = + convVarcharToNchar(strNt + matchStart, &matchedNchar, matchLen, &matchedNcharLen, pInput[0].charsetCxt); + if (convCode != 0) { + code = convCode; + terrno = code; + if (matchedNchar != NULL) taosMemoryFree(matchedNchar); } else { *(VarDataLenT *)outBuf = matchedNcharLen; (void)memcpy(outBuf + VARSTR_HEADER_SIZE, matchedNchar, matchedNcharLen); From c7a78f41bf5ce9c168709721aec437135611ca1d Mon Sep 17 00:00:00 2001 From: stephenkgu Date: Tue, 21 Apr 2026 03:09:27 -0700 Subject: [PATCH 08/48] fix group idx range checking with prepared statements --- source/libs/function/src/builtins.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/source/libs/function/src/builtins.c b/source/libs/function/src/builtins.c index d741d2773373..c9ee9f55d74f 100644 --- a/source/libs/function/src/builtins.c +++ b/source/libs/function/src/builtins.c @@ -1173,9 +1173,13 @@ static int32_t translateRegexpExtract(SFunctionNode* pFunc, char* pErrBuf, int32 if (!IS_INTEGER_TYPE(pIdxVal->node.resType.type)) { return invaildFuncParaTypeErrMsg(pErrBuf, len, "regexp_extract: group_idx must be an integer"); } - int64_t groupIdx = taosStr2Int64(pIdxVal->literal, NULL, 10); - if (groupIdx < 0 || groupIdx > 512) { - return invaildFuncParaValueErrMsg(pErrBuf, len, "regexp_extract: group_idx must be between 0 and 512"); + // Skip range validation for prepared-statement placeholders — the bound value + // is not yet known; the runtime check in regexpExtractFunction applies instead. + if (pIdxVal->placeholderNo == 0) { + int64_t groupIdx = taosStr2Int64(pIdxVal->literal, NULL, 10); + if (groupIdx < 0 || groupIdx > 512) { + return invaildFuncParaValueErrMsg(pErrBuf, len, "regexp_extract: group_idx must be between 0 and 512"); + } } } From a41023d3fe81c7dfcbc4218b949fb27d8789d971 Mon Sep 17 00:00:00 2001 From: stephenkgu Date: Tue, 21 Apr 2026 03:37:51 -0700 Subject: [PATCH 09/48] manage mem buffer outside of loop --- source/libs/scalar/src/sclfunc.c | 58 +++++++++++++++----------------- 1 file changed, 27 insertions(+), 31 deletions(-) diff --git a/source/libs/scalar/src/sclfunc.c b/source/libs/scalar/src/sclfunc.c index 43108ea2f47f..9f7081901362 100644 --- a/source/libs/scalar/src/sclfunc.c +++ b/source/libs/scalar/src/sclfunc.c @@ -1916,6 +1916,10 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar int32_t strType = GET_PARAM_TYPE(&pInput[0]); bool isNchar = (strType == TSDB_DATA_TYPE_NCHAR); + // Null-termination buffer shared across rows — grown via realloc only when needed + char *strNt = NULL; + int32_t strNtCap = 0; + for (int32_t i = 0; i < numOfRows; i++) { if (colDataIsNull_s(pStrData, i)) { colDataSetNULL(pOutputData, i); @@ -1932,42 +1936,35 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar bool needFreeUtf8 = false; if (isNchar) { if (convNcharToVarchar(strVal, &strUtf8, strLen, &strUtf8Len, pInput[0].charsetCxt) != 0) { - terrno = TSDB_CODE_SCALAR_CONVERT_ERROR; - return terrno; + code = TSDB_CODE_SCALAR_CONVERT_ERROR; + terrno = code; + break; } needFreeUtf8 = true; } - // Null-terminate the string for regexec - char ntBuf[1024]; - char *strNt = ntBuf; - bool needFreeNt = false; - if (strUtf8Len >= (int32_t)sizeof(ntBuf)) { - strNt = taosMemoryMalloc(strUtf8Len + 1); - needFreeNt = true; - if (strNt == NULL) { + // Grow the null-termination buffer only when the current row needs more space + if (strUtf8Len + 1 > strNtCap) { + char *tmp = taosMemoryRealloc(strNt, strUtf8Len + 1); + if (tmp == NULL) { if (needFreeUtf8) taosMemoryFree(strUtf8); code = terrno; break; } + strNt = tmp; + strNtCap = strUtf8Len + 1; } (void)memcpy(strNt, strUtf8, strUtf8Len); strNt[strUtf8Len] = '\0'; - int ret = regexec(regex, strNt, nmatch, pmatch, 0); - bool requestedGroupAvailable = - (groupIdx >= 0) && - ((size_t)groupIdx < nmatch) && - ((size_t)groupIdx <= regex->re_nsub); - - if (ret == REG_NOMATCH || (ret == 0 && (!requestedGroupAvailable || pmatch[groupIdx].rm_so == -1))) { - // no match, the requested capture group does not exist, or it did not participate + int ret = regexec(regex, strNt, nmatch, pmatch, 0); + if (ret == REG_NOMATCH || (ret == 0 && pmatch[groupIdx].rm_so == -1)) { + // no match, or the requested capture group did not participate colDataSetNULL(pOutputData, i); } else if (ret != 0) { // real regex execution error (e.g. REG_ESPACE) code = TSDB_CODE_PAR_REGULAR_EXPRESSION_ERROR; terrno = code; - if (needFreeNt) taosMemoryFree(strNt); if (needFreeUtf8) taosMemoryFree(strUtf8); break; } else { @@ -1978,19 +1975,18 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar // Convert matched UTF-8 bytes back to NCHAR (UCS-4) char *matchedNchar = NULL; int32_t matchedNcharLen = 0; - int32_t convCode = - convVarcharToNchar(strNt + matchStart, &matchedNchar, matchLen, &matchedNcharLen, pInput[0].charsetCxt); - if (convCode != 0) { - code = convCode; + code = convVarcharToNchar(strNt + matchStart, &matchedNchar, matchLen, &matchedNcharLen, + pInput[0].charsetCxt); + if (code != TSDB_CODE_SUCCESS) { terrno = code; - if (matchedNchar != NULL) taosMemoryFree(matchedNchar); - } else { - *(VarDataLenT *)outBuf = matchedNcharLen; - (void)memcpy(outBuf + VARSTR_HEADER_SIZE, matchedNchar, matchedNcharLen); - taosMemoryFree(matchedNchar); - code = colDataSetVal(pOutputData, i, outBuf, false); - if (code != TSDB_CODE_SUCCESS) terrno = code; + if (needFreeUtf8) taosMemoryFree(strUtf8); + break; } + *(VarDataLenT *)outBuf = matchedNcharLen; + (void)memcpy(outBuf + VARSTR_HEADER_SIZE, matchedNchar, matchedNcharLen); + taosMemoryFree(matchedNchar); + code = colDataSetVal(pOutputData, i, outBuf, false); + if (code != TSDB_CODE_SUCCESS) terrno = code; } else { *(VarDataLenT *)outBuf = matchLen; (void)memcpy(outBuf + VARSTR_HEADER_SIZE, strNt + matchStart, matchLen); @@ -1999,11 +1995,11 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar } } - if (needFreeNt) taosMemoryFree(strNt); if (needFreeUtf8) taosMemoryFree(strUtf8); if (code != TSDB_CODE_SUCCESS) break; } + taosMemoryFree(strNt); taosMemoryFree(outBuf); taosMemoryFree(pmatch); _exit: From b38457890cd7ae0dda60b8d63a8f2f47cb5fa9f7 Mon Sep 17 00:00:00 2001 From: stephenkgu Date: Tue, 21 Apr 2026 04:01:04 -0700 Subject: [PATCH 10/48] use int64_t to validate group index arg instead of int32_t for potential overflow --- source/libs/scalar/src/sclfunc.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/source/libs/scalar/src/sclfunc.c b/source/libs/scalar/src/sclfunc.c index 9f7081901362..213069d517f0 100644 --- a/source/libs/scalar/src/sclfunc.c +++ b/source/libs/scalar/src/sclfunc.c @@ -1844,16 +1844,19 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar return TSDB_CODE_SUCCESS; } - // Get group_idx (default 1; param[2] is an optional integer constant) - int32_t groupIdx = 1; + // Get group_idx (default 1; param[2] is an optional integer constant). + // Read into int64_t first to avoid silent truncation/wrap for BIGINT/UBIGINT + // placeholder values before the range check, then cast after validation. + int64_t groupIdxRaw = 1; if (inputNum == 3 && !IS_NULL_TYPE(GET_PARAM_TYPE(&pInput[2])) && !colDataIsNull_s(pInput[2].columnData, 0)) { - GET_TYPED_DATA(groupIdx, int32_t, GET_PARAM_TYPE(&pInput[2]), + GET_TYPED_DATA(groupIdxRaw, int64_t, GET_PARAM_TYPE(&pInput[2]), colDataGetData(pInput[2].columnData, 0), typeGetTypeModFromColInfo(&pInput[2].columnData->info)); } - if (groupIdx < 0 || groupIdx > 512) { + if (groupIdxRaw < 0 || groupIdxRaw > 512) { return TSDB_CODE_FUNC_FUNTION_PARA_VALUE; } + int32_t groupIdx = (int32_t)groupIdxRaw; // Build null-terminated UTF-8 pattern string (pattern is a constant, always 1 row) char patBuf[512]; From 3505a52cffee721fb3415f9d03c084da43d27242 Mon Sep 17 00:00:00 2001 From: stephenkgu Date: Tue, 21 Apr 2026 22:34:54 -0700 Subject: [PATCH 11/48] fix(ext-win): fix external window compilation --- source/libs/executor/src/externalwindowoperator.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/libs/executor/src/externalwindowoperator.c b/source/libs/executor/src/externalwindowoperator.c index af181549f93d..061c18255f68 100644 --- a/source/libs/executor/src/externalwindowoperator.c +++ b/source/libs/executor/src/externalwindowoperator.c @@ -2494,7 +2494,7 @@ static int32_t extWinApplyAggPostProjection(SOperatorInfo* pOperator, SExternalW SSDataBlock* pSlice = pExtW->pProjTmpBlock; TAOS_CHECK_EXIT(projectApplyFunctions(pExtW->projSupp.pExprInfo, pSlice, pSlice, pExtW->projSupp.pCtx, pExtW->projSupp.numOfExprs, NULL, - GET_STM_RTINFO(pOperator->pTaskInfo))); + GET_STM_RTINFO(pOperator->pTaskInfo), pOperator->pTaskInfo)); int32_t numOfCols = taosArrayGetSize(pBlock->pDataBlock); // TODO(perf): only copy back the slots actually written by projSupp, not all columns. From f40ccddfed6a565a4e2ca0799a719fdbf850ad49 Mon Sep 17 00:00:00 2001 From: Minglei Jin <49711132+stephenkgu@users.noreply.github.com> Date: Wed, 22 Apr 2026 00:15:19 -0700 Subject: [PATCH 12/48] Update source/libs/executor/src/externalwindowoperator.c Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- source/libs/executor/src/externalwindowoperator.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/source/libs/executor/src/externalwindowoperator.c b/source/libs/executor/src/externalwindowoperator.c index e8b979e7046a..79b433a80d05 100644 --- a/source/libs/executor/src/externalwindowoperator.c +++ b/source/libs/executor/src/externalwindowoperator.c @@ -2493,13 +2493,8 @@ static int32_t extWinApplyAggPostProjection(SOperatorInfo* pOperator, SExternalW SSDataBlock* pSlice = pExtW->pProjTmpBlock; TAOS_CHECK_EXIT(projectApplyFunctions(pExtW->projSupp.pExprInfo, pSlice, pSlice, pExtW->projSupp.pCtx, -<<<<<<< fix/6968250338 pExtW->projSupp.numOfExprs, NULL, GET_STM_RTINFO(pOperator->pTaskInfo), pOperator->pTaskInfo)); -======= - pExtW->projSupp.numOfExprs, NULL, GET_STM_RTINFO(pOperator->pTaskInfo), - pOperator->pTaskInfo)); ->>>>>>> 3.0 int32_t numOfCols = taosArrayGetSize(pBlock->pDataBlock); // TODO(perf): only copy back the slots actually written by projSupp, not all columns. From 3213b43d28d9ebd049af8c08c5254aedf4ff0744 Mon Sep 17 00:00:00 2001 From: Minglei Jin <49711132+stephenkgu@users.noreply.github.com> Date: Wed, 22 Apr 2026 00:22:48 -0700 Subject: [PATCH 13/48] Update source/libs/scalar/src/sclfunc.c Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- source/libs/scalar/src/sclfunc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/libs/scalar/src/sclfunc.c b/source/libs/scalar/src/sclfunc.c index 9466c93d9656..2584a8eb4f0f 100644 --- a/source/libs/scalar/src/sclfunc.c +++ b/source/libs/scalar/src/sclfunc.c @@ -1938,8 +1938,8 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar int32_t strUtf8Len = strLen; bool needFreeUtf8 = false; if (isNchar) { - if (convNcharToVarchar(strVal, &strUtf8, strLen, &strUtf8Len, pInput[0].charsetCxt) != 0) { - code = TSDB_CODE_SCALAR_CONVERT_ERROR; + code = convNcharToVarchar(strVal, &strUtf8, strLen, &strUtf8Len, pInput[0].charsetCxt); + if (code != TSDB_CODE_SUCCESS) { terrno = code; break; } From 9e090bef532c88ece43a93b7162606b7d9ecd2df Mon Sep 17 00:00:00 2001 From: Minglei Jin <49711132+stephenkgu@users.noreply.github.com> Date: Wed, 22 Apr 2026 00:31:35 -0700 Subject: [PATCH 14/48] Update source/libs/scalar/src/sclfunc.c Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- source/libs/scalar/src/sclfunc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/source/libs/scalar/src/sclfunc.c b/source/libs/scalar/src/sclfunc.c index 2584a8eb4f0f..b1fd18b658cd 100644 --- a/source/libs/scalar/src/sclfunc.c +++ b/source/libs/scalar/src/sclfunc.c @@ -1899,13 +1899,16 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar goto _exit; } - // regmatch_t array: index 0 = whole match, 1..groupIdx = capture groups + // regmatch_t array: index 0 = whole match, 1..groupIdx = capture groups. + // Initialize all entries to -1 so any submatch slots not written by regexec + // (for example when groupIdx exceeds regex->re_nsub) remain deterministic. int32_t nmatch = groupIdx + 1; regmatch_t *pmatch = taosMemoryMalloc(nmatch * sizeof(regmatch_t)); if (pmatch == NULL) { code = terrno; goto _exit; } + (void)memset(pmatch, 0xFF, nmatch * sizeof(regmatch_t)); // Output buffer: same byte width as the str column int32_t outBufLen = pStrData->info.bytes; From 6bdc547c534e032c95fb0c857d69d1f89675844f Mon Sep 17 00:00:00 2001 From: Minglei Jin <49711132+stephenkgu@users.noreply.github.com> Date: Wed, 22 Apr 2026 00:32:35 -0700 Subject: [PATCH 15/48] Update test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../11-Functions/01-Scalar/test_fun_sca_regexp_extract.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py b/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py index 0856a1df8402..e634fb819222 100644 --- a/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py +++ b/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py @@ -149,8 +149,8 @@ def _check_basic(self, dbname="db"): # ----------------------------------------------------------------- # §6 Table queries — per-row scalar behavior # ----------------------------------------------------------------- - # RXE-TBL-001: extract numeric code — only first row matches 'code=N' - # fully; verify row-by-row extraction + # RXE-TBL-001: extract numeric code — multiple rows match 'code=([0-9]+)'; + # verify row-by-row extraction for 42, 7, 0, and NULL propagation tdSql.query(f"SELECT REGEXP_EXTRACT(vc, 'code=([0-9]+)') FROM {dbname}.ct1 ORDER BY ts") tdSql.checkRows(4) tdSql.checkData(0, 0, '42') From b11c32417fe5ee8d48b877476e58889ce7964ab5 Mon Sep 17 00:00:00 2001 From: stephenkgu Date: Wed, 22 Apr 2026 01:45:23 -0700 Subject: [PATCH 16/48] use cleanup block instead of direct return --- source/libs/function/src/builtins.c | 10 +++++++++- source/libs/scalar/src/sclfunc.c | 8 ++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/source/libs/function/src/builtins.c b/source/libs/function/src/builtins.c index 9af4eb7f8ded..51593b4a9a5a 100644 --- a/source/libs/function/src/builtins.c +++ b/source/libs/function/src/builtins.c @@ -1148,7 +1148,15 @@ static int32_t translateRegexpExtract(SFunctionNode* pFunc, char* pErrBuf, int32 regPattern = utf8Pat; freeUtf8Pat = true; } else if (pPatVal->datum.p != NULL) { - regPattern = (const char*)varDataVal(pPatVal->datum.p); + // datum.p is a length-prefixed vardata buffer — not NUL-terminated. + // Build a NUL-terminated copy for regcomp(). + int32_t patBytes = varDataLen(pPatVal->datum.p); + utf8Pat = taosMemoryMalloc(patBytes + 1); + if (utf8Pat == NULL) return terrno; + (void)memcpy(utf8Pat, varDataVal(pPatVal->datum.p), patBytes); + utf8Pat[patBytes] = '\0'; + regPattern = utf8Pat; + freeUtf8Pat = true; } else { regPattern = pPatVal->literal; } diff --git a/source/libs/scalar/src/sclfunc.c b/source/libs/scalar/src/sclfunc.c index b1fd18b658cd..112c6b4d76ed 100644 --- a/source/libs/scalar/src/sclfunc.c +++ b/source/libs/scalar/src/sclfunc.c @@ -1876,7 +1876,8 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar if (tmp == NULL) { taosMemoryFree(patStr); needFreePat = false; - return terrno; + code = terrno; + goto _exit; } patStr = tmp; patStr[patLen] = '\0'; @@ -1884,7 +1885,10 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar patLen = rawPatLen; if (patLen >= (int32_t)sizeof(patBuf)) { patStr = taosMemoryMalloc(patLen + 1); - if (patStr == NULL) return terrno; + if (patStr == NULL) { + code = terrno; + goto _exit; + } needFreePat = true; } (void)memcpy(patStr, rawPat, patLen); From d56a93e4b9d8d81bf22592793306789458e3696f Mon Sep 17 00:00:00 2001 From: stephenkgu Date: Wed, 22 Apr 2026 02:17:10 -0700 Subject: [PATCH 17/48] use header extra buffer even though its length contains header already --- source/libs/scalar/src/sclfunc.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/source/libs/scalar/src/sclfunc.c b/source/libs/scalar/src/sclfunc.c index 112c6b4d76ed..4abae062fa76 100644 --- a/source/libs/scalar/src/sclfunc.c +++ b/source/libs/scalar/src/sclfunc.c @@ -1854,6 +1854,7 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar typeGetTypeModFromColInfo(&pInput[2].columnData->info)); } if (groupIdxRaw < 0 || groupIdxRaw > 512) { + pOutput->numOfRows = numOfRows; return TSDB_CODE_FUNC_FUNTION_PARA_VALUE; } int32_t groupIdx = (int32_t)groupIdxRaw; @@ -1867,7 +1868,8 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar char *rawPat = varDataVal(colDataGetData(pPatData, 0)); int32_t rawPatLen = varDataLen(colDataGetData(pPatData, 0)); if (GET_PARAM_TYPE(&pInput[1]) == TSDB_DATA_TYPE_NCHAR) { - SCL_ERR_RET(convNcharToVarchar(rawPat, &patStr, rawPatLen, &patLen, pInput[1].charsetCxt)); + code = convNcharToVarchar(rawPat, &patStr, rawPatLen, &patLen, pInput[1].charsetCxt); + if (code != TSDB_CODE_SUCCESS) goto _exit; needFreePat = true; // convNcharToVarchar allocates rawPatLen bytes (no +1 for NUL); when the // UTF-8 output fills the buffer entirely there is no room for a terminator. @@ -1914,8 +1916,10 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar } (void)memset(pmatch, 0xFF, nmatch * sizeof(regmatch_t)); - // Output buffer: same byte width as the str column - int32_t outBufLen = pStrData->info.bytes; + // Each output cell is a VarData value: VARSTR_HEADER_SIZE length prefix + data. + // Add VARSTR_HEADER_SIZE on top of info.bytes to ensure the header always fits + // regardless of whether the caller's info.bytes already includes it or not. + int32_t outBufLen = pStrData->info.bytes + VARSTR_HEADER_SIZE; char *outBuf = taosMemoryMalloc(outBufLen); if (outBuf == NULL) { taosMemoryFree(pmatch); From 5878f498bc14494e3d4a6e9c21e98c917359e520 Mon Sep 17 00:00:00 2001 From: Minglei Jin <49711132+stephenkgu@users.noreply.github.com> Date: Wed, 22 Apr 2026 02:35:26 -0700 Subject: [PATCH 18/48] Update source/libs/function/src/builtins.c Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- source/libs/function/src/builtins.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/libs/function/src/builtins.c b/source/libs/function/src/builtins.c index 51593b4a9a5a..770b15399abc 100644 --- a/source/libs/function/src/builtins.c +++ b/source/libs/function/src/builtins.c @@ -7554,14 +7554,14 @@ const SBuiltinFuncDefinition funcMgtBuiltins[] = { .inputParaInfo[0][1] = {.isLastParam = false, .startParam = 2, .endParam = 2, - .validDataType = FUNC_PARAM_SUPPORT_VARCHAR_TYPE | FUNC_PARAM_SUPPORT_NCHAR_TYPE, + .validDataType = FUNC_PARAM_SUPPORT_VARCHAR_TYPE | FUNC_PARAM_SUPPORT_NCHAR_TYPE | FUNC_PARAM_SUPPORT_NULL_TYPE, .validNodeType = FUNC_PARAM_SUPPORT_VALUE_NODE, .paramAttribute = FUNC_PARAM_NO_SPECIFIC_ATTRIBUTE, .valueRangeFlag = FUNC_PARAM_NO_SPECIFIC_VALUE,}, .inputParaInfo[0][2] = {.isLastParam = true, .startParam = 3, .endParam = 3, - .validDataType = FUNC_PARAM_SUPPORT_INTEGER_TYPE, + .validDataType = FUNC_PARAM_SUPPORT_INTEGER_TYPE | FUNC_PARAM_SUPPORT_NULL_TYPE, .validNodeType = FUNC_PARAM_SUPPORT_VALUE_NODE, .paramAttribute = FUNC_PARAM_NO_SPECIFIC_ATTRIBUTE, .valueRangeFlag = FUNC_PARAM_NO_SPECIFIC_VALUE,}, From a7b7194f1a2f7fd9bad0c75b9669e21fc5cb4e2b Mon Sep 17 00:00:00 2001 From: stephenkgu Date: Wed, 22 Apr 2026 02:36:16 -0700 Subject: [PATCH 19/48] use macro for maximum of group index instead of hard-coded --- include/libs/scalar/scalar.h | 4 ++++ source/libs/function/src/builtins.c | 2 +- source/libs/scalar/src/sclfunc.c | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/include/libs/scalar/scalar.h b/include/libs/scalar/scalar.h index f22355689b11..807351cc7d07 100644 --- a/include/libs/scalar/scalar.h +++ b/include/libs/scalar/scalar.h @@ -133,6 +133,10 @@ int32_t findInSetFunction(SScalarParam *pInput, int32_t inputNum, SScalarParam * int32_t likeInSetFunction(SScalarParam *pInput, int32_t inputNum, SScalarParam *pOutput); int32_t regexpInSetFunction(SScalarParam *pInput, int32_t inputNum, SScalarParam *pOutput); int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarParam *pOutput); + +// Maximum capture-group index accepted by regexp_extract() — shared between +// translate-time validation (builtins.c) and runtime validation (sclfunc.c). +#define REGEXP_EXTRACT_MAX_GROUP_IDX 512 int32_t generateTotpSecretFunction(SScalarParam *pInput, int32_t inputNum, SScalarParam *pOutput); int32_t generateTotpCodeFunction(SScalarParam *pInput, int32_t inputNum, SScalarParam *pOutput); diff --git a/source/libs/function/src/builtins.c b/source/libs/function/src/builtins.c index 51593b4a9a5a..acee35e61a94 100644 --- a/source/libs/function/src/builtins.c +++ b/source/libs/function/src/builtins.c @@ -1192,7 +1192,7 @@ static int32_t translateRegexpExtract(SFunctionNode* pFunc, char* pErrBuf, int32 // is not yet known; the runtime check in regexpExtractFunction applies instead. if (pIdxVal->placeholderNo == 0) { int64_t groupIdx = taosStr2Int64(pIdxVal->literal, NULL, 10); - if (groupIdx < 0 || groupIdx > 512) { + if (groupIdx < 0 || groupIdx > REGEXP_EXTRACT_MAX_GROUP_IDX) { return invaildFuncParaValueErrMsg(pErrBuf, len, "regexp_extract: group_idx must be between 0 and 512"); } } diff --git a/source/libs/scalar/src/sclfunc.c b/source/libs/scalar/src/sclfunc.c index 4abae062fa76..0b16fcdb6e1d 100644 --- a/source/libs/scalar/src/sclfunc.c +++ b/source/libs/scalar/src/sclfunc.c @@ -1853,7 +1853,7 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar colDataGetData(pInput[2].columnData, 0), typeGetTypeModFromColInfo(&pInput[2].columnData->info)); } - if (groupIdxRaw < 0 || groupIdxRaw > 512) { + if (groupIdxRaw < 0 || groupIdxRaw > REGEXP_EXTRACT_MAX_GROUP_IDX) { pOutput->numOfRows = numOfRows; return TSDB_CODE_FUNC_FUNTION_PARA_VALUE; } From 872601fa050833a3d6218c2458e36cf0b44a8f93 Mon Sep 17 00:00:00 2001 From: Minglei Jin <49711132+stephenkgu@users.noreply.github.com> Date: Wed, 22 Apr 2026 02:47:51 -0700 Subject: [PATCH 20/48] Update test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../11-Functions/01-Scalar/test_fun_sca_regexp_extract.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py b/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py index e634fb819222..5371d82e41af 100644 --- a/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py +++ b/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py @@ -122,7 +122,12 @@ def _check_basic(self, dbname="db"): tdSql.checkRows(1) tdSql.checkData(0, 0, None) - # RXE-NULL-005: non-participating group in alternation → NULL + # RXE-NULL-005: explicit NULL group_idx → NULL + tdSql.query("SELECT REGEXP_EXTRACT('abc', '(b)', NULL)") + tdSql.checkRows(1) + tdSql.checkData(0, 0, None) + + # RXE-NULL-006: non-participating group in alternation → NULL # pattern '(a)|(b)' matches 'b' via group 2; group 1 did not participate tdSql.query("SELECT REGEXP_EXTRACT('b', '(a)|(b)', 1)") tdSql.checkRows(1) From 0189d824a8930e07e4ec49def4838c6732bce74a Mon Sep 17 00:00:00 2001 From: Minglei Jin <49711132+stephenkgu@users.noreply.github.com> Date: Wed, 22 Apr 2026 02:49:30 -0700 Subject: [PATCH 21/48] Update source/libs/function/src/builtins.c Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- source/libs/function/src/builtins.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/source/libs/function/src/builtins.c b/source/libs/function/src/builtins.c index ea39c34a7f92..18b9fab85532 100644 --- a/source/libs/function/src/builtins.c +++ b/source/libs/function/src/builtins.c @@ -1178,22 +1178,26 @@ static int32_t translateRegexpExtract(SFunctionNode* pFunc, char* pErrBuf, int32 if (freeUtf8Pat) taosMemoryFree(utf8Pat); } - // param[2]: group_idx (optional) must be a non-negative integer constant + // param[2]: group_idx (optional) must be a non-negative integer constant. + // NULL is also allowed by the builtin signature and should propagate like + // other scalar functions, so skip integer/range validation for NULL values. if (numOfParams == 3) { SNode* pIdxNode = nodesListGetNode(pFunc->pParameterList, 2); if (QUERY_NODE_VALUE != nodeType(pIdxNode)) { return invaildFuncParaTypeErrMsg(pErrBuf, len, "regexp_extract: group_idx must be a constant integer"); } SValueNode* pIdxVal = (SValueNode*)pIdxNode; - if (!IS_INTEGER_TYPE(pIdxVal->node.resType.type)) { - return invaildFuncParaTypeErrMsg(pErrBuf, len, "regexp_extract: group_idx must be an integer"); - } - // Skip range validation for prepared-statement placeholders — the bound value - // is not yet known; the runtime check in regexpExtractFunction applies instead. - if (pIdxVal->placeholderNo == 0) { - int64_t groupIdx = taosStr2Int64(pIdxVal->literal, NULL, 10); - if (groupIdx < 0 || groupIdx > REGEXP_EXTRACT_MAX_GROUP_IDX) { - return invaildFuncParaValueErrMsg(pErrBuf, len, "regexp_extract: group_idx must be between 0 and 512"); + if (TSDB_DATA_TYPE_NULL != pIdxVal->node.resType.type) { + if (!IS_INTEGER_TYPE(pIdxVal->node.resType.type)) { + return invaildFuncParaTypeErrMsg(pErrBuf, len, "regexp_extract: group_idx must be an integer"); + } + // Skip range validation for prepared-statement placeholders — the bound value + // is not yet known; the runtime check in regexpExtractFunction applies instead. + if (pIdxVal->placeholderNo == 0) { + int64_t groupIdx = taosStr2Int64(pIdxVal->literal, NULL, 10); + if (groupIdx < 0 || groupIdx > REGEXP_EXTRACT_MAX_GROUP_IDX) { + return invaildFuncParaValueErrMsg(pErrBuf, len, "regexp_extract: group_idx must be between 0 and 512"); + } } } } From 37f599a7bd65684c878101dc762aa4a84a82e861 Mon Sep 17 00:00:00 2001 From: stephenkgu Date: Wed, 22 Apr 2026 02:53:47 -0700 Subject: [PATCH 22/48] en & zh doc for this func --- .../14-reference/03-taos-sql/22-function.md | 41 +++++++++++++++++++ .../14-reference/03-taos-sql/22-function.md | 41 +++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/docs/en/14-reference/03-taos-sql/22-function.md b/docs/en/14-reference/03-taos-sql/22-function.md index 335b3b2f6873..a9186c794c8c 100644 --- a/docs/en/14-reference/03-taos-sql/22-function.md +++ b/docs/en/14-reference/03-taos-sql/22-function.md @@ -865,6 +865,47 @@ LTRIM(expr) **Applicable to**: Tables and supertables. +#### REGEXP_EXTRACT + +```sql +REGEXP_EXTRACT(expr, pattern [, group_idx]) +``` + +**Function Description**: Applies the POSIX extended regular expression `pattern` to `expr` and returns the substring matched by capture group `group_idx`. Returns NULL when there is no match or when `expr` or `pattern` is NULL. + +**Return Type**: Same as `expr` (VARCHAR or NCHAR). + +**Applicable Data Types**: `expr`: VARCHAR, NCHAR. `pattern`: VARCHAR, NCHAR. + +**Nested Subquery Support**: Applicable to both inner and outer queries. + +**Applicable to**: Tables and supertables. + +**Usage**: + +- `group_idx` is a non-negative integer constant (default `1`). `0` returns the entire match; `1` returns the first capture group, `2` the second, and so on. The maximum value is 512. +- Returns NULL if `group_idx` exceeds the number of capture groups in `pattern`, or if the addressed group did not participate in the match. +- `pattern` must be a constant expression; it cannot reference a column. + +**Example**: + +```sql +taos> SELECT REGEXP_EXTRACT('2026-04-22', '(\d{4})-(\d{2})-(\d{2})', 1); + regexp_extract('2026-04-22', '(\d{4})-(\d{2})-(\d{2})', 1) | +============================================================== + 2026 | + +taos> SELECT REGEXP_EXTRACT('2026-04-22', '(\d{4})-(\d{2})-(\d{2})', 0); + regexp_extract('2026-04-22', '(\d{4})-(\d{2})-(\d{2})', 0) | +============================================================== + 2026-04-22 | + +taos> SELECT REGEXP_EXTRACT('no-digits-here', '\d+', 1); + regexp_extract('no-digits-here', '\d+', 1) | +============================================ + NULL | +``` + #### REGEXP_IN_SET ```sql diff --git a/docs/zh/14-reference/03-taos-sql/22-function.md b/docs/zh/14-reference/03-taos-sql/22-function.md index 83872b2af85a..ebb9a8a257b5 100644 --- a/docs/zh/14-reference/03-taos-sql/22-function.md +++ b/docs/zh/14-reference/03-taos-sql/22-function.md @@ -1044,6 +1044,47 @@ taos> select position('d' in 'cba'); 0 | ``` +#### REGEXP_EXTRACT + +```sql +REGEXP_EXTRACT(expr, pattern [, group_idx]) +``` + +**功能说明**:对 `expr` 应用 POSIX 扩展正则表达式 `pattern`,返回第 `group_idx` 个捕获组匹配的子串。无匹配、`expr` 或 `pattern` 为 NULL 时返回 NULL。 + +**返回结果类型**:与 `expr` 相同(VARCHAR 或 NCHAR)。 + +**适用数据类型**:`expr`:VARCHAR、NCHAR;`pattern`:VARCHAR、NCHAR。 + +**嵌套子查询支持**:适用于内层查询和外层查询。 + +**适用于**:表和超级表。 + +**使用说明**: + +- `group_idx` 为非负整数常量,默认为 `1`。`0` 返回整个匹配串,`1` 返回第一个捕获组,`2` 返回第二个,以此类推,最大值为 512。 +- 若 `group_idx` 超过 `pattern` 中的捕获组数量,或对应捕获组未参与匹配,返回 NULL。 +- `pattern` 必须为常量表达式,不可引用列。 + +**举例**: + +```sql +taos> SELECT REGEXP_EXTRACT('2026-04-22', '(\d{4})-(\d{2})-(\d{2})', 1); + regexp_extract('2026-04-22', '(\d{4})-(\d{2})-(\d{2})', 1) | +============================================================== + 2026 | + +taos> SELECT REGEXP_EXTRACT('2026-04-22', '(\d{4})-(\d{2})-(\d{2})', 0); + regexp_extract('2026-04-22', '(\d{4})-(\d{2})-(\d{2})', 0) | +============================================================== + 2026-04-22 | + +taos> SELECT REGEXP_EXTRACT('no-digits-here', '\d+', 1); + regexp_extract('no-digits-here', '\d+', 1) | +============================================ + NULL | +``` + #### REGEXP_IN_SET ```sql From 640f58055330907e8ed8ecf375f868764b235753 Mon Sep 17 00:00:00 2001 From: Minglei Jin <49711132+stephenkgu@users.noreply.github.com> Date: Wed, 22 Apr 2026 02:57:14 -0700 Subject: [PATCH 23/48] Update source/libs/function/src/builtins.c Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- source/libs/function/src/builtins.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/source/libs/function/src/builtins.c b/source/libs/function/src/builtins.c index 18b9fab85532..26addad77ebd 100644 --- a/source/libs/function/src/builtins.c +++ b/source/libs/function/src/builtins.c @@ -1180,15 +1180,19 @@ static int32_t translateRegexpExtract(SFunctionNode* pFunc, char* pErrBuf, int32 // param[2]: group_idx (optional) must be a non-negative integer constant. // NULL is also allowed by the builtin signature and should propagate like - // other scalar functions, so skip integer/range validation for NULL values. + // other scalar functions, so accept NULL-typed value nodes here and rely + // on runtime to return a NULL result. if (numOfParams == 3) { SNode* pIdxNode = nodesListGetNode(pFunc->pParameterList, 2); if (QUERY_NODE_VALUE != nodeType(pIdxNode)) { return invaildFuncParaTypeErrMsg(pErrBuf, len, "regexp_extract: group_idx must be a constant integer"); } + SValueNode* pIdxVal = (SValueNode*)pIdxNode; - if (TSDB_DATA_TYPE_NULL != pIdxVal->node.resType.type) { - if (!IS_INTEGER_TYPE(pIdxVal->node.resType.type)) { + int32_t idxType = pIdxVal->node.resType.type; + + if (TSDB_DATA_TYPE_NULL != idxType) { + if (!IS_INTEGER_TYPE(idxType)) { return invaildFuncParaTypeErrMsg(pErrBuf, len, "regexp_extract: group_idx must be an integer"); } // Skip range validation for prepared-statement placeholders — the bound value From 816ccb57bc1197cb96ee72b7425c4f85c7ccd0ea Mon Sep 17 00:00:00 2001 From: stephenkgu Date: Wed, 22 Apr 2026 02:59:51 -0700 Subject: [PATCH 24/48] fix undefined behavior with reg lib --- source/libs/scalar/src/sclfunc.c | 5 ++++- source/util/src/tcompare.c | 6 +++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/source/libs/scalar/src/sclfunc.c b/source/libs/scalar/src/sclfunc.c index 0b16fcdb6e1d..4521acc3ba62 100644 --- a/source/libs/scalar/src/sclfunc.c +++ b/source/libs/scalar/src/sclfunc.c @@ -1868,6 +1868,7 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar char *rawPat = varDataVal(colDataGetData(pPatData, 0)); int32_t rawPatLen = varDataLen(colDataGetData(pPatData, 0)); if (GET_PARAM_TYPE(&pInput[1]) == TSDB_DATA_TYPE_NCHAR) { + patStr = NULL; // ensure convNcharToVarchar always mallocs a fresh heap buffer code = convNcharToVarchar(rawPat, &patStr, rawPatLen, &patLen, pInput[1].charsetCxt); if (code != TSDB_CODE_SUCCESS) goto _exit; needFreePat = true; @@ -1945,7 +1946,7 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar int32_t strLen = varDataLen(strRaw); // For NCHAR (UCS-4), convert to UTF-8 before matching - char *strUtf8 = strVal; + char *strUtf8 = NULL; // set NULL so convNcharToVarchar always mallocs a fresh heap buffer int32_t strUtf8Len = strLen; bool needFreeUtf8 = false; if (isNchar) { @@ -1955,6 +1956,8 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar break; } needFreeUtf8 = true; + } else { + strUtf8 = strVal; } // Grow the null-termination buffer only when the current row needs more space diff --git a/source/util/src/tcompare.c b/source/util/src/tcompare.c index 07328a5fb302..c7d47554d865 100644 --- a/source/util/src/tcompare.c +++ b/source/util/src/tcompare.c @@ -1426,7 +1426,7 @@ int32_t checkRegexPattern(const char *pPattern) { int32_t ret = regcomp(®ex, pPattern, cflags); if (ret != 0) { char msgbuf[256] = {0}; - (void)regerror(ret, ®ex, msgbuf, tListLen(msgbuf)); + (void)regerror(ret, NULL, msgbuf, tListLen(msgbuf)); uError("Failed to compile regex pattern %s. reason %s", pPattern, msgbuf); return TSDB_CODE_PAR_REGULAR_EXPRESSION_ERROR; } @@ -1450,7 +1450,7 @@ int32_t getRegComp(const char *pPattern, HashRegexPtr **regexRet) { int32_t ret = regcomp(&pUsingRegex->pRegex, pPattern, cflags); if (ret != 0) { char msgbuf[256] = {0}; - (void)regerror(ret, &pUsingRegex->pRegex, msgbuf, tListLen(msgbuf)); + (void)regerror(ret, NULL, msgbuf, tListLen(msgbuf)); uError("Failed to compile regex pattern %s. reason %s", pPattern, msgbuf); taosMemoryFree(pUsingRegex); return TSDB_CODE_PAR_REGULAR_EXPRESSION_ERROR; @@ -1516,7 +1516,7 @@ int32_t threadGetRegComp(regex_t **regex, const char *pPattern) { int32_t ret = regcomp(&gRegex, pPattern, cflags); if (ret != 0) { char msgbuf[256] = {0}; - (void)regerror(ret, &gRegex, msgbuf, tListLen(msgbuf)); + (void)regerror(ret, NULL, msgbuf, tListLen(msgbuf)); uError("Failed to compile regex pattern %s. reason %s", pPattern, msgbuf); taosMemoryFree(pOldPattern); pOldPattern = NULL; From 984b0533e4b2f04240648cf5f0721203fb574d8a Mon Sep 17 00:00:00 2001 From: stephenkgu Date: Wed, 22 Apr 2026 19:31:17 -0700 Subject: [PATCH 25/48] move character converting buffer out of loop to dismiss allocating each row --- source/libs/function/src/builtins.c | 2 +- source/libs/scalar/src/sclfunc.c | 71 ++++++++++++++++------------- 2 files changed, 40 insertions(+), 33 deletions(-) diff --git a/source/libs/function/src/builtins.c b/source/libs/function/src/builtins.c index 26addad77ebd..3dfb03fa4210 100644 --- a/source/libs/function/src/builtins.c +++ b/source/libs/function/src/builtins.c @@ -1167,7 +1167,7 @@ static int32_t translateRegexpExtract(SFunctionNode* pFunc, char* pErrBuf, int32 int ret = regcomp(&re, regPattern, REG_EXTENDED); if (ret != 0) { char msgbuf[256] = {0}; - (void)regerror(ret, &re, msgbuf, sizeof(msgbuf)); + (void)regerror(ret, NULL, msgbuf, sizeof(msgbuf)); // do not call regfree — regcomp failed, re contents are undefined (POSIX) if (freeUtf8Pat) taosMemoryFree(utf8Pat); return buildFuncErrMsg(pErrBuf, len, TSDB_CODE_PAR_REGULAR_EXPRESSION_ERROR, diff --git a/source/libs/scalar/src/sclfunc.c b/source/libs/scalar/src/sclfunc.c index 4521acc3ba62..88f0829ba987 100644 --- a/source/libs/scalar/src/sclfunc.c +++ b/source/libs/scalar/src/sclfunc.c @@ -1847,8 +1847,14 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar // Get group_idx (default 1; param[2] is an optional integer constant). // Read into int64_t first to avoid silent truncation/wrap for BIGINT/UBIGINT // placeholder values before the range check, then cast after validation. + // An explicit SQL NULL group_idx propagates NULL to all output rows. int64_t groupIdxRaw = 1; - if (inputNum == 3 && !IS_NULL_TYPE(GET_PARAM_TYPE(&pInput[2])) && !colDataIsNull_s(pInput[2].columnData, 0)) { + if (inputNum == 3) { + if (IS_NULL_TYPE(GET_PARAM_TYPE(&pInput[2])) || colDataIsNull_s(pInput[2].columnData, 0)) { + colDataSetNNULL(pOutputData, 0, numOfRows); + pOutput->numOfRows = numOfRows; + return TSDB_CODE_SUCCESS; + } GET_TYPED_DATA(groupIdxRaw, int64_t, GET_PARAM_TYPE(&pInput[2]), colDataGetData(pInput[2].columnData, 0), typeGetTypeModFromColInfo(&pInput[2].columnData->info)); @@ -1860,7 +1866,7 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar int32_t groupIdx = (int32_t)groupIdxRaw; // Build null-terminated UTF-8 pattern string (pattern is a constant, always 1 row) - char patBuf[512]; + char patBuf[REGEXP_EXTRACT_MAX_GROUP_IDX]; char *patStr = patBuf; int32_t patLen = 0; bool needFreePat = false; @@ -1945,33 +1951,34 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar char *strVal = varDataVal(strRaw); int32_t strLen = varDataLen(strRaw); - // For NCHAR (UCS-4), convert to UTF-8 before matching - char *strUtf8 = NULL; // set NULL so convNcharToVarchar always mallocs a fresh heap buffer - int32_t strUtf8Len = strLen; - bool needFreeUtf8 = false; - if (isNchar) { - code = convNcharToVarchar(strVal, &strUtf8, strLen, &strUtf8Len, pInput[0].charsetCxt); - if (code != TSDB_CODE_SUCCESS) { - terrno = code; + // Grow the null-termination buffer only when the current row needs more space. + // For NCHAR: UTF-8 output is at most strLen bytes (UCS-4 byte count >= UTF-8 byte count), + // so strLen + 1 is a safe upper bound for both NCHAR and VARCHAR paths. + if (strLen + 1 > strNtCap) { + char *tmp = taosMemoryRealloc(strNt, strLen + 1); + if (tmp == NULL) { + code = terrno; break; } - needFreeUtf8 = true; - } else { - strUtf8 = strVal; + strNt = tmp; + strNtCap = strLen + 1; } - // Grow the null-termination buffer only when the current row needs more space - if (strUtf8Len + 1 > strNtCap) { - char *tmp = taosMemoryRealloc(strNt, strUtf8Len + 1); - if (tmp == NULL) { - if (needFreeUtf8) taosMemoryFree(strUtf8); - code = terrno; + // Convert input into the NUL-terminated UTF-8 scratch buffer. + // For NCHAR: convert UCS-4 directly into strNt — avoids per-row malloc/free. + // For VARCHAR: data is already UTF-8, just copy it. + int32_t strUtf8Len; + if (isNchar) { + strUtf8Len = taosUcs4ToMbs((TdUcs4 *)strVal, strLen, strNt, pInput[0].charsetCxt); + if (strUtf8Len < 0) { + code = TSDB_CODE_SCALAR_CONVERT_ERROR; + terrno = code; break; } - strNt = tmp; - strNtCap = strUtf8Len + 1; + } else { + (void)memcpy(strNt, strVal, strLen); + strUtf8Len = strLen; } - (void)memcpy(strNt, strUtf8, strUtf8Len); strNt[strUtf8Len] = '\0'; int ret = regexec(regex, strNt, nmatch, pmatch, 0); @@ -1982,26 +1989,27 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar // real regex execution error (e.g. REG_ESPACE) code = TSDB_CODE_PAR_REGULAR_EXPRESSION_ERROR; terrno = code; - if (needFreeUtf8) taosMemoryFree(strUtf8); break; } else { int32_t matchStart = pmatch[groupIdx].rm_so; int32_t matchLen = pmatch[groupIdx].rm_eo - pmatch[groupIdx].rm_so; if (isNchar) { - // Convert matched UTF-8 bytes back to NCHAR (UCS-4) - char *matchedNchar = NULL; + // Convert matched UTF-8 bytes back to NCHAR (UCS-4) directly into outBuf + // to avoid a per-row malloc/free cycle. + // outBuf data capacity (outBufLen - VARSTR_HEADER_SIZE) >= N*TSDB_NCHAR_SIZE + // which is always >= matchedCodepoints*TSDB_NCHAR_SIZE. int32_t matchedNcharLen = 0; - code = convVarcharToNchar(strNt + matchStart, &matchedNchar, matchLen, &matchedNcharLen, - pInput[0].charsetCxt); - if (code != TSDB_CODE_SUCCESS) { + bool ok = taosMbsToUcs4(strNt + matchStart, matchLen, + (TdUcs4 *)(outBuf + VARSTR_HEADER_SIZE), + outBufLen - VARSTR_HEADER_SIZE, + &matchedNcharLen, pInput[0].charsetCxt); + if (!ok) { + code = TSDB_CODE_SCALAR_CONVERT_ERROR; terrno = code; - if (needFreeUtf8) taosMemoryFree(strUtf8); break; } *(VarDataLenT *)outBuf = matchedNcharLen; - (void)memcpy(outBuf + VARSTR_HEADER_SIZE, matchedNchar, matchedNcharLen); - taosMemoryFree(matchedNchar); code = colDataSetVal(pOutputData, i, outBuf, false); if (code != TSDB_CODE_SUCCESS) terrno = code; } else { @@ -2012,7 +2020,6 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar } } - if (needFreeUtf8) taosMemoryFree(strUtf8); if (code != TSDB_CODE_SUCCESS) break; } From ceeb5cb33a237295e9c0eac4cb0e5f7ace84f87c Mon Sep 17 00:00:00 2001 From: stephenkgu Date: Wed, 22 Apr 2026 19:43:30 -0700 Subject: [PATCH 26/48] copy result data to dismiss dangling pointers --- source/libs/scalar/src/sclfunc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/source/libs/scalar/src/sclfunc.c b/source/libs/scalar/src/sclfunc.c index 88f0829ba987..c1a662064eef 100644 --- a/source/libs/scalar/src/sclfunc.c +++ b/source/libs/scalar/src/sclfunc.c @@ -1866,7 +1866,7 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar int32_t groupIdx = (int32_t)groupIdxRaw; // Build null-terminated UTF-8 pattern string (pattern is a constant, always 1 row) - char patBuf[REGEXP_EXTRACT_MAX_GROUP_IDX]; + char patBuf[512]; char *patStr = patBuf; int32_t patLen = 0; bool needFreePat = false; @@ -2010,12 +2010,12 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar break; } *(VarDataLenT *)outBuf = matchedNcharLen; - code = colDataSetVal(pOutputData, i, outBuf, false); + code = colDataSetVal(pOutputData, i, outBuf, true); if (code != TSDB_CODE_SUCCESS) terrno = code; } else { *(VarDataLenT *)outBuf = matchLen; (void)memcpy(outBuf + VARSTR_HEADER_SIZE, strNt + matchStart, matchLen); - code = colDataSetVal(pOutputData, i, outBuf, false); + code = colDataSetVal(pOutputData, i, outBuf, true); if (code != TSDB_CODE_SUCCESS) terrno = code; } } From 21e6d42132fa1307f0eab683483f230bebd35532 Mon Sep 17 00:00:00 2001 From: stephenkgu Date: Wed, 22 Apr 2026 20:00:06 -0700 Subject: [PATCH 27/48] revert set data value with true, the 4th args is isNull, not isCopy --- source/libs/scalar/src/sclfunc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/source/libs/scalar/src/sclfunc.c b/source/libs/scalar/src/sclfunc.c index c1a662064eef..dc77f9f24bc5 100644 --- a/source/libs/scalar/src/sclfunc.c +++ b/source/libs/scalar/src/sclfunc.c @@ -2010,12 +2010,12 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar break; } *(VarDataLenT *)outBuf = matchedNcharLen; - code = colDataSetVal(pOutputData, i, outBuf, true); + code = colDataSetVal(pOutputData, i, outBuf, false); if (code != TSDB_CODE_SUCCESS) terrno = code; } else { *(VarDataLenT *)outBuf = matchLen; (void)memcpy(outBuf + VARSTR_HEADER_SIZE, strNt + matchStart, matchLen); - code = colDataSetVal(pOutputData, i, outBuf, true); + code = colDataSetVal(pOutputData, i, outBuf, false); if (code != TSDB_CODE_SUCCESS) terrno = code; } } From 7a86c2948e328320abdad5c83381b1f536d1ca93 Mon Sep 17 00:00:00 2001 From: stephenkgu Date: Wed, 22 Apr 2026 20:17:10 -0700 Subject: [PATCH 28/48] fix zh, en doc examples and validate with test cases --- .../14-reference/03-taos-sql/22-function.md | 28 +++++++++---------- .../14-reference/03-taos-sql/22-function.md | 28 +++++++++---------- .../01-Scalar/test_fun_sca_regexp_extract.py | 21 ++++++++++++++ 3 files changed, 49 insertions(+), 28 deletions(-) diff --git a/docs/en/14-reference/03-taos-sql/22-function.md b/docs/en/14-reference/03-taos-sql/22-function.md index a9186c794c8c..cb7f9b5c401b 100644 --- a/docs/en/14-reference/03-taos-sql/22-function.md +++ b/docs/en/14-reference/03-taos-sql/22-function.md @@ -890,20 +890,20 @@ REGEXP_EXTRACT(expr, pattern [, group_idx]) **Example**: ```sql -taos> SELECT REGEXP_EXTRACT('2026-04-22', '(\d{4})-(\d{2})-(\d{2})', 1); - regexp_extract('2026-04-22', '(\d{4})-(\d{2})-(\d{2})', 1) | -============================================================== - 2026 | - -taos> SELECT REGEXP_EXTRACT('2026-04-22', '(\d{4})-(\d{2})-(\d{2})', 0); - regexp_extract('2026-04-22', '(\d{4})-(\d{2})-(\d{2})', 0) | -============================================================== - 2026-04-22 | - -taos> SELECT REGEXP_EXTRACT('no-digits-here', '\d+', 1); - regexp_extract('no-digits-here', '\d+', 1) | -============================================ - NULL | +taos> SELECT REGEXP_EXTRACT('2026-04-22', '([0-9]{4})-([0-9]{2})-([0-9]{2})', 1); + regexp_extract('2026-04-22', '([0-9]{4})-([0-9]{2})-([0-9]{2})', 1) | +======================================================================= + 2026 | + +taos> SELECT REGEXP_EXTRACT('2026-04-22', '([0-9]{4})-([0-9]{2})-([0-9]{2})', 0); + regexp_extract('2026-04-22', '([0-9]{4})-([0-9]{2})-([0-9]{2})', 0) | +======================================================================= + 2026-04-22 | + +taos> SELECT REGEXP_EXTRACT('no-digits-here', '[0-9]+', 1); + regexp_extract('no-digits-here', '[0-9]+', 1) | +=============================================== + NULL | ``` #### REGEXP_IN_SET diff --git a/docs/zh/14-reference/03-taos-sql/22-function.md b/docs/zh/14-reference/03-taos-sql/22-function.md index ebb9a8a257b5..5b6f3e0fcf34 100644 --- a/docs/zh/14-reference/03-taos-sql/22-function.md +++ b/docs/zh/14-reference/03-taos-sql/22-function.md @@ -1069,20 +1069,20 @@ REGEXP_EXTRACT(expr, pattern [, group_idx]) **举例**: ```sql -taos> SELECT REGEXP_EXTRACT('2026-04-22', '(\d{4})-(\d{2})-(\d{2})', 1); - regexp_extract('2026-04-22', '(\d{4})-(\d{2})-(\d{2})', 1) | -============================================================== - 2026 | - -taos> SELECT REGEXP_EXTRACT('2026-04-22', '(\d{4})-(\d{2})-(\d{2})', 0); - regexp_extract('2026-04-22', '(\d{4})-(\d{2})-(\d{2})', 0) | -============================================================== - 2026-04-22 | - -taos> SELECT REGEXP_EXTRACT('no-digits-here', '\d+', 1); - regexp_extract('no-digits-here', '\d+', 1) | -============================================ - NULL | +taos> SELECT REGEXP_EXTRACT('2026-04-22', '([0-9]{4})-([0-9]{2})-([0-9]{2})', 1); + regexp_extract('2026-04-22', '([0-9]{4})-([0-9]{2})-([0-9]{2})', 1) | +======================================================================= + 2026 | + +taos> SELECT REGEXP_EXTRACT('2026-04-22', '([0-9]{4})-([0-9]{2})-([0-9]{2})', 0); + regexp_extract('2026-04-22', '([0-9]{4})-([0-9]{2})-([0-9]{2})', 0) | +======================================================================= + 2026-04-22 | + +taos> SELECT REGEXP_EXTRACT('no-digits-here', '[0-9]+', 1); + regexp_extract('no-digits-here', '[0-9]+', 1) | +=============================================== + NULL | ``` #### REGEXP_IN_SET diff --git a/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py b/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py index 5371d82e41af..972ff7e39bb4 100644 --- a/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py +++ b/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py @@ -269,9 +269,29 @@ def _check_error(self, dbname="db"): # RXE-ERR-006: invalid regex (unmatched parenthesis) tdSql.error("SELECT REGEXP_EXTRACT('abc', '(b', 1)") + def _check_doc_examples(self): + # ----------------------------------------------------------------- + # §12 Doc examples — verify the three queries from the user manual + # ----------------------------------------------------------------- + # RXE-DOC-001: date string, group 1 → year + tdSql.query("SELECT REGEXP_EXTRACT('2026-04-22', '([0-9]{4})-([0-9]{2})-([0-9]{2})', 1)") + tdSql.checkRows(1) + tdSql.checkData(0, 0, '2026') + + # RXE-DOC-002: date string, group 0 → whole match + tdSql.query("SELECT REGEXP_EXTRACT('2026-04-22', '([0-9]{4})-([0-9]{2})-([0-9]{2})', 0)") + tdSql.checkRows(1) + tdSql.checkData(0, 0, '2026-04-22') + + # RXE-DOC-003: no match → NULL + tdSql.query("SELECT REGEXP_EXTRACT('no-digits-here', '[0-9]+', 1)") + tdSql.checkRows(1) + tdSql.checkData(0, 0, None) + def all_test(self, dbname="db"): self._check_basic(dbname) self._check_error(dbname) + self._check_doc_examples() def test_fun_sca_regexp_extract(self): """Fun: regexp_extract() @@ -287,6 +307,7 @@ def test_fun_sca_regexp_extract(self): 9. regexp_extract in subquery with GROUP BY 10. regexp_extract POSIX ERE features: character class, anchors, case sensitivity 11. regexp_extract invalid parameter error cases + 12. regexp_extract user-manual doc examples Since: v3.4.2.0 From 9ccae4017db6e7b3d3507fb75a6c13ae99b6cf1b Mon Sep 17 00:00:00 2001 From: Minglei Jin <49711132+stephenkgu@users.noreply.github.com> Date: Wed, 22 Apr 2026 20:28:25 -0700 Subject: [PATCH 29/48] Update source/libs/function/src/builtins.c Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- source/libs/function/src/builtins.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/libs/function/src/builtins.c b/source/libs/function/src/builtins.c index 3dfb03fa4210..c0fac477536a 100644 --- a/source/libs/function/src/builtins.c +++ b/source/libs/function/src/builtins.c @@ -1200,7 +1200,8 @@ static int32_t translateRegexpExtract(SFunctionNode* pFunc, char* pErrBuf, int32 if (pIdxVal->placeholderNo == 0) { int64_t groupIdx = taosStr2Int64(pIdxVal->literal, NULL, 10); if (groupIdx < 0 || groupIdx > REGEXP_EXTRACT_MAX_GROUP_IDX) { - return invaildFuncParaValueErrMsg(pErrBuf, len, "regexp_extract: group_idx must be between 0 and 512"); + return invaildFuncParaValueErrMsg( + pErrBuf, len, "regexp_extract: group_idx must be between 0 and %d", REGEXP_EXTRACT_MAX_GROUP_IDX); } } } From 7c471c36e0e7ba7eee84a79331f86c53447c1ff1 Mon Sep 17 00:00:00 2001 From: stephenkgu Date: Wed, 22 Apr 2026 21:47:48 -0700 Subject: [PATCH 30/48] pre-formats error message to fix compilation --- source/libs/function/src/builtins.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/source/libs/function/src/builtins.c b/source/libs/function/src/builtins.c index c0fac477536a..1480cf19a903 100644 --- a/source/libs/function/src/builtins.c +++ b/source/libs/function/src/builtins.c @@ -1200,8 +1200,11 @@ static int32_t translateRegexpExtract(SFunctionNode* pFunc, char* pErrBuf, int32 if (pIdxVal->placeholderNo == 0) { int64_t groupIdx = taosStr2Int64(pIdxVal->literal, NULL, 10); if (groupIdx < 0 || groupIdx > REGEXP_EXTRACT_MAX_GROUP_IDX) { - return invaildFuncParaValueErrMsg( - pErrBuf, len, "regexp_extract: group_idx must be between 0 and %d", REGEXP_EXTRACT_MAX_GROUP_IDX); + char errmsg[64]; + (void)snprintf(errmsg, sizeof(errmsg), + "regexp_extract: group_idx must be between 0 and %d", + REGEXP_EXTRACT_MAX_GROUP_IDX); + return invaildFuncParaValueErrMsg(pErrBuf, len, errmsg); } } } From 5670d20b654d580581c550a3221cb17dc9c4d48f Mon Sep 17 00:00:00 2001 From: stephenkgu Date: Thu, 23 Apr 2026 00:52:45 -0700 Subject: [PATCH 31/48] remove inaccurate comments --- source/libs/scalar/src/sclfunc.c | 2 -- .../cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/source/libs/scalar/src/sclfunc.c b/source/libs/scalar/src/sclfunc.c index dc77f9f24bc5..be3f06ee4c3a 100644 --- a/source/libs/scalar/src/sclfunc.c +++ b/source/libs/scalar/src/sclfunc.c @@ -1830,14 +1830,12 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar return TSDB_CODE_SUCCESS; } - // NULL-type str: all output rows are NULL if (IS_NULL_TYPE(GET_PARAM_TYPE(&pInput[0])) || IS_NULL_TYPE(GET_PARAM_TYPE(&pInput[1]))) { colDataSetNNULL(pOutputData, 0, numOfRows); pOutput->numOfRows = numOfRows; return TSDB_CODE_SUCCESS; } - // NULL pattern: all output rows are NULL if (colDataIsNull_s(pPatData, 0)) { colDataSetNNULL(pOutputData, 0, numOfRows); pOutput->numOfRows = numOfRows; diff --git a/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py b/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py index 972ff7e39bb4..d23ba8cea15d 100644 --- a/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py +++ b/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py @@ -133,7 +133,7 @@ def _check_basic(self, dbname="db"): tdSql.checkRows(1) tdSql.checkData(0, 0, None) - # participating group 2 → 'b' + # RXE-NULL-007: participating group 2 returns matched content tdSql.query("SELECT REGEXP_EXTRACT('b', '(a)|(b)', 2)") tdSql.checkRows(1) tdSql.checkData(0, 0, 'b') From 388cfe0742961bc2f9f1e5a869ab8ff014f04b31 Mon Sep 17 00:00:00 2001 From: Minglei Jin <49711132+stephenkgu@users.noreply.github.com> Date: Thu, 23 Apr 2026 01:03:48 -0700 Subject: [PATCH 32/48] Update docs/zh/14-reference/03-taos-sql/22-function.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../14-reference/03-taos-sql/22-function.md | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/docs/zh/14-reference/03-taos-sql/22-function.md b/docs/zh/14-reference/03-taos-sql/22-function.md index 5b6f3e0fcf34..2b8176fd7183 100644 --- a/docs/zh/14-reference/03-taos-sql/22-function.md +++ b/docs/zh/14-reference/03-taos-sql/22-function.md @@ -1062,7 +1062,7 @@ REGEXP_EXTRACT(expr, pattern [, group_idx]) **使用说明**: -- `group_idx` 为非负整数常量,默认为 `1`。`0` 返回整个匹配串,`1` 返回第一个捕获组,`2` 返回第二个,以此类推,最大值为 512。 +- `group_idx` 通常为非负整数常量,默认为 `1`。`0` 返回整个匹配串,`1` 返回第一个捕获组,`2` 返回第二个,以此类推,最大值为 512。若 `group_idx` 为 SQL `NULL`,则返回 `NULL`。 - 若 `group_idx` 超过 `pattern` 中的捕获组数量,或对应捕获组未参与匹配,返回 NULL。 - `pattern` 必须为常量表达式,不可引用列。 @@ -2410,11 +2410,11 @@ LAG(expr, offset[, default_val]) **使用说明**: - `offset` 必须为大于 0 的整数。 -- `default_val` 可选;当目标行不存在时返回该值,未指定时返回 `NULL`。 -- `default_val` 需要与 `expr` 类型兼容。 -- `LAG` 按输入结果集的行序计算;可以结合 `ORDER BY` 改变计算顺序。 -- 支持与 `_rowts`、`tbname`、标签列等一起查询,也支持在子查询和 `PARTITION BY` 场景中使用。 -- 与窗口一起使用时,`LAG` 仅在当前窗口内部按窗口内结果顺序计算,不会跨窗口继承上一窗口的状态。 +- `default_val` 可选;当目标行不存在时返回该值,未指定时返回 `NULL`。 +- `default_val` 需要与 `expr` 类型兼容。 +- `LAG` 按输入结果集的行序计算;可以结合 `ORDER BY` 改变计算顺序。 +- 支持与 `_rowts`、`tbname`、标签列等一起查询,也支持在子查询和 `PARTITION BY` 场景中使用。 +- 与窗口一起使用时,`LAG` 仅在当前窗口内部按窗口内结果顺序计算,不会跨窗口继承上一窗口的状态。 #### LEAD @@ -2433,11 +2433,11 @@ LEAD(expr, offset[, default_val]) **使用说明**: - `offset` 必须为大于 0 的整数。 -- `default_val` 可选;当目标行不存在时返回该值,未指定时返回 `NULL`。 -- `default_val` 需要与 `expr` 类型兼容。 -- `LEAD` 按输入结果集的行序计算;可以结合 `ORDER BY` 改变计算顺序。 -- 支持与 `_rowts`、`tbname`、标签列等一起查询,也支持在子查询和 `PARTITION BY` 场景中使用。 -- 与窗口一起使用时,`LEAD` 仅在当前窗口内部按窗口内结果顺序计算,不会跨窗口读取下一窗口的数据。 +- `default_val` 可选;当目标行不存在时返回该值,未指定时返回 `NULL`。 +- `default_val` 需要与 `expr` 类型兼容。 +- `LEAD` 按输入结果集的行序计算;可以结合 `ORDER BY` 改变计算顺序。 +- 支持与 `_rowts`、`tbname`、标签列等一起查询,也支持在子查询和 `PARTITION BY` 场景中使用。 +- 与窗口一起使用时,`LEAD` 仅在当前窗口内部按窗口内结果顺序计算,不会跨窗口读取下一窗口的数据。 #### MAX @@ -3196,11 +3196,11 @@ MAVG(expr, k) **适用于**:表和超级表。 -**使用说明**: - -- 不支持 +、-、*、/ 运算,如 mavg(col1, k1) + mavg(col2, k1); -- 只能与普通列,选择(Selection)、投影(Projection)函数一起使用,不能与聚合(Aggregation)函数一起使用; -- 与窗口一起使用时,`MAVG` 仅在当前窗口内部按样本顺序计算,不会跨窗口延续上一窗口的样本状态。 +**使用说明**: + +- 不支持 +、-、*、/ 运算,如 mavg(col1, k1) + mavg(col2, k1); +- 只能与普通列,选择(Selection)、投影(Projection)函数一起使用,不能与聚合(Aggregation)函数一起使用; +- 与窗口一起使用时,`MAVG` 仅在当前窗口内部按样本顺序计算,不会跨窗口延续上一窗口的样本状态。 #### STATECOUNT @@ -3223,9 +3223,9 @@ STATECOUNT(expr, oper, val) **适用于**:表和超级表。 -**使用说明**: - -- 与窗口一起使用时,`STATECOUNT` 仅统计当前窗口内部的连续记录,不会跨窗口累计。 +**使用说明**: + +- 与窗口一起使用时,`STATECOUNT` 仅统计当前窗口内部的连续记录,不会跨窗口累计。 #### STATEDURATION @@ -3249,9 +3249,9 @@ STATEDURATION(expr, oper, val, unit) **适用于**:表和超级表。 -**使用说明**: - -- 与窗口一起使用时,`STATEDURATION` 仅统计当前窗口内部满足条件的连续时长,不会跨窗口累计。 +**使用说明**: + +- 与窗口一起使用时,`STATEDURATION` 仅统计当前窗口内部满足条件的连续时长,不会跨窗口累计。 ### 时间加权统计 From 7203c84155040c2329fbbe04fe0ea932dbea9526 Mon Sep 17 00:00:00 2001 From: Minglei Jin <49711132+stephenkgu@users.noreply.github.com> Date: Thu, 23 Apr 2026 01:04:06 -0700 Subject: [PATCH 33/48] Update docs/en/14-reference/03-taos-sql/22-function.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- docs/en/14-reference/03-taos-sql/22-function.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/en/14-reference/03-taos-sql/22-function.md b/docs/en/14-reference/03-taos-sql/22-function.md index cb7f9b5c401b..2a7b30a5ef0c 100644 --- a/docs/en/14-reference/03-taos-sql/22-function.md +++ b/docs/en/14-reference/03-taos-sql/22-function.md @@ -883,7 +883,9 @@ REGEXP_EXTRACT(expr, pattern [, group_idx]) **Usage**: -- `group_idx` is a non-negative integer constant (default `1`). `0` returns the entire match; `1` returns the first capture group, `2` the second, and so on. The maximum value is 512. +- If omitted, `group_idx` defaults to `1`. +- If provided as a non-`NULL` value, `group_idx` must be a non-negative integer constant. `0` returns the entire match; `1` returns the first capture group, `2` the second, and so on. The maximum value is 512. +- If `group_idx` is SQL `NULL`, the function returns `NULL`. - Returns NULL if `group_idx` exceeds the number of capture groups in `pattern`, or if the addressed group did not participate in the match. - `pattern` must be a constant expression; it cannot reference a column. From 068a98c416fc4087809b2a7f7eb2e8bc27c15d4a Mon Sep 17 00:00:00 2001 From: Minglei Jin <49711132+stephenkgu@users.noreply.github.com> Date: Thu, 23 Apr 2026 01:04:52 -0700 Subject: [PATCH 34/48] Update source/libs/scalar/src/sclfunc.c Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- source/libs/scalar/src/sclfunc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/source/libs/scalar/src/sclfunc.c b/source/libs/scalar/src/sclfunc.c index be3f06ee4c3a..92b9776b06a0 100644 --- a/source/libs/scalar/src/sclfunc.c +++ b/source/libs/scalar/src/sclfunc.c @@ -1921,10 +1921,9 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar } (void)memset(pmatch, 0xFF, nmatch * sizeof(regmatch_t)); - // Each output cell is a VarData value: VARSTR_HEADER_SIZE length prefix + data. - // Add VARSTR_HEADER_SIZE on top of info.bytes to ensure the header always fits - // regardless of whether the caller's info.bytes already includes it or not. - int32_t outBufLen = pStrData->info.bytes + VARSTR_HEADER_SIZE; + // Each output cell is a VarData value, and for var-length types info.bytes + // already includes the VARSTR_HEADER_SIZE length prefix plus payload space. + int32_t outBufLen = pStrData->info.bytes; char *outBuf = taosMemoryMalloc(outBufLen); if (outBuf == NULL) { taosMemoryFree(pmatch); From 9652c1ea3fd4eef6ce36781fa25c1c10258a3b97 Mon Sep 17 00:00:00 2001 From: stephenkgu Date: Thu, 23 Apr 2026 01:33:49 -0700 Subject: [PATCH 35/48] new test case for big group index --- .../11-Functions/01-Scalar/test_fun_sca_regexp_extract.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py b/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py index d23ba8cea15d..e99646730b08 100644 --- a/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py +++ b/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py @@ -269,6 +269,9 @@ def _check_error(self, dbname="db"): # RXE-ERR-006: invalid regex (unmatched parenthesis) tdSql.error("SELECT REGEXP_EXTRACT('abc', '(b', 1)") + # RXE-ERR-007: group_idx exceeds maximum (512) + tdSql.error("SELECT REGEXP_EXTRACT('abc', '(b)', 513)") + def _check_doc_examples(self): # ----------------------------------------------------------------- # §12 Doc examples — verify the three queries from the user manual @@ -306,7 +309,7 @@ def test_fun_sca_regexp_extract(self): 8. regexp_extract on NCHAR column (return type NCHAR) 9. regexp_extract in subquery with GROUP BY 10. regexp_extract POSIX ERE features: character class, anchors, case sensitivity - 11. regexp_extract invalid parameter error cases + 11. regexp_extract invalid parameter error cases (including group_idx > 512) 12. regexp_extract user-manual doc examples Since: v3.4.2.0 From 4e13f1dd3798904c27913b9deeab3d64c2e06e06 Mon Sep 17 00:00:00 2001 From: stephenkgu Date: Thu, 23 Apr 2026 02:18:27 -0700 Subject: [PATCH 36/48] new test case for null pattern --- .../11-Functions/01-Scalar/test_fun_sca_regexp_extract.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py b/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py index e99646730b08..cf690e22d9b9 100644 --- a/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py +++ b/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py @@ -138,6 +138,11 @@ def _check_basic(self, dbname="db"): tdSql.checkRows(1) tdSql.checkData(0, 0, 'b') + # RXE-NULL-008: pattern=NULL → NULL + tdSql.query("SELECT REGEXP_EXTRACT('abc', NULL)") + tdSql.checkRows(1) + tdSql.checkData(0, 0, None) + # ----------------------------------------------------------------- # §5 Empty string scenarios # ----------------------------------------------------------------- @@ -302,7 +307,7 @@ def test_fun_sca_regexp_extract(self): 1. regexp_extract default group_idx=1 returns first capture group 2. regexp_extract group_idx=0 returns whole match substring 3. regexp_extract with explicit group index (1, 2, out-of-range) - 4. regexp_extract NULL input and no-match return NULL + 4. regexp_extract NULL input (str, pattern, group_idx) and no-match return NULL 5. regexp_extract capture group matching empty string returns '' 6. regexp_extract on table columns with per-row scalar semantics 7. regexp_extract in WHERE clause for row filtering From 7b2b79d496afeb6d464b3b2bbbabf1708fbaa734 Mon Sep 17 00:00:00 2001 From: Minglei Jin <49711132+stephenkgu@users.noreply.github.com> Date: Thu, 23 Apr 2026 02:32:06 -0700 Subject: [PATCH 37/48] Update source/libs/scalar/src/sclfunc.c Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- source/libs/scalar/src/sclfunc.c | 36 +++++++++++++++++++------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/source/libs/scalar/src/sclfunc.c b/source/libs/scalar/src/sclfunc.c index 92b9776b06a0..81ddd25db8db 100644 --- a/source/libs/scalar/src/sclfunc.c +++ b/source/libs/scalar/src/sclfunc.c @@ -1872,22 +1872,28 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar char *rawPat = varDataVal(colDataGetData(pPatData, 0)); int32_t rawPatLen = varDataLen(colDataGetData(pPatData, 0)); if (GET_PARAM_TYPE(&pInput[1]) == TSDB_DATA_TYPE_NCHAR) { - patStr = NULL; // ensure convNcharToVarchar always mallocs a fresh heap buffer - code = convNcharToVarchar(rawPat, &patStr, rawPatLen, &patLen, pInput[1].charsetCxt); - if (code != TSDB_CODE_SUCCESS) goto _exit; - needFreePat = true; - // convNcharToVarchar allocates rawPatLen bytes (no +1 for NUL); when the - // UTF-8 output fills the buffer entirely there is no room for a terminator. - // threadGetRegComp requires a NUL-terminated string — grow by one byte. - char *tmp = taosMemoryRealloc(patStr, patLen + 1); - if (tmp == NULL) { - taosMemoryFree(patStr); - needFreePat = false; - code = terrno; - goto _exit; + if (rawPatLen == 0) { + patLen = 0; + patStr = patBuf; + patStr[0] = '\0'; + } else { + patStr = NULL; // ensure convNcharToVarchar always mallocs a fresh heap buffer + code = convNcharToVarchar(rawPat, &patStr, rawPatLen, &patLen, pInput[1].charsetCxt); + if (code != TSDB_CODE_SUCCESS) goto _exit; + needFreePat = true; + // convNcharToVarchar allocates rawPatLen bytes (no +1 for NUL); when the + // UTF-8 output fills the buffer entirely there is no room for a terminator. + // threadGetRegComp requires a NUL-terminated string — grow by one byte. + char *tmp = taosMemoryRealloc(patStr, patLen + 1); + if (tmp == NULL) { + taosMemoryFree(patStr); + needFreePat = false; + code = terrno; + goto _exit; + } + patStr = tmp; + patStr[patLen] = '\0'; } - patStr = tmp; - patStr[patLen] = '\0'; } else { patLen = rawPatLen; if (patLen >= (int32_t)sizeof(patBuf)) { From 663e85bc6c69e6211dde7db2895d104cca764609 Mon Sep 17 00:00:00 2001 From: Minglei Jin <49711132+stephenkgu@users.noreply.github.com> Date: Thu, 23 Apr 2026 02:44:33 -0700 Subject: [PATCH 38/48] Update docs/en/14-reference/03-taos-sql/22-function.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- docs/en/14-reference/03-taos-sql/22-function.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/14-reference/03-taos-sql/22-function.md b/docs/en/14-reference/03-taos-sql/22-function.md index 2a7b30a5ef0c..0cd239d03ade 100644 --- a/docs/en/14-reference/03-taos-sql/22-function.md +++ b/docs/en/14-reference/03-taos-sql/22-function.md @@ -887,7 +887,7 @@ REGEXP_EXTRACT(expr, pattern [, group_idx]) - If provided as a non-`NULL` value, `group_idx` must be a non-negative integer constant. `0` returns the entire match; `1` returns the first capture group, `2` the second, and so on. The maximum value is 512. - If `group_idx` is SQL `NULL`, the function returns `NULL`. - Returns NULL if `group_idx` exceeds the number of capture groups in `pattern`, or if the addressed group did not participate in the match. -- `pattern` must be a constant expression; it cannot reference a column. +- `pattern` must be provided as a constant literal or parameter placeholder; it cannot reference a column or be computed from other expressions. **Example**: From d0728c92a67b1d1b96690f42b0f016c95323542a Mon Sep 17 00:00:00 2001 From: Minglei Jin <49711132+stephenkgu@users.noreply.github.com> Date: Thu, 23 Apr 2026 02:45:51 -0700 Subject: [PATCH 39/48] Update docs/zh/14-reference/03-taos-sql/22-function.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- docs/zh/14-reference/03-taos-sql/22-function.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/zh/14-reference/03-taos-sql/22-function.md b/docs/zh/14-reference/03-taos-sql/22-function.md index 2b8176fd7183..1fd1ccb7dc5d 100644 --- a/docs/zh/14-reference/03-taos-sql/22-function.md +++ b/docs/zh/14-reference/03-taos-sql/22-function.md @@ -1064,7 +1064,7 @@ REGEXP_EXTRACT(expr, pattern [, group_idx]) - `group_idx` 通常为非负整数常量,默认为 `1`。`0` 返回整个匹配串,`1` 返回第一个捕获组,`2` 返回第二个,以此类推,最大值为 512。若 `group_idx` 为 SQL `NULL`,则返回 `NULL`。 - 若 `group_idx` 超过 `pattern` 中的捕获组数量,或对应捕获组未参与匹配,返回 NULL。 -- `pattern` 必须为常量表达式,不可引用列。 +- `pattern` 必须为常量(字面量或预处理占位符),不可引用列;不支持 `concat('a','b')` 这类常量表达式。 **举例**: From be87866dcaf24bbc5e73f96b463c4026e6efd466 Mon Sep 17 00:00:00 2001 From: Minglei Jin <49711132+stephenkgu@users.noreply.github.com> Date: Thu, 23 Apr 2026 02:47:09 -0700 Subject: [PATCH 40/48] Update test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py b/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py index cf690e22d9b9..1616b40403df 100644 --- a/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py +++ b/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py @@ -5,6 +5,7 @@ class TestFunRegexpExtract: def setup_class(cls): + cls.replicaVar = 1 tdLog.debug(f"start to execute {__file__}") # ------------------------------------------------------------------ From 193997ac3c21dec486279cae0e1ecab00fd3d964 Mon Sep 17 00:00:00 2001 From: Minglei Jin <49711132+stephenkgu@users.noreply.github.com> Date: Thu, 23 Apr 2026 02:48:23 -0700 Subject: [PATCH 41/48] Update source/libs/function/src/builtins.c Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- source/libs/function/src/builtins.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/source/libs/function/src/builtins.c b/source/libs/function/src/builtins.c index 1480cf19a903..ea03924f8e28 100644 --- a/source/libs/function/src/builtins.c +++ b/source/libs/function/src/builtins.c @@ -1113,10 +1113,13 @@ static int32_t translateRegexpExtract(SFunctionNode* pFunc, char* pErrBuf, int32 FUNC_ERR_RET(validateParam(pFunc, pErrBuf, len)); int32_t numOfParams = LIST_LENGTH(pFunc->pParameterList); - // param[1]: pattern must be a constant VALUE node + // param[1]: pattern must be a literal/parameter constant VALUE node. + // Constant expressions are not accepted here because regexp_extract + // currently validates only VALUE nodes. SNode* pPatNode = nodesListGetNode(pFunc->pParameterList, 1); if (QUERY_NODE_VALUE != nodeType(pPatNode)) { - return invaildFuncParaTypeErrMsg(pErrBuf, len, "regexp_extract: pattern must be a constant"); + return invaildFuncParaTypeErrMsg( + pErrBuf, len, "regexp_extract: pattern must be a literal or parameter constant"); } // Validate the regex pattern compiles as POSIX ERE. From b34c9a1c936832b6b2cfdd589a0d5b1dc8293b4c Mon Sep 17 00:00:00 2001 From: Minglei Jin <49711132+stephenkgu@users.noreply.github.com> Date: Thu, 23 Apr 2026 02:53:14 -0700 Subject: [PATCH 42/48] Update test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../01-Scalar/test_fun_sca_regexp_extract.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py b/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py index 1616b40403df..5eed7e1e4f7c 100644 --- a/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py +++ b/test/cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py @@ -327,18 +327,19 @@ def test_fun_sca_regexp_extract(self): History: - 2026-04-20 Stephen Created """ + dbname = "db" tdSql.prepare() tdLog.printNoPrefix("==========step1:create table") - self._create_tb() + self._create_tb(dbname) tdLog.printNoPrefix("==========step2:insert data") - self._insert_data() + self._insert_data(dbname) tdLog.printNoPrefix("==========step3:all check") - self.all_test() + self.all_test(dbname) - tdSql.execute("flush database db") + tdSql.execute(f"flush database {dbname}") tdLog.printNoPrefix("==========step4:after wal, all check again") - self.all_test() + self.all_test(dbname) From 6e4743f0a26dfdfe6577f5bf74e28a25ef7ec866 Mon Sep 17 00:00:00 2001 From: stephenkgu Date: Thu, 23 Apr 2026 02:59:43 -0700 Subject: [PATCH 43/48] log reg exec error message to make production debugging actionalbe --- source/libs/scalar/src/sclfunc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/source/libs/scalar/src/sclfunc.c b/source/libs/scalar/src/sclfunc.c index 81ddd25db8db..700f2f23fcf3 100644 --- a/source/libs/scalar/src/sclfunc.c +++ b/source/libs/scalar/src/sclfunc.c @@ -1989,7 +1989,10 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar // no match, or the requested capture group did not participate colDataSetNULL(pOutputData, i); } else if (ret != 0) { - // real regex execution error (e.g. REG_ESPACE) + // real regex execution error — capture the reason for production debugging + char msgbuf[256] = {0}; + (void)regerror(ret, regex, msgbuf, sizeof(msgbuf)); + qDebug("REGEXP_EXTRACT: regexec failed for pattern '%s', reason: %s", patStr, msgbuf); code = TSDB_CODE_PAR_REGULAR_EXPRESSION_ERROR; terrno = code; break; From d8b04ac1393516712533a652af412cc07e18f503 Mon Sep 17 00:00:00 2001 From: stephenkgu Date: Thu, 23 Apr 2026 03:04:16 -0700 Subject: [PATCH 44/48] fix terrno --- source/libs/scalar/src/sclfunc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/libs/scalar/src/sclfunc.c b/source/libs/scalar/src/sclfunc.c index 700f2f23fcf3..ea14b3c06b65 100644 --- a/source/libs/scalar/src/sclfunc.c +++ b/source/libs/scalar/src/sclfunc.c @@ -1859,7 +1859,7 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar } if (groupIdxRaw < 0 || groupIdxRaw > REGEXP_EXTRACT_MAX_GROUP_IDX) { pOutput->numOfRows = numOfRows; - return TSDB_CODE_FUNC_FUNTION_PARA_VALUE; + SCL_ERR_RET(TSDB_CODE_FUNC_FUNTION_PARA_VALUE); } int32_t groupIdx = (int32_t)groupIdxRaw; @@ -1913,6 +1913,7 @@ int32_t regexpExtractFunction(SScalarParam *pInput, int32_t inputNum, SScalarPar regex_t *regex = NULL; code = threadGetRegComp(®ex, patStr); if (code != 0) { + terrno = code; goto _exit; } From 4b95695c6cf741a9b37f14c72c60edc86e1fb259 Mon Sep 17 00:00:00 2001 From: stephenkgu Date: Thu, 23 Apr 2026 17:52:10 -0700 Subject: [PATCH 45/48] fix regerror's null param --- source/libs/function/src/builtins.c | 4 ++-- source/util/src/tcompare.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/source/libs/function/src/builtins.c b/source/libs/function/src/builtins.c index ea03924f8e28..1c6bbfafa5ab 100644 --- a/source/libs/function/src/builtins.c +++ b/source/libs/function/src/builtins.c @@ -1170,8 +1170,8 @@ static int32_t translateRegexpExtract(SFunctionNode* pFunc, char* pErrBuf, int32 int ret = regcomp(&re, regPattern, REG_EXTENDED); if (ret != 0) { char msgbuf[256] = {0}; - (void)regerror(ret, NULL, msgbuf, sizeof(msgbuf)); - // do not call regfree — regcomp failed, re contents are undefined (POSIX) + (void)regerror(ret, &re, msgbuf, sizeof(msgbuf)); + // do not call regfree — regcomp failed, re is partially initialised (POSIX) if (freeUtf8Pat) taosMemoryFree(utf8Pat); return buildFuncErrMsg(pErrBuf, len, TSDB_CODE_PAR_REGULAR_EXPRESSION_ERROR, "Invalid regex pattern for regexp_extract: %s", msgbuf); diff --git a/source/util/src/tcompare.c b/source/util/src/tcompare.c index c7d47554d865..07328a5fb302 100644 --- a/source/util/src/tcompare.c +++ b/source/util/src/tcompare.c @@ -1426,7 +1426,7 @@ int32_t checkRegexPattern(const char *pPattern) { int32_t ret = regcomp(®ex, pPattern, cflags); if (ret != 0) { char msgbuf[256] = {0}; - (void)regerror(ret, NULL, msgbuf, tListLen(msgbuf)); + (void)regerror(ret, ®ex, msgbuf, tListLen(msgbuf)); uError("Failed to compile regex pattern %s. reason %s", pPattern, msgbuf); return TSDB_CODE_PAR_REGULAR_EXPRESSION_ERROR; } @@ -1450,7 +1450,7 @@ int32_t getRegComp(const char *pPattern, HashRegexPtr **regexRet) { int32_t ret = regcomp(&pUsingRegex->pRegex, pPattern, cflags); if (ret != 0) { char msgbuf[256] = {0}; - (void)regerror(ret, NULL, msgbuf, tListLen(msgbuf)); + (void)regerror(ret, &pUsingRegex->pRegex, msgbuf, tListLen(msgbuf)); uError("Failed to compile regex pattern %s. reason %s", pPattern, msgbuf); taosMemoryFree(pUsingRegex); return TSDB_CODE_PAR_REGULAR_EXPRESSION_ERROR; @@ -1516,7 +1516,7 @@ int32_t threadGetRegComp(regex_t **regex, const char *pPattern) { int32_t ret = regcomp(&gRegex, pPattern, cflags); if (ret != 0) { char msgbuf[256] = {0}; - (void)regerror(ret, NULL, msgbuf, tListLen(msgbuf)); + (void)regerror(ret, &gRegex, msgbuf, tListLen(msgbuf)); uError("Failed to compile regex pattern %s. reason %s", pPattern, msgbuf); taosMemoryFree(pOldPattern); pOldPattern = NULL; From 3350aeafbb7c9d1a2f8dff0503487f2e991f5339 Mon Sep 17 00:00:00 2001 From: stephenkgu Date: Mon, 27 Apr 2026 02:17:54 -0700 Subject: [PATCH 46/48] run test suite with ci --- test/ci/cases.task | 1 + 1 file changed, 1 insertion(+) diff --git a/test/ci/cases.task b/test/ci/cases.task index 7b415ae1e8a3..a2217d5e0c01 100644 --- a/test/ci/cases.task +++ b/test/ci/cases.task @@ -455,6 +455,7 @@ ,,y,.,./ci/pytest.sh pytest cases/11-Functions/01-Scalar/test_fun_sca_to_iso8601.py ,,y,.,./ci/pytest.sh pytest cases/11-Functions/01-Scalar/test_fun_sca_to_timestamp.py ,,y,.,./ci/pytest.sh pytest cases/11-Functions/01-Scalar/test_fun_sca_to_unixtimestamp.py +,,y,.,./ci/pytest.sh pytest cases/11-Functions/01-Scalar/test_fun_sca_regexp_extract.py ,,y,.,./ci/pytest.sh pytest cases/11-Functions/01-Scalar/test_fun_sca_today.py ,,y,.,./ci/pytest.sh pytest cases/11-Functions/01-Scalar/test_fun_sca_upper.py ,,y,.,./ci/pytest.sh pytest cases/11-Functions/01-Scalar/test_fun_sca_cast_blob.py From 4db73dfb3aefda24a23581aaaaf093798af03600 Mon Sep 17 00:00:00 2001 From: Minglei Jin <49711132+stephenkgu@users.noreply.github.com> Date: Mon, 27 Apr 2026 02:43:05 -0700 Subject: [PATCH 47/48] Update source/libs/function/src/builtins.c Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- source/libs/function/src/builtins.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/libs/function/src/builtins.c b/source/libs/function/src/builtins.c index 1c6bbfafa5ab..f32da9f2f4fd 100644 --- a/source/libs/function/src/builtins.c +++ b/source/libs/function/src/builtins.c @@ -1201,7 +1201,7 @@ static int32_t translateRegexpExtract(SFunctionNode* pFunc, char* pErrBuf, int32 // Skip range validation for prepared-statement placeholders — the bound value // is not yet known; the runtime check in regexpExtractFunction applies instead. if (pIdxVal->placeholderNo == 0) { - int64_t groupIdx = taosStr2Int64(pIdxVal->literal, NULL, 10); + int64_t groupIdx = pIdxVal->datum.i; if (groupIdx < 0 || groupIdx > REGEXP_EXTRACT_MAX_GROUP_IDX) { char errmsg[64]; (void)snprintf(errmsg, sizeof(errmsg), From d2acb171ce0305735c226050062c6e041ffe1a2c Mon Sep 17 00:00:00 2001 From: Minglei Jin <49711132+stephenkgu@users.noreply.github.com> Date: Mon, 27 Apr 2026 02:49:38 -0700 Subject: [PATCH 48/48] Update source/libs/function/src/builtins.c Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- source/libs/function/src/builtins.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/libs/function/src/builtins.c b/source/libs/function/src/builtins.c index f32da9f2f4fd..5e8e72de2060 100644 --- a/source/libs/function/src/builtins.c +++ b/source/libs/function/src/builtins.c @@ -1170,7 +1170,7 @@ static int32_t translateRegexpExtract(SFunctionNode* pFunc, char* pErrBuf, int32 int ret = regcomp(&re, regPattern, REG_EXTENDED); if (ret != 0) { char msgbuf[256] = {0}; - (void)regerror(ret, &re, msgbuf, sizeof(msgbuf)); + (void)regerror(ret, NULL, msgbuf, sizeof(msgbuf)); // do not call regfree — regcomp failed, re is partially initialised (POSIX) if (freeUtf8Pat) taosMemoryFree(utf8Pat); return buildFuncErrMsg(pErrBuf, len, TSDB_CODE_PAR_REGULAR_EXPRESSION_ERROR,