-
Notifications
You must be signed in to change notification settings - Fork 5k
func/regexp_extract: new scalar func and test cases #35191
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 52 commits
3b37e33
6d2e73a
6548192
d770e45
dc985ca
c4a5619
6a151e2
c7a78f4
a41023d
b384578
20f4090
3505a52
40293fd
a0f68be
f40ccdd
3213b43
9e090be
6bdc547
b11c324
d56a93e
5878f49
a7b7194
99b7b3a
872601f
0189d82
37f599a
9cf7858
640f580
38ac272
816ccb5
984b053
ceeb5cb
21e6d42
7a86c29
9ccae40
7c471c3
5670d20
388cfe0
7203c84
068a98c
9652c1e
4e13f1d
7b2b79d
663e85b
d0728c9
be87866
193997a
b34c9a1
6e4743f
d8b04ac
4b95695
3350aea
4db73df
d2acb17
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1105,6 +1105,117 @@ static int32_t translateRand(SFunctionNode* pFunc, char* pErrBuf, int32_t len) { | |
| static int32_t translateSleep(SFunctionNode* pFunc, char* pErrBuf, int32_t len) { | ||
| FUNC_ERR_RET(validateParam(pFunc, pErrBuf, len)); | ||
| pFunc->node.resType = (SDataType){.bytes = tDataTypes[TSDB_DATA_TYPE_INT].bytes, .type = TSDB_DATA_TYPE_INT}; | ||
|
|
||
| return TSDB_CODE_SUCCESS; | ||
| } | ||
|
|
||
| static int32_t translateRegexpExtract(SFunctionNode* pFunc, char* pErrBuf, int32_t len) { | ||
| FUNC_ERR_RET(validateParam(pFunc, pErrBuf, len)); | ||
| int32_t numOfParams = LIST_LENGTH(pFunc->pParameterList); | ||
|
|
||
| // param[1]: pattern must be a literal/parameter constant VALUE node. | ||
| // Constant expressions are not accepted here because regexp_extract | ||
| // currently validates only VALUE nodes. | ||
| SNode* pPatNode = nodesListGetNode(pFunc->pParameterList, 1); | ||
| if (QUERY_NODE_VALUE != nodeType(pPatNode)) { | ||
| return invaildFuncParaTypeErrMsg( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
此处传入了完整描述字符串 语义重叠,格式混乱。同样的问题也出现在 group_idx 校验块的两处 建议改为直接调用 // pattern 错误
return buildFuncErrMsg(pErrBuf, len, TSDB_CODE_FUNC_FUNTION_PARA_TYPE,
"regexp_extract: pattern must be a literal or parameter constant");
// group_idx 范围错误(去掉 snprintf 中间缓冲)
return buildFuncErrMsg(pErrBuf, len, TSDB_CODE_FUNC_FUNTION_PARA_VALUE,
"regexp_extract: group_idx must be between 0 and %d",
REGEXP_EXTRACT_MAX_GROUP_IDX); |
||
| pErrBuf, len, "regexp_extract: pattern must be a literal or parameter constant"); | ||
| } | ||
|
|
||
| // Validate the regex pattern compiles as POSIX ERE. | ||
| // For prepared-statement placeholders, literal may contain the placeholder | ||
| // token (for example "?") instead of the bound pattern. Prefer the | ||
| // materialized datum when available, and otherwise defer validation to | ||
| // runtime for placeholders. For NCHAR patterns datum.p holds UCS-4 vardata; | ||
| // convert it to UTF-8 to match the runtime path in regexpExtractFunction. | ||
| SValueNode* pPatVal = (SValueNode*)pPatNode; | ||
| { | ||
| const char* regPattern = NULL; | ||
| char* utf8Pat = NULL; | ||
| bool freeUtf8Pat = false; | ||
| bool deferValidation = (pPatVal->placeholderNo != 0 && pPatVal->datum.p == NULL); | ||
|
|
||
| if (!deferValidation) { | ||
| if (pPatVal->node.resType.type == TSDB_DATA_TYPE_NCHAR && pPatVal->datum.p != NULL) { | ||
| int32_t ncharBytes = varDataLen(pPatVal->datum.p); | ||
| utf8Pat = taosMemoryCalloc(ncharBytes + 1, 1); | ||
|
stephenkgu marked this conversation as resolved.
|
||
| if (utf8Pat == NULL) return terrno; | ||
| int32_t utf8Len = taosUcs4ToMbs((TdUcs4*)varDataVal(pPatVal->datum.p), ncharBytes, | ||
| utf8Pat, pPatVal->charsetCxt); | ||
|
stephenkgu marked this conversation as resolved.
|
||
| if (utf8Len < 0) { | ||
| taosMemoryFree(utf8Pat); | ||
| return buildFuncErrMsg(pErrBuf, len, TSDB_CODE_PAR_REGULAR_EXPRESSION_ERROR, | ||
| "regexp_extract: failed to convert NCHAR pattern to UTF-8"); | ||
| } | ||
| utf8Pat[utf8Len] = '\0'; | ||
| regPattern = utf8Pat; | ||
| freeUtf8Pat = true; | ||
| } else if (pPatVal->datum.p != NULL) { | ||
| // datum.p is a length-prefixed vardata buffer — not NUL-terminated. | ||
| // Build a NUL-terminated copy for regcomp(). | ||
| int32_t patBytes = varDataLen(pPatVal->datum.p); | ||
| utf8Pat = taosMemoryMalloc(patBytes + 1); | ||
| if (utf8Pat == NULL) return terrno; | ||
| (void)memcpy(utf8Pat, varDataVal(pPatVal->datum.p), patBytes); | ||
| utf8Pat[patBytes] = '\0'; | ||
| regPattern = utf8Pat; | ||
| freeUtf8Pat = true; | ||
| } else { | ||
| regPattern = pPatVal->literal; | ||
| } | ||
| } | ||
|
|
||
| if (regPattern != NULL) { | ||
| regex_t re; | ||
| int ret = regcomp(&re, regPattern, REG_EXTENDED); | ||
| if (ret != 0) { | ||
|
stephenkgu marked this conversation as resolved.
|
||
| char msgbuf[256] = {0}; | ||
| (void)regerror(ret, &re, msgbuf, sizeof(msgbuf)); | ||
|
stephenkgu marked this conversation as resolved.
Outdated
|
||
| // do not call regfree — regcomp failed, re is partially initialised (POSIX) | ||
| if (freeUtf8Pat) taosMemoryFree(utf8Pat); | ||
| return buildFuncErrMsg(pErrBuf, len, TSDB_CODE_PAR_REGULAR_EXPRESSION_ERROR, | ||
| "Invalid regex pattern for regexp_extract: %s", msgbuf); | ||
| } | ||
| regfree(&re); // only reached when regcomp succeeded | ||
|
stephenkgu marked this conversation as resolved.
|
||
| } | ||
| if (freeUtf8Pat) taosMemoryFree(utf8Pat); | ||
| } | ||
|
|
||
| // param[2]: group_idx (optional) must be a non-negative integer constant. | ||
| // NULL is also allowed by the builtin signature and should propagate like | ||
| // other scalar functions, so accept NULL-typed value nodes here and rely | ||
| // on runtime to return a NULL result. | ||
| if (numOfParams == 3) { | ||
| SNode* pIdxNode = nodesListGetNode(pFunc->pParameterList, 2); | ||
| if (QUERY_NODE_VALUE != nodeType(pIdxNode)) { | ||
| return invaildFuncParaTypeErrMsg(pErrBuf, len, "regexp_extract: group_idx must be a constant integer"); | ||
| } | ||
|
|
||
| SValueNode* pIdxVal = (SValueNode*)pIdxNode; | ||
| int32_t idxType = pIdxVal->node.resType.type; | ||
|
|
||
| if (TSDB_DATA_TYPE_NULL != idxType) { | ||
| if (!IS_INTEGER_TYPE(idxType)) { | ||
| return invaildFuncParaTypeErrMsg(pErrBuf, len, "regexp_extract: group_idx must be an integer"); | ||
| } | ||
| // Skip range validation for prepared-statement placeholders — the bound value | ||
| // is not yet known; the runtime check in regexpExtractFunction applies instead. | ||
| if (pIdxVal->placeholderNo == 0) { | ||
| int64_t groupIdx = taosStr2Int64(pIdxVal->literal, NULL, 10); | ||
|
stephenkgu marked this conversation as resolved.
Outdated
|
||
| if (groupIdx < 0 || groupIdx > REGEXP_EXTRACT_MAX_GROUP_IDX) { | ||
| char errmsg[64]; | ||
| (void)snprintf(errmsg, sizeof(errmsg), | ||
| "regexp_extract: group_idx must be between 0 and %d", | ||
| REGEXP_EXTRACT_MAX_GROUP_IDX); | ||
| return invaildFuncParaValueErrMsg(pErrBuf, len, errmsg); | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // Return type matches str (param[0]): same VARCHAR/NCHAR type and byte width | ||
| pFunc->node.resType = *getSDataTypeFromNode(nodesListGetNode(pFunc->pParameterList, 0)); | ||
|
|
||
| return TSDB_CODE_SUCCESS; | ||
| } | ||
|
|
||
|
|
@@ -7441,6 +7552,41 @@ const SBuiltinFuncDefinition funcMgtBuiltins[] = { | |
| .sprocessFunc = sleepFunction, | ||
| .finalizeFunc = NULL | ||
| }, | ||
| { | ||
| .name = "regexp_extract", | ||
| .type = FUNCTION_TYPE_REGEXP_EXTRACT, | ||
| .classification = FUNC_MGT_SCALAR_FUNC | FUNC_MGT_STRING_FUNC, | ||
| .parameters = {.minParamNum = 2, | ||
| .maxParamNum = 3, | ||
| .paramInfoPattern = 1, | ||
| .inputParaInfo[0][0] = {.isLastParam = false, | ||
| .startParam = 1, | ||
| .endParam = 1, | ||
| .validDataType = FUNC_PARAM_SUPPORT_VARCHAR_TYPE | FUNC_PARAM_SUPPORT_NCHAR_TYPE | FUNC_PARAM_SUPPORT_NULL_TYPE, | ||
| .validNodeType = FUNC_PARAM_SUPPORT_EXPR_NODE, | ||
| .paramAttribute = FUNC_PARAM_NO_SPECIFIC_ATTRIBUTE, | ||
| .valueRangeFlag = FUNC_PARAM_NO_SPECIFIC_VALUE,}, | ||
| .inputParaInfo[0][1] = {.isLastParam = false, | ||
| .startParam = 2, | ||
| .endParam = 2, | ||
| .validDataType = FUNC_PARAM_SUPPORT_VARCHAR_TYPE | FUNC_PARAM_SUPPORT_NCHAR_TYPE | FUNC_PARAM_SUPPORT_NULL_TYPE, | ||
| .validNodeType = FUNC_PARAM_SUPPORT_VALUE_NODE, | ||
| .paramAttribute = FUNC_PARAM_NO_SPECIFIC_ATTRIBUTE, | ||
| .valueRangeFlag = FUNC_PARAM_NO_SPECIFIC_VALUE,}, | ||
| .inputParaInfo[0][2] = {.isLastParam = true, | ||
| .startParam = 3, | ||
| .endParam = 3, | ||
| .validDataType = FUNC_PARAM_SUPPORT_INTEGER_TYPE | FUNC_PARAM_SUPPORT_NULL_TYPE, | ||
| .validNodeType = FUNC_PARAM_SUPPORT_VALUE_NODE, | ||
| .paramAttribute = FUNC_PARAM_NO_SPECIFIC_ATTRIBUTE, | ||
| .valueRangeFlag = FUNC_PARAM_NO_SPECIFIC_VALUE,}, | ||
| .outputParaInfo = {.validDataType = FUNC_PARAM_SUPPORT_VARCHAR_TYPE | FUNC_PARAM_SUPPORT_NCHAR_TYPE}}, | ||
| .translateFunc = translateRegexpExtract, | ||
| .getEnvFunc = NULL, | ||
| .initFunc = NULL, | ||
| .sprocessFunc = regexpExtractFunction, | ||
| .finalizeFunc = NULL, | ||
| }, | ||
| }; | ||
| // clang-format on | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.