-
Notifications
You must be signed in to change notification settings - Fork 5k
Feat/336/wal-recover #35200
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: 3.3.6
Are you sure you want to change the base?
Feat/336/wal-recover #35200
Changes from 5 commits
50b5164
4eeeb7b
8be4d82
1f23a8a
8d8b4f5
dbf6f82
281ce7b
224b376
f77e404
4fb8017
5567cc6
0967b0b
93d55a5
c98c398
de6a4de
e970734
7fb598d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -303,6 +303,8 @@ const char* syncStr(ESyncState state); | |||
|
|
||||
| int32_t syncNodeGetConfig(int64_t rid, SSyncCfg* cfg); | ||||
|
|
||||
| int32_t syncNotifyWalTruncated(int32_t vgId, int64_t truncatedVer); | ||||
|
|
||||
|
Comment on lines
+306
to
+307
|
||||
| int32_t syncNotifyWalTruncated(int32_t vgId, int64_t truncatedVer); |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -166,7 +166,7 @@ int32_t walInit(stopDnodeFn stopDnode); | |
| void walCleanUp(); | ||
|
|
||
| // handle open and ctl | ||
| SWal *walOpen(const char *path, SWalCfg *pCfg); | ||
| SWal *walOpen(const char *path, SWalCfg *pCfg, int32_t replica); | ||
|
xiao-77 marked this conversation as resolved.
|
||
| int32_t walAlter(SWal *, SWalCfg *pCfg); | ||
| int32_t walPersist(SWal *); | ||
| void walClose(SWal *); | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
|
|
@@ -357,6 +357,7 @@ bool tsStartUdfd = true; | |||||||
| int64_t tsWalFsyncDataSizeLimit = (100 * 1024 * 1024L); | ||||||||
| bool tsWalForceRepair = 0; | ||||||||
| bool tsWalDeleteOnCorruption = false; | ||||||||
| int32_t tsWalRecoveryPolicy = 0; // Default: refuse to start for single replica | ||||||||
|
|
||||||||
| // ttl | ||||||||
| bool tsTtlChangeOnWrite = false; // if true, ttl delete time changes on last write | ||||||||
|
|
@@ -982,6 +983,7 @@ static int32_t taosAddServerCfg(SConfig *pCfg) { | |||||||
| TAOS_CHECK_RETURN(cfgAddInt32(pCfg, "syncRoutineReportInterval", tsRoutineReportInterval, 5, 600, CFG_SCOPE_SERVER, CFG_DYN_SERVER,CFG_CATEGORY_LOCAL)); | ||||||||
| TAOS_CHECK_RETURN(cfgAddBool(pCfg, "syncLogHeartbeat", tsSyncLogHeartbeat, CFG_SCOPE_SERVER, CFG_DYN_SERVER,CFG_CATEGORY_LOCAL)); | ||||||||
| TAOS_CHECK_RETURN(cfgAddBool(pCfg, "walDeleteOnCorruption", tsWalDeleteOnCorruption, CFG_SCOPE_SERVER, CFG_DYN_NONE,CFG_CATEGORY_LOCAL)); | ||||||||
| TAOS_CHECK_RETURN(cfgAddInt32(pCfg, "walRecoveryPolicy", tsWalRecoveryPolicy, 0, 1, CFG_SCOPE_SERVER, CFG_DYN_NONE, CFG_CATEGORY_LOCAL)); | ||||||||
|
||||||||
| TAOS_CHECK_RETURN(cfgAddInt32(pCfg, "walRecoveryPolicy", tsWalRecoveryPolicy, 0, 1, CFG_SCOPE_SERVER, CFG_DYN_NONE, CFG_CATEGORY_LOCAL)); | |
| TAOS_CHECK_RETURN(cfgAddInt32(pCfg, "walRecoveryPolicy", tsWalRecoveryPolicy, 0, 1, CFG_SCOPE_SERVER, CFG_DYN_NONE, CFG_CATEGORY_LOCAL)); | |
| TAOS_CHECK_RETURN(cfgAddInt32(pCfg, "walDeleteOnCorruption", tsWalRecoveryPolicy, 0, 1, CFG_SCOPE_SERVER, CFG_DYN_NONE, CFG_CATEGORY_LOCAL)); |
| Original file line number | Diff line number | Diff line change | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -4090,3 +4090,8 @@ bool syncNodeCanChange(SSyncNode* pSyncNode) { | |||||||||||
| return true; | ||||||||||||
| } | ||||||||||||
| #endif | ||||||||||||
|
|
||||||||||||
| int32_t syncNotifyWalTruncated(int32_t vgId, int64_t truncatedVer) { | ||||||||||||
| sInfo("vgId:%d, notified sync module: WAL truncated to ver:%" PRId64, vgId, truncatedVer); | ||||||||||||
| return TSDB_CODE_SUCCESS; | ||||||||||||
| } | ||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The implementation of
|
||||||||||||
| int32_t syncNotifyWalTruncated(int32_t vgId, int64_t truncatedVer) { | |
| sInfo("vgId:%d, notified sync module: WAL truncated to ver:%" PRId64, vgId, truncatedVer); | |
| return TSDB_CODE_SUCCESS; | |
| } |
Copilot
AI
Apr 28, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
syncNotifyWalTruncated() is added as a public/internal API but is not called anywhere in this PR, and currently only logs a message without informing any SSyncNode state. This doesn’t match the PR description (“Call notification after three-replica recovery / Enable Raft to detect and sync missing logs”). Either wire this into the WAL truncation/recovery path (and have it trigger whatever raft re-sync behavior is required) or keep it internal until there’s an actual caller/behavior.
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -20,6 +20,8 @@ | |||||
| #include "tutil.h" | ||||||
| #include "walInt.h" | ||||||
|
|
||||||
| extern int32_t syncNotifyWalTruncated(int32_t vgId, int64_t truncatedVer); | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||||
|
|
||||||
| bool FORCE_INLINE walLogExist(SWal* pWal, int64_t ver) { | ||||||
| return !walIsEmpty(pWal) && walGetFirstVer(pWal) <= ver && walGetLastVer(pWal) >= ver; | ||||||
| } | ||||||
|
|
@@ -517,7 +519,7 @@ void walRegfree(regex_t* ptr) { | |||||
| regfree(ptr); | ||||||
| } | ||||||
|
|
||||||
| int32_t walCheckAndRepairMeta(SWal* pWal) { | ||||||
| int32_t walCheckAndRepairMeta(SWal* pWal, int32_t replica) { | ||||||
| // load log files, get first/snapshot/last version info | ||||||
| int32_t code = 0; | ||||||
| int32_t lino = 0; | ||||||
|
|
@@ -526,6 +528,8 @@ int32_t walCheckAndRepairMeta(SWal* pWal) { | |||||
| regex_t logRegPattern, idxRegPattern; | ||||||
| TdDirPtr pDir = NULL; | ||||||
| SArray* actualLog = NULL; | ||||||
| bool walTruncated = false; | ||||||
| int64_t truncatedVer = -1; | ||||||
|
||||||
| bool walTruncated = false; | |
| int64_t truncatedVer = -1; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Copilot
AI
Apr 22, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The new "auto-recovery enabled for replica=3" path is inconsistent: when replica==3 and WAL scan fails, the code only recovers if tsWalDeleteOnCorruption is enabled; otherwise it falls through to _exit with an error code, causing walOpen() to fail despite logging that auto-recovery is enabled. Either perform the recovery action unconditionally for replica==3 (still preserving the corrupted dir), or make the log/error handling reflect the actual required config and set a deterministic return code.
| if (shouldRecover && tsWalDeleteOnCorruption) { | |
| if (replica == 3 || (shouldRecover && tsWalDeleteOnCorruption)) { |
Copilot
AI
Apr 22, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When recovery triggers TAOS_RETURN(walRenameCorruptedDir(pWal)), the function returns before reaching the later syncNotifyWalTruncated() block, so the sync module is never notified in the main recovery path. If notification is required for replica=3 recovery, move the notification before the early return (or avoid returning early).
| TAOS_RETURN(walRenameCorruptedDir(pWal)); | |
| code = walRenameCorruptedDir(pWal); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are two issues here:
- The notification to the sync module (intended at line 680) is unreachable because this block either returns via
TAOS_RETURNor jumps to_exitbefore reaching the end of the function. - For multi-replica setups, the log at line 620 claims auto-recovery is enabled, but the actual recovery is still gated by
tsWalDeleteOnCorruption(which defaults to false). If recovery is intended to be automatic for clusters, this guard should be adjusted.
if (shouldRecover && (tsWalDeleteOnCorruption || replica > 1)) {
if (replica > 1) {
syncNotifyWalTruncated(pWal->cfg.vgId, truncatedVer);
}
TAOS_RETURN(walRenameCorruptedDir(pWal));
}
goto _exit;There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This block is unreachable when walTruncated is true. As noted in the previous comment, any path that detects corruption and sets walTruncated = true results in an early exit from the function. The notification logic should be moved into the recovery block where the actual truncation/renaming occurs.
Copilot
AI
Apr 22, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
walCheckAndRepairMeta() now calls syncNotifyWalTruncated(), but the wal static library does not link against sync (and sync already links against wal), so this introduces a circular/undefined-symbol risk (e.g. walTest links wal without sync). Replace this direct call with a decoupled mechanism such as a callback function pointer registered by sync at runtime, or a weak/no-op default implementation compiled into wal.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Header comment says
walRecoveryPolicy"only affects single replica", but the new logic checks it for allreplica != 3cases (so it would also affect replica=2 if that configuration exists). Please align the comment (and possibly the config semantics) with actual behavior to avoid misconfiguration.