Skip to content
Draft
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ public class DatanodeConfiguration extends ReconfigurableConfig {
public static final String DISK_CHECK_MIN_GAP_KEY = "hdds.datanode.disk.check.min.gap";
public static final String DISK_CHECK_TIMEOUT_KEY = "hdds.datanode.disk.check.timeout";
public static final String DISK_CHECK_SLIDING_WINDOW_TIMEOUT_KEY = "hdds.datanode.disk.check.sliding.window.timeout";
public static final String DISK_CHECK_RETRY_GAP_KEY = "hdds.datanode.disk.check.retry.gap";
public static final String DISK_CHECK_RETRY_ATTEMPTS = "hdds.datanode.disk.check.retry.attempts";

// Minimum space should be left on volume.
// Ex: If volume has 1000GB and minFreeSpace is configured as 10GB,
Expand Down Expand Up @@ -104,6 +106,9 @@ public class DatanodeConfiguration extends ReconfigurableConfig {
static final Duration DISK_CHECK_SLIDING_WINDOW_TIMEOUT_DEFAULT =
Duration.ofMinutes(PERIODIC_DISK_CHECK_INTERVAL_MINUTES_DEFAULT).plus(DISK_CHECK_TIMEOUT_DEFAULT);

static final Duration DISK_CHECK_RETRY_GAP_DEFAULT = Duration.ofMinutes(1);
static final int DISK_CHECK_RETRY_ATTEMPTS_DEFAULT = 2;

static final boolean CONTAINER_SCHEMA_V3_ENABLED_DEFAULT = true;
static final long ROCKSDB_LOG_MAX_FILE_SIZE_BYTES_DEFAULT = 32 * 1024 * 1024;
static final int ROCKSDB_LOG_MAX_FILE_NUM_DEFAULT = 64;
Expand Down Expand Up @@ -373,6 +378,14 @@ public class DatanodeConfiguration extends ReconfigurableConfig {
)
private boolean isDiskCheckEnabled = true;

@Config(key = "hdds.datanode.rocksdb.disk.check.io.test.enabled",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about we rename it to "hdds.datanode.disk.check.rocksdb.io.test.enabled", so that all the disk check property will share the "hdds.datanode.disk.check" prefix?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have moved the switch to its own PR in #10149.

I have updated it there.

defaultValue = "true",
type = ConfigType.BOOLEAN,
tags = {DATANODE},
description = "The configuration to enable or disable RocksDb disk IO checks."
)
private boolean isRocksDbDiskCheckEnabled = true;

@Config(key = "hdds.datanode.disk.check.io.failures.tolerated",
defaultValue = "1",
type = ConfigType.INT,
Expand Down Expand Up @@ -430,6 +443,25 @@ public class DatanodeConfiguration extends ReconfigurableConfig {
)
private Duration diskCheckSlidingWindowTimeout = DISK_CHECK_SLIDING_WINDOW_TIMEOUT_DEFAULT;

@Config(key = DISK_CHECK_RETRY_GAP_KEY,
defaultValue = "1m",
Comment thread
ptlrs marked this conversation as resolved.
Outdated
type = ConfigType.TIME,
tags = {DATANODE},
description = "Time to wait between retries of disk checks."
+ " To ignore transient issues, the RocksDb instance on a disk is validated multiple times before"
+ " declaring failure. This configuration defines the time to wait between the retry attempts."
+ " Unit could be defined with postfix (ns,ms,s,m,h,d)."
)
private Duration diskCheckRetryGap = DISK_CHECK_RETRY_GAP_DEFAULT;

@Config(key = DISK_CHECK_RETRY_ATTEMPTS,
defaultValue = "2",
type = ConfigType.INT,
tags = {DATANODE},
description = "Number of retry attempts for opening RocksDb before declaring failure."
)
private int diskCheckRetryAttempts = DISK_CHECK_RETRY_ATTEMPTS_DEFAULT;

@Config(key = "hdds.datanode.chunk.data.validation.check",
defaultValue = "false",
type = ConfigType.BOOLEAN,
Expand Down Expand Up @@ -709,6 +741,25 @@ public void validate() {
diskCheckSlidingWindowTimeout = defaultTimeout;
}

if (diskCheckRetryAttempts <= 0) {
LOG.warn("{} must be greater than zero and was set to {}. Defaulting to {}",
DISK_CHECK_RETRY_ATTEMPTS, diskCheckRetryAttempts, DISK_CHECK_RETRY_ATTEMPTS_DEFAULT);
diskCheckRetryAttempts = DISK_CHECK_RETRY_ATTEMPTS_DEFAULT;
}

if (diskCheckRetryGap.isNegative()) {
LOG.warn("{} must be greater than zero and was set to {}. Defaulting to {}",
DISK_CHECK_RETRY_GAP_KEY, diskCheckRetryGap, DISK_CHECK_RETRY_GAP_DEFAULT);
diskCheckRetryGap = DISK_CHECK_RETRY_GAP_DEFAULT;
}

if (diskCheckRetryGap.compareTo(diskCheckTimeout.dividedBy(diskCheckRetryAttempts)) > 0) {
LOG.warn("{} was set to {}. It must be less than {} / {} which is {}. Defaulting to {}",
DISK_CHECK_RETRY_GAP_KEY, diskCheckRetryGap, DISK_CHECK_TIMEOUT_KEY, diskCheckRetryAttempts,
diskCheckTimeout.dividedBy(diskCheckRetryAttempts), DISK_CHECK_RETRY_GAP_DEFAULT);
diskCheckRetryGap = DISK_CHECK_RETRY_GAP_DEFAULT;
}

if (blockDeleteCommandWorkerInterval.isNegative()) {
LOG.warn(BLOCK_DELETE_COMMAND_WORKER_INTERVAL +
" must be greater than zero and was set to {}. Defaulting to {}",
Expand Down Expand Up @@ -924,10 +975,18 @@ public Duration getDiskCheckTimeout() {
return diskCheckTimeout;
}

public Duration getDiskCheckRetryGap() {
return diskCheckRetryGap;
}

public void setDiskCheckTimeout(Duration duration) {
diskCheckTimeout = duration;
}

public int getDiskCheckRetryAttempts() {
return diskCheckRetryAttempts;
}

public void setDiskCheckEnabled(boolean diskCheckEnabled) {
isDiskCheckEnabled = diskCheckEnabled;
}
Expand All @@ -936,6 +995,10 @@ public boolean isDiskCheckEnabled() {
return isDiskCheckEnabled;
}

public boolean isRocksDbDiskCheckEnabled() {
return isRocksDbDiskCheckEnabled;
}

public Duration getDiskCheckSlidingWindowTimeout() {
return diskCheckSlidingWindowTimeout;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import jakarta.annotation.Nullable;
import java.io.File;
import java.io.IOException;
import java.time.Duration;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.ConcurrentSkipListSet;
Expand Down Expand Up @@ -306,20 +307,34 @@ public synchronized VolumeCheckResult check(@Nullable Boolean unused)

@VisibleForTesting
public VolumeCheckResult checkDbHealth(File dbFile) throws InterruptedException {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's add javadoc to this method to clarify that our goal is to check global files like CURRENT and MANIFEST, and that verifying the contents of all SST files/values in the DB is done by the container data scanner.

if (!getDiskCheckEnabled()) {
if (!(getDiskCheckEnabled() && getDatanodeConfig().isRocksDbDiskCheckEnabled())) {
return VolumeCheckResult.HEALTHY;
}

try (ManagedOptions managedOptions = new ManagedOptions();
ManagedRocksDB ignored = ManagedRocksDB.openReadOnly(managedOptions, dbFile.toString())) {
// Do nothing. Only check if rocksdb is accessible.
LOG.debug("Successfully opened the database at \"{}\" for HDDS volume {}.", dbFile, getStorageDir());
} catch (Exception e) {
if (Thread.currentThread().isInterrupted()) {
throw new InterruptedException("Check of database for volume " + this + " interrupted.");
// We attempt to open RocksDb twice to ignore any transient errors
// and to confirm that we actually cannot open RocksDb in readonly mode.
final int maxAttempts = getDatanodeConfig().getDiskCheckRetryAttempts();
final Duration maxRetryGap = getDatanodeConfig().getDiskCheckRetryGap();
for (int attempt = 0; attempt < maxAttempts; attempt++) {
try (ManagedOptions managedOptions = new ManagedOptions();
Comment thread
ptlrs marked this conversation as resolved.
Outdated
ManagedRocksDB ignored =
ManagedRocksDB.openAsSecondary(managedOptions, dbFile.toString(), getTmpDir().getPath())) {
// Do nothing. Only check if rocksdb is accessible.
LOG.debug("Successfully opened the database at \"{}\" for HDDS volume {}.", dbFile, getStorageDir());
Comment thread
ptlrs marked this conversation as resolved.
Outdated
break;
} catch (Exception e) {
Comment thread
ptlrs marked this conversation as resolved.
Outdated
if (Thread.currentThread().isInterrupted()) {
throw new InterruptedException("Check of database for volume " + this + " interrupted.");
}

if (attempt == maxAttempts - 1) {
Comment thread
ptlrs marked this conversation as resolved.
Outdated
LOG.error("Could not open Volume DB located at {}", dbFile, e);
getIoTestSlidingWindow().add();
} else {
LOG.warn("Could not open Volume DB located at {}", dbFile, e);
Thread.sleep(maxRetryGap.toMillis());
Comment thread
ptlrs marked this conversation as resolved.
Outdated
}
}
LOG.warn("Could not open Volume DB located at {}", dbFile, e);
getIoTestSlidingWindow().add();
}

if (getIoTestSlidingWindow().isExceeded()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,14 @@ public static ManagedRocksDB openReadOnly(
);
}

public static ManagedRocksDB openAsSecondary(
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ptlrs thanks for all the research you've done on secondary instances. It would be great to put a summary as a javadoc above this method about how they work and what we can expect vs. readonly instances.

final ManagedOptions options,
final String dbPath,
final String secondaryDbLogFilePath)
throws RocksDBException {
return new ManagedRocksDB(RocksDB.openAsSecondary(options, dbPath, secondaryDbLogFilePath));
Comment thread
ptlrs marked this conversation as resolved.
}

public static ManagedRocksDB open(
final DBOptions options, final String path,
final List<ColumnFamilyDescriptor> columnFamilyDescriptors,
Expand Down