diff --git a/modules/data-streams/src/test/java/org/elasticsearch/datastreams/lifecycle/health/DataStreamLifecycleHealthIndicatorServiceTests.java b/modules/data-streams/src/test/java/org/elasticsearch/datastreams/lifecycle/health/DataStreamLifecycleHealthIndicatorServiceTests.java index 084e65caa8d81..1aec44b230adb 100644 --- a/modules/data-streams/src/test/java/org/elasticsearch/datastreams/lifecycle/health/DataStreamLifecycleHealthIndicatorServiceTests.java +++ b/modules/data-streams/src/test/java/org/elasticsearch/datastreams/lifecycle/health/DataStreamLifecycleHealthIndicatorServiceTests.java @@ -21,6 +21,7 @@ import org.elasticsearch.health.node.DslErrorInfo; import org.elasticsearch.health.node.HealthInfo; import org.elasticsearch.health.node.ProjectIndexName; +import org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthInfo; import org.elasticsearch.test.ESTestCase; import org.junit.Before; @@ -167,6 +168,6 @@ public void testMultiProject() { } private HealthInfo constructHealthInfo(DataStreamLifecycleHealthInfo dslHealthInfo) { - return new HealthInfo(Map.of(), dslHealthInfo, Map.of()); + return new HealthInfo(Map.of(), dslHealthInfo, Map.of(), FileSettingsHealthInfo.INDETERMINATE); } } diff --git a/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorServiceIT.java b/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorServiceIT.java index 05b33098c3bca..e3586f330a32e 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorServiceIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorServiceIT.java @@ -23,6 +23,7 @@ import org.elasticsearch.health.node.DataStreamLifecycleHealthInfo; import org.elasticsearch.health.node.HealthInfo; import org.elasticsearch.indices.SystemIndices; +import org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthInfo; import org.elasticsearch.test.ESIntegTestCase; import org.hamcrest.Matcher; @@ -144,7 +145,16 @@ public void clusterChanged(ClusterChangedEvent event) { states.add( new RoutingNodesAndHealth( event.state().getRoutingNodes(), - service.calculate(false, 1, new HealthInfo(Map.of(), DataStreamLifecycleHealthInfo.NO_DSL_ERRORS, Map.of())) + service.calculate( + false, + 1, + new HealthInfo( + Map.of(), + DataStreamLifecycleHealthInfo.NO_DSL_ERRORS, + Map.of(), + FileSettingsHealthInfo.INDETERMINATE + ) + ) ) ); } diff --git a/server/src/internalClusterTest/java/org/elasticsearch/reservedstate/service/FileSettingsServiceIT.java b/server/src/internalClusterTest/java/org/elasticsearch/reservedstate/service/FileSettingsServiceIT.java index afbc9df1098aa..6586bb03f36ba 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/reservedstate/service/FileSettingsServiceIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/reservedstate/service/FileSettingsServiceIT.java @@ -25,6 +25,8 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.core.Strings; import org.elasticsearch.core.Tuple; +import org.elasticsearch.health.GetHealthAction; +import org.elasticsearch.health.node.FetchHealthInfoCacheAction; import org.elasticsearch.reservedstate.action.ReservedClusterSettingsAction; import org.elasticsearch.test.ESIntegTestCase; import org.junit.Before; @@ -37,9 +39,12 @@ import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Stream; +import static org.elasticsearch.health.HealthStatus.YELLOW; import static org.elasticsearch.indices.recovery.RecoverySettings.INDICES_RECOVERY_MAX_BYTES_PER_SEC_SETTING; import static org.elasticsearch.node.Node.INITIAL_STATE_TIMEOUT_SETTING; +import static org.elasticsearch.test.NodeRoles.dataNode; import static org.elasticsearch.test.NodeRoles.dataOnlyNode; import static org.elasticsearch.test.NodeRoles.masterNode; import static org.hamcrest.Matchers.allOf; @@ -498,6 +503,82 @@ public void testSettingsAppliedOnMasterReElection() throws Exception { assertClusterStateSaveOK(savedClusterState.v1(), savedClusterState.v2(), "43mb"); } + public void testHealthIndicatorWithSingleNode() throws Exception { + internalCluster().setBootstrapMasterNodeIndex(0); + logger.info("--> start the node"); + String nodeName = internalCluster().startNode(); + FileSettingsService masterFileSettingsService = internalCluster().getInstance(FileSettingsService.class, nodeName); + assertBusy(() -> assertTrue(masterFileSettingsService.watching())); + + ensureStableCluster(1); + + testHealthIndicatorOnError(nodeName, nodeName); + } + + public void testHealthIndicatorWithSeparateHealthNode() throws Exception { + internalCluster().setBootstrapMasterNodeIndex(0); + logger.info("--> start a data node to act as the health node"); + String healthNode = internalCluster().startNode( + Settings.builder().put(dataOnlyNode()).put("discovery.initial_state_timeout", "1s") + ); + + logger.info("--> start master node"); + final String masterNode = internalCluster().startMasterOnlyNode( + Settings.builder().put(INITIAL_STATE_TIMEOUT_SETTING.getKey(), "0s").build() + ); + FileSettingsService masterFileSettingsService = internalCluster().getInstance(FileSettingsService.class, masterNode); + assertBusy(() -> assertTrue(masterFileSettingsService.watching())); + + ensureStableCluster(2); + + testHealthIndicatorOnError(masterNode, healthNode); + } + + /** + * {@code masterNode} and {@code healthNode} can be the same node. + */ + private void testHealthIndicatorOnError(String masterNode, String healthNode) throws Exception { + logger.info("--> ensure all is well before the error"); + assertBusy(() -> { + FetchHealthInfoCacheAction.Response healthNodeResponse = client().execute( + FetchHealthInfoCacheAction.INSTANCE, + new FetchHealthInfoCacheAction.Request() + ).get(); + assertEquals(0, healthNodeResponse.getHealthInfo().fileSettingsHealthInfo().failureStreak()); + }); + + logger.info("--> induce an error and wait for it to be processed"); + var savedClusterState = setupClusterStateListenerForError(masterNode); + writeJSONFile(masterNode, testErrorJSON, logger, versionCounter.incrementAndGet()); + boolean awaitSuccessful = savedClusterState.v1().await(20, TimeUnit.SECONDS); + assertTrue(awaitSuccessful); + + logger.info("--> ensure the health node also reports it"); + assertBusy(() -> { + FetchHealthInfoCacheAction.Response healthNodeResponse = client().execute( + FetchHealthInfoCacheAction.INSTANCE, + new FetchHealthInfoCacheAction.Request() + ).get(); + assertEquals( + "Cached info on health node should report one failure", + 1, + healthNodeResponse.getHealthInfo().fileSettingsHealthInfo().failureStreak() + ); + + for (var node : Stream.of(masterNode, healthNode).distinct().toList()) { + GetHealthAction.Response getHealthResponse = client(node).execute( + GetHealthAction.INSTANCE, + new GetHealthAction.Request(false, 123) + ).get(); + assertEquals( + "Health should be yellow on node " + node, + YELLOW, + getHealthResponse.findIndicator(FileSettingsService.FileSettingsHealthIndicatorService.NAME).status() + ); + } + }); + } + private void assertHasErrors(AtomicLong waitForMetadataVersion, String expectedError) { var errorMetadata = getErrorMetadata(waitForMetadataVersion); assertThat(errorMetadata, is(notNullValue())); diff --git a/server/src/main/java/org/elasticsearch/TransportVersions.java b/server/src/main/java/org/elasticsearch/TransportVersions.java index 8ba8240283e6b..940b307cc710c 100644 --- a/server/src/main/java/org/elasticsearch/TransportVersions.java +++ b/server/src/main/java/org/elasticsearch/TransportVersions.java @@ -241,6 +241,7 @@ static TransportVersion def(int id) { public static final TransportVersion ML_INFERENCE_SAGEMAKER = def(9_069_0_00); public static final TransportVersion WRITE_LOAD_INCLUDES_BUFFER_WRITES = def(9_070_00_0); public static final TransportVersion INTRODUCE_FAILURES_DEFAULT_RETENTION = def(9_071_0_00); + public static final TransportVersion FILE_SETTINGS_HEALTH_INFO = def(9_072_0_00); /* * STOP! READ THIS FIRST! No, really, diff --git a/server/src/main/java/org/elasticsearch/health/node/HealthInfo.java b/server/src/main/java/org/elasticsearch/health/node/HealthInfo.java index 03f9ac803bafe..005a2d12acff6 100644 --- a/server/src/main/java/org/elasticsearch/health/node/HealthInfo.java +++ b/server/src/main/java/org/elasticsearch/health/node/HealthInfo.java @@ -14,25 +14,35 @@ import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.io.stream.Writeable; import org.elasticsearch.core.Nullable; +import org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthInfo; import java.io.IOException; import java.util.Map; +import static java.util.Objects.requireNonNull; import static org.elasticsearch.health.node.DataStreamLifecycleHealthInfo.NO_DSL_ERRORS; +import static org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthInfo.INDETERMINATE; /** * This class wraps all the data returned by the health node. - * @param diskInfoByNode A Map of node id to DiskHealthInfo for that node - * @param dslHealthInfo The data stream lifecycle health information + * + * @param diskInfoByNode A Map of node id to DiskHealthInfo for that node + * @param dslHealthInfo The data stream lifecycle health information * @param repositoriesInfoByNode A Map of node id to RepositoriesHealthInfo for that node + * @param fileSettingsHealthInfo The file-based settings health information */ public record HealthInfo( Map diskInfoByNode, @Nullable DataStreamLifecycleHealthInfo dslHealthInfo, - Map repositoriesInfoByNode + Map repositoriesInfoByNode, + FileSettingsHealthInfo fileSettingsHealthInfo ) implements Writeable { - public static final HealthInfo EMPTY_HEALTH_INFO = new HealthInfo(Map.of(), NO_DSL_ERRORS, Map.of()); + public static final HealthInfo EMPTY_HEALTH_INFO = new HealthInfo(Map.of(), NO_DSL_ERRORS, Map.of(), INDETERMINATE); + + public HealthInfo { + requireNonNull(fileSettingsHealthInfo); + } public HealthInfo(StreamInput input) throws IOException { this( @@ -40,7 +50,10 @@ public HealthInfo(StreamInput input) throws IOException { input.getTransportVersion().onOrAfter(TransportVersions.V_8_12_0) ? input.readOptionalWriteable(DataStreamLifecycleHealthInfo::new) : null, - input.getTransportVersion().onOrAfter(TransportVersions.V_8_13_0) ? input.readMap(RepositoriesHealthInfo::new) : Map.of() + input.getTransportVersion().onOrAfter(TransportVersions.V_8_13_0) ? input.readMap(RepositoriesHealthInfo::new) : Map.of(), + input.getTransportVersion().onOrAfter(TransportVersions.FILE_SETTINGS_HEALTH_INFO) + ? input.readOptionalWriteable(FileSettingsHealthInfo::new) + : INDETERMINATE ); } @@ -53,5 +66,8 @@ public void writeTo(StreamOutput output) throws IOException { if (output.getTransportVersion().onOrAfter(TransportVersions.V_8_13_0)) { output.writeMap(repositoriesInfoByNode, StreamOutput::writeWriteable); } + if (output.getTransportVersion().onOrAfter(TransportVersions.FILE_SETTINGS_HEALTH_INFO)) { + output.writeOptionalWriteable(fileSettingsHealthInfo); + } } } diff --git a/server/src/main/java/org/elasticsearch/health/node/HealthInfoCache.java b/server/src/main/java/org/elasticsearch/health/node/HealthInfoCache.java index 2874e53608b82..6e1aa2db724b9 100644 --- a/server/src/main/java/org/elasticsearch/health/node/HealthInfoCache.java +++ b/server/src/main/java/org/elasticsearch/health/node/HealthInfoCache.java @@ -17,10 +17,13 @@ import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.core.Nullable; import org.elasticsearch.health.node.selection.HealthNode; +import org.elasticsearch.reservedstate.service.FileSettingsService; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; +import static org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthInfo.INDETERMINATE; + /** * Keeps track of several health statuses per node that can be used in health. */ @@ -31,6 +34,7 @@ public class HealthInfoCache implements ClusterStateListener { @Nullable private volatile DataStreamLifecycleHealthInfo dslHealthInfo = null; private volatile ConcurrentHashMap repositoriesInfoByNode = new ConcurrentHashMap<>(); + private volatile FileSettingsService.FileSettingsHealthInfo fileSettingsHealthInfo = INDETERMINATE; private HealthInfoCache() {} @@ -44,7 +48,8 @@ public void updateNodeHealth( String nodeId, @Nullable DiskHealthInfo diskHealthInfo, @Nullable DataStreamLifecycleHealthInfo latestDslHealthInfo, - @Nullable RepositoriesHealthInfo repositoriesHealthInfo + @Nullable RepositoriesHealthInfo repositoriesHealthInfo, + @Nullable FileSettingsService.FileSettingsHealthInfo fileSettingsHealthInfo ) { if (diskHealthInfo != null) { diskInfoByNode.put(nodeId, diskHealthInfo); @@ -55,6 +60,9 @@ public void updateNodeHealth( if (repositoriesHealthInfo != null) { repositoriesInfoByNode.put(nodeId, repositoriesHealthInfo); } + if (fileSettingsHealthInfo != null) { + this.fileSettingsHealthInfo = fileSettingsHealthInfo; + } } @Override @@ -77,6 +85,7 @@ public void clusterChanged(ClusterChangedEvent event) { diskInfoByNode = new ConcurrentHashMap<>(); dslHealthInfo = null; repositoriesInfoByNode = new ConcurrentHashMap<>(); + fileSettingsHealthInfo = INDETERMINATE; } } @@ -86,6 +95,6 @@ public void clusterChanged(ClusterChangedEvent event) { */ public HealthInfo getHealthInfo() { // A shallow copy is enough because the inner data is immutable. - return new HealthInfo(Map.copyOf(diskInfoByNode), dslHealthInfo, Map.copyOf(repositoriesInfoByNode)); + return new HealthInfo(Map.copyOf(diskInfoByNode), dslHealthInfo, Map.copyOf(repositoriesInfoByNode), fileSettingsHealthInfo); } } diff --git a/server/src/main/java/org/elasticsearch/health/node/UpdateHealthInfoCacheAction.java b/server/src/main/java/org/elasticsearch/health/node/UpdateHealthInfoCacheAction.java index 60151521d5629..2547eadc03409 100644 --- a/server/src/main/java/org/elasticsearch/health/node/UpdateHealthInfoCacheAction.java +++ b/server/src/main/java/org/elasticsearch/health/node/UpdateHealthInfoCacheAction.java @@ -23,6 +23,9 @@ import org.elasticsearch.health.node.action.HealthNodeRequest; import org.elasticsearch.health.node.action.TransportHealthNodeAction; import org.elasticsearch.injection.guice.Inject; +import org.elasticsearch.logging.LogManager; +import org.elasticsearch.logging.Logger; +import org.elasticsearch.reservedstate.service.FileSettingsService; import org.elasticsearch.tasks.Task; import org.elasticsearch.threadpool.ThreadPool; import org.elasticsearch.transport.TransportService; @@ -37,6 +40,7 @@ * regarding this node. */ public class UpdateHealthInfoCacheAction extends ActionType { + private static final Logger logger = LogManager.getLogger(UpdateHealthInfoCacheAction.class); public static class Request extends HealthNodeRequest { private final String nodeId; @@ -46,17 +50,21 @@ public static class Request extends HealthNodeRequest { private final DataStreamLifecycleHealthInfo dslHealthInfo; @Nullable private final RepositoriesHealthInfo repositoriesHealthInfo; + @Nullable + private final FileSettingsService.FileSettingsHealthInfo fileSettingsHealthInfo; public Request( String nodeId, DiskHealthInfo diskHealthInfo, DataStreamLifecycleHealthInfo dslHealthInfo, - RepositoriesHealthInfo repositoriesHealthInfo + RepositoriesHealthInfo repositoriesHealthInfo, + @Nullable FileSettingsService.FileSettingsHealthInfo fileSettingsHealthInfo ) { this.nodeId = nodeId; this.diskHealthInfo = diskHealthInfo; this.dslHealthInfo = dslHealthInfo; this.repositoriesHealthInfo = repositoriesHealthInfo; + this.fileSettingsHealthInfo = fileSettingsHealthInfo; } public Request(String nodeId, DataStreamLifecycleHealthInfo dslHealthInfo) { @@ -64,6 +72,15 @@ public Request(String nodeId, DataStreamLifecycleHealthInfo dslHealthInfo) { this.diskHealthInfo = null; this.repositoriesHealthInfo = null; this.dslHealthInfo = dslHealthInfo; + this.fileSettingsHealthInfo = null; + } + + public Request(String nodeId, FileSettingsService.FileSettingsHealthInfo info) { + this.nodeId = nodeId; + this.diskHealthInfo = null; + this.repositoriesHealthInfo = null; + this.dslHealthInfo = null; + this.fileSettingsHealthInfo = info; } public Request(StreamInput in) throws IOException { @@ -75,6 +92,9 @@ public Request(StreamInput in) throws IOException { this.repositoriesHealthInfo = in.getTransportVersion().onOrAfter(TransportVersions.V_8_13_0) ? in.readOptionalWriteable(RepositoriesHealthInfo::new) : null; + this.fileSettingsHealthInfo = in.getTransportVersion().onOrAfter(TransportVersions.FILE_SETTINGS_HEALTH_INFO) + ? in.readOptionalWriteable(FileSettingsService.FileSettingsHealthInfo::new) + : null; } else { // BWC for pre-8.12 the disk health info was mandatory. Evolving this request has proven tricky however we've made use of // waiting for all nodes to be on the {@link TransportVersions.HEALTH_INFO_ENRICHED_WITH_DSL_STATUS} transport version @@ -83,6 +103,7 @@ public Request(StreamInput in) throws IOException { this.diskHealthInfo = new DiskHealthInfo(in); this.dslHealthInfo = null; this.repositoriesHealthInfo = null; + this.fileSettingsHealthInfo = null; } } @@ -102,6 +123,11 @@ public RepositoriesHealthInfo getRepositoriesHealthInfo() { return repositoriesHealthInfo; } + @Nullable + public FileSettingsService.FileSettingsHealthInfo getFileSettingsHealthInfo() { + return fileSettingsHealthInfo; + } + @Override public ActionRequestValidationException validate() { return null; @@ -117,6 +143,9 @@ public void writeTo(StreamOutput out) throws IOException { if (out.getTransportVersion().onOrAfter(TransportVersions.V_8_13_0)) { out.writeOptionalWriteable(repositoriesHealthInfo); } + if (out.getTransportVersion().onOrAfter(TransportVersions.FILE_SETTINGS_HEALTH_INFO)) { + out.writeOptionalWriteable(fileSettingsHealthInfo); + } } else { // BWC for pre-8.12 the disk health info was mandatory. Evolving this request has proven tricky however we've made use of // waiting for all nodes to be on the {@link TransportVersions.V_8_12_0} transport version @@ -185,7 +214,7 @@ public Builder dslHealthInfo(DataStreamLifecycleHealthInfo dslHealthInfo) { } public Request build() { - return new Request(nodeId, diskHealthInfo, dslHealthInfo, repositoriesHealthInfo); + return new Request(nodeId, diskHealthInfo, dslHealthInfo, repositoriesHealthInfo, null); } } } @@ -228,13 +257,21 @@ protected void healthOperation( ClusterState clusterState, ActionListener listener ) { + logger.debug( + "Updating health info cache on node [{}][{}] from node [{}]", + clusterService.getNodeName(), + clusterService.localNode().getId(), + request.getNodeId() + ); nodeHealthOverview.updateNodeHealth( request.getNodeId(), request.getDiskHealthInfo(), request.getDslHealthInfo(), - request.getRepositoriesHealthInfo() + request.getRepositoriesHealthInfo(), + request.getFileSettingsHealthInfo() ); listener.onResponse(AcknowledgedResponse.of(true)); } } + } diff --git a/server/src/main/java/org/elasticsearch/node/NodeConstruction.java b/server/src/main/java/org/elasticsearch/node/NodeConstruction.java index 284fbd845b922..eb4f2c2543475 100644 --- a/server/src/main/java/org/elasticsearch/node/NodeConstruction.java +++ b/server/src/main/java/org/elasticsearch/node/NodeConstruction.java @@ -193,6 +193,7 @@ import org.elasticsearch.reservedstate.action.ReservedClusterSettingsAction; import org.elasticsearch.reservedstate.service.FileSettingsService; import org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthIndicatorService; +import org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthTracker; import org.elasticsearch.reservedstate.service.FileSettingsServiceProvider; import org.elasticsearch.rest.action.search.SearchResponseMetrics; import org.elasticsearch.script.ScriptModule; @@ -1111,11 +1112,12 @@ public Map queryFields() { actionModule.getReservedClusterStateService().installClusterStateHandler(new ReservedRepositoryAction(repositoriesService)); actionModule.getReservedClusterStateService().installProjectStateHandler(new ReservedPipelineAction()); - FileSettingsHealthIndicatorService fileSettingsHealthIndicatorService = new FileSettingsHealthIndicatorService(settings); + var fileSettingsHealthIndicatorPublisher = new FileSettingsService.FileSettingsHealthIndicatorPublisherImpl(clusterService, client); + var fileSettingsHealthTracker = new FileSettingsHealthTracker(settings, fileSettingsHealthIndicatorPublisher); FileSettingsService fileSettingsService = pluginsService.loadSingletonServiceProvider( FileSettingsServiceProvider.class, () -> FileSettingsService::new - ).construct(clusterService, actionModule.getReservedClusterStateService(), environment, fileSettingsHealthIndicatorService); + ).construct(clusterService, actionModule.getReservedClusterStateService(), environment, fileSettingsHealthTracker); RestoreService restoreService = new RestoreService( clusterService, @@ -1199,8 +1201,7 @@ public Map queryFields() { transportService, threadPool, telemetryProvider, - repositoriesService, - fileSettingsHealthIndicatorService + repositoriesService ) ); @@ -1372,8 +1373,7 @@ private Module loadDiagnosticServices( TransportService transportService, ThreadPool threadPool, TelemetryProvider telemetryProvider, - RepositoriesService repositoriesService, - FileSettingsHealthIndicatorService fileSettingsHealthIndicatorService + RepositoriesService repositoriesService ) { MasterHistoryService masterHistoryService = new MasterHistoryService(transportService, threadPool, clusterService); @@ -1389,7 +1389,7 @@ private Module loadDiagnosticServices( new RepositoryIntegrityHealthIndicatorService(clusterService), new DiskHealthIndicatorService(clusterService), new ShardsCapacityHealthIndicatorService(clusterService), - fileSettingsHealthIndicatorService + new FileSettingsHealthIndicatorService() ); var pluginHealthIndicatorServices = pluginsService.filterPlugins(HealthPlugin.class) .flatMap(plugin -> plugin.getHealthIndicatorServices().stream()); diff --git a/server/src/main/java/org/elasticsearch/reservedstate/service/FileSettingsHealthIndicatorPublisher.java b/server/src/main/java/org/elasticsearch/reservedstate/service/FileSettingsHealthIndicatorPublisher.java new file mode 100644 index 0000000000000..888445791f4b3 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/reservedstate/service/FileSettingsHealthIndicatorPublisher.java @@ -0,0 +1,21 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.reservedstate.service; + +import org.elasticsearch.action.ActionListener; +import org.elasticsearch.action.support.master.AcknowledgedResponse; +import org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthTracker; + +/** + * Used by {@link FileSettingsHealthTracker} to send health info to the health node. + */ +public interface FileSettingsHealthIndicatorPublisher { + void publish(FileSettingsService.FileSettingsHealthInfo info, ActionListener actionListener); +} diff --git a/server/src/main/java/org/elasticsearch/reservedstate/service/FileSettingsService.java b/server/src/main/java/org/elasticsearch/reservedstate/service/FileSettingsService.java index 6f736fd0209c4..8de1979a1dbd8 100644 --- a/server/src/main/java/org/elasticsearch/reservedstate/service/FileSettingsService.java +++ b/server/src/main/java/org/elasticsearch/reservedstate/service/FileSettingsService.java @@ -11,17 +11,24 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import org.elasticsearch.action.ActionListener; import org.elasticsearch.action.ActionResponse; import org.elasticsearch.action.support.PlainActionFuture; +import org.elasticsearch.action.support.master.AcknowledgedResponse; +import org.elasticsearch.client.internal.Client; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.ClusterStateListener; import org.elasticsearch.cluster.NotMasterException; import org.elasticsearch.cluster.coordination.FailedToCommitClusterStateException; import org.elasticsearch.cluster.metadata.Metadata; import org.elasticsearch.cluster.metadata.ReservedStateMetadata; +import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.common.Strings; import org.elasticsearch.common.file.MasterNodeFileWatchingService; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.io.stream.Writeable; import org.elasticsearch.common.settings.Setting; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.env.Environment; @@ -31,6 +38,8 @@ import org.elasticsearch.health.HealthIndicatorService; import org.elasticsearch.health.SimpleHealthIndicatorDetails; import org.elasticsearch.health.node.HealthInfo; +import org.elasticsearch.health.node.UpdateHealthInfoCacheAction; +import org.elasticsearch.health.node.selection.HealthNode; import org.elasticsearch.xcontent.XContentParseException; import org.elasticsearch.xcontent.XContentParser; import org.elasticsearch.xcontent.XContentParserConfiguration; @@ -75,7 +84,7 @@ public class FileSettingsService extends MasterNodeFileWatchingService implement private final Path watchedFile; private final ReservedClusterStateService stateService; - private final FileSettingsHealthIndicatorService healthIndicatorService; + private final FileSettingsHealthTracker healthIndicatorTracker; /** * Constructs the {@link FileSettingsService} @@ -83,19 +92,19 @@ public class FileSettingsService extends MasterNodeFileWatchingService implement * @param clusterService so we can register ourselves as a cluster state change listener * @param stateService an instance of the immutable cluster state controller, so we can perform the cluster state changes * @param environment we need the environment to pull the location of the config and operator directories - * @param healthIndicatorService tracks the success or failure of file-based settings + * @param healthIndicatorTracker tracks the success or failure of file-based settings operations */ @SuppressWarnings("this-escape") public FileSettingsService( ClusterService clusterService, ReservedClusterStateService stateService, Environment environment, - FileSettingsHealthIndicatorService healthIndicatorService + FileSettingsHealthTracker healthIndicatorTracker ) { super(clusterService, environment.configDir().toAbsolutePath().resolve(OPERATOR_DIRECTORY)); this.watchedFile = watchedFileDir().resolve(SETTINGS_FILE_NAME); this.stateService = stateService; - this.healthIndicatorService = healthIndicatorService; + this.healthIndicatorTracker = healthIndicatorTracker; } protected Logger logger() { @@ -106,10 +115,6 @@ public Path watchedFile() { return watchedFile; } - public FileSettingsHealthIndicatorService healthIndicatorService() { - return healthIndicatorService; - } - /** * Used by snapshot restore service {@link org.elasticsearch.snapshots.RestoreService} to prepare the reserved * state of the snapshot for the current cluster. @@ -143,14 +148,14 @@ public void handleSnapshotRestore(ClusterState clusterState, Metadata.Builder md @Override protected void doStart() { - healthIndicatorService.startOccurred(); + healthIndicatorTracker.startOccurred(); super.doStart(); } @Override protected void doStop() { super.doStop(); - healthIndicatorService.stopOccurred(); + healthIndicatorTracker.stopOccurred(); } /** @@ -193,7 +198,7 @@ protected void processFile(Path file, boolean startup) throws IOException, Execu logger().debug("Received notification for unknown file {}", file); } else { logger().info("processing path [{}] for [{}]{}", watchedFile, NAMESPACE, startup ? " on service start" : ""); - healthIndicatorService.changeOccurred(); + healthIndicatorTracker.changeOccurred(); processFileChanges(startup ? HIGHER_OR_SAME_VERSION : HIGHER_VERSION_ONLY); } } @@ -211,12 +216,17 @@ private void processFileChanges(ReservedStateVersionCheck versionCheck) throws I } protected void completeProcessing(Exception e, PlainActionFuture completion) { - if (e != null) { - healthIndicatorService.failureOccurred(e.toString()); - completion.onFailure(e); - } else { - completion.onResponse(null); - healthIndicatorService.successOccurred(); + try { + if (e != null) { + healthIndicatorTracker.failureOccurred(e.toString()); + completion.onFailure(e); + } else { + completion.onResponse(null); + healthIndicatorTracker.successOccurred(); + } + } finally { + logger().debug("Publishing to health node"); + healthIndicatorTracker.publish(); } } @@ -247,6 +257,52 @@ protected void processInitialFilesMissing() throws ExecutionException, Interrupt completion.get(); } + public record FileSettingsHealthInfo(boolean isActive, long changeCount, long failureStreak, String mostRecentFailure) + implements + Writeable { + + /** + * Indicates that no conclusions can be drawn about the health status. + */ + public static final FileSettingsHealthInfo INDETERMINATE = new FileSettingsHealthInfo(false, 0L, 0, null); + + /** + * Indicates that the health info system is active and no changes have occurred yet, so all is well. + */ + public static final FileSettingsHealthInfo INITIAL_ACTIVE = new FileSettingsHealthInfo(true, 0L, 0, null); + + public FileSettingsHealthInfo(StreamInput in) throws IOException { + this(in.readBoolean(), in.readVLong(), in.readVLong(), in.readOptionalString()); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeBoolean(isActive); + out.writeVLong(changeCount); + out.writeVLong(failureStreak); + out.writeOptionalString(mostRecentFailure); + } + + public FileSettingsHealthInfo inactive() { + return new FileSettingsHealthInfo(false, changeCount, failureStreak, mostRecentFailure); + } + + public FileSettingsHealthInfo changed() { + return new FileSettingsHealthInfo(isActive, changeCount + 1, failureStreak, mostRecentFailure); + } + + public FileSettingsHealthInfo successful() { + return new FileSettingsHealthInfo(isActive, changeCount, 0, null); + } + + public FileSettingsHealthInfo failed(String failureDescription) { + return new FileSettingsHealthInfo(isActive, changeCount, failureStreak + 1, failureDescription); + } + } + + /** + * Stateless service that maps a {@link FileSettingsHealthInfo} to a {@link HealthIndicatorResult}. + */ public static class FileSettingsHealthIndicatorService implements HealthIndicatorService { static final String NAME = "file_settings"; static final String INACTIVE_SYMPTOM = "File-based settings are inactive"; @@ -264,6 +320,43 @@ public static class FileSettingsHealthIndicatorService implements HealthIndicato ) ); + @Override + public String name() { + return NAME; + } + + @Override + public synchronized HealthIndicatorResult calculate(boolean verbose, int maxAffectedResourcesCount, HealthInfo healthInfo) { + return calculate(healthInfo.fileSettingsHealthInfo()); + } + + public HealthIndicatorResult calculate(FileSettingsHealthInfo info) { + if (info.isActive() == false) { + return createIndicator(GREEN, INACTIVE_SYMPTOM, HealthIndicatorDetails.EMPTY, List.of(), List.of()); + } + if (0 == info.changeCount()) { + return createIndicator(GREEN, NO_CHANGES_SYMPTOM, HealthIndicatorDetails.EMPTY, List.of(), List.of()); + } + if (0 == info.failureStreak()) { + return createIndicator(GREEN, SUCCESS_SYMPTOM, HealthIndicatorDetails.EMPTY, List.of(), List.of()); + } else { + return createIndicator( + YELLOW, + FAILURE_SYMPTOM, + new SimpleHealthIndicatorDetails( + Map.of("failure_streak", info.failureStreak(), "most_recent_failure", info.mostRecentFailure()) + ), + STALE_SETTINGS_IMPACT, + List.of() + ); + } + } + } + + /** + * Houses the current {@link FileSettingsHealthInfo} and provides a means to publish it to the health node. + */ + public static class FileSettingsHealthTracker { /** * We want a length limit so we don't blow past the indexing limit in the case of a long description string. * This is an {@code OperatorDynamic} setting so that if the truncation hampers troubleshooting efforts, @@ -278,37 +371,36 @@ public static class FileSettingsHealthIndicatorService implements HealthIndicato ); private final Settings settings; - private boolean isActive = false; - private long changeCount = 0; - private long failureStreak = 0; - private String mostRecentFailure = null; + private final FileSettingsHealthIndicatorPublisher publisher; + private FileSettingsHealthInfo currentInfo = FileSettingsHealthInfo.INDETERMINATE; - public FileSettingsHealthIndicatorService(Settings settings) { + public FileSettingsHealthTracker(Settings settings, FileSettingsHealthIndicatorPublisher publisher) { this.settings = settings; + this.publisher = publisher; + } + + public FileSettingsHealthInfo getCurrentInfo() { + return currentInfo; } public synchronized void startOccurred() { - isActive = true; - failureStreak = 0; + currentInfo = FileSettingsHealthInfo.INITIAL_ACTIVE; } public synchronized void stopOccurred() { - isActive = false; - mostRecentFailure = null; + currentInfo = currentInfo.inactive(); } public synchronized void changeOccurred() { - ++changeCount; + currentInfo = currentInfo.changed(); } public synchronized void successOccurred() { - failureStreak = 0; - mostRecentFailure = null; + currentInfo = currentInfo.successful(); } public synchronized void failureOccurred(String description) { - ++failureStreak; - mostRecentFailure = limitLength(description); + currentInfo = currentInfo.failed(limitLength(description)); } private String limitLength(String description) { @@ -320,28 +412,43 @@ private String limitLength(String description) { } } - @Override - public String name() { - return NAME; + /** + * Sends the current health info to the health node. + */ + public void publish() { + publisher.publish( + currentInfo, + ActionListener.wrap( + r -> logger.debug("Successfully published health indicator"), + e -> logger.warn("Failed to publish health indicator", e) + ) + ); } + } - @Override - public synchronized HealthIndicatorResult calculate(boolean verbose, int maxAffectedResourcesCount, HealthInfo healthInfo) { - if (isActive == false) { - return createIndicator(GREEN, INACTIVE_SYMPTOM, HealthIndicatorDetails.EMPTY, List.of(), List.of()); - } - if (0 == changeCount) { - return createIndicator(GREEN, NO_CHANGES_SYMPTOM, HealthIndicatorDetails.EMPTY, List.of(), List.of()); - } - if (0 == failureStreak) { - return createIndicator(GREEN, SUCCESS_SYMPTOM, HealthIndicatorDetails.EMPTY, List.of(), List.of()); + public static class FileSettingsHealthIndicatorPublisherImpl implements FileSettingsHealthIndicatorPublisher { + private final ClusterService clusterService; + private final Client client; + + public FileSettingsHealthIndicatorPublisherImpl(ClusterService clusterService, Client client) { + this.clusterService = clusterService; + this.client = client; + } + + public void publish(FileSettingsHealthInfo info, ActionListener actionListener) { + DiscoveryNode currentHealthNode = HealthNode.findHealthNode(clusterService.state()); + if (currentHealthNode == null) { + logger.debug( + "Unable to report file settings health because there is no health node in the cluster;" + + " will retry next time file settings health changes." + ); } else { - return createIndicator( - YELLOW, - FAILURE_SYMPTOM, - new SimpleHealthIndicatorDetails(Map.of("failure_streak", failureStreak, "most_recent_failure", mostRecentFailure)), - STALE_SETTINGS_IMPACT, - List.of() + logger.debug("Publishing file settings health indicators: [{}]", info); + String localNode = clusterService.localNode().getId(); + client.execute( + UpdateHealthInfoCacheAction.INSTANCE, + new UpdateHealthInfoCacheAction.Request(localNode, info), + actionListener ); } } diff --git a/server/src/main/java/org/elasticsearch/reservedstate/service/FileSettingsServiceProvider.java b/server/src/main/java/org/elasticsearch/reservedstate/service/FileSettingsServiceProvider.java index 99cc4ffa7e2c6..b6a149f59a5a3 100644 --- a/server/src/main/java/org/elasticsearch/reservedstate/service/FileSettingsServiceProvider.java +++ b/server/src/main/java/org/elasticsearch/reservedstate/service/FileSettingsServiceProvider.java @@ -17,6 +17,6 @@ FileSettingsService construct( ClusterService clusterService, ReservedClusterStateService stateService, Environment environment, - FileSettingsService.FileSettingsHealthIndicatorService healthIndicatorService + FileSettingsService.FileSettingsHealthTracker healthTracker ); } diff --git a/server/src/test/java/org/elasticsearch/health/HealthServiceTests.java b/server/src/test/java/org/elasticsearch/health/HealthServiceTests.java index c10e8a485b130..492a5e0434fb3 100644 --- a/server/src/test/java/org/elasticsearch/health/HealthServiceTests.java +++ b/server/src/test/java/org/elasticsearch/health/HealthServiceTests.java @@ -16,6 +16,7 @@ import org.elasticsearch.health.node.DataStreamLifecycleHealthInfo; import org.elasticsearch.health.node.FetchHealthInfoCacheAction; import org.elasticsearch.health.node.HealthInfo; +import org.elasticsearch.reservedstate.service.FileSettingsService; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.threadpool.TestThreadPool; import org.elasticsearch.threadpool.ThreadPool; @@ -251,7 +252,12 @@ public void testThatIndicatorsGetHealthInfoData() throws Exception { var shardsAvailable = new HealthIndicatorResult("shards_availability", GREEN, null, null, null, null); var diskHealthInfoMap = randomMap(1, 1, () -> tuple(randomAlphaOfLength(10), randomDiskHealthInfo())); var repoHealthInfoMap = randomMap(1, 1, () -> tuple(randomAlphaOfLength(10), randomRepoHealthInfo())); - HealthInfo healthInfo = new HealthInfo(diskHealthInfoMap, DataStreamLifecycleHealthInfo.NO_DSL_ERRORS, repoHealthInfoMap); + HealthInfo healthInfo = new HealthInfo( + diskHealthInfoMap, + DataStreamLifecycleHealthInfo.NO_DSL_ERRORS, + repoHealthInfoMap, + FileSettingsService.FileSettingsHealthInfo.INDETERMINATE + ); var service = new HealthService( // The preflight indicator does not get data because the data is not fetched until after the preflight check diff --git a/server/src/test/java/org/elasticsearch/health/node/DiskHealthIndicatorServiceTests.java b/server/src/test/java/org/elasticsearch/health/node/DiskHealthIndicatorServiceTests.java index b77f970d9e785..597cfaad380d1 100644 --- a/server/src/test/java/org/elasticsearch/health/node/DiskHealthIndicatorServiceTests.java +++ b/server/src/test/java/org/elasticsearch/health/node/DiskHealthIndicatorServiceTests.java @@ -36,6 +36,7 @@ import org.elasticsearch.health.ImpactArea; import org.elasticsearch.index.IndexVersion; import org.elasticsearch.index.shard.ShardId; +import org.elasticsearch.reservedstate.service.FileSettingsService; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.xcontent.ToXContent; import org.elasticsearch.xcontent.XContentBuilder; @@ -274,7 +275,12 @@ public void testRedNoBlockedIndicesAndRedAllRoleNodes() throws IOException { diskInfoByNode.put(discoveryNode.getId(), new DiskHealthInfo(HealthStatus.GREEN)); } } - HealthInfo healthInfo = new HealthInfo(diskInfoByNode, DataStreamLifecycleHealthInfo.NO_DSL_ERRORS, Map.of()); + HealthInfo healthInfo = new HealthInfo( + diskInfoByNode, + DataStreamLifecycleHealthInfo.NO_DSL_ERRORS, + Map.of(), + FileSettingsService.FileSettingsHealthInfo.INDETERMINATE + ); HealthIndicatorResult result = diskHealthIndicatorService.calculate(true, healthInfo); assertThat(result.status(), equalTo(HealthStatus.RED)); @@ -1050,7 +1056,12 @@ private HealthInfo createHealthInfo(List healthInfoConfigs) { diskInfoByNode.put(node.getId(), diskHealthInfo); } } - return new HealthInfo(diskInfoByNode, DataStreamLifecycleHealthInfo.NO_DSL_ERRORS, Map.of()); + return new HealthInfo( + diskInfoByNode, + DataStreamLifecycleHealthInfo.NO_DSL_ERRORS, + Map.of(), + FileSettingsService.FileSettingsHealthInfo.INDETERMINATE + ); } private static ClusterService createClusterService(Collection nodes, boolean withBlockedIndex) { diff --git a/server/src/test/java/org/elasticsearch/health/node/FetchHealthInfoCacheActionTests.java b/server/src/test/java/org/elasticsearch/health/node/FetchHealthInfoCacheActionTests.java index 6334264597437..308e69baf7bc4 100644 --- a/server/src/test/java/org/elasticsearch/health/node/FetchHealthInfoCacheActionTests.java +++ b/server/src/test/java/org/elasticsearch/health/node/FetchHealthInfoCacheActionTests.java @@ -18,6 +18,7 @@ import org.elasticsearch.cluster.node.DiscoveryNodeUtils; import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.health.HealthStatus; +import org.elasticsearch.reservedstate.service.FileSettingsService; import org.elasticsearch.test.ESTestCase; import org.elasticsearch.test.EqualsHashCodeTestUtils; import org.elasticsearch.test.transport.CapturingTransport; @@ -130,7 +131,8 @@ private HealthInfoCache getTestHealthInfoCache() { nodeId, new DiskHealthInfo(randomFrom(HealthStatus.values()), randomFrom(DiskHealthInfo.Cause.values())), randomDslHealthInfo(), - randomRepoHealthInfo() + randomRepoHealthInfo(), + FileSettingsService.FileSettingsHealthInfo.INDETERMINATE ); } return healthInfoCache; diff --git a/server/src/test/java/org/elasticsearch/health/node/HealthInfoCacheTests.java b/server/src/test/java/org/elasticsearch/health/node/HealthInfoCacheTests.java index da28af0dd2825..b2ec1a6b92e53 100644 --- a/server/src/test/java/org/elasticsearch/health/node/HealthInfoCacheTests.java +++ b/server/src/test/java/org/elasticsearch/health/node/HealthInfoCacheTests.java @@ -17,6 +17,7 @@ import org.elasticsearch.cluster.node.DiscoveryNodeUtils; import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.health.HealthStatus; +import org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthInfo; import org.elasticsearch.test.ESTestCase; import java.util.Map; @@ -49,12 +50,12 @@ public void testAddHealthInfo() { HealthInfoCache healthInfoCache = HealthInfoCache.create(clusterService); DataStreamLifecycleHealthInfo latestDslHealthInfo = randomDslHealthInfo(); var repoHealthInfo = randomRepoHealthInfo(); - healthInfoCache.updateNodeHealth(node1.getId(), GREEN, latestDslHealthInfo, repoHealthInfo); - healthInfoCache.updateNodeHealth(node2.getId(), RED, null, null); + healthInfoCache.updateNodeHealth(node1.getId(), GREEN, latestDslHealthInfo, repoHealthInfo, FileSettingsHealthInfo.INDETERMINATE); + healthInfoCache.updateNodeHealth(node2.getId(), RED, null, null, FileSettingsHealthInfo.INDETERMINATE); Map diskHealthInfo = healthInfoCache.getHealthInfo().diskInfoByNode(); // Ensure that HealthInfoCache#getHealthInfo() returns a copy of the health info. - healthInfoCache.updateNodeHealth(node1.getId(), RED, null, null); + healthInfoCache.updateNodeHealth(node1.getId(), RED, null, null, FileSettingsHealthInfo.INDETERMINATE); assertThat(diskHealthInfo.get(node1.getId()), equalTo(GREEN)); assertThat(diskHealthInfo.get(node2.getId()), equalTo(RED)); @@ -64,10 +65,10 @@ public void testAddHealthInfo() { public void testRemoveNodeFromTheCluster() { HealthInfoCache healthInfoCache = HealthInfoCache.create(clusterService); - healthInfoCache.updateNodeHealth(node1.getId(), GREEN, null, null); + healthInfoCache.updateNodeHealth(node1.getId(), GREEN, null, null, FileSettingsHealthInfo.INDETERMINATE); DataStreamLifecycleHealthInfo latestDslHealthInfo = randomDslHealthInfo(); var repoHealthInfo = randomRepoHealthInfo(); - healthInfoCache.updateNodeHealth(node2.getId(), RED, latestDslHealthInfo, repoHealthInfo); + healthInfoCache.updateNodeHealth(node2.getId(), RED, latestDslHealthInfo, repoHealthInfo, FileSettingsHealthInfo.INDETERMINATE); ClusterState previous = ClusterStateCreationUtils.state(node1, node1, node1, allNodes); ClusterState current = ClusterStateCreationUtils.state(node1, node1, node1, new DiscoveryNode[] { node1 }); @@ -83,8 +84,14 @@ public void testRemoveNodeFromTheCluster() { public void testNotAHealthNode() { HealthInfoCache healthInfoCache = HealthInfoCache.create(clusterService); - healthInfoCache.updateNodeHealth(node1.getId(), GREEN, randomDslHealthInfo(), randomRepoHealthInfo()); - healthInfoCache.updateNodeHealth(node2.getId(), RED, null, null); + healthInfoCache.updateNodeHealth( + node1.getId(), + GREEN, + randomDslHealthInfo(), + randomRepoHealthInfo(), + FileSettingsHealthInfo.INDETERMINATE + ); + healthInfoCache.updateNodeHealth(node2.getId(), RED, null, null, FileSettingsHealthInfo.INDETERMINATE); ClusterState previous = ClusterStateCreationUtils.state(node1, node1, node1, allNodes); ClusterState current = ClusterStateCreationUtils.state(node1, node1, node2, allNodes); diff --git a/server/src/test/java/org/elasticsearch/health/node/HealthInfoTests.java b/server/src/test/java/org/elasticsearch/health/node/HealthInfoTests.java index 604e94d187022..c109909a5dbd9 100644 --- a/server/src/test/java/org/elasticsearch/health/node/HealthInfoTests.java +++ b/server/src/test/java/org/elasticsearch/health/node/HealthInfoTests.java @@ -11,7 +11,9 @@ import org.elasticsearch.common.io.stream.Writeable; import org.elasticsearch.health.HealthStatus; +import org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthInfo; import org.elasticsearch.test.AbstractWireSerializingTestCase; +import org.elasticsearch.test.ESTestCase; import java.util.HashMap; import java.util.Map; @@ -29,7 +31,12 @@ protected Writeable.Reader instanceReader() { protected HealthInfo createTestInstance() { var diskInfoByNode = randomMap(0, 10, () -> tuple(randomAlphaOfLength(10), randomDiskHealthInfo())); var repositoriesInfoByNode = randomMap(0, 10, () -> tuple(randomAlphaOfLength(10), randomRepoHealthInfo())); - return new HealthInfo(diskInfoByNode, randomBoolean() ? randomDslHealthInfo() : null, repositoriesInfoByNode); + return new HealthInfo( + diskInfoByNode, + randomBoolean() ? randomDslHealthInfo() : null, + repositoriesInfoByNode, + randomBoolean() ? FileSettingsHealthInfo.INDETERMINATE : mutateFileSettingsHealthInfo(FileSettingsHealthInfo.INDETERMINATE) + ); } @Override @@ -41,7 +48,8 @@ public static HealthInfo mutateHealthInfo(HealthInfo originalHealthInfo) { var diskHealth = originalHealthInfo.diskInfoByNode(); var dslHealth = originalHealthInfo.dslHealthInfo(); var repoHealth = originalHealthInfo.repositoriesInfoByNode(); - switch (randomInt(2)) { + var fsHealth = originalHealthInfo.fileSettingsHealthInfo(); + switch (randomInt(3)) { case 0 -> diskHealth = mutateMap( originalHealthInfo.diskInfoByNode(), () -> randomAlphaOfLength(10), @@ -53,8 +61,9 @@ public static HealthInfo mutateHealthInfo(HealthInfo originalHealthInfo) { () -> randomAlphaOfLength(10), HealthInfoTests::randomRepoHealthInfo ); + case 3 -> fsHealth = mutateFileSettingsHealthInfo(fsHealth); } - return new HealthInfo(diskHealth, dslHealth, repoHealth); + return new HealthInfo(diskHealth, dslHealth, repoHealth, fsHealth); } public static DiskHealthInfo randomDiskHealthInfo() { @@ -74,6 +83,18 @@ public static RepositoriesHealthInfo randomRepoHealthInfo() { return new RepositoriesHealthInfo(randomList(5, () -> randomAlphaOfLength(10)), randomList(5, () -> randomAlphaOfLength(10))); } + private static FileSettingsHealthInfo mutateFileSettingsHealthInfo(FileSettingsHealthInfo original) { + long changeCount = randomValueOtherThan(original.changeCount(), ESTestCase::randomNonNegativeLong); + long failureStreak = randomLongBetween(0, changeCount); + String mostRecentFailure; + if (failureStreak == 0) { + mostRecentFailure = null; + } else { + mostRecentFailure = "Random failure #" + randomIntBetween(1000, 9999); + } + return new FileSettingsHealthInfo(true, changeCount, failureStreak, mostRecentFailure); + } + /** * Mutates a {@link Map} by either adding, updating, or removing an entry. */ diff --git a/server/src/test/java/org/elasticsearch/reservedstate/service/FileSettingsHealthIndicatorServiceTests.java b/server/src/test/java/org/elasticsearch/reservedstate/service/FileSettingsHealthIndicatorServiceTests.java index e973073efb184..bed69769862c5 100644 --- a/server/src/test/java/org/elasticsearch/reservedstate/service/FileSettingsHealthIndicatorServiceTests.java +++ b/server/src/test/java/org/elasticsearch/reservedstate/service/FileSettingsHealthIndicatorServiceTests.java @@ -9,11 +9,11 @@ package org.elasticsearch.reservedstate.service; -import org.elasticsearch.common.settings.Settings; import org.elasticsearch.health.HealthIndicatorDetails; import org.elasticsearch.health.HealthIndicatorResult; import org.elasticsearch.health.SimpleHealthIndicatorDetails; import org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthIndicatorService; +import org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthInfo; import org.elasticsearch.test.ESTestCase; import org.junit.Before; @@ -22,109 +22,62 @@ import static org.elasticsearch.health.HealthStatus.GREEN; import static org.elasticsearch.health.HealthStatus.YELLOW; -import static org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthIndicatorService.DESCRIPTION_LENGTH_LIMIT_KEY; import static org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthIndicatorService.FAILURE_SYMPTOM; import static org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthIndicatorService.INACTIVE_SYMPTOM; +import static org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthIndicatorService.NAME; import static org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthIndicatorService.NO_CHANGES_SYMPTOM; import static org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthIndicatorService.STALE_SETTINGS_IMPACT; import static org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthIndicatorService.SUCCESS_SYMPTOM; +import static org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthInfo.INDETERMINATE; +import static org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthInfo.INITIAL_ACTIVE; /** - * Here, we test {@link FileSettingsHealthIndicatorService} in isolation; - * we do not test that {@link FileSettingsService} uses it correctly. + * Tests that {@link FileSettingsHealthIndicatorService} produces the right {@link HealthIndicatorResult} + * from a given {@link FileSettingsHealthInfo}. */ public class FileSettingsHealthIndicatorServiceTests extends ESTestCase { - FileSettingsHealthIndicatorService healthIndicatorService; @Before public void initialize() { - healthIndicatorService = new FileSettingsHealthIndicatorService(Settings.EMPTY); + healthIndicatorService = new FileSettingsHealthIndicatorService(); } - public void testInitiallyGreen() {} - - public void testStartAndStop() { - assertEquals( - new HealthIndicatorResult("file_settings", GREEN, INACTIVE_SYMPTOM, HealthIndicatorDetails.EMPTY, List.of(), List.of()), - healthIndicatorService.calculate(false, null) - ); - healthIndicatorService.startOccurred(); - assertEquals( - new HealthIndicatorResult("file_settings", GREEN, NO_CHANGES_SYMPTOM, HealthIndicatorDetails.EMPTY, List.of(), List.of()), - healthIndicatorService.calculate(false, null) - ); - healthIndicatorService.stopOccurred(); + public void testGreenWhenInactive() { + var expected = new HealthIndicatorResult(NAME, GREEN, INACTIVE_SYMPTOM, HealthIndicatorDetails.EMPTY, List.of(), List.of()); + assertEquals(expected, healthIndicatorService.calculate(INDETERMINATE)); + assertEquals(expected, healthIndicatorService.calculate(new FileSettingsHealthInfo(false, 0, 0, null))); assertEquals( - new HealthIndicatorResult("file_settings", GREEN, INACTIVE_SYMPTOM, HealthIndicatorDetails.EMPTY, List.of(), List.of()), - healthIndicatorService.calculate(false, null) + "Inactive should be GREEN regardless of other fields", + expected, + healthIndicatorService.calculate(new FileSettingsHealthInfo(false, 123, 123, "test")) ); } - public void testGreenYellowYellowGreen() { - healthIndicatorService.startOccurred(); - healthIndicatorService.changeOccurred(); - // This is a strange case: a change occurred, but neither success nor failure have been reported yet. - // While the change is still in progress, we don't change the status. - assertEquals( - new HealthIndicatorResult("file_settings", GREEN, SUCCESS_SYMPTOM, HealthIndicatorDetails.EMPTY, List.of(), List.of()), - healthIndicatorService.calculate(false, null) - ); - - healthIndicatorService.failureOccurred("whoopsie 1"); - assertEquals( - new HealthIndicatorResult( - "file_settings", - YELLOW, - FAILURE_SYMPTOM, - new SimpleHealthIndicatorDetails(Map.of("failure_streak", 1L, "most_recent_failure", "whoopsie 1")), - STALE_SETTINGS_IMPACT, - List.of() - ), - healthIndicatorService.calculate(false, null) - ); - - healthIndicatorService.failureOccurred("whoopsie #2"); - assertEquals( - new HealthIndicatorResult( - "file_settings", - YELLOW, - FAILURE_SYMPTOM, - new SimpleHealthIndicatorDetails(Map.of("failure_streak", 2L, "most_recent_failure", "whoopsie #2")), - STALE_SETTINGS_IMPACT, - List.of() - ), - healthIndicatorService.calculate(false, null) - ); + public void testNoChangesYet() { + var expected = new HealthIndicatorResult(NAME, GREEN, NO_CHANGES_SYMPTOM, HealthIndicatorDetails.EMPTY, List.of(), List.of()); + assertEquals(expected, healthIndicatorService.calculate(INITIAL_ACTIVE)); + assertEquals(expected, healthIndicatorService.calculate(new FileSettingsHealthInfo(true, 0, 0, null))); + } - healthIndicatorService.successOccurred(); - assertEquals( - new HealthIndicatorResult("file_settings", GREEN, SUCCESS_SYMPTOM, HealthIndicatorDetails.EMPTY, List.of(), List.of()), - healthIndicatorService.calculate(false, null) - ); + public void testSuccess() { + var expected = new HealthIndicatorResult(NAME, GREEN, SUCCESS_SYMPTOM, HealthIndicatorDetails.EMPTY, List.of(), List.of()); + assertEquals(expected, healthIndicatorService.calculate(INITIAL_ACTIVE.changed())); + assertEquals(expected, healthIndicatorService.calculate(INITIAL_ACTIVE.changed().successful())); + assertEquals(expected, healthIndicatorService.calculate(INITIAL_ACTIVE.changed().failed("whoops").successful())); } - public void testDescriptionIsTruncated() { - checkTruncatedDescription(9, "123456789", "123456789"); - checkTruncatedDescription(8, "123456789", "1234567…"); - checkTruncatedDescription(1, "12", "…"); + public void testFailure() { + var info = INITIAL_ACTIVE.changed().failed("whoops"); + assertEquals(yellow(1, "whoops"), healthIndicatorService.calculate(info)); + info = info.failed("whoops again"); + assertEquals(yellow(2, "whoops again"), healthIndicatorService.calculate(info)); + info = info.successful().failed("uh oh"); + assertEquals(yellow(1, "uh oh"), healthIndicatorService.calculate(info)); } - private void checkTruncatedDescription(int lengthLimit, String description, String expectedTruncatedDescription) { - var service = new FileSettingsHealthIndicatorService(Settings.builder().put(DESCRIPTION_LENGTH_LIMIT_KEY, lengthLimit).build()); - service.startOccurred(); - service.changeOccurred(); - service.failureOccurred(description); - assertEquals( - new HealthIndicatorResult( - "file_settings", - YELLOW, - FAILURE_SYMPTOM, - new SimpleHealthIndicatorDetails(Map.of("failure_streak", 1L, "most_recent_failure", expectedTruncatedDescription)), - STALE_SETTINGS_IMPACT, - List.of() - ), - service.calculate(false, null) - ); + private HealthIndicatorResult yellow(long failureStreak, String mostRecentFailure) { + var details = new SimpleHealthIndicatorDetails(Map.of("failure_streak", failureStreak, "most_recent_failure", mostRecentFailure)); + return new HealthIndicatorResult(NAME, YELLOW, FAILURE_SYMPTOM, details, STALE_SETTINGS_IMPACT, List.of()); } } diff --git a/server/src/test/java/org/elasticsearch/reservedstate/service/FileSettingsHealthTrackerTests.java b/server/src/test/java/org/elasticsearch/reservedstate/service/FileSettingsHealthTrackerTests.java new file mode 100644 index 0000000000000..72b9f58b0c4a0 --- /dev/null +++ b/server/src/test/java/org/elasticsearch/reservedstate/service/FileSettingsHealthTrackerTests.java @@ -0,0 +1,77 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.reservedstate.service; + +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthInfo; +import org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthTracker; +import org.elasticsearch.test.ESTestCase; +import org.junit.Before; + +import static org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthTracker.DESCRIPTION_LENGTH_LIMIT_KEY; +import static org.elasticsearch.reservedstate.service.FileSettingsServiceTests.NOOP_PUBLISHER; + +/** + * Here, we test {@link FileSettingsHealthTracker} in isolation; + * we do not test that {@link FileSettingsService} uses it correctly. + */ +public class FileSettingsHealthTrackerTests extends ESTestCase { + + FileSettingsHealthTracker healthTracker; + + @Before + public void initialize() { + healthTracker = new FileSettingsHealthTracker(Settings.EMPTY, NOOP_PUBLISHER); + } + + public void testStartAndStop() { + assertFalse(healthTracker.getCurrentInfo().isActive()); + healthTracker.startOccurred(); + assertEquals(new FileSettingsHealthInfo(true, 0, 0, null), healthTracker.getCurrentInfo()); + healthTracker.stopOccurred(); + assertFalse(healthTracker.getCurrentInfo().isActive()); + } + + public void testGoodBadGood() { + healthTracker.startOccurred(); + healthTracker.changeOccurred(); + // This is an unusual state: we've reported a change, but not yet reported whether it passed or failed + assertEquals(new FileSettingsHealthInfo(true, 1, 0, null), healthTracker.getCurrentInfo()); + + healthTracker.failureOccurred("whoopsie 1"); + assertEquals(new FileSettingsHealthInfo(true, 1, 1, "whoopsie 1"), healthTracker.getCurrentInfo()); + + healthTracker.changeOccurred(); + healthTracker.failureOccurred("whoopsie 2"); + assertEquals(new FileSettingsHealthInfo(true, 2, 2, "whoopsie 2"), healthTracker.getCurrentInfo()); + + healthTracker.changeOccurred(); + healthTracker.successOccurred(); + assertEquals(new FileSettingsHealthInfo(true, 3, 0, null), healthTracker.getCurrentInfo()); + } + + public void testDescriptionIsTruncated() { + checkTruncatedDescription(9, "123456789", "123456789"); + checkTruncatedDescription(8, "123456789", "1234567…"); + checkTruncatedDescription(1, "12", "…"); + } + + private void checkTruncatedDescription(int lengthLimit, String description, String expectedTruncatedDescription) { + var tracker = new FileSettingsHealthTracker( + Settings.builder().put(DESCRIPTION_LENGTH_LIMIT_KEY, lengthLimit).build(), + NOOP_PUBLISHER + ); + tracker.startOccurred(); + tracker.changeOccurred(); + tracker.failureOccurred(description); + assertEquals(new FileSettingsHealthInfo(true, 1, 1, expectedTruncatedDescription), tracker.getCurrentInfo()); + } + +} diff --git a/server/src/test/java/org/elasticsearch/reservedstate/service/FileSettingsServiceTests.java b/server/src/test/java/org/elasticsearch/reservedstate/service/FileSettingsServiceTests.java index 68436ddea0276..868d70ea6928a 100644 --- a/server/src/test/java/org/elasticsearch/reservedstate/service/FileSettingsServiceTests.java +++ b/server/src/test/java/org/elasticsearch/reservedstate/service/FileSettingsServiceTests.java @@ -33,8 +33,9 @@ import org.elasticsearch.core.TimeValue; import org.elasticsearch.env.BuildVersion; import org.elasticsearch.env.Environment; +import org.elasticsearch.health.HealthIndicatorResult; import org.elasticsearch.reservedstate.action.ReservedClusterSettingsAction; -import org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthIndicatorService; +import org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthTracker; import org.elasticsearch.tasks.TaskManager; import org.elasticsearch.test.ClusterServiceUtils; import org.elasticsearch.test.ESTestCase; @@ -93,9 +94,14 @@ public class FileSettingsServiceTests extends ESTestCase { private ReservedClusterStateService controller; private ThreadPool threadpool; private FileSettingsService fileSettingsService; - private FileSettingsHealthIndicatorService healthIndicatorService; + private FileSettingsHealthTracker healthIndicatorTracker; private Path watchedFile; + /** + * We're not testing health info publication here. + */ + public static final FileSettingsHealthIndicatorPublisher NOOP_PUBLISHER = (f, a) -> {}; + @Before public void setUp() throws Exception { super.setUp(); @@ -141,8 +147,8 @@ public void setUp() throws Exception { List.of() ) ); - healthIndicatorService = spy(new FileSettingsHealthIndicatorService(Settings.EMPTY)); - fileSettingsService = spy(new FileSettingsService(clusterService, controller, env, healthIndicatorService)); + healthIndicatorTracker = spy(new FileSettingsHealthTracker(Settings.EMPTY, NOOP_PUBLISHER)); + fileSettingsService = spy(new FileSettingsService(clusterService, controller, env, healthIndicatorTracker)); watchedFile = fileSettingsService.watchedFile(); } @@ -174,8 +180,8 @@ public void testStartStop() { assertTrue(fileSettingsService.watching()); fileSettingsService.stop(); assertFalse(fileSettingsService.watching()); - verify(healthIndicatorService, times(1)).startOccurred(); - verify(healthIndicatorService, times(1)).stopOccurred(); + verify(healthIndicatorTracker, times(1)).startOccurred(); + verify(healthIndicatorTracker, times(1)).stopOccurred(); } public void testOperatorDirName() { @@ -228,9 +234,9 @@ public void testInitialFileError() throws Exception { verify(fileSettingsService, times(1)).processFile(eq(watchedFile), eq(true)); verify(controller, times(1)).process(any(), any(XContentParser.class), eq(ReservedStateVersionCheck.HIGHER_OR_SAME_VERSION), any()); - assertEquals(YELLOW, healthIndicatorService.calculate(false, null).status()); - verify(healthIndicatorService, times(1)).changeOccurred(); - verify(healthIndicatorService, times(1)).failureOccurred(argThat(s -> s.startsWith(IllegalStateException.class.getName()))); + assertEquals(YELLOW, currentHealthIndicatorResult().status()); + verify(healthIndicatorTracker, times(1)).changeOccurred(); + verify(healthIndicatorTracker, times(1)).failureOccurred(argThat(s -> s.startsWith(IllegalStateException.class.getName()))); } @SuppressWarnings("unchecked") @@ -263,9 +269,9 @@ public void testInitialFileWorks() throws Exception { verify(fileSettingsService, times(1)).processFile(eq(watchedFile), eq(true)); verify(controller, times(1)).process(any(), any(XContentParser.class), eq(ReservedStateVersionCheck.HIGHER_OR_SAME_VERSION), any()); - assertEquals(GREEN, healthIndicatorService.calculate(false, null).status()); - verify(healthIndicatorService, times(1)).changeOccurred(); - verify(healthIndicatorService, times(1)).successOccurred(); + assertEquals(GREEN, currentHealthIndicatorResult().status()); + verify(healthIndicatorTracker, times(1)).changeOccurred(); + verify(healthIndicatorTracker, times(1)).successOccurred(); } @SuppressWarnings("unchecked") @@ -318,9 +324,9 @@ public void testProcessFileChanges() throws Exception { verify(fileSettingsService, times(1)).processFile(eq(watchedFile), eq(false)); verify(controller, times(1)).process(any(), any(XContentParser.class), eq(ReservedStateVersionCheck.HIGHER_VERSION_ONLY), any()); - assertEquals(GREEN, healthIndicatorService.calculate(false, null).status()); - verify(healthIndicatorService, times(2)).changeOccurred(); - verify(healthIndicatorService, times(2)).successOccurred(); + assertEquals(GREEN, currentHealthIndicatorResult().status()); + verify(healthIndicatorTracker, times(2)).changeOccurred(); + verify(healthIndicatorTracker, times(2)).successOccurred(); } public void testInvalidJSON() throws Exception { @@ -362,8 +368,8 @@ public void testInvalidJSON() throws Exception { argThat(e -> unwrapException(e) instanceof XContentParseException) ); - assertEquals(YELLOW, healthIndicatorService.calculate(false, null).status()); - verify(healthIndicatorService, Mockito.atLeast(1)).failureOccurred(contains(XContentParseException.class.getName())); + assertEquals(YELLOW, currentHealthIndicatorResult().status()); + verify(healthIndicatorTracker, Mockito.atLeast(1)).failureOccurred(contains(XContentParseException.class.getName())); } /** @@ -429,8 +435,8 @@ public void testStopWorksInMiddleOfProcessing() throws Exception { fileSettingsService.close(); // When the service is stopped, the health indicator should be green - assertEquals(GREEN, healthIndicatorService.calculate(false, null).status()); - verify(healthIndicatorService).stopOccurred(); + assertEquals(GREEN, currentHealthIndicatorResult().status()); + verify(healthIndicatorTracker).stopOccurred(); // let the deadlocked thread end, so we can cleanly exit the test deadThreadLatch.countDown(); @@ -505,4 +511,8 @@ private static void longAwait(CountDownLatch latch) { } } + private HealthIndicatorResult currentHealthIndicatorResult() { + return new FileSettingsService.FileSettingsHealthIndicatorService().calculate(healthIndicatorTracker.getCurrentInfo()); + } + } diff --git a/server/src/test/java/org/elasticsearch/snapshots/RepositoryIntegrityHealthIndicatorServiceTests.java b/server/src/test/java/org/elasticsearch/snapshots/RepositoryIntegrityHealthIndicatorServiceTests.java index cadd9d5196f69..05d316ee8420e 100644 --- a/server/src/test/java/org/elasticsearch/snapshots/RepositoryIntegrityHealthIndicatorServiceTests.java +++ b/server/src/test/java/org/elasticsearch/snapshots/RepositoryIntegrityHealthIndicatorServiceTests.java @@ -27,6 +27,7 @@ import org.elasticsearch.health.SimpleHealthIndicatorDetails; import org.elasticsearch.health.node.HealthInfo; import org.elasticsearch.health.node.RepositoriesHealthInfo; +import org.elasticsearch.reservedstate.service.FileSettingsService.FileSettingsHealthInfo; import org.elasticsearch.test.ESTestCase; import org.junit.Before; import org.mockito.Mockito; @@ -77,7 +78,8 @@ public void setUp() throws Exception { node2.getId(), new RepositoriesHealthInfo(List.of(), List.of()) ) - ) + ), + FileSettingsHealthInfo.INDETERMINATE ); featureService = Mockito.mock(FeatureService.class); @@ -255,7 +257,7 @@ public void testIsUnknownWhenNoHealthInfoIsAvailable() { var service = createRepositoryIntegrityHealthIndicatorService(clusterState); assertThat( - service.calculate(true, new HealthInfo(Map.of(), null, Map.of())), + service.calculate(true, new HealthInfo(Map.of(), null, Map.of(), FileSettingsHealthInfo.INDETERMINATE)), equalTo( new HealthIndicatorResult( NAME, @@ -290,7 +292,12 @@ public void testLimitNumberOfAffectedResources() { invalidRepos.add("invalid-repo-" + i); repoHealthInfo.put("node-" + i, new RepositoriesHealthInfo(List.of("unknown-repo-" + i), List.of("invalid-repo-" + i))); }); - healthInfo = new HealthInfo(healthInfo.diskInfoByNode(), healthInfo.dslHealthInfo(), repoHealthInfo); + healthInfo = new HealthInfo( + healthInfo.diskInfoByNode(), + healthInfo.dslHealthInfo(), + repoHealthInfo, + FileSettingsHealthInfo.INDETERMINATE + ); assertThat( service.calculate(true, 10, healthInfo).diagnosisList(),