Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.iotdb.confignode.it.partition;

import org.apache.iotdb.it.env.EnvFactory;
import org.apache.iotdb.it.framework.IoTDBTestRunner;
import org.apache.iotdb.itbase.category.ClusterIT;
import org.apache.iotdb.itbase.category.LocalStandaloneIT;

import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import org.junit.runner.RunWith;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.sql.Connection;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;

import static org.apache.iotdb.consensus.ConsensusFactory.RATIS_CONSENSUS;

@RunWith(IoTDBTestRunner.class)
@Category({LocalStandaloneIT.class, ClusterIT.class})
public class DataPartitionTableIntegrityCheckProcedureIT {
private static final Logger LOGGER =
LoggerFactory.getLogger(DataPartitionTableIntegrityCheckProcedureIT.class);

@Before
public void setUp() {
EnvFactory.getEnv()
.getConfig()
.getCommonConfig()
.setConfigNodeConsensusProtocolClass(RATIS_CONSENSUS)
.setSchemaRegionConsensusProtocolClass(RATIS_CONSENSUS)
.setDataRegionConsensusProtocolClass(RATIS_CONSENSUS)
.setDataReplicationFactor(1);
EnvFactory.getEnv().initClusterEnvironment(1, 1);
}

@After
public void tearDown() throws Exception {
EnvFactory.getEnv().cleanClusterEnvironment();
}

@Test
public void testConcurrentSubmitDataPartitionTableIntegrityCheckProcedure()
throws InterruptedException {
final int threadCount = 10;
final CountDownLatch startLatch = new CountDownLatch(1);
final CountDownLatch finishLatch = new CountDownLatch(threadCount);
final ExecutorService executor = Executors.newFixedThreadPool(threadCount);

final AtomicInteger successCount = new AtomicInteger(0);
final AtomicInteger failCount = new AtomicInteger(0);
final List<String> failureMessages = Collections.synchronizedList(new ArrayList<>());

// Concurrently submit the DataPartitionTableIntegrityCheckProcedure
for (int i = 0; i < threadCount; i++) {
final int threadId = i;
executor.submit(
() -> {
try {
startLatch.await();

try (final Connection connection = EnvFactory.getEnv().getConnection();
final Statement stmt = connection.createStatement()) {
stmt.execute("REPAIR DATA PARTITION TABLE");
successCount.incrementAndGet();
LOGGER.info("Thread {} submitted integrity check successfully", threadId);
}
} catch (final SQLException e) {
failCount.incrementAndGet();
failureMessages.add("Thread " + threadId + " failed: " + e.getMessage());
LOGGER.info(
"Thread {} failed to submit integrity check: {}", threadId, e.getMessage());
} catch (final Exception e) {
failCount.incrementAndGet();
failureMessages.add("Thread " + threadId + " failed unexpectedly: " + e.getMessage());
LOGGER.error("Thread {} unexpected error: {}", threadId, e.getMessage(), e);
} finally {
finishLatch.countDown();
}
});
}

startLatch.countDown();

final boolean completed = finishLatch.await(60, TimeUnit.SECONDS);
Assert.assertTrue("Not all threads completed within timeout", completed);

executor.shutdown();
Assert.assertTrue(
"Executor did not terminate", executor.awaitTermination(10, TimeUnit.SECONDS));

LOGGER.info("Success count: {}, Fail count: {}", successCount.get(), failCount.get());
LOGGER.info("Failure messages: {}", failureMessages);

Assert.assertEquals(
"Only one procedure should be submitted successfully", 1, successCount.get());
Assert.assertEquals(
"The other concurrent submissions should be rejected", threadCount - 1, failCount.get());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ utilityStatement
| showQueries | showDiskUsage | showCurrentTimestamp | killQuery | grantWatermarkEmbedding
| revokeWatermarkEmbedding | loadConfiguration | loadTimeseries | loadFile
| removeFile | unloadFile | setSqlDialect | showCurrentSqlDialect | showCurrentUser
| repairDataPartitionTable
;

/**
Expand Down Expand Up @@ -1238,6 +1239,11 @@ stopRepairData
: STOP REPAIR DATA (ON (LOCAL | CLUSTER))?
;

// Repair Data Partition Table
repairDataPartitionTable
: REPAIR DATA PARTITION TABLE
;

// Explain
explain
: EXPLAIN (ANALYZE VERBOSE?)? selectStatement?
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -319,8 +319,6 @@ public class ConfigNodeConfig {

private long forceWalPeriodForConfigNodeSimpleInMs = 100;

private long partitionTableRecoverWaitAllDnUpTimeoutInMs = 60000;

public ConfigNodeConfig() {
// empty constructor
}
Expand Down Expand Up @@ -1288,13 +1286,4 @@ public long getFailureDetectorPhiAcceptablePauseInMs() {
public void setFailureDetectorPhiAcceptablePauseInMs(long failureDetectorPhiAcceptablePauseInMs) {
this.failureDetectorPhiAcceptablePauseInMs = failureDetectorPhiAcceptablePauseInMs;
}

public long getPartitionTableRecoverWaitAllDnUpTimeoutInMs() {
return partitionTableRecoverWaitAllDnUpTimeoutInMs;
}

public void setPartitionTableRecoverWaitAllDnUpTimeoutInMs(
long partitionTableRecoverWaitAllDnUpTimeoutInMs) {
this.partitionTableRecoverWaitAllDnUpTimeoutInMs = partitionTableRecoverWaitAllDnUpTimeoutInMs;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -322,23 +322,6 @@ private void loadProperties(TrimProperties properties) throws BadNodeUrlExceptio
"failure_detector_phi_acceptable_pause_in_ms",
String.valueOf(conf.getFailureDetectorPhiAcceptablePauseInMs()))));

long partitionTableRecoverWaitAllDnUpTimeoutInMs =
Long.parseLong(
properties.getProperty(
"partition_table_recover_wait_all_dn_up_timeout_ms",
String.valueOf(conf.getPartitionTableRecoverWaitAllDnUpTimeoutInMs())));
if (partitionTableRecoverWaitAllDnUpTimeoutInMs <= 0) {
LOGGER.warn(
"partition_table_recover_wait_all_dn_up_timeout_ms should be greater than 0, "
+ "but current value is {}, ignore that and use the default value {}",
partitionTableRecoverWaitAllDnUpTimeoutInMs,
conf.getPartitionTableRecoverWaitAllDnUpTimeoutInMs());
partitionTableRecoverWaitAllDnUpTimeoutInMs =
conf.getPartitionTableRecoverWaitAllDnUpTimeoutInMs();
}
conf.setPartitionTableRecoverWaitAllDnUpTimeoutInMs(
partitionTableRecoverWaitAllDnUpTimeoutInMs);
Comment thread
CRZbulabula marked this conversation as resolved.

String leaderDistributionPolicy =
properties.getProperty("leader_distribution_policy", conf.getLeaderDistributionPolicy());
if (AbstractLeaderBalancer.GREEDY_POLICY.equals(leaderDistributionPolicy)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1155,6 +1155,16 @@ public TDataPartitionTableResp getOrCreateDataPartition(
return resp;
}

@Override
public TSStatus dataPartitionTableIntegrityCheck() {
TSStatus status = confirmLeader();
if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) {
return status;
}

return partitionManager.dataPartitionTableIntegrityCheck();
}

private void printNewCreatedDataPartition(
GetOrCreateDataPartitionPlan getOrCreateDataPartitionPlan, TDataPartitionTableResp resp) {
final String lineSeparator = System.lineSeparator();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,8 @@ TSchemaNodeManagementResp getNodePathsPartition(
TDataPartitionTableResp getOrCreateDataPartition(
GetOrCreateDataPartitionPlan getOrCreateDataPartitionPlan);

TSStatus dataPartitionTableIntegrityCheck();

/**
* Get AuditLogger.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@
import org.apache.iotdb.confignode.procedure.impl.node.RemoveAINodeProcedure;
import org.apache.iotdb.confignode.procedure.impl.node.RemoveConfigNodeProcedure;
import org.apache.iotdb.confignode.procedure.impl.node.RemoveDataNodesProcedure;
import org.apache.iotdb.confignode.procedure.impl.partition.DataPartitionTableIntegrityCheckProcedure;
import org.apache.iotdb.confignode.procedure.impl.pipe.plugin.CreatePipePluginProcedure;
import org.apache.iotdb.confignode.procedure.impl.pipe.plugin.DropPipePluginProcedure;
import org.apache.iotdb.confignode.procedure.impl.pipe.runtime.PipeHandleLeaderChangeProcedure;
Expand Down Expand Up @@ -1376,16 +1375,6 @@ public TSStatus createRegionGroups(
}
}

/** Used to repair the lost data partition table */
public TSStatus dataPartitionTableIntegrityCheck() {
DataPartitionTableIntegrityCheckProcedure procedure;
synchronized (this) {
procedure = new DataPartitionTableIntegrityCheckProcedure();
executor.submitProcedure(procedure);
}
return waitingProcedureFinished(procedure, 86400000);
}

/**
* Generate {@link CreateTriggerProcedure} and wait until it finished.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
import org.apache.iotdb.confignode.persistence.partition.maintainer.RegionDeleteTask;
import org.apache.iotdb.confignode.persistence.partition.maintainer.RegionMaintainTask;
import org.apache.iotdb.confignode.persistence.partition.maintainer.RegionMaintainType;
import org.apache.iotdb.confignode.procedure.impl.partition.DataPartitionTableIntegrityCheckProcedure;
import org.apache.iotdb.confignode.rpc.thrift.TCountTimeSlotListReq;
import org.apache.iotdb.confignode.rpc.thrift.TGetRegionIdReq;
import org.apache.iotdb.confignode.rpc.thrift.TGetSeriesSlotListReq;
Expand Down Expand Up @@ -152,6 +153,9 @@ public class PartitionManager {
private final ScheduledExecutorService regionMaintainer;
private Future<?> currentRegionMaintainerFuture;

private final AtomicBoolean dataPartitionTableIntegrityCheckProcedureRunning =
new AtomicBoolean(false);

public PartitionManager(IManager configManager, PartitionInfo partitionInfo) {
this.configManager = configManager;
this.partitionInfo = partitionInfo;
Expand Down Expand Up @@ -511,6 +515,29 @@ public DataPartitionResp getOrCreateDataPartition(final GetOrCreateDataPartition
return resp;
}

/** Used to repair the lost data partition table */
public TSStatus dataPartitionTableIntegrityCheck() {
if (configManager
.getProcedureManager()
.isExistUnfinishedProcedure(DataPartitionTableIntegrityCheckProcedure.class)
|| !dataPartitionTableIntegrityCheckProcedureRunning.compareAndSet(false, true)) {
Comment thread
CRZbulabula marked this conversation as resolved.
return RpcUtils.getStatus(
TSStatusCode.OVERLAP_WITH_EXISTING_TASK,
"DataPartitionTableIntegrityCheckProcedure is already submitted.");
}

synchronized (this) {
DataPartitionTableIntegrityCheckProcedure procedure =
new DataPartitionTableIntegrityCheckProcedure();
getProcedureManager().getExecutor().submitProcedure(procedure);
}
return new TSStatus(TSStatusCode.SUCCESS_STATUS.getStatusCode());
}

public void markDataPartitionTableIntegrityCheckProcedureFinished() {
dataPartitionTableIntegrityCheckProcedureRunning.set(false);
}

private TSStatus consensusWritePartitionResult(ConfigPhysicalPlan plan) {
TSStatus status = getConsensusManager().confirmLeader();
if (status.getCode() != TSStatusCode.SUCCESS_STATUS.getStatusCode()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,15 @@ public DataPartitionTableIntegrityCheckProcedure() {
super();
}

@Override
protected void updateMetricsOnFinish(
final ConfigNodeProcedureEnv env, final long runtime, final boolean success) {
super.updateMetricsOnFinish(env, runtime, success);
env.getConfigManager()
.getPartitionManager()
.markDataPartitionTableIntegrityCheckProcedureFinished();
}

@Override
protected Flow executeFromState(
final ConfigNodeProcedureEnv env, final DataPartitionTableIntegrityCheckProcedureState state)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,11 @@ public TDataPartitionTableResp getOrCreateDataPartitionTable(TDataPartitionReq r
return configManager.getOrCreateDataPartition(getOrCreateDataPartitionReq);
}

@Override
public TSStatus dataPartitionTableIntegrityCheck() {
return configManager.dataPartitionTableIntegrityCheck();
}

@Override
public TSStatus operatePermission(final TAuthorizerReq req) {
ConfigPhysicalPlanType configPhysicalPlanType =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -700,6 +700,12 @@ public TDataPartitionTableResp getOrCreateDataPartitionTable(TDataPartitionReq r
resp -> !updateConfigNodeLeader(resp.status));
}

@Override
public TSStatus dataPartitionTableIntegrityCheck() throws TException {
return executeRemoteCallWithRetry(
() -> client.dataPartitionTableIntegrityCheck(), status -> !updateConfigNodeLeader(status));
}

@Override
public TSStatus operatePermission(TAuthorizerReq req) throws TException {
return executeRemoteCallWithRetry(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@
import org.apache.iotdb.db.queryengine.plan.execution.config.sys.KillQueryTask;
import org.apache.iotdb.db.queryengine.plan.execution.config.sys.LoadConfigurationTask;
import org.apache.iotdb.db.queryengine.plan.execution.config.sys.MergeTask;
import org.apache.iotdb.db.queryengine.plan.execution.config.sys.RepairDataPartitionTableTask;
import org.apache.iotdb.db.queryengine.plan.execution.config.sys.SetConfigurationTask;
import org.apache.iotdb.db.queryengine.plan.execution.config.sys.SetSystemStatusTask;
import org.apache.iotdb.db.queryengine.plan.execution.config.sys.ShowConfigurationTask;
Expand Down Expand Up @@ -213,6 +214,7 @@
import org.apache.iotdb.db.queryengine.plan.statement.sys.KillQueryStatement;
import org.apache.iotdb.db.queryengine.plan.statement.sys.LoadConfigurationStatement;
import org.apache.iotdb.db.queryengine.plan.statement.sys.MergeStatement;
import org.apache.iotdb.db.queryengine.plan.statement.sys.RepairDataPartitionTable;
import org.apache.iotdb.db.queryengine.plan.statement.sys.SetConfigurationStatement;
import org.apache.iotdb.db.queryengine.plan.statement.sys.SetSqlDialectStatement;
import org.apache.iotdb.db.queryengine.plan.statement.sys.SetSystemStatusStatement;
Expand Down Expand Up @@ -384,6 +386,12 @@ public IConfigTask visitStartRepairData(
return new StartRepairDataTask(startRepairDataStatement);
}

@Override
public IConfigTask visitRepairDataPartitionTable(
RepairDataPartitionTable repairDataPartitionTable, MPPQueryContext context) {
return new RepairDataPartitionTableTask();
}

@Override
public IConfigTask visitStopRepairData(
StopRepairDataStatement stopRepairDataStatement, MPPQueryContext context) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1457,6 +1457,27 @@ public SettableFuture<ConfigTaskResult> stopRepairData(boolean onCluster) {
return future;
}

@Override
public SettableFuture<ConfigTaskResult> repairDataPartitionTable() {
SettableFuture<ConfigTaskResult> future = SettableFuture.create();
TSStatus tsStatus = new TSStatus();

try (ConfigNodeClient client =
CONFIG_NODE_CLIENT_MANAGER.borrowClient(ConfigNodeInfo.CONFIG_REGION_ID)) {
// Send request to ConfigNode to trigger DataPartitionTableIntegrityCheckProcedure
tsStatus = client.dataPartitionTableIntegrityCheck();
} catch (ClientManagerException | TException e) {
future.setException(e);
}

if (tsStatus.getCode() == TSStatusCode.SUCCESS_STATUS.getStatusCode()) {
future.set(new ConfigTaskResult(TSStatusCode.SUCCESS_STATUS));
} else {
future.setException(new IoTDBException(tsStatus));
}
return future;
}

@Override
public SettableFuture<ConfigTaskResult> loadConfiguration(boolean onCluster) {
SettableFuture<ConfigTaskResult> future = SettableFuture.create();
Expand Down
Loading