腾讯真题:Redis集群扩容方案
面试重要程度:⭐⭐⭐⭐⭐
真题来源:腾讯2024社招技术面试
考察重点:Redis集群架构、数据迁移、高可用设计
预计阅读时间:45分钟
真题背景
面试官: "我们的Redis集群目前有6个节点(3主3从),存储了约500GB数据,QPS达到10万。由于业务快速增长,需要扩容到12个节点(6主6从)。请详细设计扩容方案,包括数据迁移策略、服务可用性保证、回滚预案等。另外,如果在扩容过程中发现某个节点出现故障,应该如何处理?"
考察意图:
- Redis集群架构的深度理解
- 大规模数据迁移的工程实践能力
- 高可用系统设计思维
- 故障处理和应急响应能力
- 生产环境运维经验
🎯 现状分析与扩容规划
当前集群状态分析
集群拓扑:
# 当前集群状态 redis-cli --cluster info 127.0.0.1:7000 # 节点分布 Master1 (7000): slots 0-5460 (5461 slots) Master2 (7001): slots 5461-10922 (5462 slots) Master3 (7002): slots 10923-16383 (5461 slots) Slave1 (7003): replicates Master1 Slave2 (7004): replicates Master2 Slave3 (7005): replicates Master3
性能指标分析:
/**
* 集群性能监控
*/
@Component
public class ClusterMonitor {
@Autowired
private RedisClusterConnection clusterConnection;
/**
* 获取集群性能指标
*/
public ClusterMetrics getClusterMetrics() {
ClusterMetrics metrics = new ClusterMetrics();
// 获取所有主节点
Iterable<RedisClusterNode> masters = clusterConnection.clusterGetNodes()
.stream()
.filter(RedisClusterNode::isMaster)
.collect(Collectors.toList());
for (RedisClusterNode master : masters) {
NodeMetrics nodeMetrics = getNodeMetrics(master);
metrics.addNodeMetrics(nodeMetrics);
}
return metrics;
}
private NodeMetrics getNodeMetrics(RedisClusterNode node) {
Properties info = clusterConnection.info(node);
NodeMetrics metrics = new NodeMetrics();
metrics.setNodeId(node.getId());
metrics.setHost(node.getHost());
metrics.setPort(node.getPort());
// 内存使用情况
metrics.setUsedMemory(Long.parseLong(info.getProperty("used_memory", "0")));
metrics.setMaxMemory(Long.parseLong(info.getProperty("maxmemory", "0")));
// QPS统计
metrics.setCommandsProcessed(Long.parseLong(info.getProperty("total_commands_processed", "0")));
metrics.setConnectedClients(Integer.parseInt(info.getProperty("connected_clients", "0")));
// 槽位信息
Set<SlotRange> slotRanges = node.getSlotRange();
metrics.setSlotCount(slotRanges.stream()
.mapToInt(range -> range.getEnd() - range.getStart() + 1)
.sum());
return metrics;
}
}
@Data
public class ClusterMetrics {
private List<NodeMetrics> nodeMetrics = new ArrayList<>();
private long totalMemoryUsed;
private long totalQPS;
private int totalSlots = 16384;
public void addNodeMetrics(NodeMetrics nodeMetrics) {
this.nodeMetrics.add(nodeMetrics);
this.totalMemoryUsed += nodeMetrics.getUsedMemory();
this.totalQPS += nodeMetrics.getQps();
}
/**
* 分析是否需要扩容
*/
public boolean needsExpansion() {
// 内存使用率超过70%
boolean memoryPressure = nodeMetrics.stream()
.anyMatch(node -> node.getMemoryUsageRatio() > 0.7);
// 单节点QPS超过3万
boolean qpsPressure = nodeMetrics.stream()
.anyMatch(node -> node.getQps() > 30000);
// 连接数超过5000
boolean connectionPressure = nodeMetrics.stream()
.anyMatch(node -> node.getConnectedClients() > 5000);
return memoryPressure || qpsPressure || connectionPressure;
}
}
扩容目标规划
扩容后集群架构:
/**
* 扩容规划
*/
@Component
public class ExpansionPlanner {
/**
* 制定扩容计划
*/
public ExpansionPlan createExpansionPlan(ClusterMetrics currentMetrics) {
ExpansionPlan plan = new ExpansionPlan();
// 目标:6主6从架构
plan.setTargetMasterCount(6);
plan.setTargetSlaveCount(6);
// 新增节点规划
List<NodeConfig> newNodes = Arrays.asList(
new NodeConfig("192.168.1.10", 7006, NodeType.MASTER),
new NodeConfig("192.168.1.11", 7007, NodeType.MASTER),
new NodeConfig("192.168.1.12", 7008, NodeType.MASTER),
new NodeConfig("192.168.1.13", 7009, NodeType.SLAVE),
new NodeConfig("192.168.1.14", 7010, NodeType.SLAVE),
new NodeConfig("192.168.1.15", 7011, NodeType.SLAVE)
);
plan.setNewNodes(newNodes);
// 槽位重新分配计划
plan.setSlotReallocation(calculateSlotReallocation());
// 预估迁移时间
plan.setEstimatedMigrationTime(estimateMigrationTime(currentMetrics));
return plan;
}
private Map<String, SlotRange> calculateSlotReallocation() {
Map<String, SlotRange> allocation = new HashMap<>();
// 6个主节点,每个节点约2731个槽位
int slotsPerMaster = 16384 / 6;
int remainder = 16384 % 6;
for (int i = 0; i < 6; i++) {
int startSlot = i * slotsPerMaster;
int endSlot = (i + 1) * slotsPerMaster - 1;
// 前remainder个节点多分配1个槽位
if (i < remainder) {
endSlot++;
}
allocation.put("master" + i, new SlotRange(startSlot, endSlot));
}
return allocation;
}
private Duration estimateMigrationTime(ClusterMetrics metrics) {
// 根据数据量和网络带宽估算
long totalDataSize = metrics.getTotalMemoryUsed();
long networkBandwidth = 1000 * 1024 * 1024; // 1GB/s
long migrationBandwidth = networkBandwidth / 4; // 预留75%带宽给业务
long estimatedSeconds = totalDataSize / migrationBandwidth;
return Duration.ofSeconds(estimatedSeconds);
}
}
🚀 扩容实施方案
阶段一:环境准备
新节点部署:
#!/bin/bash
# 新节点部署脚本
# 1. 创建新节点配置文件
create_node_config() {
local port=$1
local node_dir="/opt/redis/node-${port}"
mkdir -p ${node_dir}
cat > ${node_dir}/redis.conf << EOF
port ${port}
cluster-enabled yes
cluster-config-file nodes-${port}.conf
cluster-node-timeout 5000
appendonly yes
appendfilename "appendonly-${port}.aof"
dir ${node_dir}
logfile ${node_dir}/redis-${port}.log
pidfile /var/run/redis_${port}.pid
# 内存配置
maxmemory 8gb
maxmemory-policy allkeys-lru
# 网络配置
tcp-keepalive 300
timeout 0
# 持久化配置
save 900 1
save 300 10
save 60 10000
EOF
}
# 2. 启动新节点
start_new_nodes() {
for port in 7006 7007 7008 7009 7010 7011; do
echo "Starting Redis node on port ${port}..."
create_node_config ${port}
redis-server /opt/redis/node-${port}/redis.conf &
sleep 2
done
}
# 3. 验证节点状态
verify_nodes() {
for port in 7006 7007 7008 7009 7010 7011; do
if redis-cli -p ${port} ping | grep -q PONG; then
echo "Node ${port}: OK"
else
echo "Node ${port}: FAILED"
exit 1
fi
done
}
start_new_nodes
verify_nodes
环境检查清单:
/**
* 扩容前环境检查
*/
@Component
public class PreExpansionChecker {
/**
* 执行扩容前检查
*/
public CheckResult performPreExpansionCheck() {
CheckResult result = new CheckResult();
// 1. 集群健康检查
result.addCheck("cluster_health", checkClusterHealth());
// 2. 节点资源检查
result.addCheck("node_resources", checkNodeResources());
// 3. 网络连通性检查
result.addCheck("network_connectivity", checkNetworkConnectivity());
// 4. 备份验证
result.addCheck("backup_verification", checkBackupStatus());
// 5. 监控系统检查
result.addCheck("monitoring_system", checkMonitoringSystem());
return result;
}
private boolean checkClusterHealth() {
try {
// 检查所有节点状态
Iterable<RedisClusterNode> nodes = clusterConnection.clusterGetNodes();
for (RedisClusterNode node : nodes) {
if (node.getFlags().contains(RedisClusterNode.Flag.FAIL)) {
log.error("Node {} is in FAIL state", node.getId());
return false;
}
}
// 检查槽位分配
Properties clusterInfo = clusterConnection.clusterGetClusterInfo();
String clusterState = clusterInfo.getProperty("cluster_state");
return "ok".equals(clusterState);
} catch (Exception e) {
log.error("Cluster health check failed", e);
return false;
}
}
private boolean checkNodeResources() {
// 检查CPU、内存、磁盘空间
return true; // 简化实现
}
private boolean checkNetworkConnectivity() {
// 检查新旧节点间网络连通性
return true; // 简化实现
}
private boolean checkBackupStatus() {
// 验证最近的备份是否可用
return true; // 简化实现
}
private boolean checkMonitoringSystem() {
// 确保监控系统正常工作
return true; // 简化实现
}
}
阶段二:节点加入集群
添加新主节点:
/**
* 集群扩容执行器
*/
@Component
public class ClusterExpansionExecutor {
@Autowired
private RedisClusterConnection clusterConnection;
/**
* 添加新主节点到集群
*/
public void addMasterNodes(List<NodeConfig> masterNodes) {
for (NodeConfig nodeConfig : masterNodes) {
try {
log.info("Adding master node: {}:{}", nodeConfig.getHost(), nodeConfig.getPort());
// 1. 将新节点加入集群
clusterConnection.clusterMeet(nodeConfig.getHost(), nodeConfig.getPort());
// 2. 等待节点握手完成
waitForNodeHandshake(nodeConfig);
// 3. 验证节点状态
verifyNodeStatus(nodeConfig);
log.info("Master node {}:{} added successfully",
nodeConfig.getHost(), nodeConfig.getPort());
} catch (Exception e) {
log.error("Failed to add master node {}:{}",
nodeConfig.getHost(), nodeConfig.getPort(), e);
throw new ExpansionException("Failed to add master node", e);
}
}
}
/**
* 添加新从节点
*/
public void addSlaveNodes(List<NodeConfig> slaveNodes, Map<String, String> masterSlaveMapping) {
for (NodeConfig slaveConfig : slaveNodes) {
try {
log.info("Adding slave node: {}:{}", slaveConfig.getHost(), slaveConfig.getPort());
// 1. 将从节点加入集群
clusterConnection.clusterMeet(slaveConfig.getHost(), slaveConfig.getPort());
// 2. 等待握手完成
waitForNodeHandshake(slaveConfig);
// 3. 设置主从关系
String masterId = masterSlaveMapping.get(slaveConfig.getNodeId());
clusterConnection.clusterReplicate(slaveConfig.getNodeId(), masterId);
// 4. 验证主从关系
verifyReplicationStatus(slaveConfig, masterId);
log.info("Slave node {}:{} added successfully",
slaveConfig.getHost(), slaveConfig.getPort());
} catch (Exception e) {
log.error("Failed to add slave node {}:{}",
slaveConfig.getHost(), slaveConfig.getPort(), e);
throw new ExpansionException("Failed to add slave node", e);
}
}
}
private void waitForNodeHandshake(NodeConfig nodeConfig) throws InterruptedException {
int maxRetries = 30;
int retryCount = 0;
while (retryCount < maxRetries) {
try {
RedisClusterNode node = findNodeById(nodeConfig.getNodeId());
if (node != null && !node.getFlags().contains(RedisClusterNode.Flag.HANDSHAKE)) {
return; // 握手完成
}
} catch (Exception e) {
// 忽略异常,继续重试
}
Thread.sleep(1000);
retryCount++;
}
throw new ExpansionException("Node handshake timeout: " + nodeConfig.getNodeId());
}
}
阶段三:槽位迁移
槽位迁移策略:
/**
* 槽位迁移管理器
*/
@Component
public class SlotMigrationManager {
@Autowired
private RedisClusterConnection clusterConnection;
/**
* 执行槽位迁移
*/
public void migrateSlots(Map<String, SlotRange> reallocationPlan) {
// 按批次迁移,避免对业务造成太大影响
int batchSize = 100; // 每批迁移100个槽位
for (Map.Entry<String, SlotRange> entry : reallocationPlan.entrySet()) {
String targetNodeId = entry.getKey();
SlotRange slotRange = entry.getValue();
migrateSlotRange(targetNodeId, slotRange, batchSize);
}
}
private void migrateSlotRange(String targetNodeId, SlotRange slotRange, int batchSize) {
int startSlot = slotRange.getStart();
int endSlot = slotRange.getEn
剩余60%内容,订阅专栏后可继续查看/也可单篇购买
Java面试圣经 文章被收录于专栏
Java面试圣经,带你练透java圣经
滴滴公司福利 1695人发布