异常现象

集群中某个节点etcd无法启动,其他节点正常

systemctl status etcd
● etcd.service - etcd
   Loaded: loaded (/etc/systemd/system/etcd.service; enabled; vendor preset: disabled)
   Active: activating (auto-restart) (Result: exit-code) since Thu 2025-09-18 09:59:47 CST; 7s ago
  Process: 7427 ExecStart=/usr/local/bin/etcd (code=exited, status=2)
 Main PID: 7427 (code=exited, status=2)

Sep 18 09:59:47 k8s-master1 systemd[1]: etcd.service: main process exited, code=exited, status=2/INVALIDARGUMENT
Sep 18 09:59:47 k8s-master1 systemd[1]: Failed to start etcd.
Sep 18 09:59:47 k8s-master1 systemd[1]: Unit etcd.service entered failed state.
Sep 18 09:59:47 k8s-master1 systemd[1]: etcd.service failed.
journalctl -xe
Sep 18 09:59:57 k8s-master1 etcd[7598]: recognized and used environment variable ETCD_AUTO_COMPACTION_RETENTION=8
Sep 18 09:59:57 k8s-master1 etcd[7598]: [WARNING] Deprecated '--logger=capnslog' flag is set; use '--logger=zap' flag instead
Sep 18 09:59:57 k8s-master1 etcd[7598]: [WARNING] Deprecated '--logger=capnslog' flag is set; use '--logger=zap' flag instead
Sep 18 09:59:57 k8s-master1 etcd[7598]: recognized and used environment variable ETCD_CERT_FILE=/etc/ssl/etcd/ssl/member-k8s-master1.pem
Sep 18 09:59:57 k8s-master1 etcd[7598]: recognized and used environment variable ETCD_CLIENT_CERT_AUTH=true
Sep 18 09:59:57 k8s-master1 etcd[7598]: recognized and used environment variable ETCD_DATA_DIR=/var/lib/etcd
Sep 18 09:59:57 k8s-master1 etcd[7598]: recognized and used environment variable ETCD_ELECTION_TIMEOUT=5000
Sep 18 09:59:57 k8s-master1 etcd[7598]: recognized and used environment variable ETCD_ENABLE_V2=true
Sep 18 09:59:57 k8s-master1 etcd[7598]: recognized and used environment variable ETCD_HEARTBEAT_INTERVAL=250
Sep 18 09:59:57 k8s-master1 etcd[7598]: recognized and used environment variable ETCD_INITIAL_ADVERTISE_PEER_URLS=https://192.168.127.11:2380
Sep 18 09:59:57 k8s-master1 etcd[7598]: recognized and used environment variable ETCD_INITIAL_CLUSTER=etcd-k8s-master1=https://192.168.127.11:2380,etcd-k8s-master2=https://192.168.127.12:2380,etcd-k8s
Sep 18 09:59:57 k8s-master1 etcd[7598]: panic: freepages: failed to get all reachable pages (page 6657851291078908972: out of bounds: 22020)
Sep 18 09:59:57 k8s-master1 etcd[7598]: recognized and used environment variable ETCD_INITIAL_CLUSTER_STATE=existing
Sep 18 09:59:57 k8s-master1 etcd[7598]: goroutine 96 [running]:
Sep 18 09:59:57 k8s-master1 etcd[7598]: go.etcd.io/bbolt.(*DB).freepages.func2(0xc000190240)
Sep 18 09:59:57 k8s-master1 etcd[7598]: /home/ANT.AMAZON.COM/leegyuho/go/pkg/mod/go.etcd.io/bbolt@v1.3.3/db.go:1003 +0xe5
Sep 18 09:59:57 k8s-master1 etcd[7598]: created by go.etcd.io/bbolt.(*DB).freepages
Sep 18 09:59:57 k8s-master1 etcd[7598]: /home/ANT.AMAZON.COM/leegyuho/go/pkg/mod/go.etcd.io/bbolt@v1.3.3/db.go:1001 +0x1b5
Sep 18 09:59:57 k8s-master1 etcd[7598]: recognized and used environment variable ETCD_INITIAL_CLUSTER_TOKEN=k8s_etcd
Sep 18 09:59:57 k8s-master1 etcd[7598]: recognized and used environment variable ETCD_KEY_FILE=/etc/ssl/etcd/ssl/member-k8s-master1-key.pem
Sep 18 09:59:57 k8s-master1 etcd[7598]: recognized and used environment variable ETCD_LISTEN_CLIENT_URLS=https://192.168.127.11:2379,https://127.0.0.1:2379
Sep 18 09:59:57 k8s-master1 etcd[7598]: recognized and used environment variable ETCD_LISTEN_PEER_URLS=https://192.168.127.11:2380
Sep 18 09:59:57 k8s-master1 etcd[7598]: recognized and used environment variable ETCD_METRICS=basic
Sep 18 09:59:57 k8s-master1 etcd[7598]: recognized and used environment variable ETCD_NAME=etcd-k8s-master1
Sep 18 09:59:57 k8s-master1 etcd[7598]: recognized and used environment variable ETCD_PEER_CERT_FILE=/etc/ssl/etcd/ssl/member-k8s-master1.pem
Sep 18 09:59:57 k8s-master1 etcd[7598]: recognized and used environment variable ETCD_PEER_CLIENT_CERT_AUTH=True
Sep 18 09:59:57 k8s-master1 etcd[7598]: recognized and used environment variable ETCD_PEER_KEY_FILE=/etc/ssl/etcd/ssl/member-k8s-master1-key.pem
Sep 18 09:59:57 k8s-master1 etcd[7598]: recognized and used environment variable ETCD_PEER_TRUSTED_CA_FILE=/etc/ssl/etcd/ssl/ca.pem
Sep 18 09:59:57 k8s-master1 etcd[7598]: recognized and used environment variable ETCD_PROXY=off
Sep 18 09:59:57 k8s-master1 etcd[7598]: recognized and used environment variable ETCD_SNAPSHOT_COUNT=10000
Sep 18 09:59:57 k8s-master1 etcd[7598]: recognized and used environment variable ETCD_TRUSTED_CA_FILE=/etc/ssl/etcd/ssl/ca.pem
Sep 18 09:59:57 k8s-master1 etcd[7598]: etcd Version: 3.4.13
Sep 18 09:59:57 k8s-master1 etcd[7598]: Git SHA: ae9734ed2
Sep 18 09:59:57 k8s-master1 etcd[7598]: Go Version: go1.12.17
Sep 18 09:59:57 k8s-master1 etcd[7598]: Go OS/Arch: linux/amd64
Sep 18 09:59:57 k8s-master1 etcd[7598]: setting maximum number of CPUs to 4, total number of available CPUs is 4
Sep 18 09:59:57 k8s-master1 etcd[7598]: the server is already initialized as member before, starting as etcd member...
Sep 18 09:59:57 k8s-master1 etcd[7598]: peerTLS: cert = /etc/ssl/etcd/ssl/member-k8s-master1.pem, key = /etc/ssl/etcd/ssl/member-k8s-master1-key.pem, trusted-ca = /etc/ssl/etcd/ssl/ca.pem, client-cert
Sep 18 09:59:57 k8s-master1 etcd[7598]: name = etcd-k8s-master1
Sep 18 09:59:57 k8s-master1 etcd[7598]: data dir = /var/lib/etcd
Sep 18 09:59:57 k8s-master1 etcd[7598]: member dir = /var/lib/etcd/member
Sep 18 09:59:57 k8s-master1 etcd[7598]: heartbeat = 250ms
Sep 18 09:59:57 k8s-master1 etcd[7598]: election = 5000ms
Sep 18 09:59:57 k8s-master1 etcd[7598]: snapshot count = 10000
Sep 18 09:59:57 k8s-master1 etcd[7598]: advertise client URLs = https://192.168.127.11:2379
Sep 18 09:59:57 k8s-master1 etcd[7598]: initial advertise peer URLs = https://192.168.127.11:2380
Sep 18 09:59:57 k8s-master1 etcd[7598]: initial cluster = 
Sep 18 09:59:57 k8s-master1 systemd[1]: etcd.service: main process exited, code=exited, status=2/INVALIDARGUMENT
Sep 18 09:59:57 k8s-master1 systemd[1]: Failed to start etcd.

解决方案

删除损坏数据,从其他节点重新同步

# 备份
mv /var/lib/etcd /var/lib/etcd.bak.$(date +%F-%H%M)
mkdir -p /var/lib/etcd
chown -R root:root /var/lib/etcd
chmod 700 /var/lib/etcd
systemctl start etcd
systemctl status etcd

etcd 备份脚本(backup.sh)

#!/bin/bash

ETCDCTL_PATH='/usr/local/bin/etcdctl'
ENDPOINTS='https://k8s-master1:2379'
BACKUP_DIR="/root/backups/kube_etcd/etcd-$(date +%Y%m%d_%H%M%S)"

ETCDCTL_CERT="/etc/ssl/etcd/ssl/admin-k8s-master1.pem"
ETCDCTL_KEY="/etc/ssl/etcd/ssl/admin-k8s-master1-key.pem"
ETCDCTL_CA_FILE="/etc/ssl/etcd/ssl/ca.pem"

mkdir -p "$BACKUP_DIR"

export ETCDCTL_API=3
if $ETCDCTL_PATH --endpoints="$ENDPOINTS" snapshot save "$BACKUP_DIR/snapshot.db" \
   --cacert="$ETCDCTL_CA_FILE" \
   --cert="$ETCDCTL_CERT" \
   --key="$ETCDCTL_KEY" > "$BACKUP_DIR/snapshot.log" 2>&1; then
    echo "[$(date)] etcd snapshot saved: $BACKUP_DIR/snapshot.db"
else
    echo "[$(date)] etcd snapshot failed! See $BACKUP_DIR/snapshot.log"
    exit 1
fi

# 保留最近 10 份
cd "$(dirname "$BACKUP_DIR")" || exit
ls -1t | tail -n +11 | xargs rm -rf

etcd 恢复脚本(restore.sh)

#!/bin/bash

ETCDCTL_PATH="/usr/local/bin/etcdctl"
SNAPSHOT_FILE="$1"   # 传入 snapshot.db 的路径
RESTORE_DIR="/var/lib/etcd-restore"
NEW_ETCD_DATA_DIR="/var/lib/etcd"
ETCD_NAME="k8s-master1"   # 当前节点名称(需与 systemd 配置一致)
ETCD_INITIAL_CLUSTER="k8s-master1=https://k8s-master1:2380,k8s-master2=https://k8s-master2:2380,k8s-master3=https://k8s-master3:2380"
ETCD_INITIAL_CLUSTER_TOKEN="etcd-cluster"
ETCD_INITIAL_ADVERTISE_PEER_URLS="https://k8s-master1:2380"

if [ -z "$SNAPSHOT_FILE" ]; then
    echo "用法: $0 <snapshot.db 路径>"
    exit 1
fi

# 确认 snapshot 文件存在
if [ ! -f "$SNAPSHOT_FILE" ]; then
    echo "错误: 找不到快照文件 $SNAPSHOT_FILE"
    exit 1
fi

echo "===> 停止 etcd 服务..."
systemctl stop etcd

echo "===> 删除旧数据目录..."
rm -rf "$NEW_ETCD_DATA_DIR"

echo "===> 从 snapshot 恢复到 $RESTORE_DIR ..."
ETCDCTL_API=3 $ETCDCTL_PATH snapshot restore "$SNAPSHOT_FILE" \
  --name $ETCD_NAME \
  --data-dir "$RESTORE_DIR" \
  --initial-cluster "$ETCD_INITIAL_CLUSTER" \
  --initial-cluster-token "$ETCD_INITIAL_CLUSTER_TOKEN" \
  --initial-advertise-peer-urls "$ETCD_INITIAL_ADVERTISE_PEER_URLS"

echo "===> 替换 etcd 数据目录..."
mv "$RESTORE_DIR" "$NEW_ETCD_DATA_DIR"

echo "===> 启动 etcd 服务..."
systemctl start etcd

echo "===> 检查 etcd 状态..."
systemctl status etcd --no-pager