$ ETCDCTL_ENDPOINTS=https://10.x.45.252:2379 etcdctl snapshot save 0608-etcd.db {"level":"info","ts":1623124958.5931418,"caller","snapshot/v3_snapshot.go:119","msg":"create temporary db file","path":"0608-etcd.db.part"}
$ cd /etc/kubernetes/rollbackcopy/currentVersion.latest $ cluster-restore.sh . ...stopping kube-apiserver-pod.yml ...stopping kube-controller-manager-pod.yml ...stopping kube-scheduler-pod.yml ...stopping etcd-pod.yml Waiting for container etcd to stop complete Waiting for container etcdctl to stop ...................................complete Waiting for container etcd-metrics to stop complete Waiting for container kube-controller-manager to stop complete Waiting for container kube-apiserver to stop .........................complete Waiting for container kube-scheduler to stop complete starting restore-etcd static pod starting kube-apiserver-pod.yml static-pod-resource/kube-apiserver-pod-50/kube-apiserver-pod.yaml starting kube-controller-manager-pod.yml static-pod-resource/kube-controller-manager-pod-7/kube-controller-manager-pod.yml starting kube-scheduler-pod.yml static-pod-resource/kube-scheduler-pod-7/kube-scheduler-pod.yml
STATIC_POD_CONTAINERS=("etcd""etcdctl""etcd-metrics""kube-controller-manager""kube-apiserver""kube-scheduler") functionwait_for_containers_to_stop(){ local CONTAINERS=("$@") ctrID
for NAME in"${CONTAINERS[@]}"; do echo"Waiting for container ${NAME} to stop" ctrID="$(crictl ps --label io.kubernetes.container.name=${NAME} -q)" if [ -n "$ctrID" ];then crictl stop $ctrID fi done }
... command: -/bin/sh --c -| #!/bin/sh set -euo pipefail ... if [ !-z$(ls-A"/var/lib/etcd") ];then echo"please delete the contents of data directory before restoring, running the restore script will do this for you" exit1 fi
# check if we have backup file to be restored # if the file exist, check if it has not changed size in last 5 seconds if [ !-f/var/lib/etcd-backup/snapshot.db ];then echo"please make a copy of the snapshot db file, then move that copy to /var/lib/etcd-backup/snapshot.db" exit1 else ...
从逻辑看是启动的时候如果数据目录必须不为空,然后/var/lib/etcd-backup/目录得存在备份的 db 文件,看了下挂载目录,宿主机上也是这个目录。打算先在第一个 master 节点的 etcd 容器里备份。然后用备份文件在其他节点恢复备份。
细心观察看前面的每个 endpoint 下 member list 是看到的自己的。所以是在 master1 上逐渐添加其他 member。
1 2 3 4 5 6 7 8
$ ETCDCTL_ENDPOINTS=https://10.x.45.251:2379 etcdctl member add master2.openshift4.example.com --peer-urls=https://10.x.45.252:2380 Member 3e27197aa4521ea0 added to cluster 1c2134e7d41c45b1
[root@master1 /]$ ETCDCTL_ENDPOINTS=https://10.x.45.251:2379 etcdctl member list 3e27197aa4521ea0, unstarted, , https://10.x.45.252:2380, , false 831fd1ef9bc83a2b, started, master1.openshift4.example.com, https://10.x.45.251:2380, https://10.x.45.251:2379, false [root@master1 /]$ ETCDCTL_ENDPOINTS=https://10.x.45.251:2379 etcdctl member list 3e27197aa4521ea0, unstarted, , https://10.x.45.252:2380, , false 831fd1ef9bc83a2b, started, master1.openshift4.example.com, https://10.x.45.251:2380, https://10.x.45.251:2379, false [root@master1 /]$ ETCDCTL_ENDPOINTS=https://10.x.45.251:2379 etcdctl member list 3e27197aa4521ea0, started, master2.openshift4.example.com, https://10.x.45.252:2380, https://10.x.45.252:2379, false 831fd1ef9bc83a2b, started, master1.openshift4.example.com, https://10.x.45.251:2380, https://10.x.45.251:2379, false [root@master1 /]$ etcdctl endpoint status -w table {"level":"warn","ts":"2021-06-08T09:26:16.669Z","caller":"clientv3/retry_interceptor.go:62","msg":"retrying of unary invoker failed","target":"passthrough:///https://10.x.45.222:2379","attempt":0,"error":"rpc error: code = DeadlineExceeded desc = latest balancer error: connection error: desc = \"transport: Error while dialing dial tcp 10.x.45.222:2379: connect: connection refused\""} Failed to get the status of endpoint https://10.x.45.222:2379 (context deadline exceeded) +----------------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------+ | ENDPOINT | ID | VERSION | DB SIZE | IS LEADER | IS LEARNER | RAFT TERM | RAFT INDEX | RAFT APPLIED INDEX | ERRORS | +----------------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------+ | https://10.x.45.251:2379 | 831fd1ef9bc83a2b | 3.4.9 | 724 MB | true | false | 632 | 1041 | 1041 | | | https://10.x.45.252:2379 | 3e27197aa4521ea0 | 3.4.9 | 724 MB | false | false | 632 | 1041 | 1041 | | +----------------------------+------------------+---------+---------+-----------+------------+-----------+------------+--------------------+--------+
好消息,然后恢复第三个,添加第三个member:
1 2 3 4 5 6 7
[root@master1 /]$ ETCDCTL_ENDPOINTS=https://10.x.45.251:2379 etcdctl member add master3.openshift4.example.com --peer-urls=https://10.x.45.222:2380 Member d319fa1cbb0e28fe added to cluster 1c2134e7d41c45b1
... #!/bin/sh set-euopipefail etcdctlmemberlist||true # this has a non-zero return code if the command is non-zero. If you use an export first, it doesn't and you # will succeed when you should fail. ETCD_INITIAL_CLUSTER=$(discover-etcd-initial-cluster\\ --cacert=/etc/kubernetes/static-pod-certs/configmaps/etcd-serving-ca/ca-bundle.crt\\ --cert=/etc/kubernetes/static-pod-certs/secrets/etcd-all-peer/etcd-peer-master11.cluster.lonlife.dev.crt\\ --key=/etc/kubernetes/static-pod-certs/secrets/etcd-all-peer/etcd-peer-master11.cluster.lonlife.dev.key\\ --endpoints=${ALL_ETCD_ENDPOINTS}\\ --data-dir=/var/lib/etcd\\ --target-peer-url-host=${NODE_master11_cluster_lonlife_dev_ETCD_URL_HOST}\\ --target-name=master11.cluster.lonlife.dev) exportETCD_INITIAL_CLUSTER # we cannot use the \"normal\" port conflict initcontainer because when we upgrade, the existing static pod will never yield, # so we do the detection in etcd container itsefl. echo-n\"Waitingforports2379,2380 and9978 tobereleased.\" while [ -n\"$(ss-Htan'( sport = 2379 or sport = 2380 or sport = 9978 )')\" ];do echo-n\".\" sleep1 done exportETCD_NAME=${NODE_master11_cluster_lonlife_dev_ETCD_NAME} env|grepETCD|grep-vNODE set-x # See https://etcd.io/docs/v3.4.0/tuning/ for why we use ionice execionice-c2-n0etcd\\ --log-level=info\\ --initial-advertise-peer-urls=https://${NODE_master11_cluster_lonlife_dev_IP}:2380\\ --cert-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-serving/etcd-serving-master11.cluster.lonlife.dev.crt\\ --key-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-serving/etcd-serving-master11.cluster.lonlife.dev.key\\ --trusted-ca-file=/etc/kubernetes/static-pod-certs/configmaps/etcd-serving-ca/ca-bundle.crt\\ --client-cert-auth=true\\ --peer-cert-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-peer/etcd-peer-master11.cluster.lonlife.dev.crt\\ --peer-key-file=/etc/kubernetes/static-pod-certs/secrets/etcd-all-peer/etcd-peer-master11.cluster.lonlife.dev.key\\ --peer-trusted-ca-file=/etc/kubernetes/static-pod-certs/configmaps/etcd-peer-client-ca/ca-bundle.crt\\ --peer-client-cert-auth=true\\ --advertise-client-urls=https://${NODE_master11_cluster_lonlife_dev_IP}:2379\\ --listen-client-urls=https://0.0.0.0:2379\\ --listen-peer-urls=https://0.0.0.0:2380\\ --listen-metrics-urls=https://0.0.0.0:9978||mv/etc/kubernetes/etcd-backup-dir/etcd-member.yaml/etc/kubernetes/manifests