zhangguanzhang's Blog

arm64 redis COW 检测

字数统计: 984阅读时长: 5 min
2024/05/22

关于 redis arm64 COW 检测

由来

redis 5.0.14 开始,会带有 arm64 上的 MADV_FREE/fork 的 kernal bug check,参照以下 issue 和 pr

影响

根据代码逻辑,redis arm64 上启动的时候会检测,如果存在 bug,则不会启动,类似下面的日志:

1
2
3
# WARNING Your kernel has a bug that could lead to data corruption during background save. Please upgrade to the latest stable kernel.
# Redis will now exit to prevent data corruption. Note that it is possible to suppress this warning by setting the following config: ignore-warnings ARM64-COW-BUG
WARN exited: redis (exit status 1; not expected)

需要在配置文件里添加 ignore-warnings ARM64-COW-BUG 才行,国产的麒麟和一些其他系统的内核都是 backpart 居多,同事说部分系统上 redis 配置里加上配置后会无法启动,不加是能启动的,但是我看代码是先检查 COW 再看没配置选项才退出:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
int ret;
if ((ret = checkLinuxMadvFreeForkBug(&err_msg)) <= 0) {
if (ret < 0) {
serverLog(LL_WARNING, "WARNING %s", err_msg);
sdsfree(err_msg);
} else
serverLog(LL_WARNING, "Failed to test the kernel for a bug that could lead to data corruption during background save. "
"Your system could be affected, please report this error.");
if (!checkIgnoreWarning("ARM64-COW-BUG")) {
serverLog(LL_WARNING,"Redis will now exit to prevent data corruption. "
"Note that it is possible to suppress this warning by setting the following config: ignore-warnings ARM64-COW-BUG");
exit(1);
}
}

对于他说的现象我是存疑的。

扣检测代码

发现 golang 的 arm64 没有 undefined: syscall.SYS_FORK,还是直接扣下检测 c 函数,编译成二进制,然后加到 redis docker-entrypoint.sh 里做逻辑追加配置:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
// https://github.com/redis/redis/blob/e92363e248019b8bf3fc7dd8ce84f648b6b13473/src/syscheck.c#L174-L299

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <errno.h>
#include <string.h>

#ifndef MADV_FREE
#define MADV_FREE 8
#endif

int anetPipe(int *fds, int read_flags, int write_flags) {
return pipe(fds);
}

/* Get size in kilobytes of the Shared_Dirty pages of the calling process for the
* memory map corresponding to the provided address, or -1 on error. */
int smapsGetSharedDirty(unsigned long addr) {
int ret, in_mapping = 0, val = -1;
unsigned long from, to;
char buf[64];
FILE *f;

f = fopen("/proc/self/smaps", "r");
if (!f) return -1;

while (1) {
if (!fgets(buf, sizeof(buf), f))
break;

ret = sscanf(buf, "%lx-%lx", &from, &to);
if (ret == 2)
in_mapping = from <= addr && addr < to;

if (in_mapping && !memcmp(buf, "Shared_Dirty:", 13)) {
sscanf(buf, "%*s %d", &val);
/* If parsing fails, we remain with val == -1 */
break;
}
}

fclose(f);
return val;
}

/* Older arm64 Linux kernels have a bug that could lead to data corruption
* during background save in certain scenarios. This function checks if the
* kernel is affected.
* The bug was fixed in commit ff1712f953e27f0b0718762ec17d0adb15c9fd0b
* titled: "arm64: pgtable: Ensure dirty bit is preserved across pte_wrprotect()"
*/
int checkLinuxMadvFreeForkBug(char **error_msg) {
int ret, pipefd[2] = { -1, -1 };
pid_t pid;
char *p = NULL, *q;
int res = 1;
long page_size = sysconf(_SC_PAGESIZE);
long map_size = 3 * page_size;

p = mmap(NULL, map_size, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
if (p == MAP_FAILED) {
return 0;
}

q = p + page_size;

ret = mprotect(q, page_size, PROT_READ | PROT_WRITE);
if (ret < 0) {
res = 0;
goto exit;
}

*(volatile char*)q = 0;

ret = madvise(q, page_size, MADV_FREE);
if (ret < 0) {
if (errno == EINVAL) goto exit;
res = 0;
goto exit;
}

*(volatile char*)q = 0;

ret = anetPipe(pipefd, 0, 0);
if (ret < 0) {
res = 0;
goto exit;
}

pid = fork();
if (pid < 0) {
res = 0;
goto exit;
} else if (!pid) {
ret = smapsGetSharedDirty((unsigned long) q);
if (!ret)
res = -1;
else if (ret == -1)
res = 0;

ret = write(pipefd[1], &res, sizeof(res));
exit(0);
} else {
ret = read(pipefd[0], &res, sizeof(res));
if (ret < 0) {
res = 0;
}

waitpid(pid, NULL, 0);
}

exit:
if (pipefd[0] != -1) close(pipefd[0]);
if (pipefd[1] != -1) close(pipefd[1]);
if (p != NULL) munmap(p, map_size);

if (res == -1 && error_msg) {
*error_msg = strdup("Your kernel has a bug that could lead to data corruption during background save. Please upgrade to the latest stable kernel.");
}

return res;
}

int main() {
char *error_msg = NULL;
int result = checkLinuxMadvFreeForkBug(&error_msg);
if (result == -1) {
printf("Kernel bug detected: %s\n", error_msg);
free(error_msg);
return 1;
} else if (result == 1) {
printf("No kernel bug detected.\n");
} else {
printf("Error checking for kernel bug.\n");
}
return 0;
}

arm64 上编译成静态文件:

1
gcc -o check_bug -static check_bug.c

然后构建 docker 镜像:

1
2
3
4
5
6
7
8
arch ?= $(shell uname -m)

build:
ifeq ($(arch), aarch64)
gcc -o check_bug.sh -static check_bug.c
endif
image: build
docker build -t xxx .

.gitignore 忽略二进制提交

1
check_bug.sh

dockerfile 里 COPY *.sh

CATALOG
  1. 1. 由来
  2. 2. 影响
  3. 3. 扣检测代码