feat: add advanced incidents module (cpu high, network down)

This commit is contained in:
likingcode
2026-03-10 17:17:51 +08:00
parent 2c407efbda
commit a07409af64
2 changed files with 176 additions and 6 deletions

View File

@@ -1,13 +1,13 @@
{
"meta": {
"version": "4.2",
"version": "4.3",
"title": "Linux 系统学习课程(运维全场景版)",
"author": "OpenClaw Dev",
"updated": "2026-03-10",
"description": "强调知识理解、场景迁移与运维全场景覆盖的 Linux 学习课程",
"module_count": 10,
"total_lessons": 30,
"total_exercises": 90,
"module_count": 11,
"total_lessons": 32,
"total_exercises": 105,
"pedagogy": "learning-first",
"orientation": "ops-full-scenarios",
"source_style": "classic-linux-textbook-inspired"
@@ -2231,6 +2231,165 @@
]
}
]
},
{
"id": "module_11_incidents2",
"title": "模块 11综合事故专题进阶",
"summary": "继续用场景驱动的方式训练 CPU 异常、网络不通等更高频事故的排障顺序。",
"lessons": [
{
"id": "m11_l1_cpu_high",
"title": "场景CPU 飙高排查",
"goal": "建立 CPU 异常排查顺序:先确认负载与进程,再定位原因与缓解措施。",
"why_it_matters": "CPU 异常会直接影响延迟与吞吐,是最常见的线上事故信号之一。",
"concepts": [
"load average vs CPU 使用率",
"top/ps 的阅读方式",
"定位热进程与线程",
"短期缓解 vs 根因修复"
],
"command": "top / ps / kill",
"examples": [
"top",
"ps aux --sort=-%cpu | head",
"kill -TERM <pid>"
],
"pitfalls": [
"只看 load average 不看 CPU 核数和 I/O 情况",
"一上来 kill -9 导致数据损坏"
],
"scenarios": [
"接口延迟上升但网络正常",
"机器风扇狂转、CPU 使用率长期 100%"
],
"troubleshooting_flow": [
"先确认现象top 看整体 CPU 与 load average",
"定位元凶:按 CPU 排序找到最热进程/线程",
"确认影响:是否是业务进程、是否可重启或降级",
"短期缓解:优先温和信号或限流/重启",
"根因修复:回到日志/发布变更/代码热点"
],
"related_commands": [
"top",
"ps",
"kill",
"pkill",
"journalctl"
],
"classic_view": "教材视角CPU 排障的关键是把“现象→进程→原因”串成链路,而不是看到 100% 就盲目重启。",
"takeaways": [
"形成分层排障顺序,而不是遇到问题就随手试命令。",
"CPU 异常优先定位热进程,再决定缓解手段。"
],
"after_class": "课后建议:模拟一个死循环进程(或阅读示例输出),练习从 top/ps 定位到 PID再思考温和退出与强制退出的差别。",
"exercises": [
{
"id": "m11_l1_e1",
"type": "operation",
"title": "第一步:查看整体 CPU/负载",
"hint": "top",
"success_test": "cmd == 'top' and ('load average' in output or 'Tasks' in output)",
"solution": [
"top"
],
"success_msg": "✅ 看到了整体态势,继续定位热进程。"
},
{
"id": "m11_l1_e2",
"type": "operation",
"title": "第二步:定位最吃 CPU 的进程(示例)",
"hint": "ps aux --sort=-%cpu | head",
"success_test": "'%CPU' in output or 'python' in output or 'java' in output or 'nginx' in output",
"solution": [
"ps aux --sort=-%cpu | head"
],
"success_msg": "✅ 已定位热进程,下一步考虑缓解措施。"
}
]
},
{
"id": "m11_l2_network_down",
"title": "场景:网络不通排查",
"goal": "建立网络不通的分层排障IP/链路 → DNS → 端口 → 请求。",
"why_it_matters": "网络问题最容易“混层”,正确顺序能显著缩短定位时间。",
"concepts": [
"链路层/地址层/名称解析",
"端口监听 vs 连通性",
"用 curl 验证应用层"
],
"command": "ip / ping / dig / ss / curl",
"examples": [
"ip addr",
"ping -c 4 127.0.0.1",
"dig example.com",
"ss -ltnp | grep 80",
"curl -I http://127.0.0.1"
],
"pitfalls": [
"把 DNS 失败当成网络彻底不通",
"只看端口 LISTEN 不发请求验证"
],
"scenarios": [
"域名访问失败但 IP 可通",
"本机服务正常但外部访问失败"
],
"troubleshooting_flow": [
"先看本机地址ip addr 是否有正确 IP",
"再看基础连通ping 网关/目标 IP",
"再看 DNSdig 域名解析是否正确",
"再看端口ss/netstat 是否监听",
"最后发请求curl 验证应用层"
],
"related_commands": [
"ip",
"ping",
"dig",
"ss",
"curl"
],
"classic_view": "教材视角:网络排障要分层,一层层排除,不要上来就改防火墙或重启。",
"takeaways": [
"形成分层排障顺序,而不是遇到问题就随手试命令。",
"先确认地址与连通性,再看 DNS/端口/请求。"
],
"after_class": "课后建议分别模拟“DNS 错”“端口未监听”“服务返回异常”三种情况,练习用同一套顺序识别差异。",
"exercises": [
{
"id": "m11_l2_e1",
"type": "operation",
"title": "第一步:确认地址信息",
"hint": "ip addr",
"success_test": "cmd.startswith('ip') and 'inet' in output",
"solution": [
"ip addr"
],
"success_msg": "✅ 地址信息正常,继续检查连通性。"
},
{
"id": "m11_l2_e2",
"type": "operation",
"title": "第二步:确认基础连通(本机)",
"hint": "ping 127.0.0.1",
"success_test": "cmd.startswith('ping') and 'packet loss' in output",
"solution": [
"ping 127.0.0.1"
],
"success_msg": "✅ 基础连通性 OK继续检查 DNS/端口。"
},
{
"id": "m11_l2_e3",
"type": "operation",
"title": "第三步:确认 DNS 解析",
"hint": "dig example.com",
"success_test": "cmd.startswith('dig') and 'ANSWER SECTION' in output",
"solution": [
"dig example.com"
],
"success_msg": "✅ DNS 解析结果已拿到,继续端口与请求验证。"
}
]
}
]
}
]
}