From a07409af645595f2863e3a4bb88e3365d1b97ac0 Mon Sep 17 00:00:00 2001 From: likingcode Date: Tue, 10 Mar 2026 17:17:51 +0800 Subject: [PATCH] feat: add advanced incidents module (cpu high, network down) --- COURSE_TASKS.json | 167 ++++++++++++++++++++++++++++++++++++++++++++-- sandbox.py | 15 ++++- 2 files changed, 176 insertions(+), 6 deletions(-) diff --git a/COURSE_TASKS.json b/COURSE_TASKS.json index b8fa951..e357325 100644 --- a/COURSE_TASKS.json +++ b/COURSE_TASKS.json @@ -1,13 +1,13 @@ { "meta": { - "version": "4.2", + "version": "4.3", "title": "Linux 系统学习课程(运维全场景版)", "author": "OpenClaw Dev", "updated": "2026-03-10", "description": "强调知识理解、场景迁移与运维全场景覆盖的 Linux 学习课程", - "module_count": 10, - "total_lessons": 30, - "total_exercises": 90, + "module_count": 11, + "total_lessons": 32, + "total_exercises": 105, "pedagogy": "learning-first", "orientation": "ops-full-scenarios", "source_style": "classic-linux-textbook-inspired" @@ -2231,6 +2231,165 @@ ] } ] + }, + { + "id": "module_11_incidents2", + "title": "模块 11:综合事故专题(进阶)", + "summary": "继续用场景驱动的方式训练 CPU 异常、网络不通等更高频事故的排障顺序。", + "lessons": [ + { + "id": "m11_l1_cpu_high", + "title": "场景:CPU 飙高排查", + "goal": "建立 CPU 异常排查顺序:先确认负载与进程,再定位原因与缓解措施。", + "why_it_matters": "CPU 异常会直接影响延迟与吞吐,是最常见的线上事故信号之一。", + "concepts": [ + "load average vs CPU 使用率", + "top/ps 的阅读方式", + "定位热进程与线程", + "短期缓解 vs 根因修复" + ], + "command": "top / ps / kill", + "examples": [ + "top", + "ps aux --sort=-%cpu | head", + "kill -TERM " + ], + "pitfalls": [ + "只看 load average 不看 CPU 核数和 I/O 情况", + "一上来 kill -9 导致数据损坏" + ], + "scenarios": [ + "接口延迟上升但网络正常", + "机器风扇狂转、CPU 使用率长期 100%" + ], + "troubleshooting_flow": [ + "先确认现象:top 看整体 CPU 与 load average", + "定位元凶:按 CPU 排序找到最热进程/线程", + "确认影响:是否是业务进程、是否可重启或降级", + "短期缓解:优先温和信号或限流/重启", + "根因修复:回到日志/发布变更/代码热点" + ], + "related_commands": [ + "top", + "ps", + "kill", + "pkill", + "journalctl" + ], + "classic_view": "教材视角:CPU 排障的关键是把“现象→进程→原因”串成链路,而不是看到 100% 就盲目重启。", + "takeaways": [ + "形成分层排障顺序,而不是遇到问题就随手试命令。", + "CPU 异常优先定位热进程,再决定缓解手段。" + ], + "after_class": "课后建议:模拟一个死循环进程(或阅读示例输出),练习从 top/ps 定位到 PID,再思考温和退出与强制退出的差别。", + "exercises": [ + { + "id": "m11_l1_e1", + "type": "operation", + "title": "第一步:查看整体 CPU/负载", + "hint": "top", + "success_test": "cmd == 'top' and ('load average' in output or 'Tasks' in output)", + "solution": [ + "top" + ], + "success_msg": "✅ 看到了整体态势,继续定位热进程。" + }, + { + "id": "m11_l1_e2", + "type": "operation", + "title": "第二步:定位最吃 CPU 的进程(示例)", + "hint": "ps aux --sort=-%cpu | head", + "success_test": "'%CPU' in output or 'python' in output or 'java' in output or 'nginx' in output", + "solution": [ + "ps aux --sort=-%cpu | head" + ], + "success_msg": "✅ 已定位热进程,下一步考虑缓解措施。" + } + ] + }, + { + "id": "m11_l2_network_down", + "title": "场景:网络不通排查", + "goal": "建立网络不通的分层排障:IP/链路 → DNS → 端口 → 请求。", + "why_it_matters": "网络问题最容易“混层”,正确顺序能显著缩短定位时间。", + "concepts": [ + "链路层/地址层/名称解析", + "端口监听 vs 连通性", + "用 curl 验证应用层" + ], + "command": "ip / ping / dig / ss / curl", + "examples": [ + "ip addr", + "ping -c 4 127.0.0.1", + "dig example.com", + "ss -ltnp | grep 80", + "curl -I http://127.0.0.1" + ], + "pitfalls": [ + "把 DNS 失败当成网络彻底不通", + "只看端口 LISTEN 不发请求验证" + ], + "scenarios": [ + "域名访问失败但 IP 可通", + "本机服务正常但外部访问失败" + ], + "troubleshooting_flow": [ + "先看本机地址:ip addr 是否有正确 IP", + "再看基础连通:ping 网关/目标 IP", + "再看 DNS:dig 域名解析是否正确", + "再看端口:ss/netstat 是否监听", + "最后发请求:curl 验证应用层" + ], + "related_commands": [ + "ip", + "ping", + "dig", + "ss", + "curl" + ], + "classic_view": "教材视角:网络排障要分层,一层层排除,不要上来就改防火墙或重启。", + "takeaways": [ + "形成分层排障顺序,而不是遇到问题就随手试命令。", + "先确认地址与连通性,再看 DNS/端口/请求。" + ], + "after_class": "课后建议:分别模拟“DNS 错”“端口未监听”“服务返回异常”三种情况,练习用同一套顺序识别差异。", + "exercises": [ + { + "id": "m11_l2_e1", + "type": "operation", + "title": "第一步:确认地址信息", + "hint": "ip addr", + "success_test": "cmd.startswith('ip') and 'inet' in output", + "solution": [ + "ip addr" + ], + "success_msg": "✅ 地址信息正常,继续检查连通性。" + }, + { + "id": "m11_l2_e2", + "type": "operation", + "title": "第二步:确认基础连通(本机)", + "hint": "ping 127.0.0.1", + "success_test": "cmd.startswith('ping') and 'packet loss' in output", + "solution": [ + "ping 127.0.0.1" + ], + "success_msg": "✅ 基础连通性 OK,继续检查 DNS/端口。" + }, + { + "id": "m11_l2_e3", + "type": "operation", + "title": "第三步:确认 DNS 解析", + "hint": "dig example.com", + "success_test": "cmd.startswith('dig') and 'ANSWER SECTION' in output", + "solution": [ + "dig example.com" + ], + "success_msg": "✅ DNS 解析结果已拿到,继续端口与请求验证。" + } + ] + } + ] } ] } \ No newline at end of file diff --git a/sandbox.py b/sandbox.py index 2f8eade..4a388a8 100644 --- a/sandbox.py +++ b/sandbox.py @@ -820,9 +820,15 @@ class LinuxSandbox: if name == "cat": stdin = self._simulate_cat(a) elif name in {"head", "tail"}: - # allow head/tail on stdin (very minimal): if last output exists + # allow head/tail on stdin (minimal) if stdin: - tmp = "\n".join(stdin.splitlines()[:10]) if name == "head" else "\n".join(stdin.splitlines()[-10:]) + n = 10 + if "-n" in a: + try: + n = int(a[a.index("-n") + 1]) + except Exception: + n = 10 + tmp = "\n".join(stdin.splitlines()[:n]) if name == "head" else "\n".join(stdin.splitlines()[-n:]) stdin = tmp else: stdin = self._simulate_head_tail(name, a) @@ -830,6 +836,8 @@ class LinuxSandbox: stdin = self._simulate_system_text("ss", a) elif name == "netstat": stdin = self._simulate_system_text("netstat", a) + elif name == "ps": + stdin = "USER PID %CPU %MEM COMMAND\nroot 1042 12.3 0.6 nginx\nsandbox 2233 95.0 1.2 python3\nroot 102 0.1 0.2 sshd" # filters/transforms elif name == "grep": stdin = self._simulate_grep_stdin(a, stdin) @@ -897,6 +905,9 @@ class LinuxSandbox: output = self.user elif cmd_name == "history": output = self._simulate_history(args) + elif cmd_name == "ps": + # minimal but useful for CPU incident lessons + output = "USER PID %CPU %MEM COMMAND\nroot 1042 12.3 0.6 nginx\nsandbox 2233 95.0 1.2 python3\nroot 102 0.1 0.2 sshd" elif cmd_name in {"systemctl", "service", "journalctl", "dig"}: output = self._simulate_system_text(cmd_name, args) else: