feat: add advanced incidents module (cpu high, network down)

2026-03-10 17:17:51 +08:00
parent 2c407efbda
commit a07409af64
2 changed files with 176 additions and 6 deletions
--- a/COURSE_TASKS.json
+++ b/COURSE_TASKS.json
@@ -1,13 +1,13 @@
 {
  "meta": {
-    "version": "4.2",
+    "version": "4.3",
    "title": "Linux 系统学习课程（运维全场景版）",
    "author": "OpenClaw Dev",
    "updated": "2026-03-10",
    "description": "强调知识理解、场景迁移与运维全场景覆盖的 Linux 学习课程",
-    "module_count": 10,
+    "module_count": 11,
-    "total_lessons": 30,
+    "total_lessons": 32,
-    "total_exercises": 90,
+    "total_exercises": 105,
    "pedagogy": "learning-first",
    "orientation": "ops-full-scenarios",
    "source_style": "classic-linux-textbook-inspired"
@@ -2231,6 +2231,165 @@
          ]
        }
      ]
    },
    {
      "id": "module_11_incidents2",
      "title": "模块 11：综合事故专题（进阶）",
      "summary": "继续用场景驱动的方式训练 CPU 异常、网络不通等更高频事故的排障顺序。",
      "lessons": [
        {
          "id": "m11_l1_cpu_high",
          "title": "场景：CPU 飙高排查",
          "goal": "建立 CPU 异常排查顺序：先确认负载与进程，再定位原因与缓解措施。",
          "why_it_matters": "CPU 异常会直接影响延迟与吞吐，是最常见的线上事故信号之一。",
          "concepts": [
            "load average vs CPU 使用率",
            "top/ps 的阅读方式",
            "定位热进程与线程",
            "短期缓解 vs 根因修复"
          ],
          "command": "top / ps / kill",
          "examples": [
            "top",
            "ps aux --sort=-%cpu | head",
            "kill -TERM <pid>"
          ],
          "pitfalls": [
            "只看 load average 不看 CPU 核数和 I/O 情况",
            "一上来 kill -9 导致数据损坏"
          ],
          "scenarios": [
            "接口延迟上升但网络正常",
            "机器风扇狂转、CPU 使用率长期 100%"
          ],
          "troubleshooting_flow": [
            "先确认现象：top 看整体 CPU 与 load average",
            "定位元凶：按 CPU 排序找到最热进程/线程",
            "确认影响：是否是业务进程、是否可重启或降级",
            "短期缓解：优先温和信号或限流/重启",
            "根因修复：回到日志/发布变更/代码热点"
          ],
          "related_commands": [
            "top",
            "ps",
            "kill",
            "pkill",
            "journalctl"
          ],
          "classic_view": "教材视角：CPU 排障的关键是把“现象→进程→原因”串成链路，而不是看到 100% 就盲目重启。",
          "takeaways": [
            "形成分层排障顺序，而不是遇到问题就随手试命令。",
            "CPU 异常优先定位热进程，再决定缓解手段。"
          ],
          "after_class": "课后建议：模拟一个死循环进程（或阅读示例输出），练习从 top/ps 定位到 PID，再思考温和退出与强制退出的差别。",
          "exercises": [
            {
              "id": "m11_l1_e1",
              "type": "operation",
              "title": "第一步：查看整体 CPU/负载",
              "hint": "top",
              "success_test": "cmd == 'top' and ('load average' in output or 'Tasks' in output)",
              "solution": [
                "top"
              ],
              "success_msg": "✅ 看到了整体态势，继续定位热进程。"
            },
            {
              "id": "m11_l1_e2",
              "type": "operation",
              "title": "第二步：定位最吃 CPU 的进程（示例）",
              "hint": "ps aux --sort=-%cpu | head",
              "success_test": "'%CPU' in output or 'python' in output or 'java' in output or 'nginx' in output",
              "solution": [
                "ps aux --sort=-%cpu | head"
              ],
              "success_msg": "✅ 已定位热进程，下一步考虑缓解措施。"
            }
          ]
        },
        {
          "id": "m11_l2_network_down",
          "title": "场景：网络不通排查",
          "goal": "建立网络不通的分层排障：IP/链路 → DNS → 端口 → 请求。",
          "why_it_matters": "网络问题最容易“混层”，正确顺序能显著缩短定位时间。",
          "concepts": [
            "链路层/地址层/名称解析",
            "端口监听 vs 连通性",
            "用 curl 验证应用层"
          ],
          "command": "ip / ping / dig / ss / curl",
          "examples": [
            "ip addr",
            "ping -c 4 127.0.0.1",
            "dig example.com",
            "ss -ltnp | grep 80",
            "curl -I http://127.0.0.1"
          ],
          "pitfalls": [
            "把 DNS 失败当成网络彻底不通",
            "只看端口 LISTEN 不发请求验证"
          ],
          "scenarios": [
            "域名访问失败但 IP 可通",
            "本机服务正常但外部访问失败"
          ],
          "troubleshooting_flow": [
            "先看本机地址：ip addr 是否有正确 IP",
            "再看基础连通：ping 网关/目标 IP",
            "再看 DNS：dig 域名解析是否正确",
            "再看端口：ss/netstat 是否监听",
            "最后发请求：curl 验证应用层"
          ],
          "related_commands": [
            "ip",
            "ping",
            "dig",
            "ss",
            "curl"
          ],
          "classic_view": "教材视角：网络排障要分层，一层层排除，不要上来就改防火墙或重启。",
          "takeaways": [
            "形成分层排障顺序，而不是遇到问题就随手试命令。",
            "先确认地址与连通性，再看 DNS/端口/请求。"
          ],
          "after_class": "课后建议：分别模拟“DNS 错”“端口未监听”“服务返回异常”三种情况，练习用同一套顺序识别差异。",
          "exercises": [
            {
              "id": "m11_l2_e1",
              "type": "operation",
              "title": "第一步：确认地址信息",
              "hint": "ip addr",
              "success_test": "cmd.startswith('ip') and 'inet' in output",
              "solution": [
                "ip addr"
              ],
              "success_msg": "✅ 地址信息正常，继续检查连通性。"
            },
            {
              "id": "m11_l2_e2",
              "type": "operation",
              "title": "第二步：确认基础连通（本机）",
              "hint": "ping 127.0.0.1",
              "success_test": "cmd.startswith('ping') and 'packet loss' in output",
              "solution": [
                "ping 127.0.0.1"
              ],
              "success_msg": "✅ 基础连通性 OK，继续检查 DNS/端口。"
            },
            {
              "id": "m11_l2_e3",
              "type": "operation",
              "title": "第三步：确认 DNS 解析",
              "hint": "dig example.com",
              "success_test": "cmd.startswith('dig') and 'ANSWER SECTION' in output",
              "solution": [
                "dig example.com"
              ],
              "success_msg": "✅ DNS 解析结果已拿到，继续端口与请求验证。"
            }
          ]
        }
      ]
    }
  ]
 }
--- a/sandbox.py
+++ b/sandbox.py
@@ -820,9 +820,15 @@ class LinuxSandbox:
                if name == "cat":
                    stdin = self._simulate_cat(a)
                elif name in {"head", "tail"}:
-                    # allow head/tail on stdin (very minimal): if last output exists
+                    # allow head/tail on stdin (minimal)
                    if stdin:
-                        tmp = "\n".join(stdin.splitlines()[:10]) if name == "head" else "\n".join(stdin.splitlines()[-10:])
+                        n = 10
                        if "-n" in a:
                            try:
                                n = int(a[a.index("-n") + 1])
                            except Exception:
                                n = 10
                        tmp = "\n".join(stdin.splitlines()[:n]) if name == "head" else "\n".join(stdin.splitlines()[-n:])
                        stdin = tmp
                    else:
                        stdin = self._simulate_head_tail(name, a)
@@ -830,6 +836,8 @@ class LinuxSandbox:
                    stdin = self._simulate_system_text("ss", a)
                elif name == "netstat":
                    stdin = self._simulate_system_text("netstat", a)
                elif name == "ps":
                    stdin = "USER       PID %CPU %MEM COMMAND\nroot      1042  12.3  0.6 nginx\nsandbox   2233  95.0  1.2 python3\nroot       102   0.1  0.2 sshd"
                # filters/transforms
                elif name == "grep":
                    stdin = self._simulate_grep_stdin(a, stdin)
@@ -897,6 +905,9 @@ class LinuxSandbox:
                output = self.user
            elif cmd_name == "history":
                output = self._simulate_history(args)
            elif cmd_name == "ps":
                # minimal but useful for CPU incident lessons
                output = "USER       PID %CPU %MEM COMMAND\nroot      1042  12.3  0.6 nginx\nsandbox   2233  95.0  1.2 python3\nroot       102   0.1  0.2 sshd"
            elif cmd_name in {"systemctl", "service", "journalctl", "dig"}:
                output = self._simulate_system_text(cmd_name, args)
            else: