From a07409af645595f2863e3a4bb88e3365d1b97ac0 Mon Sep 17 00:00:00 2001
From: likingcode <likingcode@users.noreply.github.com>
Date: Tue, 10 Mar 2026 17:17:51 +0800
Subject: [PATCH] feat: add advanced incidents module (cpu high, network down)

---
 COURSE_TASKS.json | 167 ++++++++++++++++++++++++++++++++++++++++++++--
 sandbox.py        |  15 ++++-
 2 files changed, 176 insertions(+), 6 deletions(-)
diff --git a/COURSE_TASKS.json b/COURSE_TASKS.json
index b8fa951..e357325 100644
--- a/COURSE_TASKS.json
+++ b/COURSE_TASKS.json
@@ -1,13 +1,13 @@
 {
   "meta": {
-    "version": "4.2",
+    "version": "4.3",
     "title": "Linux 系统学习课程（运维全场景版）",
     "author": "OpenClaw Dev",
     "updated": "2026-03-10",
     "description": "强调知识理解、场景迁移与运维全场景覆盖的 Linux 学习课程",
-    "module_count": 10,
-    "total_lessons": 30,
-    "total_exercises": 90,
+    "module_count": 11,
+    "total_lessons": 32,
+    "total_exercises": 105,
     "pedagogy": "learning-first",
     "orientation": "ops-full-scenarios",
     "source_style": "classic-linux-textbook-inspired"
@@ -2231,6 +2231,165 @@
           ]
         }
       ]
+    },
+    {
+      "id": "module_11_incidents2",
+      "title": "模块 11：综合事故专题（进阶）",
+      "summary": "继续用场景驱动的方式训练 CPU 异常、网络不通等更高频事故的排障顺序。",
+      "lessons": [
+        {
+          "id": "m11_l1_cpu_high",
+          "title": "场景：CPU 飙高排查",
+          "goal": "建立 CPU 异常排查顺序：先确认负载与进程，再定位原因与缓解措施。",
+          "why_it_matters": "CPU 异常会直接影响延迟与吞吐，是最常见的线上事故信号之一。",
+          "concepts": [
+            "load average vs CPU 使用率",
+            "top/ps 的阅读方式",
+            "定位热进程与线程",
+            "短期缓解 vs 根因修复"
+          ],
+          "command": "top / ps / kill",
+          "examples": [
+            "top",
+            "ps aux --sort=-%cpu | head",
+            "kill -TERM <pid>"
+          ],
+          "pitfalls": [
+            "只看 load average 不看 CPU 核数和 I/O 情况",
+            "一上来 kill -9 导致数据损坏"
+          ],
+          "scenarios": [
+            "接口延迟上升但网络正常",
+            "机器风扇狂转、CPU 使用率长期 100%"
+          ],
+          "troubleshooting_flow": [
+            "先确认现象：top 看整体 CPU 与 load average",
+            "定位元凶：按 CPU 排序找到最热进程/线程",
+            "确认影响：是否是业务进程、是否可重启或降级",
+            "短期缓解：优先温和信号或限流/重启",
+            "根因修复：回到日志/发布变更/代码热点"
+          ],
+          "related_commands": [
+            "top",
+            "ps",
+            "kill",
+            "pkill",
+            "journalctl"
+          ],
+          "classic_view": "教材视角：CPU 排障的关键是把“现象→进程→原因”串成链路，而不是看到 100% 就盲目重启。",
+          "takeaways": [
+            "形成分层排障顺序，而不是遇到问题就随手试命令。",
+            "CPU 异常优先定位热进程，再决定缓解手段。"
+          ],
+          "after_class": "课后建议：模拟一个死循环进程（或阅读示例输出），练习从 top/ps 定位到 PID，再思考温和退出与强制退出的差别。",
+          "exercises": [
+            {
+              "id": "m11_l1_e1",
+              "type": "operation",
+              "title": "第一步：查看整体 CPU/负载",
+              "hint": "top",
+              "success_test": "cmd == 'top' and ('load average' in output or 'Tasks' in output)",
+              "solution": [
+                "top"
+              ],
+              "success_msg": "✅ 看到了整体态势，继续定位热进程。"
+            },
+            {
+              "id": "m11_l1_e2",
+              "type": "operation",
+              "title": "第二步：定位最吃 CPU 的进程（示例）",
+              "hint": "ps aux --sort=-%cpu | head",
+              "success_test": "'%CPU' in output or 'python' in output or 'java' in output or 'nginx' in output",
+              "solution": [
+                "ps aux --sort=-%cpu | head"
+              ],
+              "success_msg": "✅ 已定位热进程，下一步考虑缓解措施。"
+            }
+          ]
+        },
+        {
+          "id": "m11_l2_network_down",
+          "title": "场景：网络不通排查",
+          "goal": "建立网络不通的分层排障：IP/链路 → DNS → 端口 → 请求。",
+          "why_it_matters": "网络问题最容易“混层”，正确顺序能显著缩短定位时间。",
+          "concepts": [
+            "链路层/地址层/名称解析",
+            "端口监听 vs 连通性",
+            "用 curl 验证应用层"
+          ],
+          "command": "ip / ping / dig / ss / curl",
+          "examples": [
+            "ip addr",
+            "ping -c 4 127.0.0.1",
+            "dig example.com",
+            "ss -ltnp | grep 80",
+            "curl -I http://127.0.0.1"
+          ],
+          "pitfalls": [
+            "把 DNS 失败当成网络彻底不通",
+            "只看端口 LISTEN 不发请求验证"
+          ],
+          "scenarios": [
+            "域名访问失败但 IP 可通",
+            "本机服务正常但外部访问失败"
+          ],
+          "troubleshooting_flow": [
+            "先看本机地址：ip addr 是否有正确 IP",
+            "再看基础连通：ping 网关/目标 IP",
+            "再看 DNS：dig 域名解析是否正确",
+            "再看端口：ss/netstat 是否监听",
+            "最后发请求：curl 验证应用层"
+          ],
+          "related_commands": [
+            "ip",
+            "ping",
+            "dig",
+            "ss",
+            "curl"
+          ],
+          "classic_view": "教材视角：网络排障要分层，一层层排除，不要上来就改防火墙或重启。",
+          "takeaways": [
+            "形成分层排障顺序，而不是遇到问题就随手试命令。",
+            "先确认地址与连通性，再看 DNS/端口/请求。"
+          ],
+          "after_class": "课后建议：分别模拟“DNS 错”“端口未监听”“服务返回异常”三种情况，练习用同一套顺序识别差异。",
+          "exercises": [
+            {
+              "id": "m11_l2_e1",
+              "type": "operation",
+              "title": "第一步：确认地址信息",
+              "hint": "ip addr",
+              "success_test": "cmd.startswith('ip') and 'inet' in output",
+              "solution": [
+                "ip addr"
+              ],
+              "success_msg": "✅ 地址信息正常，继续检查连通性。"
+            },
+            {
+              "id": "m11_l2_e2",
+              "type": "operation",
+              "title": "第二步：确认基础连通（本机）",
+              "hint": "ping 127.0.0.1",
+              "success_test": "cmd.startswith('ping') and 'packet loss' in output",
+              "solution": [
+                "ping 127.0.0.1"
+              ],
+              "success_msg": "✅ 基础连通性 OK，继续检查 DNS/端口。"
+            },
+            {
+              "id": "m11_l2_e3",
+              "type": "operation",
+              "title": "第三步：确认 DNS 解析",
+              "hint": "dig example.com",
+              "success_test": "cmd.startswith('dig') and 'ANSWER SECTION' in output",
+              "solution": [
+                "dig example.com"
+              ],
+              "success_msg": "✅ DNS 解析结果已拿到，继续端口与请求验证。"
+            }
+          ]
+        }
+      ]
     }
   ]
 }
\ No newline at end of file
diff --git a/sandbox.py b/sandbox.py
index 2f8eade..4a388a8 100644
--- a/sandbox.py
+++ b/sandbox.py
@@ -820,9 +820,15 @@ class LinuxSandbox:
                 if name == "cat":
                     stdin = self._simulate_cat(a)
                 elif name in {"head", "tail"}:
-                    # allow head/tail on stdin (very minimal): if last output exists
+                    # allow head/tail on stdin (minimal)
                     if stdin:
-                        tmp = "\n".join(stdin.splitlines()[:10]) if name == "head" else "\n".join(stdin.splitlines()[-10:])
+                        n = 10
+                        if "-n" in a:
+                            try:
+                                n = int(a[a.index("-n") + 1])
+                            except Exception:
+                                n = 10
+                        tmp = "\n".join(stdin.splitlines()[:n]) if name == "head" else "\n".join(stdin.splitlines()[-n:])
                         stdin = tmp
                     else:
                         stdin = self._simulate_head_tail(name, a)
@@ -830,6 +836,8 @@ class LinuxSandbox:
                     stdin = self._simulate_system_text("ss", a)
                 elif name == "netstat":
                     stdin = self._simulate_system_text("netstat", a)
+                elif name == "ps":
+                    stdin = "USER       PID %CPU %MEM COMMAND\nroot      1042  12.3  0.6 nginx\nsandbox   2233  95.0  1.2 python3\nroot       102   0.1  0.2 sshd"
                 # filters/transforms
                 elif name == "grep":
                     stdin = self._simulate_grep_stdin(a, stdin)
@@ -897,6 +905,9 @@ class LinuxSandbox:
                 output = self.user
             elif cmd_name == "history":
                 output = self._simulate_history(args)
+            elif cmd_name == "ps":
+                # minimal but useful for CPU incident lessons
+                output = "USER       PID %CPU %MEM COMMAND\nroot      1042  12.3  0.6 nginx\nsandbox   2233  95.0  1.2 python3\nroot       102   0.1  0.2 sshd"
             elif cmd_name in {"systemctl", "service", "journalctl", "dig"}:
                 output = self._simulate_system_text(cmd_name, args)
             else: