feat: deepen ops modules and enhance sandbox coverage

This commit is contained in:
likingcode
2026-03-10 09:23:23 +08:00
parent 99a948953b
commit f3d5f92478
2 changed files with 146 additions and 48 deletions

View File

@@ -860,15 +860,21 @@
"examples": [ "examples": [
"systemctl status nginx", "systemctl status nginx",
"systemctl restart nginx", "systemctl restart nginx",
"systemctl enable nginx" "systemctl enable nginx",
"systemctl status app.service",
"systemctl restart app.service",
"systemctl is-enabled nginx"
], ],
"pitfalls": [ "pitfalls": [
"改完配置却忘记重启服务", "改完配置却忘记重启服务",
"只看页面,不看服务状态" "只看页面,不看服务状态",
"把 restart 当成排障终点,而不是排查起点"
], ],
"scenarios": [ "scenarios": [
"排查服务没起来", "排查服务没起来",
"改配置后让服务生效" "改配置后让服务生效",
"配置变更后重新加载服务",
"确认服务是否开机自启"
], ],
"exercises": [ "exercises": [
{ {
@@ -897,7 +903,8 @@
"takeaways": [ "takeaways": [
"学完后应能做到:理解 Linux 服务的查看、启动、停止和重启。", "学完后应能做到:理解 Linux 服务的查看、启动、停止和重启。",
"易错提醒:改完配置却忘记重启服务", "易错提醒:改完配置却忘记重启服务",
"迁移场景:排查服务没起来" "迁移场景:排查服务没起来",
"服务问题先看状态,再决定下一步看日志、端口还是配置。"
], ],
"after_class": "课后建议:回到真实或模拟环境里,再用 systemctl 做一次独立练习,并尝试自己解释每条输出的含义。" "after_class": "课后建议:回到真实或模拟环境里,再用 systemctl 做一次独立练习,并尝试自己解释每条输出的含义。"
}, },
@@ -915,15 +922,21 @@
"examples": [ "examples": [
"journalctl -u nginx -n 50", "journalctl -u nginx -n 50",
"journalctl -xe", "journalctl -xe",
"journalctl -u app -f" "journalctl -u app -f",
"journalctl -u app.service -n 100",
"journalctl -u nginx -f",
"journalctl --since today"
], ],
"pitfalls": [ "pitfalls": [
"只看应用日志,不看 systemd 日志", "只看应用日志,不看 systemd 日志",
"看太多日志却抓不到最近错误" "看太多日志却抓不到最近错误",
"不限定服务名导致日志范围过大,难以定位"
], ],
"scenarios": [ "scenarios": [
"查看服务启动失败原因", "查看服务启动失败原因",
"查看最近报错" "查看最近报错",
"定位服务启动失败的关键报错",
"观察重启前后日志变化"
], ],
"exercises": [ "exercises": [
{ {
@@ -952,7 +965,8 @@
"takeaways": [ "takeaways": [
"学完后应能做到:理解如何查看服务日志和系统日志。", "学完后应能做到:理解如何查看服务日志和系统日志。",
"易错提醒:只看应用日志,不看 systemd 日志", "易错提醒:只看应用日志,不看 systemd 日志",
"迁移场景:查看服务启动失败原因" "迁移场景:查看服务启动失败原因",
"日志不是越多越好,关键是缩小范围看最近、看目标服务。"
], ],
"after_class": "课后建议:回到真实或模拟环境里,再用 journalctl 做一次独立练习,并尝试自己解释每条输出的含义。" "after_class": "课后建议:回到真实或模拟环境里,再用 journalctl 做一次独立练习,并尝试自己解释每条输出的含义。"
}, },
@@ -970,15 +984,21 @@
"examples": [ "examples": [
"kill 1234", "kill 1234",
"pkill nginx", "pkill nginx",
"nohup python3 app.py &" "nohup python3 app.py &",
"kill -9 1234",
"pkill -f python",
"nohup bash backup.sh &"
], ],
"pitfalls": [ "pitfalls": [
"直接粗暴 kill 掉关键进程", "直接粗暴 kill 掉关键进程",
"不知道后台任务输出去哪了" "不知道后台任务输出去哪了",
"不了解信号差异就直接使用 -9"
], ],
"scenarios": [ "scenarios": [
"结束卡死进程", "结束卡死进程",
"让脚本后台运行" "让脚本后台运行",
"结束僵死任务",
"让临时脚本脱离终端继续执行"
], ],
"exercises": [ "exercises": [
{ {
@@ -1009,7 +1029,8 @@
"takeaways": [ "takeaways": [
"学完后应能做到:理解如何控制进程和让任务脱离终端运行。", "学完后应能做到:理解如何控制进程和让任务脱离终端运行。",
"易错提醒:直接粗暴 kill 掉关键进程", "易错提醒:直接粗暴 kill 掉关键进程",
"迁移场景:结束卡死进程" "迁移场景:结束卡死进程",
"进程控制的重点是知道为什么结束、结束谁、结束后系统会怎样。"
], ],
"after_class": "课后建议:回到真实或模拟环境里,再用 kill、pkill、nohup 做一次独立练习,并尝试自己解释每条输出的含义。" "after_class": "课后建议:回到真实或模拟环境里,再用 kill、pkill、nohup 做一次独立练习,并尝试自己解释每条输出的含义。"
} }
@@ -1034,15 +1055,21 @@
"examples": [ "examples": [
"ip addr", "ip addr",
"ifconfig", "ifconfig",
"ping 127.0.0.1" "ping 127.0.0.1",
"ip addr show eth0",
"ping 192.168.1.1",
"ping -c 4 example.com"
], ],
"pitfalls": [ "pitfalls": [
"能 ping 通就以为服务一定可用", "能 ping 通就以为服务一定可用",
"只会看 IP不理解监听端口" "只会看 IP不理解监听端口",
"把 DNS 解析失败误判成网络完全不通"
], ],
"scenarios": [ "scenarios": [
"确认机器是否有正确 IP", "确认机器是否有正确 IP",
"测试目标是否能连通" "测试目标是否能连通",
"确认目标机器有无 IP",
"初步判断网络层是否通"
], ],
"exercises": [ "exercises": [
{ {
@@ -1083,7 +1110,8 @@
"takeaways": [ "takeaways": [
"学完后应能做到理解网卡、IP 和连通性的基本概念。", "学完后应能做到理解网卡、IP 和连通性的基本概念。",
"易错提醒:能 ping 通就以为服务一定可用", "易错提醒:能 ping 通就以为服务一定可用",
"迁移场景:确认机器是否有正确 IP" "迁移场景:确认机器是否有正确 IP",
"网络排查第一步是先确认链路和地址,再看更上层。"
], ],
"after_class": "课后建议:回到真实或模拟环境里,再用 ip addr、ifconfig、ping 做一次独立练习,并尝试自己解释每条输出的含义。" "after_class": "课后建议:回到真实或模拟环境里,再用 ip addr、ifconfig、ping 做一次独立练习,并尝试自己解释每条输出的含义。"
}, },
@@ -1101,15 +1129,21 @@
"examples": [ "examples": [
"ss -ltnp", "ss -ltnp",
"netstat -tunlp", "netstat -tunlp",
"curl http://127.0.0.1:8080/health" "curl http://127.0.0.1:8080/health",
"ss -ltnp | grep 80",
"curl -I http://127.0.0.1",
"wget http://127.0.0.1"
], ],
"pitfalls": [ "pitfalls": [
"只看页面打不开,不查监听", "只看页面打不开,不查监听",
"只看监听,不测实际请求" "只看监听,不测实际请求",
"只看 LISTEN 不看实际响应码和返回体"
], ],
"scenarios": [ "scenarios": [
"查服务是否监听端口", "查服务是否监听端口",
"测试接口是否返回 200" "测试接口是否返回 200",
"确认 Web 服务是否监听 80 端口",
"确认 HTTP 健康检查是否正常"
], ],
"exercises": [ "exercises": [
{ {
@@ -1151,7 +1185,8 @@
"takeaways": [ "takeaways": [
"学完后应能做到:建立监听端口和服务请求验证的能力。", "学完后应能做到:建立监听端口和服务请求验证的能力。",
"易错提醒:只看页面打不开,不查监听", "易错提醒:只看页面打不开,不查监听",
"迁移场景:查服务是否监听端口" "迁移场景:查服务是否监听端口",
"监听正常不代表业务正常,请求失败也不一定是服务没启动。"
], ],
"after_class": "课后建议:回到真实或模拟环境里,再用 ss、netstat、curl、wget 做一次独立练习,并尝试自己解释每条输出的含义。" "after_class": "课后建议:回到真实或模拟环境里,再用 ss、netstat、curl、wget 做一次独立练习,并尝试自己解释每条输出的含义。"
}, },
@@ -1169,15 +1204,21 @@
"examples": [ "examples": [
"traceroute 8.8.8.8", "traceroute 8.8.8.8",
"dig example.com", "dig example.com",
"which curl" "which curl",
"whereis nginx",
"dig api.example.com",
"traceroute example.com"
], ],
"pitfalls": [ "pitfalls": [
"把 DNS 问题误判成应用问题", "把 DNS 问题误判成应用问题",
"不知道命令来自哪里" "不知道命令来自哪里",
"忽略 DNS TTL 和缓存带来的影响"
], ],
"scenarios": [ "scenarios": [
"排查域名异常", "排查域名异常",
"确认命令路径和来源" "确认命令路径和来源",
"排查域名切换未生效",
"确认命令实际安装位置"
], ],
"exercises": [ "exercises": [
{ {
@@ -1214,7 +1255,8 @@
"takeaways": [ "takeaways": [
"学完后应能做到:建立链路定位和名称解析基础认知。", "学完后应能做到:建立链路定位和名称解析基础认知。",
"易错提醒:把 DNS 问题误判成应用问题", "易错提醒:把 DNS 问题误判成应用问题",
"迁移场景:排查域名异常" "迁移场景:排查域名异常",
"命令定位、解析路径和网络链路,都是“看不见的问题”的排查入口。"
], ],
"after_class": "课后建议:回到真实或模拟环境里,再用 traceroute、dig、which、whereis 做一次独立练习,并尝试自己解释每条输出的含义。" "after_class": "课后建议:回到真实或模拟环境里,再用 traceroute、dig、which、whereis 做一次独立练习,并尝试自己解释每条输出的含义。"
} }
@@ -1239,15 +1281,21 @@
"examples": [ "examples": [
"whoami", "whoami",
"id", "id",
"passwd" "passwd",
"id sandbox_user",
"su - root",
"passwd sandbox_user"
], ],
"pitfalls": [ "pitfalls": [
"不知道自己当前权限边界", "不知道自己当前权限边界",
"以为所有命令都能执行" "以为所有命令都能执行",
"不知道服务身份和登录用户身份可能不同"
], ],
"scenarios": [ "scenarios": [
"确认当前身份", "确认当前身份",
"看自己属于哪些组" "看自己属于哪些组",
"确认程序以什么身份运行",
"确认某用户是否属于目标组"
], ],
"exercises": [ "exercises": [
{ {
@@ -1289,7 +1337,8 @@
"takeaways": [ "takeaways": [
"学完后应能做到:理解当前身份、用户组和密码变更的意义。", "学完后应能做到:理解当前身份、用户组和密码变更的意义。",
"易错提醒:不知道自己当前权限边界", "易错提醒:不知道自己当前权限边界",
"迁移场景:确认当前身份" "迁移场景:确认当前身份",
"身份问题常常决定你能看什么、改什么、执行什么。"
], ],
"after_class": "课后建议:回到真实或模拟环境里,再用 whoami、id、passwd、su 做一次独立练习,并尝试自己解释每条输出的含义。" "after_class": "课后建议:回到真实或模拟环境里,再用 whoami、id、passwd、su 做一次独立练习,并尝试自己解释每条输出的含义。"
}, },
@@ -1307,15 +1356,21 @@
"examples": [ "examples": [
"chmod 644 file.txt", "chmod 644 file.txt",
"chmod +x run.sh", "chmod +x run.sh",
"chown app:app app.log" "chown app:app app.log",
"chmod 644 app.conf",
"chown app:app /var/log/app.log",
"chgrp deploy script.sh"
], ],
"pitfalls": [ "pitfalls": [
"图省事直接给 777", "图省事直接给 777",
"不了解属组导致协作混乱" "不了解属组导致协作混乱",
"只改 chmod不看属主属组"
], ],
"scenarios": [ "scenarios": [
"修脚本执行权限", "修脚本执行权限",
"调整日志文件归属" "调整日志文件归属",
"修复配置文件权限",
"调整日志文件归属方便服务写入"
], ],
"exercises": [ "exercises": [
{ {
@@ -1351,7 +1406,8 @@
"takeaways": [ "takeaways": [
"学完后应能做到:理解 Linux 权限控制的基本模型和常见修改动作。", "学完后应能做到:理解 Linux 权限控制的基本模型和常见修改动作。",
"易错提醒:图省事直接给 777", "易错提醒:图省事直接给 777",
"迁移场景:修脚本执行权限" "迁移场景:修脚本执行权限",
"权限排障常常不是只看一个数字,而是同时看权限、属主、属组和执行身份。"
], ],
"after_class": "课后建议:回到真实或模拟环境里,再用 chmod、chown、chgrp 做一次独立练习,并尝试自己解释每条输出的含义。" "after_class": "课后建议:回到真实或模拟环境里,再用 chmod、chown、chgrp 做一次独立练习,并尝试自己解释每条输出的含义。"
}, },
@@ -1368,15 +1424,19 @@
"command": "sudo / rm -rf / 安全习惯", "command": "sudo / rm -rf / 安全习惯",
"examples": [ "examples": [
"sudo systemctl restart nginx", "sudo systemctl restart nginx",
"rm -rf /tmp/testdir" "rm -rf /tmp/testdir",
"rm -rf /tmp/old-release"
], ],
"pitfalls": [ "pitfalls": [
"把 sudo 当默认前缀", "把 sudo 当默认前缀",
"不确认路径就执行递归删除" "不确认路径就执行递归删除",
"高权限操作前不做确认和备份"
], ],
"scenarios": [ "scenarios": [
"高权限改系统配置", "高权限改系统配置",
"清理目录前先确认路径" "清理目录前先确认路径",
"修改系统级配置前先评估影响",
"清理目录前先校验路径"
], ],
"exercises": [ "exercises": [
{ {
@@ -1407,7 +1467,8 @@
"takeaways": [ "takeaways": [
"学完后应能做到:建立运维中“能做”不等于“该做”的安全意识。", "学完后应能做到:建立运维中“能做”不等于“该做”的安全意识。",
"易错提醒:把 sudo 当默认前缀", "易错提醒:把 sudo 当默认前缀",
"迁移场景:高权限改系统配置" "迁移场景:高权限改系统配置",
"高风险命令需要形成“先确认、后执行、再验证”的习惯。"
], ],
"after_class": "课后建议:回到真实或模拟环境里,再用 sudo、rm -rf、安全习惯 做一次独立练习,并尝试自己解释每条输出的含义。" "after_class": "课后建议:回到真实或模拟环境里,再用 sudo、rm -rf、安全习惯 做一次独立练习,并尝试自己解释每条输出的含义。"
} }
@@ -1795,14 +1856,20 @@
"examples": [ "examples": [
"systemctl status nginx", "systemctl status nginx",
"ss -ltnp", "ss -ltnp",
"curl http://127.0.0.1:8080/health" "curl http://127.0.0.1:8080/health",
"systemctl status nginx && ss -ltnp | grep 80",
"journalctl -u nginx -n 50",
"curl -I http://127.0.0.1"
], ],
"pitfalls": [ "pitfalls": [
"只看浏览器打不开,不看服务状态", "只看浏览器打不开,不看服务状态",
"没有层次地乱查" "没有层次地乱查",
"没有层次感地同时改服务、改配置、重启,导致问题更难定位"
], ],
"scenarios": [ "scenarios": [
"应用服务无法访问" "应用服务无法访问",
"线上服务返回 502/504",
"站点页面打不开但机器正常"
], ],
"exercises": [ "exercises": [
{ {
@@ -1835,7 +1902,8 @@
"takeaways": [ "takeaways": [
"学完后应能做到:建立“先服务、再端口、再日志、再请求”的排查顺序。", "学完后应能做到:建立“先服务、再端口、再日志、再请求”的排查顺序。",
"易错提醒:只看浏览器打不开,不看服务状态", "易错提醒:只看浏览器打不开,不看服务状态",
"迁移场景:应用服务无法访问" "迁移场景:应用服务无法访问",
"服务不可用时,排障要按层进行:服务 → 进程 → 端口 → 日志 → 请求。"
], ],
"after_class": "课后建议:回到真实或模拟环境里,再用 systemctl、ps、ss、journalctl、curl 做一次独立练习,并尝试自己解释每条输出的含义。" "after_class": "课后建议:回到真实或模拟环境里,再用 systemctl、ps、ss、journalctl、curl 做一次独立练习,并尝试自己解释每条输出的含义。"
}, },
@@ -1853,14 +1921,19 @@
"examples": [ "examples": [
"df -h", "df -h",
"du -sh /var/log", "du -sh /var/log",
"find /var/log -type f" "find /var/log -type f",
"df -h /var",
"du -sh /var/log/* | sort"
], ],
"pitfalls": [ "pitfalls": [
"只看 df 不继续追目录", "只看 df 不继续追目录",
"删文件前不确认用途" "删文件前不确认用途",
"直接删除不熟悉的大文件,可能破坏恢复和排障线索"
], ],
"scenarios": [ "scenarios": [
"排查磁盘 100%" "排查磁盘 100%",
"日志目录暴涨导致磁盘满",
"发布产物堆积导致空间不足"
], ],
"exercises": [ "exercises": [
{ {
@@ -1892,7 +1965,8 @@
"takeaways": [ "takeaways": [
"学完后应能做到:建立从 df 到 du 再到 find 的磁盘问题定位思路。", "学完后应能做到:建立从 df 到 du 再到 find 的磁盘问题定位思路。",
"易错提醒:只看 df 不继续追目录", "易错提醒:只看 df 不继续追目录",
"迁移场景:排查磁盘 100%" "迁移场景:排查磁盘 100%",
"磁盘排查的关键是先找文件系统,再找目录,再找大文件。"
], ],
"after_class": "课后建议:回到真实或模拟环境里,再用 df、du、find、sort 做一次独立练习,并尝试自己解释每条输出的含义。" "after_class": "课后建议:回到真实或模拟环境里,再用 df、du、find、sort 做一次独立练习,并尝试自己解释每条输出的含义。"
}, },
@@ -1910,15 +1984,19 @@
"examples": [ "examples": [
"whoami", "whoami",
"id", "id",
"tail -n 20 /var/log/auth.log" "tail -n 20 /var/log/auth.log",
"grep sandbox_user /etc/passwd",
"id sandbox_user"
], ],
"pitfalls": [ "pitfalls": [
"只怀疑密码错误,不看日志", "只怀疑密码错误,不看日志",
"忽略组权限问题" "忽略组权限问题",
"只盯着密码,不看账号状态和权限配置"
], ],
"scenarios": [ "scenarios": [
"SSH 登录失败", "SSH 登录失败",
"执行权限不足" "执行权限不足",
"脚本执行提示无权限"
], ],
"exercises": [ "exercises": [
{ {
@@ -1951,7 +2029,8 @@
"takeaways": [ "takeaways": [
"学完后应能做到:把身份、权限、日志三者串起来理解。", "学完后应能做到:把身份、权限、日志三者串起来理解。",
"易错提醒:只怀疑密码错误,不看日志", "易错提醒:只怀疑密码错误,不看日志",
"迁移场景SSH 登录失败" "迁移场景SSH 登录失败",
"登录失败排查要把身份、日志和权限一起看,不能只猜密码。"
], ],
"after_class": "课后建议:回到真实或模拟环境里,再用 whoami、id、passwd、grep、tail 做一次独立练习,并尝试自己解释每条输出的含义。" "after_class": "课后建议:回到真实或模拟环境里,再用 whoami、id、passwd、grep、tail 做一次独立练习,并尝试自己解释每条输出的含义。"
} }

View File

@@ -560,6 +560,9 @@ class LinuxSandbox:
"kill": "signal sent (simulated)", "kill": "signal sent (simulated)",
"pkill": "matched processes terminated (simulated)", "pkill": "matched processes terminated (simulated)",
"nohup": "appending output to nohup.out", "nohup": "appending output to nohup.out",
"systemctl": "● nginx.service - A high performance web server\n Loaded: loaded (/usr/lib/systemd/system/nginx.service; enabled)\n Active: active (running) since Mon 2026-03-06 10:00:00 CST; 5 days ago\n Main PID: 1042 (nginx)\n Tasks: 3\n Memory: 12.5M",
"service": "nginx is running",
"journalctl": "Mar 06 10:00:00 systemd[1]: Started nginx.service\nMar 06 10:00:01 nginx[1042]: worker process started\nMar 06 10:04:12 nginx[1042]: connect() failed (111: Connection refused)",
"ifconfig": "eth0: flags=4163<UP,BROADCAST,RUNNING,MULTICAST> mtu 1500\n inet 192.168.1.20 netmask 255.255.255.0\nlo: flags=73<UP,LOOPBACK,RUNNING> mtu 65536\n inet 127.0.0.1 netmask 255.0.0.0", "ifconfig": "eth0: flags=4163<UP,BROADCAST,RUNNING,MULTICAST> mtu 1500\n inet 192.168.1.20 netmask 255.255.255.0\nlo: flags=73<UP,LOOPBACK,RUNNING> mtu 65536\n inet 127.0.0.1 netmask 255.0.0.0",
"ip": "1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536\n inet 127.0.0.1/8 scope host lo\n2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500\n inet 192.168.1.20/24 scope global eth0", "ip": "1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536\n inet 127.0.0.1/8 scope host lo\n2: eth0: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500\n inet 192.168.1.20/24 scope global eth0",
"ping": "PING 127.0.0.1 (127.0.0.1): 56 data bytes\n64 bytes from 127.0.0.1: icmp_seq=0 ttl=64 time=0.025 ms\n--- 127.0.0.1 ping statistics ---\n4 packets transmitted, 4 received, 0% packet loss", "ping": "PING 127.0.0.1 (127.0.0.1): 56 data bytes\n64 bytes from 127.0.0.1: icmp_seq=0 ttl=64 time=0.025 ms\n--- 127.0.0.1 ping statistics ---\n4 packets transmitted, 4 received, 0% packet loss",
@@ -589,12 +592,26 @@ class LinuxSandbox:
} }
if cmd_name == "ip" and args[:1] == ["addr"]: if cmd_name == "ip" and args[:1] == ["addr"]:
return canned["ip"] return canned["ip"]
if cmd_name == "systemctl":
if args[:1] == ["status"]:
target = args[1] if len(args) > 1 else 'nginx'
return canned["systemctl"].replace('nginx', target)
if args[:1] in (["restart"], ["start"], ["stop"], ["reload"], ["enable"]):
target = args[1] if len(args) > 1 else 'service'
return f"{args[0]} operation completed for {target} (simulated)"
return canned["systemctl"]
if cmd_name == "service":
return canned["service"]
if cmd_name == "journalctl":
return canned["journalctl"]
if cmd_name == "tail" or cmd_name == "head": if cmd_name == "tail" or cmd_name == "head":
return self._simulate_head_tail(cmd_name, args) return self._simulate_head_tail(cmd_name, args)
if cmd_name == "which": if cmd_name == "which":
return canned["which"] return canned["which"]
if cmd_name == "whereis": if cmd_name == "whereis":
return canned["whereis"] return canned["whereis"]
if cmd_name == "dig":
return canned.get("dig", ";; ANSWER SECTION:\nexample.com. 300 IN A 93.184.216.34")
if cmd_name == "export" and args and "=" in args[0]: if cmd_name == "export" and args and "=" in args[0]:
key, value = args[0].split("=", 1) key, value = args[0].split("=", 1)
self.env[key] = value self.env[key] = value
@@ -681,6 +698,8 @@ class LinuxSandbox:
output = self.user output = self.user
elif cmd_name == "history": elif cmd_name == "history":
output = self._simulate_history(args) output = self._simulate_history(args)
elif cmd_name in {"systemctl", "service", "journalctl", "dig"}:
output = self._simulate_system_text(cmd_name, args)
else: else:
output = self._simulate_system_text(cmd_name, args) output = self._simulate_system_text(cmd_name, args)
except Exception as e: except Exception as e: