{ "meta": { "version": "4.2", "title": "Linux 系统学习课程(运维全场景版)", "author": "OpenClaw Dev", "updated": "2026-03-10", "description": "强调知识理解、场景迁移与运维全场景覆盖的 Linux 学习课程", "module_count": 10, "total_lessons": 30, "total_exercises": 90, "pedagogy": "learning-first", "orientation": "ops-full-scenarios", "source_style": "classic-linux-textbook-inspired" }, "modules": [ { "id": "module_1_foundation", "title": "模块 1:建立 Linux 基本认知", "summary": "先理解终端、目录、路径和最基础命令,建立 Linux 使用的空间感。", "lessons": [ { "id": "m1_l1_pwd", "title": "认识当前目录:pwd", "goal": "理解当前工作目录的意义,知道自己在文件系统中的位置。", "why_it_matters": "很多 Linux 操作依赖路径。如果不知道自己当前在哪,后续命令容易出错。", "concepts": [ "当前工作目录", "绝对路径与相对路径", "为什么要先定位再操作" ], "command": "pwd", "examples": [ "pwd", "cd /tmp && pwd" ], "pitfalls": [ "以为终端默认总在同一个目录", "不分清当前目录和目标目录" ], "scenarios": [ "切目录后确认自己到了哪里", "写脚本前确认当前运行位置" ], "exercises": [ { "id": "m1_l1_e1", "type": "understanding", "question": "查看当前工作目录应该使用什么命令?", "answer": "pwd" }, { "id": "m1_l1_e2", "type": "operation", "title": "输出当前目录", "hint": "直接输入 pwd", "success_test": "cmd == 'pwd'", "solution": [ "pwd" ], "success_msg": "你已经能确认自己所在的位置了。" }, { "id": "m1_l1_e3", "type": "scenario", "question": "如果你不确定自己当前在哪个目录,第一反应应该做什么?", "answer": "先执行 pwd 确认当前目录" } ], "related_commands": [ "pwd" ], "classic_view": "教材视角:Linux 入门首先不是背命令,而是建立“目录、路径、文件”这套基础空间感。", "takeaways": [ "学完后应能做到:理解当前工作目录的意义,知道自己在文件系统中的位置。", "易错提醒:以为终端默认总在同一个目录", "迁移场景:切目录后确认自己到了哪里" ], "after_class": "课后建议:回到真实或模拟环境里,再用 pwd 做一次独立练习,并尝试自己解释每条输出的含义。" }, { "id": "m1_l2_ls", "title": "看见目录内容:ls", "goal": "理解 ls 的作用,并掌握查看隐藏文件和详细信息的基本方式。", "why_it_matters": "Linux 下很多探索行为都从 ls 开始,它决定你如何观察目录结构。", "concepts": [ "目录内容查看", "隐藏文件", "长列表信息" ], "command": "ls", "examples": [ "ls", "ls -la", "ls -lh /etc" ], "pitfalls": [ "误以为 ls 看不到的文件就不存在", "不会区分普通 ls 和 ls -l 的用途" ], "scenarios": [ "排查目录里到底有哪些文件", "检查配置目录中是否有隐藏文件" ], "exercises": [ { "id": "m1_l2_e1", "type": "understanding", "question": "为什么 ls -a 会比 ls 多看到一些文件?", "answer": "因为它会显示隐藏文件,包括以点开头的文件" }, { "id": "m1_l2_e2", "type": "operation", "title": "列出当前目录内容", "hint": "输入 ls", "success_test": "cmd == 'ls'", "solution": [ "ls" ], "success_msg": "你已经会观察目录内容了。" }, { "id": "m1_l2_e3", "type": "operation", "title": "显示隐藏文件和详细信息", "hint": "使用 ls -la", "success_test": "cmd == 'ls -la' or cmd == 'ls -al'", "solution": [ "ls -la", "ls -al" ], "success_msg": "你已经会用更完整的方式查看目录了。" } ], "related_commands": [ "ls" ], "classic_view": "教材视角:Linux 入门首先不是背命令,而是建立“目录、路径、文件”这套基础空间感。", "takeaways": [ "学完后应能做到:理解 ls 的作用,并掌握查看隐藏文件和详细信息的基本方式。", "易错提醒:误以为 ls 看不到的文件就不存在", "迁移场景:排查目录里到底有哪些文件" ], "after_class": "课后建议:回到真实或模拟环境里,再用 ls 做一次独立练习,并尝试自己解释每条输出的含义。" }, { "id": "m1_l3_cd_cat_echo", "title": "移动、读文件、输出文本", "goal": "掌握 cd、cat、echo 这些最基础但最常用的命令。", "why_it_matters": "这三个命令几乎贯穿 Linux 入门阶段的所有练习。", "concepts": [ "切换目录", "读取文件", "输出文本与变量" ], "command": "cd / cat / echo", "examples": [ "cd /tmp", "cat /etc/hosts", "echo Hello Linux" ], "pitfalls": [ "把 cd 和 ls 混用", "用 cat 去看过大的文件", "不知道 echo 也常用于脚本调试" ], "scenarios": [ "进入指定目录继续操作", "快速读取配置文件", "验证变量和命令输出" ], "exercises": [ { "id": "m1_l3_e1", "type": "operation", "title": "进入 /tmp 目录", "hint": "cd /tmp", "success_test": "cmd == 'cd /tmp' and cwd == '/tmp'", "solution": [ "cd /tmp" ], "success_msg": "你已经能切换到目标目录了。" }, { "id": "m1_l3_e2", "type": "operation", "title": "读取 hosts 文件", "hint": "cat /etc/hosts", "success_test": "cmd == 'cat /etc/hosts' and 'localhost' in output", "solution": [ "cat /etc/hosts" ], "success_msg": "你已经会读取基础文本文件了。" }, { "id": "m1_l3_e3", "type": "operation", "title": "输出 Hello Linux", "hint": "echo Hello Linux", "success_test": "cmd == 'echo Hello Linux' and 'Hello Linux' in output", "solution": [ "echo Hello Linux" ], "success_msg": "你已经掌握了最基础的文本输出命令。" } ], "related_commands": [ "cd", "cat", "echo" ], "classic_view": "教材视角:Linux 入门首先不是背命令,而是建立“目录、路径、文件”这套基础空间感。", "takeaways": [ "学完后应能做到:掌握 cd、cat、echo 这些最基础但最常用的命令。", "易错提醒:把 cd 和 ls 混用", "迁移场景:进入指定目录继续操作" ], "after_class": "课后建议:回到真实或模拟环境里,再用 cd、cat、echo 做一次独立练习,并尝试自己解释每条输出的含义。" } ] }, { "id": "module_2_filesystem", "title": "模块 2:文件与目录操作", "summary": "围绕创建、复制、移动、删除和查看文件属性建立文件系统操作能力。", "lessons": [ { "id": "m2_l1_create", "title": "创建文件与目录:mkdir / touch", "goal": "理解目录和文件的创建逻辑,学会递归创建多级目录。", "why_it_matters": "很多项目初始化、环境准备都从创建目录结构开始。", "concepts": [ "目录创建", "多级目录", "空文件创建" ], "command": "mkdir / touch", "examples": [ "mkdir demo", "mkdir -p /tmp/a/b/c", "touch notes.txt" ], "pitfalls": [ "忘记使用 -p 创建多级目录", "目标父目录不存在时 touch 失败" ], "scenarios": [ "初始化项目目录结构", "创建占位文件和日志文件" ], "exercises": [ { "id": "m2_l1_e1", "type": "operation", "title": "递归创建目录", "hint": "mkdir -p /tmp/a/b/c", "success_test": "cmd == 'mkdir -p /tmp/a/b/c' and exists('/tmp/a/b/c')", "solution": [ "mkdir -p /tmp/a/b/c" ], "success_msg": "多级目录创建成功。" }, { "id": "m2_l1_e2", "type": "operation", "title": "创建空文件", "hint": "touch /tmp/a/b/c/readme.txt", "success_test": "cmd == 'touch /tmp/a/b/c/readme.txt' and exists('/tmp/a/b/c/readme.txt')", "solution": [ "touch /tmp/a/b/c/readme.txt" ], "success_msg": "空文件创建成功。" }, { "id": "m2_l1_e3", "type": "scenario", "question": "为什么 mkdir -p 适合项目初始化?", "answer": "因为它可以一次创建多级目录,即使上层目录不存在也能自动补齐" } ], "related_commands": [ "mkdir", "touch" ], "classic_view": "教材视角:文件系统操作是 Linux 使用的基本手艺,关键不只是会敲命令,而是知道每一步在改变什么。", "takeaways": [ "学完后应能做到:理解目录和文件的创建逻辑,学会递归创建多级目录。", "易错提醒:忘记使用 -p 创建多级目录", "迁移场景:初始化项目目录结构" ], "after_class": "课后建议:回到真实或模拟环境里,再用 mkdir、touch 做一次独立练习,并尝试自己解释每条输出的含义。" }, { "id": "m2_l2_move_copy_delete", "title": "复制、移动与删除:cp / mv / rm", "goal": "理解文件操作中的备份、迁移、重命名和清理。", "why_it_matters": "日常 Linux 使用里最常见的就是处理文件的生命周期。", "concepts": [ "复制与备份", "移动与重命名", "删除风险" ], "command": "cp / mv / rm", "examples": [ "cp /etc/hosts /tmp/hosts.bak", "mv old.txt new.txt", "rm -r /tmp/testdir" ], "pitfalls": [ "把删除当成移动", "对目录使用 cp 却忘记 -r", "rm -rf 风险极高" ], "scenarios": [ "做配置备份", "整理日志文件", "清理无用目录" ], "exercises": [ { "id": "m2_l2_e1", "type": "operation", "title": "复制 hosts 文件", "hint": "cp /etc/hosts /tmp/hosts.bak", "success_test": "cmd == 'cp /etc/hosts /tmp/hosts.bak' and exists('/tmp/hosts.bak')", "solution": [ "cp /etc/hosts /tmp/hosts.bak" ], "success_msg": "文件备份成功。" }, { "id": "m2_l2_e2", "type": "operation", "title": "重命名备份文件", "hint": "mv /tmp/hosts.bak /tmp/hosts.backup", "success_test": "cmd == 'mv /tmp/hosts.bak /tmp/hosts.backup' and exists('/tmp/hosts.backup')", "solution": [ "mv /tmp/hosts.bak /tmp/hosts.backup" ], "success_msg": "文件重命名成功。" }, { "id": "m2_l2_e3", "type": "understanding", "question": "为什么 rm -rf 是高风险命令?", "answer": "因为它会递归并强制删除文件和目录,执行错误会造成不可恢复的数据丢失" } ], "related_commands": [ "cp", "mv", "rm" ], "classic_view": "教材视角:文件系统操作是 Linux 使用的基本手艺,关键不只是会敲命令,而是知道每一步在改变什么。", "takeaways": [ "学完后应能做到:理解文件操作中的备份、迁移、重命名和清理。", "易错提醒:把删除当成移动", "迁移场景:做配置备份" ], "after_class": "课后建议:回到真实或模拟环境里,再用 cp、mv、rm 做一次独立练习,并尝试自己解释每条输出的含义。" }, { "id": "m2_l3_stat_permissions", "title": "认识文件属性:stat 与权限基础", "goal": "开始理解文件属性和权限表达。", "why_it_matters": "文件权限是 Linux 系统安全和协作的重要基础。", "concepts": [ "文件元信息", "权限三元组", "目录与文件权限差异" ], "command": "stat / chmod", "examples": [ "stat /etc/hosts", "chmod 755 script.sh", "chmod +x run.sh" ], "pitfalls": [ "不了解 755 / 644 的含义", "给不该执行的文件随意加执行权限" ], "scenarios": [ "检查脚本是否可执行", "排查权限导致的运行失败" ], "exercises": [ { "id": "m2_l3_e1", "type": "operation", "title": "查看 hosts 属性", "hint": "stat /etc/hosts", "success_test": "cmd == 'stat /etc/hosts' and 'File:' in output", "solution": [ "stat /etc/hosts" ], "success_msg": "你已经会查看文件属性了。" }, { "id": "m2_l3_e2", "type": "understanding", "question": "755 和 644 最核心的区别是什么?", "answer": "755 允许拥有者读写执行,其他人读执行;644 没有执行权限" }, { "id": "m2_l3_e3", "type": "operation", "title": "给文件添加执行权限", "hint": "chmod +x /tmp/a/b/c/readme.txt", "success_test": "cmd == 'chmod +x /tmp/a/b/c/readme.txt'", "solution": [ "chmod +x /tmp/a/b/c/readme.txt" ], "success_msg": "你已经完成了权限修改练习。" } ], "related_commands": [ "stat", "chmod" ], "classic_view": "教材视角:文件系统操作是 Linux 使用的基本手艺,关键不只是会敲命令,而是知道每一步在改变什么。", "takeaways": [ "学完后应能做到:开始理解文件属性和权限表达。", "易错提醒:不了解 755 / 644 的含义", "迁移场景:检查脚本是否可执行" ], "after_class": "课后建议:回到真实或模拟环境里,再用 stat、chmod 做一次独立练习,并尝试自己解释每条输出的含义。" } ] }, { "id": "module_3_searching", "title": "模块 3:阅读与筛选信息", "summary": "把 Linux 当成信息检索工具来学,围绕日志、配置和统计建立阅读能力。", "lessons": [ { "id": "m3_l1_read_logs", "title": "看文件头尾:head / tail", "goal": "学会快速读取大文件的局部内容。", "why_it_matters": "日志通常很大,不可能总是整份去看。", "concepts": [ "查看前几行", "查看后几行", "实时追踪" ], "command": "head / tail", "examples": [ "head -n 5 /var/log/syslog", "tail -n 20 /var/log/syslog", "tail -f /var/log/syslog" ], "pitfalls": [ "大文件直接 cat 影响阅读效率", "不会区分查看历史和跟踪新增日志" ], "scenarios": [ "看配置文件开头", "盯日志尾部排查实时错误" ], "exercises": [ { "id": "m3_l1_e1", "type": "operation", "title": "查看 syslog 前 5 行", "hint": "head -n 5 /var/log/syslog", "success_test": "(cmd == 'head -n 5 /var/log/syslog' or cmd == 'head -5 /var/log/syslog') and len(output.split('\\n')) >= 5", "solution": [ "head -n 5 /var/log/syslog", "head -5 /var/log/syslog" ], "success_msg": "你已经会局部查看大文件开头了。" }, { "id": "m3_l1_e2", "type": "operation", "title": "查看 syslog 最后 3 行", "hint": "tail -n 3 /var/log/syslog", "success_test": "(cmd == 'tail -n 3 /var/log/syslog' or cmd == 'tail -3 /var/log/syslog') and len(output.split('\\n')) >= 3", "solution": [ "tail -n 3 /var/log/syslog", "tail -3 /var/log/syslog" ], "success_msg": "你已经会快速查看日志尾部了。" }, { "id": "m3_l1_e3", "type": "scenario", "question": "为什么排查线上报错时更常先用 tail 而不是 cat?", "answer": "因为日志通常很大,tail 可以更快聚焦最近发生的问题" } ], "related_commands": [ "head", "tail" ], "classic_view": "教材视角:日志与文本处理是运维的核心阅读能力,grep / find / tail 不是零散命令,而是一套信息筛选工具链。", "takeaways": [ "学完后应能做到:学会快速读取大文件的局部内容。", "易错提醒:大文件直接 cat 影响阅读效率", "迁移场景:看配置文件开头" ], "after_class": "课后建议:回到真实或模拟环境里,再用 head、tail 做一次独立练习,并尝试自己解释每条输出的含义。" }, { "id": "m3_l2_grep", "title": "关键词搜索:grep", "goal": "理解 grep 作为日志排障和文本定位核心工具的价值。", "why_it_matters": "没有 grep,查日志和配置会慢很多。", "concepts": [ "大小写忽略", "显示行号", "反向匹配", "递归搜索" ], "command": "grep", "examples": [ "grep error /var/log/syslog", "grep -in root /etc/passwd", "grep -v nologin /etc/passwd" ], "pitfalls": [ "不会结合 -n 定位行号", "不知道 -i 和 -v 的常见用途" ], "scenarios": [ "查错误日志", "找配置项", "过滤无效行" ], "exercises": [ { "id": "m3_l2_e1", "type": "operation", "title": "查找 syslog 中的 error", "hint": "grep error /var/log/syslog", "success_test": "cmd == 'grep error /var/log/syslog' and 'error' in output.lower()", "solution": [ "grep error /var/log/syslog" ], "success_msg": "你已经会在日志里搜关键词了。" }, { "id": "m3_l2_e2", "type": "operation", "title": "忽略大小写搜索 root", "hint": "grep -i root /etc/passwd", "success_test": "cmd == 'grep -i root /etc/passwd'", "solution": [ "grep -i root /etc/passwd" ], "success_msg": "你已经知道如何处理大小写差异了。" }, { "id": "m3_l2_e3", "type": "understanding", "question": "grep -n 的意义是什么?", "answer": "显示匹配结果所在的行号,方便快速定位原文位置" } ], "related_commands": [ "grep" ], "classic_view": "教材视角:日志与文本处理是运维的核心阅读能力,grep / find / tail 不是零散命令,而是一套信息筛选工具链。", "takeaways": [ "学完后应能做到:理解 grep 作为日志排障和文本定位核心工具的价值。", "易错提醒:不会结合 -n 定位行号", "迁移场景:查错误日志" ], "after_class": "课后建议:回到真实或模拟环境里,再用 grep 做一次独立练习,并尝试自己解释每条输出的含义。" }, { "id": "m3_l3_find_wc_sort", "title": "查找与统计:find / wc / sort", "goal": "建立查找文件和做基础统计的能力。", "why_it_matters": "Linux 的很多效率来自组合式查找与统计。", "concepts": [ "按名称查找", "行数字数统计", "排序输出" ], "command": "find / wc / sort", "examples": [ "find /etc -name '*.conf'", "wc -l /var/log/syslog", "ls | sort" ], "pitfalls": [ "把 find 和 grep 混淆", "不会根据任务选文件查找还是内容查找" ], "scenarios": [ "找配置文件", "统计日志行数", "整理输出结果" ], "exercises": [ { "id": "m3_l3_e1", "type": "operation", "title": "查找 /etc 下所有 .conf 文件", "hint": "find /etc -name '*.conf'", "success_test": "cmd == \"find /etc -name '*.conf'\" and '.conf' in output", "solution": [ "find /etc -name '*.conf'" ], "success_msg": "你已经会用 find 定位文件了。" }, { "id": "m3_l3_e2", "type": "operation", "title": "统计 syslog 行数", "hint": "wc -l /var/log/syslog", "success_test": "cmd == 'wc -l /var/log/syslog' and output.strip().isdigit()", "solution": [ "wc -l /var/log/syslog" ], "success_msg": "你已经会做基础统计了。" }, { "id": "m3_l3_e3", "type": "understanding", "question": "找文件位置应该优先想到 find 还是 grep?为什么?", "answer": "优先用 find,因为这是文件定位问题,不是文件内容搜索问题" } ], "related_commands": [ "find", "wc", "sort" ], "classic_view": "教材视角:日志与文本处理是运维的核心阅读能力,grep / find / tail 不是零散命令,而是一套信息筛选工具链。", "takeaways": [ "学完后应能做到:建立查找文件和做基础统计的能力。", "易错提醒:把 find 和 grep 混淆", "迁移场景:找配置文件" ], "after_class": "课后建议:回到真实或模拟环境里,再用 find、wc、sort 做一次独立练习,并尝试自己解释每条输出的含义。" } ] }, { "id": "module_4_system_state", "title": "模块 4:系统状态与资源认知", "summary": "学习如何看进程、负载、磁盘、内存和挂载信息,建立系统状态判断能力。", "lessons": [ { "id": "m4_l1_process", "title": "看进程:ps / top", "goal": "理解 Linux 中的进程概念,知道如何查看系统正在运行什么。", "why_it_matters": "绝大多数服务故障、卡顿和异常都要先看进程。", "concepts": [ "进程与服务", "前台与后台", "ps 和 top 的区别" ], "command": "ps / top", "examples": [ "ps aux", "ps -ef", "top" ], "pitfalls": [ "只会看进程名,不会看状态", "把存在进程等同于服务可用" ], "scenarios": [ "确认服务进程是否存在", "定位高 CPU 进程" ], "exercises": [ { "id": "m4_l1_e1", "type": "operation", "title": "查看所有进程", "hint": "ps aux", "success_test": "cmd == 'ps aux' and 'PID' in output", "solution": [ "ps aux" ], "success_msg": "你已经会查看系统进程了。" }, { "id": "m4_l1_e2", "type": "understanding", "question": "为什么看到进程存在,不代表服务一定可用?", "answer": "因为进程存在只说明程序在运行,不代表端口监听、配置、依赖或接口一定正常" }, { "id": "m4_l1_e3", "type": "scenario", "question": "排查“服务似乎没启动”时,第一步通常可以用什么命令?", "answer": "先用 ps aux 或 ps -ef 查看相关进程是否存在" } ], "related_commands": [ "ps", "top" ], "classic_view": "教材视角:系统状态认知是运维的基本盘,先学会“看懂机器”,再谈优化和修复。", "takeaways": [ "学完后应能做到:理解 Linux 中的进程概念,知道如何查看系统正在运行什么。", "易错提醒:只会看进程名,不会看状态", "迁移场景:确认服务进程是否存在", "形成分层排障顺序,而不是遇到问题就随手试命令。" ], "after_class": "课后建议:回到真实或模拟环境里,再用 ps、top 做一次独立练习,并尝试自己解释每条输出的含义。", "troubleshooting_flow": [ "先确认服务对应的进程是否存在", "再看进程状态、CPU、内存占用是否异常", "如果进程存在但服务不可用,再继续看端口和日志", "不要把“有进程”误判成“服务正常”" ] }, { "id": "m4_l2_disk_memory", "title": "看磁盘与内存:df / du / free", "goal": "掌握查看磁盘使用、目录占用和内存情况的基础方法。", "why_it_matters": "磁盘爆满、内存紧张是最常见的线上问题之一。", "concepts": [ "磁盘空间 vs 目录占用", "物理内存与可用内存", "df 和 du 的区别" ], "command": "df / du / free", "examples": [ "df -h", "du -sh /var/log", "free -h" ], "pitfalls": [ "只会看总磁盘,不会看哪个目录占用大", "误把 free 的 used 当成唯一关键指标" ], "scenarios": [ "排查磁盘已满", "定位大目录", "查看内存是否紧张" ], "exercises": [ { "id": "m4_l2_e1", "type": "operation", "title": "查看磁盘空间", "hint": "df -h", "success_test": "cmd == 'df -h' and 'Filesystem' in output", "solution": [ "df -h" ], "success_msg": "你已经会看磁盘使用情况了。" }, { "id": "m4_l2_e2", "type": "operation", "title": "查看 /sandbox 目录大小", "hint": "du -sh /sandbox", "success_test": "cmd == 'du -sh /sandbox' and '/sandbox' in output", "solution": [ "du -sh /sandbox" ], "success_msg": "你已经会看目录占用了。" }, { "id": "m4_l2_e3", "type": "understanding", "question": "df 和 du 的核心区别是什么?", "answer": "df 看文件系统层面的磁盘使用,du 看目录或文件占用大小" } ], "related_commands": [ "df", "du", "free" ], "classic_view": "教材视角:系统状态认知是运维的基本盘,先学会“看懂机器”,再谈优化和修复。", "takeaways": [ "学完后应能做到:掌握查看磁盘使用、目录占用和内存情况的基础方法。", "易错提醒:只会看总磁盘,不会看哪个目录占用大", "迁移场景:排查磁盘已满", "形成分层排障顺序,而不是遇到问题就随手试命令。" ], "after_class": "课后建议:回到真实或模拟环境里,再用 df、du、free 做一次独立练习,并尝试自己解释每条输出的含义。", "troubleshooting_flow": [ "先用 df 确认是哪个文件系统空间不足", "再用 du 逐层定位哪个目录占用最大", "必要时结合 find 找出大文件", "清理前先确认文件用途与是否还能用于排障" ] }, { "id": "m4_l3_mount_history", "title": "运行时间、挂载点与历史命令", "goal": "建立系统运行时间、挂载结构与命令习惯认知。", "why_it_matters": "理解机器运行了多久、磁盘挂载在哪里、最近执行过什么命令,是运维日常的基础信息。", "concepts": [ "uptime 的含义", "挂载点", "history 复盘" ], "command": "uptime / mount / history", "examples": [ "uptime", "mount", "history -n 5" ], "pitfalls": [ "不看历史重复犯错", "忽略挂载点导致排查路径错位" ], "scenarios": [ "查看机器是否重启过", "判断目录属于哪个挂载点", "复盘最近操作" ], "exercises": [ { "id": "m4_l3_e1", "type": "operation", "title": "查看系统运行时间", "hint": "uptime", "success_test": "cmd == 'uptime' and 'load average' in output", "solution": [ "uptime" ], "success_msg": "你已经会看系统运行时间和负载了。" }, { "id": "m4_l3_e2", "type": "operation", "title": "查看最近命令历史", "hint": "history -n 5", "success_test": "cmd == 'history -n 5' and output != ''", "solution": [ "history -n 5" ], "success_msg": "你已经会利用历史命令回顾操作了。" }, { "id": "m4_l3_e3", "type": "scenario", "question": "为什么排查问题时查看 history 很有价值?", "answer": "因为它可以帮助回溯最近做过什么操作,快速定位变更和可能的触发点" } ], "related_commands": [ "uptime", "mount", "history" ], "classic_view": "教材视角:系统状态认知是运维的基本盘,先学会“看懂机器”,再谈优化和修复。", "takeaways": [ "学完后应能做到:建立系统运行时间、挂载结构与命令习惯认知。", "易错提醒:不看历史重复犯错", "迁移场景:查看机器是否重启过", "形成分层排障顺序,而不是遇到问题就随手试命令。" ], "after_class": "课后建议:回到真实或模拟环境里,再用 uptime、mount、history 做一次独立练习,并尝试自己解释每条输出的含义。", "troubleshooting_flow": [ "先看 uptime 确认系统是否近期重启", "再看 mount 判断关键目录属于哪个挂载点", "最后回看 history 了解最近做过什么变更", "把系统状态和操作历史结合起来看" ] } ] }, { "id": "module_5_service_logs", "title": "模块 5:服务与日志排障", "summary": "围绕服务启动、运行状态、日志报错和后台执行建立排障链路。", "lessons": [ { "id": "m5_l1_systemctl", "title": "服务管理:systemctl 基础", "goal": "理解 Linux 服务的查看、启动、停止和重启。", "why_it_matters": "现代 Linux 发行版大量使用 systemd 管理服务。", "concepts": [ "服务状态", "启动与重启", "systemd 基础" ], "command": "systemctl", "examples": [ "systemctl status nginx", "systemctl restart nginx", "systemctl enable nginx", "systemctl status app.service", "systemctl restart app.service", "systemctl is-enabled nginx" ], "pitfalls": [ "改完配置却忘记重启服务", "只看页面,不看服务状态", "把 restart 当成排障终点,而不是排查起点" ], "scenarios": [ "排查服务没起来", "改配置后让服务生效", "配置变更后重新加载服务", "确认服务是否开机自启" ], "exercises": [ { "id": "m5_l1_e1", "type": "understanding", "question": "为什么改完服务配置后常常要 restart 或 reload?", "answer": "因为配置文件变化不会自动生效,需要让服务重新加载配置" }, { "id": "m5_l1_e2", "type": "scenario", "question": "排查“网站打不开”时,为什么应该先看 systemctl status?", "answer": "因为要先确认服务是否真的在运行,以及是否有明显启动失败信息" }, { "id": "m5_l1_e3", "type": "understanding", "question": "enable 和 start 的区别是什么?", "answer": "start 是当前立即启动,enable 是设置开机自动启动" } ], "related_commands": [ "systemctl" ], "classic_view": "教材视角:服务排障要形成链路思维——状态、进程、端口、日志、请求,要分层观察。", "takeaways": [ "学完后应能做到:理解 Linux 服务的查看、启动、停止和重启。", "易错提醒:改完配置却忘记重启服务", "迁移场景:排查服务没起来", "服务问题先看状态,再决定下一步看日志、端口还是配置。", "形成分层排障顺序,而不是遇到问题就随手试命令。" ], "after_class": "课后建议:回到真实或模拟环境里,再用 systemctl 做一次独立练习,并尝试自己解释每条输出的含义。", "troubleshooting_flow": [ "先看 systemctl status,确认服务到底是不是 running", "再看是否有明显的启动失败或退出提示", "如果状态异常,再进入日志层和端口层", "不要一上来就盲目 restart 多次" ] }, { "id": "m5_l2_journalctl", "title": "看系统日志:journalctl", "goal": "理解如何查看服务日志和系统日志。", "why_it_matters": "很多 systemd 管理的服务排障入口就是 journalctl。", "concepts": [ "单服务日志", "最近日志", "实时跟踪日志" ], "command": "journalctl", "examples": [ "journalctl -u nginx -n 50", "journalctl -xe", "journalctl -u app -f", "journalctl -u app.service -n 100", "journalctl -u nginx -f", "journalctl --since today" ], "pitfalls": [ "只看应用日志,不看 systemd 日志", "看太多日志却抓不到最近错误", "不限定服务名导致日志范围过大,难以定位" ], "scenarios": [ "查看服务启动失败原因", "查看最近报错", "定位服务启动失败的关键报错", "观察重启前后日志变化" ], "exercises": [ { "id": "m5_l2_e1", "type": "understanding", "question": "为什么 journalctl 对 systemd 服务排障特别重要?", "answer": "因为它能直接查看服务生命周期和 systemd 记录的日志" }, { "id": "m5_l2_e2", "type": "scenario", "question": "服务启动失败后,下一步除了看 status 还应该看什么?", "answer": "看 journalctl -u 服务名 的日志,确认具体报错" }, { "id": "m5_l2_e3", "type": "understanding", "question": "为什么实时排查时常用 -f?", "answer": "因为 -f 可以持续跟踪新增日志,适合边操作边观察" } ], "related_commands": [ "journalctl" ], "classic_view": "教材视角:服务排障要形成链路思维——状态、进程、端口、日志、请求,要分层观察。", "takeaways": [ "学完后应能做到:理解如何查看服务日志和系统日志。", "易错提醒:只看应用日志,不看 systemd 日志", "迁移场景:查看服务启动失败原因", "日志不是越多越好,关键是缩小范围看最近、看目标服务。", "形成分层排障顺序,而不是遇到问题就随手试命令。" ], "after_class": "课后建议:回到真实或模拟环境里,再用 journalctl 做一次独立练习,并尝试自己解释每条输出的含义。", "troubleshooting_flow": [ "先限定服务名缩小日志范围", "优先看最近几十行,不要一开始把范围拉太大", "定位到关键报错后再回溯上下文", "边操作边用 -f 观察实时变化" ] }, { "id": "m5_l3_process_control", "title": "进程控制:kill / pkill / nohup", "goal": "理解如何控制进程和让任务脱离终端运行。", "why_it_matters": "部署、排障和临时任务常会遇到进程管理问题。", "concepts": [ "发送信号", "按名称结束进程", "后台运行" ], "command": "kill / pkill / nohup", "examples": [ "kill 1234", "pkill nginx", "nohup python3 app.py &", "kill -9 1234", "pkill -f python", "nohup bash backup.sh &" ], "pitfalls": [ "直接粗暴 kill 掉关键进程", "不知道后台任务输出去哪了", "不了解信号差异就直接使用 -9" ], "scenarios": [ "结束卡死进程", "让脚本后台运行", "结束僵死任务", "让临时脚本脱离终端继续执行" ], "exercises": [ { "id": "m5_l3_e1", "type": "understanding", "question": "为什么 kill 进程前要先确认 PID 和进程身份?", "answer": "因为误杀错误进程可能导致服务中断或数据问题" }, { "id": "m5_l3_e2", "type": "understanding", "question": "nohup 的作用是什么?", "answer": "让命令在退出终端后继续运行,适合后台任务" }, { "id": "m5_l3_e3", "type": "scenario", "question": "如果你想让一个脚本关掉 SSH 后仍然继续跑,应该想到什么?", "answer": "使用 nohup 或其他后台运行方式" } ], "related_commands": [ "kill", "pkill", "nohup" ], "classic_view": "教材视角:服务排障要形成链路思维——状态、进程、端口、日志、请求,要分层观察。", "takeaways": [ "学完后应能做到:理解如何控制进程和让任务脱离终端运行。", "易错提醒:直接粗暴 kill 掉关键进程", "迁移场景:结束卡死进程", "进程控制的重点是知道为什么结束、结束谁、结束后系统会怎样。", "形成分层排障顺序,而不是遇到问题就随手试命令。" ], "after_class": "课后建议:回到真实或模拟环境里,再用 kill、pkill、nohup 做一次独立练习,并尝试自己解释每条输出的含义。", "troubleshooting_flow": [ "确认要处理的是哪个进程", "评估结束进程会不会影响业务", "优先选择合理方式终止,不要默认暴力 kill -9", "需要后台运行任务时再考虑 nohup" ] } ] }, { "id": "module_6_network", "title": "模块 6:网络与连接排查", "summary": "建立 IP、端口、监听、连通性和请求验证等运维网络基础。", "lessons": [ { "id": "m6_l1_ip_ping", "title": "网络基础:ip addr / ifconfig / ping", "goal": "理解网卡、IP 和连通性的基本概念。", "why_it_matters": "服务是否可达,首先是网络问题还是应用问题,需要先分清。", "concepts": [ "网卡", "IP 地址", "连通性测试" ], "command": "ip addr / ifconfig / ping", "examples": [ "ip addr", "ifconfig", "ping 127.0.0.1", "ip addr show eth0", "ping 192.168.1.1", "ping -c 4 example.com" ], "pitfalls": [ "能 ping 通就以为服务一定可用", "只会看 IP,不理解监听端口", "把 DNS 解析失败误判成网络完全不通" ], "scenarios": [ "确认机器是否有正确 IP", "测试目标是否能连通", "确认目标机器有无 IP", "初步判断网络层是否通" ], "exercises": [ { "id": "m6_l1_e1", "type": "operation", "title": "查看网卡地址", "hint": "ip addr", "success_test": "cmd == 'ip addr' and 'inet' in output", "solution": [ "ip addr" ], "success_msg": "你已经会看基本网卡信息了。" }, { "id": "m6_l1_e2", "type": "operation", "title": "测试本机回环连通性", "hint": "ping 127.0.0.1", "success_test": "cmd == 'ping 127.0.0.1' and 'packet loss' in output", "solution": [ "ping 127.0.0.1" ], "success_msg": "你已经做了一次基础连通性验证。" }, { "id": "m6_l1_e3", "type": "understanding", "question": "为什么 ping 通不等于服务一定可用?", "answer": "因为 ping 只说明网络层连通,不代表应用端口和接口层面正常" } ], "related_commands": [ "ip addr", "ifconfig", "ping" ], "classic_view": "教材视角:网络问题最怕混层,学习要区分链路、端口、协议、请求,不要一股脑都归为“网络不通”。", "takeaways": [ "学完后应能做到:理解网卡、IP 和连通性的基本概念。", "易错提醒:能 ping 通就以为服务一定可用", "迁移场景:确认机器是否有正确 IP", "网络排查第一步是先确认链路和地址,再看更上层。", "形成分层排障顺序,而不是遇到问题就随手试命令。" ], "after_class": "课后建议:回到真实或模拟环境里,再用 ip addr、ifconfig、ping 做一次独立练习,并尝试自己解释每条输出的含义。", "troubleshooting_flow": [ "先确认机器有没有拿到正确 IP", "再用 ping 验证基础连通性", "如果 ping 不通,优先怀疑网络层或地址层问题", "如果 ping 通,再继续检查端口和应用层" ] }, { "id": "m6_l2_ss_curl", "title": "端口与请求:ss / netstat / curl / wget", "goal": "建立监听端口和服务请求验证的能力。", "why_it_matters": "运维排障很多时候要回答两个问题:端口开没开?接口通不通?", "concepts": [ "端口监听", "TCP 层可达性", "HTTP 请求验证" ], "command": "ss / netstat / curl / wget", "examples": [ "ss -ltnp", "netstat -tunlp", "curl http://127.0.0.1:8080/health", "ss -ltnp | grep 80", "curl -I http://127.0.0.1", "wget http://127.0.0.1" ], "pitfalls": [ "只看页面打不开,不查监听", "只看监听,不测实际请求", "只看 LISTEN 不看实际响应码和返回体" ], "scenarios": [ "查服务是否监听端口", "测试接口是否返回 200", "确认 Web 服务是否监听 80 端口", "确认 HTTP 健康检查是否正常" ], "exercises": [ { "id": "m6_l2_e1", "type": "operation", "title": "查看监听端口", "hint": "ss -ltnp", "success_test": "cmd == 'ss -ltnp' and 'LISTEN' in output", "solution": [ "ss -ltnp" ], "success_msg": "你已经会看监听端口了。" }, { "id": "m6_l2_e2", "type": "operation", "title": "请求本地页面", "hint": "curl http://127.0.0.1", "success_test": "cmd == 'curl http://127.0.0.1' and '' in output", "solution": [ "curl http://127.0.0.1" ], "success_msg": "你已经会做基本 HTTP 探测了。" }, { "id": "m6_l2_e3", "type": "scenario", "question": "排查“服务起了但访问失败”时,为什么要同时看 ss 和 curl?", "answer": "因为 ss 看端口监听,curl 看应用层响应,两者结合才能判断问题在哪一层" } ], "related_commands": [ "ss", "netstat", "curl", "wget" ], "classic_view": "教材视角:网络问题最怕混层,学习要区分链路、端口、协议、请求,不要一股脑都归为“网络不通”。", "takeaways": [ "学完后应能做到:建立监听端口和服务请求验证的能力。", "易错提醒:只看页面打不开,不查监听", "迁移场景:查服务是否监听端口", "监听正常不代表业务正常,请求失败也不一定是服务没启动。", "形成分层排障顺序,而不是遇到问题就随手试命令。" ], "after_class": "课后建议:回到真实或模拟环境里,再用 ss、netstat、curl、wget 做一次独立练习,并尝试自己解释每条输出的含义。", "troubleshooting_flow": [ "先看端口是否监听", "再用 curl 验证应用层是否有返回", "如果监听正常但请求异常,再结合日志判断应用问题", "如果根本没监听,先回到服务状态层排查" ] }, { "id": "m6_l3_name_route", "title": "路由与名称解析:traceroute / dig / whereis 命令定位", "goal": "建立链路定位和名称解析基础认知。", "why_it_matters": "有些故障不是服务本身坏了,而是路径或解析出了问题。", "concepts": [ "路由路径", "DNS 解析", "命令位置定位" ], "command": "traceroute / dig / which / whereis", "examples": [ "traceroute 8.8.8.8", "dig example.com", "which curl", "whereis nginx", "dig api.example.com", "traceroute example.com" ], "pitfalls": [ "把 DNS 问题误判成应用问题", "不知道命令来自哪里", "忽略 DNS TTL 和缓存带来的影响" ], "scenarios": [ "排查域名异常", "确认命令路径和来源", "排查域名切换未生效", "确认命令实际安装位置" ], "exercises": [ { "id": "m6_l3_e1", "type": "operation", "title": "定位 curl 命令路径", "hint": "which curl", "success_test": "cmd == 'which curl' and '/usr/bin/curl' in output", "solution": [ "which curl" ], "success_msg": "你已经会定位命令路径了。" }, { "id": "m6_l3_e2", "type": "understanding", "question": "为什么 DNS 出问题时,服务本身可能没坏但用户仍然访问失败?", "answer": "因为域名解析不到正确 IP,流量根本到不了目标服务" }, { "id": "m6_l3_e3", "type": "scenario", "question": "排查“域名不通”时除了 curl,还应该想到什么?", "answer": "还应该检查 dig/nslookup 和网络路径,确认是不是解析或链路问题" } ], "related_commands": [ "traceroute", "dig", "which", "whereis" ], "classic_view": "教材视角:网络问题最怕混层,学习要区分链路、端口、协议、请求,不要一股脑都归为“网络不通”。", "takeaways": [ "学完后应能做到:建立链路定位和名称解析基础认知。", "易错提醒:把 DNS 问题误判成应用问题", "迁移场景:排查域名异常", "命令定位、解析路径和网络链路,都是“看不见的问题”的排查入口。", "形成分层排障顺序,而不是遇到问题就随手试命令。" ], "after_class": "课后建议:回到真实或模拟环境里,再用 traceroute、dig、which、whereis 做一次独立练习,并尝试自己解释每条输出的含义。", "troubleshooting_flow": [ "先确认命令或服务实际路径", "再检查域名解析是否正确", "必要时查看路由链路是否异常", "把“命令路径 / DNS / 路由”当成三类不同问题" ] } ] }, { "id": "module_7_security", "title": "模块 7:权限、用户与安全基础", "summary": "围绕用户、组、权限和高风险操作建立 Linux 安全基本认知。", "lessons": [ { "id": "m7_l1_users", "title": "用户与身份:whoami / id / passwd / su", "goal": "理解当前身份、用户组和密码变更的意义。", "why_it_matters": "你是谁、你属于谁、你能做什么,是 Linux 安全的最基础问题。", "concepts": [ "当前用户", "用户组", "身份切换" ], "command": "whoami / id / passwd / su", "examples": [ "whoami", "id", "passwd", "id sandbox_user", "su - root", "passwd sandbox_user" ], "pitfalls": [ "不知道自己当前权限边界", "以为所有命令都能执行", "不知道服务身份和登录用户身份可能不同" ], "scenarios": [ "确认当前身份", "看自己属于哪些组", "确认程序以什么身份运行", "确认某用户是否属于目标组" ], "exercises": [ { "id": "m7_l1_e1", "type": "operation", "title": "查看当前用户", "hint": "whoami", "success_test": "cmd == 'whoami' and 'sandbox_user' in output", "solution": [ "whoami" ], "success_msg": "你已经会确认当前登录身份了。" }, { "id": "m7_l1_e2", "type": "operation", "title": "查看当前用户组信息", "hint": "id", "success_test": "cmd == 'id' and 'uid=' in output", "solution": [ "id" ], "success_msg": "你已经会查看用户与组信息了。" }, { "id": "m7_l1_e3", "type": "understanding", "question": "为什么运维排障前先确认 whoami 很重要?", "answer": "因为不同身份决定你能看到什么、改什么,以及排障时会不会被权限挡住" } ], "related_commands": [ "whoami", "id", "passwd", "su" ], "classic_view": "教材视角:权限和身份是 Linux 安全边界的基础,能执行不代表应该执行。", "takeaways": [ "学完后应能做到:理解当前身份、用户组和密码变更的意义。", "易错提醒:不知道自己当前权限边界", "迁移场景:确认当前身份", "身份问题常常决定你能看什么、改什么、执行什么。" ], "after_class": "课后建议:回到真实或模拟环境里,再用 whoami、id、passwd、su 做一次独立练习,并尝试自己解释每条输出的含义。" }, { "id": "m7_l2_permissions", "title": "权限控制:chmod / chown / chgrp", "goal": "理解 Linux 权限控制的基本模型和常见修改动作。", "why_it_matters": "很多“无法访问、无法执行、无法写入”本质上都是权限问题。", "concepts": [ "读写执行", "拥有者与属组", "最小权限原则" ], "command": "chmod / chown / chgrp", "examples": [ "chmod 644 file.txt", "chmod +x run.sh", "chown app:app app.log", "chmod 644 app.conf", "chown app:app /var/log/app.log", "chgrp deploy script.sh" ], "pitfalls": [ "图省事直接给 777", "不了解属组导致协作混乱", "只改 chmod,不看属主属组" ], "scenarios": [ "修脚本执行权限", "调整日志文件归属", "修复配置文件权限", "调整日志文件归属方便服务写入" ], "exercises": [ { "id": "m7_l2_e1", "type": "operation", "title": "给文件添加执行权限", "hint": "chmod +x /tmp/a/b/c/readme.txt", "success_test": "cmd == 'chmod +x /tmp/a/b/c/readme.txt'", "solution": [ "chmod +x /tmp/a/b/c/readme.txt" ], "success_msg": "你已经会做最基础的执行权限修改。" }, { "id": "m7_l2_e2", "type": "understanding", "question": "为什么生产环境里不应该随手给 777?", "answer": "因为 777 让所有人都有读写执行权限,风险过高,容易造成安全和误操作问题" }, { "id": "m7_l2_e3", "type": "scenario", "question": "脚本提示 Permission denied 时,你会先想到什么?", "answer": "先检查文件是否有执行权限,以及当前用户是否有访问权限" } ], "related_commands": [ "chmod", "chown", "chgrp" ], "classic_view": "教材视角:权限和身份是 Linux 安全边界的基础,能执行不代表应该执行。", "takeaways": [ "学完后应能做到:理解 Linux 权限控制的基本模型和常见修改动作。", "易错提醒:图省事直接给 777", "迁移场景:修脚本执行权限", "权限排障常常不是只看一个数字,而是同时看权限、属主、属组和执行身份。" ], "after_class": "课后建议:回到真实或模拟环境里,再用 chmod、chown、chgrp 做一次独立练习,并尝试自己解释每条输出的含义。" }, { "id": "m7_l3_risk", "title": "高风险命令与最小权限原则", "goal": "建立运维中“能做”不等于“该做”的安全意识。", "why_it_matters": "很多事故不是因为不会,而是因为过度权限和冒险操作。", "concepts": [ "sudo 的边界", "危险删除", "最小权限" ], "command": "sudo / rm -rf / 安全习惯", "examples": [ "sudo systemctl restart nginx", "rm -rf /tmp/testdir", "rm -rf /tmp/old-release" ], "pitfalls": [ "把 sudo 当默认前缀", "不确认路径就执行递归删除", "高权限操作前不做确认和备份" ], "scenarios": [ "高权限改系统配置", "清理目录前先确认路径", "修改系统级配置前先评估影响", "清理目录前先校验路径" ], "exercises": [ { "id": "m7_l3_e1", "type": "understanding", "question": "为什么最小权限原则在运维里很重要?", "answer": "因为权限越大,误操作和被利用的风险越高,应只给完成任务所需的最小权限" }, { "id": "m7_l3_e2", "type": "scenario", "question": "执行 rm -rf 之前最应该确认什么?", "answer": "确认目标路径是否正确,以及是否真的需要递归强制删除" }, { "id": "m7_l3_e3", "type": "understanding", "question": "为什么不应该把 sudo 当成“万能解决方案”?", "answer": "因为它绕过权限边界,容易掩盖根因并扩大误操作风险" } ], "related_commands": [ "sudo", "rm -rf", "安全习惯" ], "classic_view": "教材视角:权限和身份是 Linux 安全边界的基础,能执行不代表应该执行。", "takeaways": [ "学完后应能做到:建立运维中“能做”不等于“该做”的安全意识。", "易错提醒:把 sudo 当默认前缀", "迁移场景:高权限改系统配置", "高风险命令需要形成“先确认、后执行、再验证”的习惯。" ], "after_class": "课后建议:回到真实或模拟环境里,再用 sudo、rm -rf、安全习惯 做一次独立练习,并尝试自己解释每条输出的含义。" } ] }, { "id": "module_8_packages_env", "title": "模块 8:软件包、环境与命令定位", "summary": "理解命令从哪里来、环境变量如何影响执行、软件包如何管理。", "lessons": [ { "id": "m8_l1_path_env", "title": "环境变量与命令定位:env / export / which / whereis", "goal": "理解 PATH、环境变量和命令查找机制。", "why_it_matters": "很多“命令找不到”“版本不对”“环境不生效”都和环境变量有关。", "concepts": [ "PATH", "环境变量", "命令来源" ], "command": "env / export / which / whereis", "examples": [ "env", "export APP_ENV=prod", "which python3", "whereis nginx" ], "pitfalls": [ "以为命令名唯一对应一个位置", "不知道 PATH 顺序会影响执行结果" ], "scenarios": [ "排查命令找不到", "排查执行到错误版本" ], "exercises": [ { "id": "m8_l1_e1", "type": "operation", "title": "查看环境变量", "hint": "env", "success_test": "cmd == 'env' and 'PATH=' in output", "solution": [ "env" ], "success_msg": "你已经会查看环境变量了。" }, { "id": "m8_l1_e2", "type": "operation", "title": "定位 ls 命令", "hint": "which ls", "success_test": "cmd == 'which ls' and '/bin/ls' in output", "solution": [ "which ls" ], "success_msg": "你已经会查命令来源了。" }, { "id": "m8_l1_e3", "type": "understanding", "question": "为什么 PATH 顺序会影响命令执行结果?", "answer": "因为系统会按 PATH 的顺序查找同名命令,先找到哪个就执行哪个" } ], "related_commands": [ "env", "export", "which", "whereis" ], "classic_view": "教材视角:很多“环境问题”本质是命令来源、变量配置和包版本问题,不是应用本身坏了。", "takeaways": [ "学完后应能做到:理解 PATH、环境变量和命令查找机制。", "易错提醒:以为命令名唯一对应一个位置", "迁移场景:排查命令找不到" ], "after_class": "课后建议:回到真实或模拟环境里,再用 env、export、which、whereis 做一次独立练习,并尝试自己解释每条输出的含义。" }, { "id": "m8_l2_package_mgr", "title": "包管理基础:apt / yum / dpkg / rpm", "goal": "理解 Linux 软件安装与查询的基本方式。", "why_it_matters": "软件装没装、版本对不对,是环境排障的重要基础。", "concepts": [ "Debian 与 RedHat 系包管理差异", "包查询", "版本核对" ], "command": "apt / yum / dpkg / rpm", "examples": [ "apt list --installed", "yum list installed", "rpm -qa | grep nginx" ], "pitfalls": [ "不知道发行版不同,命令体系也不同", "只会装包,不会查版本" ], "scenarios": [ "确认软件已安装", "核对线上版本" ], "exercises": [ { "id": "m8_l2_e1", "type": "understanding", "question": "为什么 apt 和 yum 不能混着理解?", "answer": "因为它们属于不同发行版的包管理体系,命令、仓库和包格式都有差异" }, { "id": "m8_l2_e2", "type": "scenario", "question": "排查“命令不存在”时,除了 which 还会想到什么?", "answer": "还要确认对应软件包是否已安装,必要时用包管理工具查询" }, { "id": "m8_l2_e3", "type": "understanding", "question": "为什么确认软件版本在运维里很重要?", "answer": "因为不同版本的配置、行为和兼容性可能不同,排障和发布都依赖版本信息" } ], "related_commands": [ "apt", "yum", "dpkg", "rpm" ], "classic_view": "教材视角:很多“环境问题”本质是命令来源、变量配置和包版本问题,不是应用本身坏了。", "takeaways": [ "学完后应能做到:理解 Linux 软件安装与查询的基本方式。", "易错提醒:不知道发行版不同,命令体系也不同", "迁移场景:确认软件已安装" ], "after_class": "课后建议:回到真实或模拟环境里,再用 apt、yum、dpkg、rpm 做一次独立练习,并尝试自己解释每条输出的含义。" }, { "id": "m8_l3_alias_habit", "title": "alias 与命令行习惯", "goal": "建立更高效、更安全的日常命令行习惯。", "why_it_matters": "很多效率差异来自长期习惯,而不是单个命令是否会敲。", "concepts": [ "alias", "常用缩写", "习惯的收益与风险" ], "command": "alias", "examples": [ "alias ll='ls -l'", "alias gs='git status'" ], "pitfalls": [ "别名过多反而混乱", "依赖个人别名导致跨机器不一致" ], "scenarios": [ "提高常用命令效率", "统一个人命令习惯" ], "exercises": [ { "id": "m8_l3_e1", "type": "understanding", "question": "为什么 alias 既能提升效率,也可能带来问题?", "answer": "它能简化命令,但如果过度依赖,换环境或和他人协作时可能造成理解和一致性问题" }, { "id": "m8_l3_e2", "type": "understanding", "question": "为什么运维平台环境中不建议胡乱定义复杂 alias?", "answer": "因为可能影响命令可预期性,增加排障和协作成本" }, { "id": "m8_l3_e3", "type": "scenario", "question": "什么时候 alias 是好的,什么时候需要克制?", "answer": "高频、简单、个人明确的命令可以用 alias;涉及生产、协作和高风险操作应尽量保持显式命令" } ], "related_commands": [ "alias" ], "classic_view": "教材视角:很多“环境问题”本质是命令来源、变量配置和包版本问题,不是应用本身坏了。", "takeaways": [ "学完后应能做到:建立更高效、更安全的日常命令行习惯。", "易错提醒:别名过多反而混乱", "迁移场景:提高常用命令效率" ], "after_class": "课后建议:回到真实或模拟环境里,再用 alias 做一次独立练习,并尝试自己解释每条输出的含义。" } ] }, { "id": "module_9_automation", "title": "模块 9:自动化、归档与运维习惯", "summary": "建立重定向、管道、定时任务、归档备份和命令复盘习惯。", "lessons": [ { "id": "m9_l1_pipe_redirect", "title": "组合能力:管道与重定向", "goal": "理解为什么 Linux 强调小命令组合,而不是一个命令包办一切。", "why_it_matters": "运维效率往往来自命令组合,而不是单个命令本身。", "concepts": [ "标准输入输出", "重定向", "管道组合" ], "command": "| / > / >>", "examples": [ "grep error /var/log/syslog | wc -l", "echo hello > note.txt", "cat file >> backup.txt" ], "pitfalls": [ "覆盖写和追加写不分", "不会把命令组合成链路" ], "scenarios": [ "统计错误行数", "生成结果文件" ], "exercises": [ { "id": "m9_l1_e1", "type": "understanding", "question": "为什么管道是 Unix/Linux 的核心思想之一?", "answer": "因为它让小工具可以彼此组合,快速拼出解决问题的命令链路" }, { "id": "m9_l1_e2", "type": "scenario", "question": "如果想统计日志里 error 出现了多少次,为什么 grep 配合 wc 很自然?", "answer": "因为 grep 负责筛选,wc 负责统计,两者分工清晰又容易组合" }, { "id": "m9_l1_e3", "type": "understanding", "question": "> 和 >> 的区别是什么?", "answer": "> 是覆盖写入,>> 是追加写入" } ], "related_commands": [ "|", ">", ">>" ], "classic_view": "教材视角:自动化不是炫技,而是把重复工作做成稳定、可复用、可回溯的流程。", "takeaways": [ "学完后应能做到:理解为什么 Linux 强调小命令组合,而不是一个命令包办一切。", "易错提醒:覆盖写和追加写不分", "迁移场景:统计错误行数" ], "after_class": "课后建议:回到真实或模拟环境里,再用 |、>、>> 做一次独立练习,并尝试自己解释每条输出的含义。" }, { "id": "m9_l2_tar_backup", "title": "归档与备份:tar / gzip", "goal": "理解打包压缩和备份的基本思路。", "why_it_matters": "备份不是把文件复制一下,而是要考虑归档、压缩和恢复。", "concepts": [ "归档 vs 压缩", "备份与恢复", "打包文件" ], "command": "tar / gzip", "examples": [ "tar -czf backup.tar.gz /etc", "tar -xzf backup.tar.gz -C /tmp" ], "pitfalls": [ "只会备份,不会恢复验证", "不知道 tar 和 gzip 各自扮演什么角色" ], "scenarios": [ "备份配置目录", "迁移文件集合" ], "exercises": [ { "id": "m9_l2_e1", "type": "understanding", "question": "为什么备份后最好做一次恢复验证?", "answer": "因为只有验证过能恢复,备份才真正有意义" }, { "id": "m9_l2_e2", "type": "scenario", "question": "为什么很多运维备份会用 tar.gz?", "answer": "因为它适合把多个文件归档后再压缩,便于传输和保存" }, { "id": "m9_l2_e3", "type": "understanding", "question": "tar 和 gzip 的角色区别是什么?", "answer": "tar 负责打包归档,gzip 负责压缩" } ], "related_commands": [ "tar", "gzip" ], "classic_view": "教材视角:自动化不是炫技,而是把重复工作做成稳定、可复用、可回溯的流程。", "takeaways": [ "学完后应能做到:理解打包压缩和备份的基本思路。", "易错提醒:只会备份,不会恢复验证", "迁移场景:备份配置目录" ], "after_class": "课后建议:回到真实或模拟环境里,再用 tar、gzip 做一次独立练习,并尝试自己解释每条输出的含义。" }, { "id": "m9_l3_crontab_history", "title": "定时任务与操作复盘:crontab / history", "goal": "理解自动化执行与命令历史复盘的价值。", "why_it_matters": "运维很多工作是周期性的,同时排障也离不开复盘。", "concepts": [ "周期任务", "命令历史", "自动化意识" ], "command": "crontab / history", "examples": [ "crontab -l", "history -n 10" ], "pitfalls": [ "写了定时任务却不记录输出", "不会利用 history 回顾近期操作" ], "scenarios": [ "定时备份", "回顾误操作" ], "exercises": [ { "id": "m9_l3_e1", "type": "understanding", "question": "为什么定时任务不只要能跑,还要关注日志和输出?", "answer": "因为无人值守任务如果失败却没有输出记录,很难排查问题" }, { "id": "m9_l3_e2", "type": "scenario", "question": "复盘线上事故时,history 能提供什么帮助?", "answer": "帮助确认最近执行过哪些命令,判断是否有变更触发了问题" }, { "id": "m9_l3_e3", "type": "understanding", "question": "为什么自动化不是“偷懒”,而是运维能力的一部分?", "answer": "因为自动化能减少重复劳动、降低人为失误并提升稳定性" } ], "related_commands": [ "crontab", "history" ], "classic_view": "教材视角:自动化不是炫技,而是把重复工作做成稳定、可复用、可回溯的流程。", "takeaways": [ "学完后应能做到:理解自动化执行与命令历史复盘的价值。", "易错提醒:写了定时任务却不记录输出", "迁移场景:定时备份" ], "after_class": "课后建议:回到真实或模拟环境里,再用 crontab、history 做一次独立练习,并尝试自己解释每条输出的含义。" } ] }, { "id": "module_10_incidents", "title": "模块 10:运维综合实战场景", "summary": "把前面所有命令和认知串起来,围绕真实故障场景形成排查链路。", "lessons": [ { "id": "m10_l1_service_down", "title": "场景:服务无法访问排查", "goal": "建立“先服务、再端口、再日志、再请求”的排查顺序。", "why_it_matters": "这是最经典的运维问题之一。", "concepts": [ "服务状态", "端口监听", "日志定位", "HTTP 验证" ], "command": "systemctl / ps / ss / journalctl / curl", "examples": [ "systemctl status nginx", "ss -ltnp", "curl http://127.0.0.1:8080/health", "systemctl status nginx && ss -ltnp | grep 80", "journalctl -u nginx -n 50", "curl -I http://127.0.0.1" ], "pitfalls": [ "只看浏览器打不开,不看服务状态", "没有层次地乱查", "没有层次感地同时改服务、改配置、重启,导致问题更难定位" ], "scenarios": [ "应用服务无法访问", "线上服务返回 502/504", "站点页面打不开但机器正常" ], "exercises": [ { "id": "m10_l1_e1", "type": "scenario", "question": "遇到“网站打不开”,一个合理的排查顺序是什么?", "answer": "先看服务状态,再看进程和端口,再看日志,最后用 curl 验证接口" }, { "id": "m10_l1_e2", "type": "understanding", "question": "为什么不应该一上来就改配置?", "answer": "因为先确认问题在哪一层更重要,盲改配置可能扩大问题" }, { "id": "m10_l1_e3", "type": "scenario", "question": "如果端口没监听,你下一步更应该看什么?", "answer": "看服务状态和日志,确认是否启动失败或启动后立即退出" }, { "id": "m10_l1_service_down_op1", "type": "operation", "title": "第一步:确认服务状态", "hint": "systemctl status nginx", "success_test": "cmd == 'systemctl status nginx'", "solution": [ "systemctl status nginx" ], "success_msg": "✅ 通过:继续下一步" }, { "id": "m10_l1_service_down_op2", "type": "operation", "title": "第二步:确认端口监听", "hint": "ss -ltnp | grep 80", "success_test": "'80' in output", "solution": [ "ss -ltnp | grep 80" ], "success_msg": "✅ 通过:继续下一步" }, { "id": "m10_l1_service_down_op3", "type": "operation", "title": "第三步:看最近日志", "hint": "journalctl -u nginx -n 50", "success_test": "'Started' in output or 'connect() failed' in output", "solution": [ "journalctl -u nginx -n 50" ], "success_msg": "✅ 通过:继续下一步" }, { "id": "m10_l1_service_down_op4", "type": "operation", "title": "第四步:本机请求验证", "hint": "curl -I http://127.0.0.1", "success_test": "'hello' in output or '200' in output or 'html' in output", "solution": [ "curl -I http://127.0.0.1" ], "success_msg": "✅ 通过:继续下一步" } ], "related_commands": [ "systemctl", "ps", "ss", "journalctl", "curl" ], "classic_view": "教材视角:综合场景训练的重点不是记住某条命令,而是建立分层排障顺序和判断习惯。", "takeaways": [ "学完后应能做到:建立“先服务、再端口、再日志、再请求”的排查顺序。", "易错提醒:只看浏览器打不开,不看服务状态", "迁移场景:应用服务无法访问", "服务不可用时,排障要按层进行:服务 → 进程 → 端口 → 日志 → 请求。", "形成分层排障顺序,而不是遇到问题就随手试命令。" ], "after_class": "课后建议:回到真实或模拟环境里,再用 systemctl、ps、ss、journalctl、curl 做一次独立练习,并尝试自己解释每条输出的含义。", "troubleshooting_flow": [ "服务状态:systemctl status", "进程状态:ps / 进程是否存在", "端口监听:ss 或 netstat", "日志定位:journalctl / 应用日志", "请求验证:curl 直接打本机或接口" ] }, { "id": "m10_l2_disk_full", "title": "场景:磁盘爆满排查", "goal": "建立从 df 到 du 再到 find 的磁盘问题定位思路。", "why_it_matters": "磁盘满会直接导致服务报错、写入失败和日志异常。", "concepts": [ "文件系统空间", "目录占用", "大文件定位" ], "command": "df / du / find / sort", "examples": [ "df -h", "du -sh /var/log", "find /var/log -type f", "df -h /var", "du -sh /var/log/* | sort" ], "pitfalls": [ "只看 df 不继续追目录", "删文件前不确认用途", "直接删除不熟悉的大文件,可能破坏恢复和排障线索" ], "scenarios": [ "排查磁盘 100%", "日志目录暴涨导致磁盘满", "发布产物堆积导致空间不足" ], "exercises": [ { "id": "m10_l2_e1", "type": "scenario", "question": "磁盘爆满时,为什么通常先 df 再 du?", "answer": "因为先确认哪个文件系统满了,再定位具体哪个目录占用大" }, { "id": "m10_l2_e2", "type": "understanding", "question": "为什么删除日志前要先确认是否还能用于排障?", "answer": "因为日志可能是定位故障的关键证据,盲删会丢失排障线索" }, { "id": "m10_l2_e3", "type": "scenario", "question": "如果 /var/log 特别大,你会想到哪些命令组合?", "answer": "df、du、find、sort 组合起来定位大文件和大目录" }, { "id": "m10_l2_disk_full_op1", "type": "operation", "title": "第一步:确认哪个挂载点满了", "hint": "df -h", "success_test": "'Filesystem' in output", "solution": [ "df -h" ], "success_msg": "✅ 通过:继续下一步" }, { "id": "m10_l2_disk_full_op2", "type": "operation", "title": "第二步:定位大目录(示例:/var/log)", "hint": "du -sh /var/log", "success_test": "'/var/log' in output", "solution": [ "du -sh /var/log" ], "success_msg": "✅ 通过:继续下一步" }, { "id": "m10_l2_disk_full_op3", "type": "operation", "title": "第三步:找日志相关文件(示例)", "hint": "find /var/log -type f", "success_test": "'/var/log' in output", "solution": [ "find /var/log -type f" ], "success_msg": "✅ 通过:继续下一步" } ], "related_commands": [ "df", "du", "find", "sort" ], "classic_view": "教材视角:综合场景训练的重点不是记住某条命令,而是建立分层排障顺序和判断习惯。", "takeaways": [ "学完后应能做到:建立从 df 到 du 再到 find 的磁盘问题定位思路。", "易错提醒:只看 df 不继续追目录", "迁移场景:排查磁盘 100%", "磁盘排查的关键是先找文件系统,再找目录,再找大文件。", "形成分层排障顺序,而不是遇到问题就随手试命令。" ], "after_class": "课后建议:回到真实或模拟环境里,再用 df、du、find、sort 做一次独立练习,并尝试自己解释每条输出的含义。", "troubleshooting_flow": [ "文件系统层:df -h", "目录层:du -sh", "文件层:find + sort", "处理层:确认是否可删、是否要备份、是否影响排障" ] }, { "id": "m10_l3_login_fail", "title": "场景:登录失败与权限异常排查", "goal": "把身份、权限、日志三者串起来理解。", "why_it_matters": "很多系统问题最终都落在权限与身份边界上。", "concepts": [ "当前身份", "认证日志", "权限边界" ], "command": "whoami / id / passwd / grep / tail", "examples": [ "whoami", "id", "tail -n 20 /var/log/auth.log", "grep sandbox_user /etc/passwd", "id sandbox_user" ], "pitfalls": [ "只怀疑密码错误,不看日志", "忽略组权限问题", "只盯着密码,不看账号状态和权限配置" ], "scenarios": [ "SSH 登录失败", "执行权限不足", "脚本执行提示无权限" ], "exercises": [ { "id": "m10_l3_e1", "type": "scenario", "question": "排查登录失败时,除了用户名密码,还要想到什么?", "answer": "要看认证日志、用户身份、组信息和权限配置" }, { "id": "m10_l3_e2", "type": "understanding", "question": "为什么权限异常常常不能只靠肉眼猜?", "answer": "因为真实问题可能同时涉及用户、组、文件权限和服务身份,需要结合命令验证" }, { "id": "m10_l3_e3", "type": "scenario", "question": "如果脚本明明存在却执行不了,你会从哪几类信息开始看?", "answer": "先看 whoami/id,再看文件权限和属主属组,必要时看相关日志" }, { "id": "m10_l3_login_fail_op1", "type": "operation", "title": "第一步:确认当前身份", "hint": "id", "success_test": "'uid=' in output", "solution": [ "id" ], "success_msg": "✅ 通过:继续下一步" }, { "id": "m10_l3_login_fail_op2", "type": "operation", "title": "第二步:查看认证日志尾部", "hint": "cat /var/log/auth.log | tail -n 1", "success_test": "'sshd' in output", "solution": [ "cat /var/log/auth.log | tail -n 1" ], "success_msg": "✅ 通过:继续下一步" }, { "id": "m10_l3_login_fail_op3", "type": "operation", "title": "第三步:确认用户是否存在", "hint": "grep sandbox_user /etc/passwd", "success_test": "'sandbox_user' in output", "solution": [ "grep sandbox_user /etc/passwd" ], "success_msg": "✅ 通过:继续下一步" } ], "related_commands": [ "whoami", "id", "passwd", "grep", "tail" ], "classic_view": "教材视角:综合场景训练的重点不是记住某条命令,而是建立分层排障顺序和判断习惯。", "takeaways": [ "学完后应能做到:把身份、权限、日志三者串起来理解。", "易错提醒:只怀疑密码错误,不看日志", "迁移场景:SSH 登录失败", "登录失败排查要把身份、日志和权限一起看,不能只猜密码。", "形成分层排障顺序,而不是遇到问题就随手试命令。" ], "after_class": "课后建议:回到真实或模拟环境里,再用 whoami、id、passwd、grep、tail 做一次独立练习,并尝试自己解释每条输出的含义。", "troubleshooting_flow": [ "身份层:whoami / id / 当前用户是谁", "账户层:账号是否存在、是否被限制", "权限层:文件和脚本权限是否正确", "日志层:auth.log / 相关认证日志", "不要只盯着“密码错了”一个方向" ] } ] } ] }