
Job 是 Nomad 中最核心的配置单元,它是对工作负载的声明式规范。一个 Job 文件定义了:
应用程序要在哪里运行(数据中心、区域)如何调度(服务类型、批处理任务等)需要运行多少实例资源需求(CPU、内存等)更新策略、重启策略等层次结构:
Job → Task Group → Task
Job:顶层配置,包含一个或多个 Task GroupTask Group:一组需要在同一节点上运行的 TaskTask:具体的工作单元(如 Docker 容器、进程等)
Nomad 支持以下 Job 类型(通过
type 参数指定):
| 类型 | 说明 | 使用场景 |
|---|---|---|
service | 长期运行的服务(默认) | Web 服务、API、数据库等 |
batch | 短期批处理任务 | 数据处理、定时任务 |
system | 系统级任务,在所有客户端运行 | 监控代理、日志收集器 |
sysbatch | 系统级批处理任务 | 系统维护脚本 |
Job 配置文件使用 HCL(HashiCorp Configuration Language)格式,扩展名为
.nomad 或
.hcl。
job "job-name" {
# Job 级别配置
datacenters = ["dc1"]
type = "service"
# Task Group
group "group-name" {
count = 1
# Task
task "task-name" {
driver = "docker"
config {
# 驱动特定配置
}
resources {
# 资源需求
}
}
}
}
Job 块是顶层配置,定义作业的全局属性。
job "example" {
# 区域配置(可选,默认为 "global")
region = "global"
# 数据中心列表(必需)
datacenters = ["dc1", "dc2"]
# Job 类型(可选,默认为 "service")
type = "service"
# 优先级(可选,默认为 50,范围 1-100)
priority = 50
# 命名空间(可选,默认为 "default")
namespace = "default"
}
限制 Job 可以运行的节点:
job "example" {
# 只在 Linux 节点运行
constraint {
attribute = "${attr.kernel.name}"
value = "linux"
}
# 只在有特定标签的节点运行
constraint {
attribute = "${node.class}"
value = "compute"
}
}
控制滚动更新行为:
job "example" {
update {
# 并行更新数量
max_parallel = 1
# 最小健康时间
min_healthy_time = "10s"
# 健康检查超时
healthy_deadline = "3m"
# 部署进度超时
progress_deadline = "10m"
# 部署失败时是否自动回滚
auto_revert = false
# 金丝雀部署实例数
canary = 0
}
}
控制节点排空时的任务迁移:
job "example" {
migrate {
# 并行迁移数量
max_parallel = 1
# 健康检查机制:"checks" 或 "task_states"
health_check = "checks"
# 最小健康时间
min_healthy_time = "10s"
# 健康检查超时
healthy_deadline = "5m"
}
}
Task Group 定义一组需要在同一节点运行的任务。
group "web-group" {
# 实例数量
count = 3
# 网络配置
network {
port "http" {
to = 8080
}
port "admin" {
static = 9090
}
}
# 临时磁盘
ephemeral_disk {
size = 300 # MB
}
# 重启策略
restart {
attempts = 2
interval = "30m"
delay = "15s"
mode = "fail"
}
}
network {
# 网络模式:"bridge"、"host" 或默认
mode = "bridge"
# 动态端口
port "http" {
to = 8080 # 容器内端口
}
# 静态端口
port "admin" {
static = 9090 # 主机端口
to = 9090 # 容器内端口
}
}
注册服务到 Nomad 或 Consul:
service {
name = "my-service"
tags = ["api", "v1"]
port = "http"
provider = "nomad" # 或 "consul"
# 健康检查
check {
name = "health"
type = "http"
path = "/health"
interval = "10s"
timeout = "2s"
}
}
restart {
# 时间间隔内的重启次数
attempts = 2
# 时间间隔
interval = "30m"
# 重启延迟
delay = "15s"
# 模式:"delay"(延迟)或 "fail"(失败)
mode = "fail"
}
Task 定义具体的工作单元。
task "web" {
# 驱动类型:"docker"、"exec"、"rawexec" 等
driver = "docker"
# 驱动配置
config {
image = "nginx:latest"
ports = ["http"]
}
# 资源需求
resources {
cpu = 500 # MHz
memory = 256 # MB
}
# 环境变量
env {
APP_ENV = "production"
}
}
根据不同驱动类型,配置内容不同:
Docker 驱动:
config {
image = "redis:7"
ports = ["db"]
# 认证失败时尝试公共仓库
auth_soft_fail = true
# 挂载卷
volumes = [
"/host/path:/container/path"
]
# 命令和参数
command = "/bin/sh"
args = ["-c", "echo hello"]
}
Exec 驱动:
config {
command = "/usr/bin/python"
args = ["app.py"]
}
Raw Exec 驱动:
config {
command = "/bin/sleep"
args = ["3600"]
}
resources {
# CPU(MHz)
cpu = 500
# 内存(MB)
memory = 256
# GPU(可选)
device "nvidia/gpu" {
count = 1
}
}
在任务启动前下载文件:
artifact {
source = "https://example.com/config.tar.gz"
destination = "local/"
options {
checksum = "md5:c4aa853ad2215426eb7d70a21922e794"
}
}
动态生成配置文件:
template {
data = <<EOF
SERVER_NAME={{ env "NOMAD_ALLOC_ID" }}
DATABASE_URL={{ key "service/database/url" }}
EOF
destination = "local/config.env"
env = true
}
logs {
max_files = 10
max_file_size = 15 # MB
}
从 Vault 获取密钥:
vault {
policies = ["app-policy"]
change_mode = "restart"
change_signal = "SIGHUP"
}
identity {
env = true # 暴露为环境变量
file = true # 写入文件
}
表达节点偏好:
affinity {
attribute = "${node.datacenter}"
value = "us-west1"
weight = 100 # -100 到 100
}
跨节点分散部署:
spread {
attribute = "${node.datacenter}"
target "dc1" {
percent = 60
}
target "dc2" {
percent = 40
}
}
创建一个
.nomad 或
.hcl 文件,例如
web-app.nomad。
nomad job validate web-app.nomad
查看部署前的执行计划:
nomad job plan web-app.nomad
nomad job run web-app.nomad
# 查看 Job 状态
nomad job status web-app
# 查看 Job 列表
nomad job status
# 查看详细信息
nomad job inspect web-app
# 查看分配(Allocation)
nomad alloc status <alloc-id>
# 查看日志
nomad alloc logs <alloc-id>
# 实时查看日志
nomad alloc logs -f <alloc-id>
修改配置文件后重新运行:
nomad job run web-app.nomad
# 停止 Job
nomad job stop web-app
# 清除 Job(停止并从系统移除)
nomad job stop -purge web-app
job "web-service" {
datacenters = ["dc1"]
type = "service"
group "web" {
count = 3
network {
port "http" {
to = 8080
}
}
service {
name = "web-api"
port = "http"
provider = "nomad"
check {
type = "http"
path = "/health"
interval = "10s"
timeout = "2s"
}
}
task "nginx" {
driver = "docker"
config {
image = "nginx:latest"
ports = ["http"]
}
resources {
cpu = 500
memory = 256
}
}
}
}
job "data-processor" {
datacenters = ["dc1"]
type = "batch"
group "processor" {
count = 1
task "process" {
driver = "docker"
config {
image = "python:3.9"
command = "python"
args = ["/local/process.py"]
}
artifact {
source = "https://example.com/process.py"
destination = "local/"
}
resources {
cpu = 1000
memory = 512
}
}
}
}
job "config-app" {
datacenters = ["dc1"]
type = "service"
group "app" {
count = 2
task "server" {
driver = "docker"
config {
image = "myapp:latest"
ports = ["http"]
}
env {
ENVIRONMENT = "production"
LOG_LEVEL = "info"
}
template {
data = <<EOF
DATABASE_HOST={{ env "NOMAD_IP_http" }}
DATABASE_PORT=5432
API_KEY={{ key "service/api-key" }}
EOF
destination = "secrets/app.env"
env = true
}
resources {
cpu = 500
memory = 256
}
}
}
}
job "multi-task" {
datacenters = ["dc1"]
group "services" {
count = 1
network {
port "web" {
to = 8080
}
port "api" {
to = 9090
}
}
task "web-server" {
driver = "docker"
config {
image = "nginx:latest"
ports = ["web"]
}
resources {
cpu = 300
memory = 128
}
}
task "api-server" {
driver = "docker"
config {
image = "api:latest"
ports = ["api"]
}
resources {
cpu = 500
memory = 256
}
}
}
}
job "database" {
datacenters = ["dc1"]
type = "service"
group "db" {
count = 1
volume "db-data" {
type = "host"
read_only = false
source = "mysql-data"
}
network {
port "mysql" {
to = 3306
}
}
task "mysql" {
driver = "docker"
config {
image = "mysql:8.0"
ports = ["mysql"]
}
volume_mount {
volume = "db-data"
destination = "/var/lib/mysql"
read_only = false
}
env {
MYSQL_ROOT_PASSWORD = "secret"
}
resources {
cpu = 1000
memory = 1024
}
}
}
}
# 运行 Job
nomad job run <job-file>
# 停止 Job
nomad job stop <job-name>
# 停止并清除 Job
nomad job stop -purge <job-name>
# 查看 Job 状态
nomad job status <job-name>
# 查看所有 Job
nomad job status
# 查看 Job 详细信息
nomad job inspect <job-name>
# 验证 Job 配置
nomad job validate <job-file>
# 查看执行计划
nomad job plan <job-file>
# 查看 Job 历史版本
nomad job history <job-name>
# 回滚到指定版本
nomad job revert <job-name> <version>
# 查看分配列表
nomad alloc status
# 查看特定分配
nomad alloc status <alloc-id>
# 查看日志
nomad alloc logs <alloc-id>
# 实时查看日志
nomad alloc logs -f <alloc-id>
# 查看特定任务日志
nomad alloc logs <alloc-id> <task-name>
# 进入容器
nomad alloc exec <alloc-id> <task-name> /bin/sh
# 重启分配
nomad alloc restart <alloc-id>
# 停止分配
nomad alloc stop <alloc-id>
# 查看节点列表
nomad node status
# 查看节点详情
nomad node status <node-id>
# 查看集群状态
nomad server members
# 查看评估(Evaluation)
nomad eval status <eval-id>
# 查看部署状态
nomad deployment status <deployment-id>
# 提升金丝雀部署
nomad deployment promote <deployment-id>
# 失败部署
nomad deployment fail <deployment-id>
# 格式化 Job 文件
nomad job fmt <job-file>
# 格式化目录下所有文件
nomad job fmt .
Nomad 支持在配置中使用变量插值:
env {
# Nomad 提供的变量
ALLOC_ID = "${NOMAD_ALLOC_ID}"
TASK_NAME = "${NOMAD_TASK_NAME}"
IP_ADDRESS = "${NOMAD_IP_http}"
PORT = "${NOMAD_PORT_http}"
}
constraint {
attribute = "${attr.kernel.name}"
value = "linux"
}
constraint {
attribute = "${attr.cpu.arch}"
value = "amd64"
}
constraint {
attribute = "${meta.team}"
value = "backend"
}
Job 无法调度
检查节点资源是否充足检查约束条件是否正确查看评估日志:
nomad eval status <eval-id> 任务频繁重启
查看任务日志:
nomad alloc logs <alloc-id>检查健康检查配置调整重启策略 端口冲突
使用动态端口而非静态端口检查端口映射配置镜像拉取失败
检查镜像名称和标签配置镜像仓库认证启用
auth_soft_fail
NOMAD_ALLOC_DIR:分配目录
NOMAD_ALLOC_ID:分配 ID
NOMAD_ALLOC_INDEX:分配索引
NOMAD_ALLOC_NAME:分配名称
NOMAD_DC:数据中心
NOMAD_GROUP_NAME:Task Group 名称
NOMAD_IP_<label>:端口 IP
NOMAD_JOB_ID:Job ID
NOMAD_JOB_NAME:Job 名称
NOMAD_NAMESPACE:命名空间
NOMAD_PORT_<label>:端口号
NOMAD_REGION:区域
NOMAD_TASK_DIR:任务目录
NOMAD_TASK_NAME:任务名称