蓝盾流水线中的 Kubernetes 调度优化

2025-07-30 DevOps k8s, 蓝盾评论

在大量使用蓝盾「Docker公共构建机」来跑构建任务后，我们发现拉起的构建 Pod 通过 hostPath 挂载工作目录做缓存，当同一流水线任务重复执行时能够加速，本文介绍蓝盾调度器如何进行调度到有缓存的节点。

1. 背景

在前文蓝盾「Docker公共构建机」缓存清理中我们通过分析源码，知道拉起的构建 Pod 通过 hostPath 挂载工作目录做缓存。我们接下来进一步分析创建 Pod 的流程。

2. 部署配置

dispatch-k8s-manager/resources/config.yaml

dispatch:
  # 调度需要使用到的label，确定构建机唯一性
  label: bkci.dispatch.kubenetes/core
  # 通过k8s watch来观察构建机状态
  watch:
    task:
      label: bkci.dispatch.kubenetes/watch-task
  builder:
    # 将构建机调度到指定标签节点的配置，不填写则在集群内都可以调度，优先级小于专机和特殊机器
    nodeSelector:
      label:
      value:
    # 构建机曾经调度过的节点名称列表
    nodesAnnotation: bkci.dispatch.kubenetes/builder-history-nodes
    # 容器历史资源使用相关
    realResource:
      # 监控构建机容器资源使用的 prometheus api地址， 字段为空则不开启realResource优化
      # 注：集群内为 集群内为 <service>.<namespace>.svc.cluster.local:<port>
      prometheusUrl: 
      realResourceAnnotation: bkci.dispatch.kubenetes/builder-real-resources
  # 一些具有特定属性的机器，例如独特的网络策略
  specialMachine:
    label: bkci.dispatch.kubenetes/special-builder
  # 只给特定用户使用的专机
  privateMachine:
    label: bkci.dispatch.kubenetes/private-builder

通过 dispatch-k8s-manager 模块的配置文件，我们发现可以通过 nodeSelector、 nodesAnnotation 、realResource 等配置来设置调度策略。

3. 源码分析

3.1 亲和性和污点容忍

dispatch-k8s-manager/pkg/apiserver/service/builder_start.go

func CreateBuilder(builder *Builder) (taskId string, err error) {

	volumes, volumeMounts := getBuilderVolumeAndMount(builder.Name, builder.NFSs)

	var replicas int32 = 1

	tolers, nodeMatches := buildDedicatedBuilder(builder)
  
  ...

  annotations, err := getBuilderAnnotations(builder.Name)
	if err != nil {
		return "", err
	}

  ...

  go task.DoCreateBuilder(
		taskId,
		&kubeclient.Deployment{
			Name:        builder.Name,
			Labels:      labels,
			MatchLabels: matchlabels,
			Replicas:    &replicas,
			Pod: kubeclient.Pod{
				Labels:      labels,
				Annotations: annotations,
				Volumes:     volumes,
				Containers: []kubeclient.Container{
					{
						Image:        builder.Image,
						Resources:    *resources,
						Env:          getEnvs(builder.Env),
						Command:      builder.Command,
						VolumeMounts: volumeMounts,
					},
				},
				NodeMatches:     nodeMatches,
				Tolerations:     tolers,
				PullImageSecret: pullImageSecret,
			},
		},
	)
  
	return taskId, nil
}


// buildDedicatedBuilder 获取污点和节点亲和度配置
func buildDedicatedBuilder(builder *Builder) ([]corev1.Toleration, []kubeclient.NodeMatch) {
    // 优先读取专机配置
    ...
    // 读取具有特殊配置的机器
    ...
    // 如果配置中配置了节点选择器则使用节点选择器
    ...
    return nil, nil
}

// getBuilderAnnotations 获取构建机注释配置
func getBuilderAnnotations(builderName string) (map[string]string, error) {
	...
	// 获取节点记录，用来把构建机分配到已有的节点
  ...
	// 获取RealResource记录
	...
	return result, nil
}

dispatch-k8s-manager/pkg/kubeclient/deployment.go

func CreateDeployment(dep *Deployment) error {
  ...
  // 将 NodeMatches 转为 nodeAffinity
	var affinity *corev1.Affinity
	if len(dep.Pod.NodeMatches) > 0 {
		var matches []corev1.NodeSelectorRequirement
		for _, mat := range dep.Pod.NodeMatches {
			matches = append(matches, corev1.NodeSelectorRequirement{
				Key:      mat.Key,
				Operator: mat.Operator,
				Values:   mat.Values,
			})
		}
		affinity = &corev1.Affinity{
			NodeAffinity: &corev1.NodeAffinity{
				RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{
					NodeSelectorTerms: []corev1.NodeSelectorTerm{
						{
							MatchExpressions: matches,
						},
					},
				},
			},
		}
	}
	...
	return nil
}

在 CreateBuilder 里，调度相关的两个核心参数 tolers 和 nodeMatches 都是通过 buildDedicatedBuilder(builder) 返回的，这两个参数会一起传递给 kubeclient 层，在 kubeclient 的 CreateDeployment 方法中：

NodeMatches 会被转换为 affinity.nodeAffinity，用于节点亲和调度。
Tolerations 会直接下发到 Pod 的 spec.tolerations 字段，用于污点容忍。

3.2 历史节点调度

蓝盾源码里我们找到了有关亲和性以及污点容忍的实现，但是有关历史节点调度的实现只有通过 getBuilderAnnotations 给 Pod 设置注解。至于如何通过注解影响调度在蓝盾源码里并没有找到相关内容。我们进一步分析发现，历史节点调度需要通过蓝盾基于K8S调度插件实现。

apiVersion: v1
kind: Pod
metadata:
  annotations:
    bkci.dispatch.kubenetes/builder-history-nodes: '["10.x.x.1","10.x.x.2","10.x.x.3"]'
  labels:
    bkci.dispatch.kubenetes/core: build1753761077695-ivcpmoxg
    bkci.dispatch.kubenetes/watch-task: t-1753785688231121886-iInjpMUr-builder-start
  name: build1753761077695-ivcpmoxg-c9d8fc6c9-mqhkk
  ...

package bkdevopsschedulerplugin

import (
    "context"
    "encoding/json"
    "k8s.io/api/core/v1"
    "k8s.io/kubernetes/pkg/scheduler/framework"
)

const nodesAnnotation = "bkci.dispatch.kubenetes/builder-history-nodes"
const readResourceAnnotation = "bkci.dispatch.kubenetes/builder-real-resources"

type realResourceUsage struct {
    Cpu    string `json:"cpu"`
    Memory string `json:"memory"`
}

func (s *SchedulerPlugin) Score(_ context.Context, _ *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
    // 读取历史节点信息
    var nodeHis []string
    if nodesS, ok := pod.ObjectMeta.Annotations[nodesAnnotation]; ok {
        _ = json.Unmarshal([]byte(nodesS), &nodeHis)
    }

    // 读取资源信息
    var realResources []realResourceUsage
    if realS, ok := pod.ObjectMeta.Annotations[readResourceAnnotation]; ok {
        _ = json.Unmarshal([]byte(realS), &realResources)
    }

    // 计算历史节点分数
    nodeScore := calculateNodeHisScore(nodeHis, nodeName)

    // 计算资源分数
    // ...省略资源分数计算逻辑...
    realResourceScore := ... // 通过 realResources 和节点资源情况计算

    // 返回总分
    return nodeScore + realResourceScore, nil
}

var nodeHisScores = map[int]int64{0: 30, 1: 20, 2: 10}

// calculateNodeHisScore 计算历史节点分数，将3个历史节点从最近到最远依次打分 30 - 10分
func calculateNodeHisScore(nodeHis []string, nodeName string) int64 {
	if len(nodeHis) == 0 {
		return framework.MinNodeScore
	}

	for index, name := range nodeHis {
		if name != nodeName {
			continue
		}

		score := framework.MinNodeScore
		if indexS, ok := nodeHisScores[index]; ok {
			score = indexS
		}

		return score
	}

	return framework.MinNodeScore
}

在插件的 Score 阶段，会读取 Pod 的 bkci.dispatch.kubenetes/builder-history-nodes 注解内容，并将其反序列化为历史节点名称数组，即提供历史节点信息。
插件通过 calculateNodeHisScore 方法，根据当前调度节点是否在历史节点列表中，以及其在列表中的顺序，给予不同的分数（最近的历史节点分数最高）。
该分数会与资源分数（通过 bkci.dispatch.kubenetes/builder-real-resources 注解和节点资源情况计算得出）相加，作为最终调度优先级，影响调度器选择节点的排序。

4. 总结

在蓝盾流水线中，通过以下方式实现了 Kubernetes 的调度优化：

历史节点调度：通过注解记录历史节点信息，调度插件优先选择这些节点，减少初始化时间。
亲和性（Affinity）：根据配置文件中的 nodeSelector 和代码中的 NodeMatches 转换为 nodeAffinity，确保 Pod 调度到特定节点。
污点容忍（Tolerations）：仅在配置文件中指定了专机（privateMachine）时，生成污点容忍配置，允许 Pod 调度到带特定污点的节点。

这些机制协同提升了调度效率和资源利用率。

5. 参考

本文链接： https://blazehu.github.io/2025/07/30/devops/landun_dispatch_scheduler/
版权声明： 本博客所有文章除特别声明外，均采用 CC BY 4.0 CN协议许可协议。转载请注明出处！

blazehuCloudNative/SRE/DevOps

个人简介。