目录

kube-scheduler的二次开发示例

概述

本文以 Kubernetes v1.30.4 为例,详细讲解一下如何在原来的 kube-scheduler 的基础上,通过增加 Plugins 的方式,完成二次开发和上线验证。

调度场景

Kubernetes 原生的调度器里没有根据节点磁盘空间的大小来进行调度的插件,因此在调度过程中,并不会考虑节点本身的磁盘使用量,如果希望调度的过程中,可以考虑磁盘,尤其是根目录的剩余空间,则需要通过扩展原生的调度器插件。

二次开发

尽管 Kubernetes 官方提供了 Scheduler Framework 支持灵活的扩展调度插件,但是原生代码中,也提供了接口甚至是 example 支持和引导如何通过修改原生代码来支持自定义的插件,下面是一个简单的示例,示例中,调度器在给节点打分的阶段,会通过 DiskAvailable 这个自定义插件,通过查询 Prometheus 的指标接口,获取每个节点的根目录的剩余空间,并且根据剩余空间的百分比给节点打分,剩余空间越高,得分越高,

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
// pkg/scheduler/framework/plugins/diskAvailable/diskAvailable.go
package diskAvailable

import (
	"context"
	"encoding/json"
	"fmt"
	"io/ioutil"
	"k8s.io/apimachinery/pkg/runtime"
	"net/http"
	"strconv"

	v1 "k8s.io/api/core/v1"
	"k8s.io/kubernetes/pkg/scheduler/framework"
)

const (
	Name          = "DiskAvailable"  // 插件名称改为 DiskAvailable
	RequiredLabel = "use-disk-score" // 固定标签键
)

type DiskAvailable struct {
	handle framework.Handle
}

var _ framework.ScorePlugin = &DiskAvailable{}

// New 初始化插件实例
func New(_ context.Context, plArgs runtime.Object, h framework.Handle) (framework.Plugin, error) {
	d := &DiskAvailable{
		handle: h,
	}
	return d, nil
}

// Name 返回插件名称
func (d *DiskAvailable) Name() string {
	return Name
}

// Score 计算节点得分
func (d *DiskAvailable) Score(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
	// 检查 Pod 是否包含指定标签
	if _, exists := pod.Labels[RequiredLabel]; !exists {
		// 如果标签不存在,返回默认分数 0
		return 0, framework.NewStatus(framework.Success, "Pod does not have the required label")
	}

	// 从 Prometheus 获取磁盘大小
	diskSize, err := fetchDiskSizeFromPrometheus(nodeName)
	if err != nil {
		return 0, framework.NewStatus(framework.Error, fmt.Sprintf("Failed to fetch disk size for node %s: %v", nodeName, err))
	}

	// 假设磁盘大小范围为 [0, 100] GB,将其映射到 [0, 100] 分数
	score := int64((diskSize / 250.0) * 100)
	if score > 100 {
		score = 100
	}

	return score, nil
}

// ScoreExtensions 返回 nil,因为我们不需要扩展
func (d *DiskAvailable) ScoreExtensions() framework.ScoreExtensions {
	return nil
}

// 从 Prometheus 获取磁盘大小
func fetchDiskSizeFromPrometheus(nodeName string) (float64, error) {
	// Prometheus 查询 URL
	prometheusQueryURL := fmt.Sprintf("http://kube-prometheus-stack-prometheus.kube-prometheus-stack.svc.cluster.local:9090/api/v1/query?query=node_filesystem_avail_bytes{instance='%s:9100',mountpoint='/'}", nodeName)

	resp, err := http.Get(prometheusQueryURL)
	if err != nil {
		return 0, err
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		return 0, fmt.Errorf("unexpected status code: %d", resp.StatusCode)
	}

	body, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		return 0, err
	}

	// Parse Prometheus response
	var result struct {
		Data struct {
			Result []struct {
				Value []interface{} `json:"value"`
			} `json:"result"`
		} `json:"data"`
	}
	if err := json.Unmarshal(body, &result); err != nil {
		return 0, err
	}

	if len(result.Data.Result) == 0 {
		return 0, fmt.Errorf("no data found for node %s", nodeName)
	}

	// Extract disk size (in bytes) from the result
	diskSizeBytes, ok := result.Data.Result[0].Value[1].(string)
	if !ok {
		return 0, fmt.Errorf("invalid data format for node %s", nodeName)
	}

	// Convert to GB
	diskSizeGB := parseDiskSize(diskSizeBytes) / (1024 * 1024 * 1024)
	return diskSizeGB, nil
}

// 解析磁盘大小
func parseDiskSize(sizeStr string) float64 {
	size, _ := strconv.ParseFloat(sizeStr, 64)
	return size
}

另外还需要修改插件注册的相关代码,修改如下,修改完之后,这个 DiskAvailable 的插件就可以作为 kube-scheduler 默认启动的 in-tree 插件。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
// pkg/scheduler/framework/plugins/registry.go
/*
Copyright 2019 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package plugins

import (
	"k8s.io/apiserver/pkg/util/feature"
	"k8s.io/kubernetes/pkg/features"
	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultbinder"
	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/defaultpreemption"
	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/diskAvailable"
	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/dynamicresources"
	plfeature "k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/imagelocality"
	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/interpodaffinity"
	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeaffinity"
	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodename"
	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeports"
	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources"
	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodeunschedulable"
	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/nodevolumelimits"
	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/podtopologyspread"
	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/queuesort"
	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/schedulinggates"
	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/tainttoleration"
	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding"
	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumerestrictions"
	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumezone"
	"k8s.io/kubernetes/pkg/scheduler/framework/runtime"
)

// NewInTreeRegistry builds the registry with all the in-tree plugins.
// A scheduler that runs out of tree plugins can register additional plugins
// through the WithFrameworkOutOfTreeRegistry option.
func NewInTreeRegistry() runtime.Registry {
	fts := plfeature.Features{
		EnableDynamicResourceAllocation:              feature.DefaultFeatureGate.Enabled(features.DynamicResourceAllocation),
		EnableVolumeCapacityPriority:                 feature.DefaultFeatureGate.Enabled(features.VolumeCapacityPriority),
		EnableNodeInclusionPolicyInPodTopologySpread: feature.DefaultFeatureGate.Enabled(features.NodeInclusionPolicyInPodTopologySpread),
		EnableMatchLabelKeysInPodTopologySpread:      feature.DefaultFeatureGate.Enabled(features.MatchLabelKeysInPodTopologySpread),
		EnablePodDisruptionConditions:                feature.DefaultFeatureGate.Enabled(features.PodDisruptionConditions),
		EnableInPlacePodVerticalScaling:              feature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling),
		EnableSidecarContainers:                      feature.DefaultFeatureGate.Enabled(features.SidecarContainers),
	}

	registry := runtime.Registry{
		dynamicresources.Name:                runtime.FactoryAdapter(fts, dynamicresources.New),
		imagelocality.Name:                   imagelocality.New,
		tainttoleration.Name:                 tainttoleration.New,
		nodename.Name:                        nodename.New,
		nodeports.Name:                       nodeports.New,
		nodeaffinity.Name:                    nodeaffinity.New,
		podtopologyspread.Name:               runtime.FactoryAdapter(fts, podtopologyspread.New),
		nodeunschedulable.Name:               nodeunschedulable.New,
		noderesources.Name:                   runtime.FactoryAdapter(fts, noderesources.NewFit),
		noderesources.BalancedAllocationName: runtime.FactoryAdapter(fts, noderesources.NewBalancedAllocation),
		volumebinding.Name:                   runtime.FactoryAdapter(fts, volumebinding.New),
		volumerestrictions.Name:              runtime.FactoryAdapter(fts, volumerestrictions.New),
		volumezone.Name:                      volumezone.New,
		nodevolumelimits.CSIName:             runtime.FactoryAdapter(fts, nodevolumelimits.NewCSI),
		nodevolumelimits.EBSName:             runtime.FactoryAdapter(fts, nodevolumelimits.NewEBS),
		nodevolumelimits.GCEPDName:           runtime.FactoryAdapter(fts, nodevolumelimits.NewGCEPD),
		nodevolumelimits.AzureDiskName:       runtime.FactoryAdapter(fts, nodevolumelimits.NewAzureDisk),
		nodevolumelimits.CinderName:          runtime.FactoryAdapter(fts, nodevolumelimits.NewCinder),
		interpodaffinity.Name:                interpodaffinity.New,
		queuesort.Name:                       queuesort.New,
		defaultbinder.Name:                   defaultbinder.New,
		defaultpreemption.Name:               runtime.FactoryAdapter(fts, defaultpreemption.New),
		schedulinggates.Name:                 schedulinggates.New,
		diskAvailable.Name:                   diskAvailable.New,
	}

	return registry
}

部署和测试

构建镜像,下面的一些变量是笔者环境需要配置的,没有特殊要求的读者,可以不填。整个命令执行完之后,会把调度器的镜像推送到笔者的私有仓库内。

1
make quick-release-images DBG=1 KUBE_DOCKER_REGISTRY=registry.default.svc.cat.dog KUBE_GORUNNER_IMAGE=runzhliu/network-multitool:latest

最后通过下面的 YAML 文件,部署一个额外的调度器叫做 my-scheduler,并且创建一个需要使用到 DiskAvailable 插件的 Pod,然后观察调度器的日志,查看是否使用到了自定义的插件。

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
apiVersion: v1
kind: ServiceAccount
metadata:
  name: my-scheduler
  namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: my-scheduler-as-kube-scheduler
subjects:
  - kind: ServiceAccount
    name: my-scheduler
    namespace: kube-system
roleRef:
  kind: ClusterRole
  name: system:kube-scheduler
  apiGroup: rbac.authorization.k8s.io
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: my-scheduler-as-volume-scheduler
subjects:
  - kind: ServiceAccount
    name: my-scheduler
    namespace: kube-system
roleRef:
  kind: ClusterRole
  name: system:volume-scheduler
  apiGroup: rbac.authorization.k8s.io
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
  name: my-scheduler-extension-apiserver-authentication-reader
  namespace: kube-system
roleRef:
  kind: Role
  name: extension-apiserver-authentication-reader
  apiGroup: rbac.authorization.k8s.io
subjects:
  - kind: ServiceAccount
    name: my-scheduler
    namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
  name: my-scheduler-leader-election
  namespace: kube-system
rules:
  - apiGroups: ["coordination.k8s.io"]
    resources: ["leases"]
    verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
  name: my-scheduler-leader-election
  namespace: kube-system
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: Role
  name: my-scheduler-leader-election
subjects:
  - kind: ServiceAccount
    name: my-scheduler
    namespace: kube-system
---
apiVersion: v1
kind: ConfigMap
metadata:
  name: my-scheduler-config
  namespace: kube-system
data:
  my-scheduler-config.yaml: |
    apiVersion: kubescheduler.config.k8s.io/v1
    kind: KubeSchedulerConfiguration
    profiles:
      - schedulerName: my-scheduler
        plugins:
          score:
            enabled:
              - name: Runzhliu
              - name: DiskAvailable
    leaderElection:
      leaderElect: false    
---
apiVersion: apps/v1
kind: Deployment
metadata:
  labels:
    component: scheduler
    tier: control-plane
  name: my-scheduler
  namespace: kube-system
spec:
  selector:
    matchLabels:
      component: scheduler
      tier: control-plane
  replicas: 1
  template:
    metadata:
      labels:
        component: scheduler
        tier: control-plane
        version: second
    spec:
      serviceAccountName: my-scheduler
      containers:
        - command:
            - /usr/local/bin/kube-scheduler
            - --config=/etc/kubernetes/my-scheduler/my-scheduler-config.yaml
            - --authentication-kubeconfig=/etc/kubernetes/scheduler.conf
            - --authorization-kubeconfig=/etc/kubernetes/scheduler.conf
            - --bind-address=0.0.0.0
            - --kubeconfig=/etc/kubernetes/scheduler.conf
            - --leader-elect-resource-name=my-scheduler
            - --leader-elect=true
            - --v=4
          image: registry.default.svc.cat.dog/kube-scheduler-amd64:v1.30.4
          imagePullPolicy: Always
          livenessProbe:
            httpGet:
              path: /healthz
              port: 10259
              scheme: HTTPS
            initialDelaySeconds: 15
          name: kube-second-scheduler
          readinessProbe:
            httpGet:
              path: /healthz
              port: 10259
              scheme: HTTPS
          resources:
            requests:
              cpu: '0.1'
          securityContext:
            privileged: false
          volumeMounts:
            - name: config-volume
              mountPath: /etc/kubernetes/my-scheduler
            - mountPath: /etc/kubernetes/scheduler.conf
              name: kubeconfig
              readOnly: true
            - mountPath: /etc/kubernetes/scheduler-config.yaml
              name: scheduler-config
              readOnly: true
      hostNetwork: false
      hostPID: false
      volumes:
        - name: config-volume
          configMap:
            name: my-scheduler-config
        - hostPath:
            path: /etc/kubernetes/scheduler.conf
            type: FileOrCreate
          name: kubeconfig
        - hostPath:
            path: /etc/kubernetes/scheduler-config.yaml
            type: FileOrCreate
          name: scheduler-config
---
apiVersion: v1
kind: Pod
metadata:
  name: annotation-second-scheduler
  labels:
    name: multischeduler-example
    use-disk-score: "true"
spec:
  schedulerName: my-scheduler
  containers:
    - name: pod-with-second-annotation-container
      image: runzhliu/network-multitool:latest

查看相关日志,可以查看到 DiskAvailable 作为插件给每个节点打分的过程。

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
I0806 14:37:03.800521       1 leaderelection.go:260] successfully acquired lease kube-system/my-scheduler
I0806 14:37:03.800721       1 scheduler.go:451] "watching scheduler config file changes..."
I0806 14:37:03.800986       1 schedule_one.go:84] "About to try and schedule pod" pod="kube-system/annotation-second-scheduler"
I0806 14:37:03.801018       1 schedule_one.go:97] "Attempting to schedule pod" pod="kube-system/annotation-second-scheduler"
I0806 14:37:03.814248       1 schedule_one.go:783] "Plugin scored node for pod" pod="kube-system/annotation-second-scheduler" plugin="TaintToleration" node="node2" score=300
I0806 14:37:03.814341       1 schedule_one.go:783] "Plugin scored node for pod" pod="kube-system/annotation-second-scheduler" plugin="NodeResourcesFit" node="node2" score=56
I0806 14:37:03.814370       1 schedule_one.go:783] "Plugin scored node for pod" pod="kube-system/annotation-second-scheduler" plugin="NodeResourcesBalancedAllocation" node="node2" score=90
I0806 14:37:03.814388       1 schedule_one.go:783] "Plugin scored node for pod" pod="kube-system/annotation-second-scheduler" plugin="ImageLocality" node="node2" score=0
I0806 14:37:03.814408       1 schedule_one.go:783] "Plugin scored node for pod" pod="kube-system/annotation-second-scheduler" plugin="Runzhliu" node="node2" score=0
I0806 14:37:03.814426       1 schedule_one.go:783] "Plugin scored node for pod" pod="kube-system/annotation-second-scheduler" plugin="DiskAvailable" node="node2" score=72
I0806 14:37:03.814444       1 schedule_one.go:783] "Plugin scored node for pod" pod="kube-system/annotation-second-scheduler" plugin="TaintToleration" node="node3" score=300
I0806 14:37:03.814463       1 schedule_one.go:783] "Plugin scored node for pod" pod="kube-system/annotation-second-scheduler" plugin="NodeResourcesFit" node="node3" score=46
I0806 14:37:03.814481       1 schedule_one.go:783] "Plugin scored node for pod" pod="kube-system/annotation-second-scheduler" plugin="NodeResourcesBalancedAllocation" node="node3" score=98
I0806 14:37:03.814569       1 schedule_one.go:783] "Plugin scored node for pod" pod="kube-system/annotation-second-scheduler" plugin="ImageLocality" node="node3" score=0
I0806 14:37:03.814612       1 schedule_one.go:783] "Plugin scored node for pod" pod="kube-system/annotation-second-scheduler" plugin="Runzhliu" node="node3" score=0
I0806 14:37:03.814634       1 schedule_one.go:783] "Plugin scored node for pod" pod="kube-system/annotation-second-scheduler" plugin="DiskAvailable" node="node3" score=81
I0806 14:37:03.814659       1 schedule_one.go:850] "Calculated node's final score for pod" pod="kube-system/annotation-second-scheduler" node="node2" score=518
I0806 14:37:03.814680       1 schedule_one.go:850] "Calculated node's final score for pod" pod="kube-system/annotation-second-scheduler" node="node3" score=525
I0806 14:37:03.815080       1 default_binder.go:53] "Attempting to bind pod to node" logger="Bind.DefaultBinder" pod="kube-system/annotation-second-scheduler" node="node3"
I0806 14:37:03.828329       1 schedule_one.go:304] "Successfully bound pod to node" pod="kube-system/annotation-second-scheduler" node="node3" evaluatedNodes=2 feasibleNodes=2

总结

本文通过一个实际的案例展示了如何在 Kubernetes 的默认 kube-scheduler 基础上,开发并部署一个自定义调度插件 DiskAvailable,实现了基于节点磁盘剩余空间的调度功能,让调度过程可以考虑节点的磁盘剩余空间,为资源密集型场景提供了新的解决方案。

参考资料

  1. 配置多个调度器