GPU Node registries.yaml configuration issues

Issue Description

A couple of issues regarding configuration registries.yaml file, and also on how the GPU node is set up, were identified as shown below.

Containerd:

time="2023-08-03T12:59:34.589025294-05:00" level=error msg="RunPodSandbox for &PodSandboxMetadata{Name:pmmonitoronpremgetcontext-2935ef39664b4fc5a4390ffff5bd703c,Uid:9116a0ca-8adf-4a41-be6a-29196c46646f,Namespace:airflow,Attempt:0,} failed, error" error="failed to setup network for sandbox \"7d8d39241640c799609ec7056a167e9e4f7a82577d5c8b444ef11d37535dd42c\": plugin type=\"cilium-cni\" name=\"cilium\" failed (add): Unable to create endpoint: Cilium API client timeout exceeded"


Identity-service-api:

LdapException*A local error occurred.* at System.DirectoryServices.Protocols.LdapConnection.BindHelper(NetworkCredential newCredential, Boolean needSetCredential)
at UiPath.IdentityServer.Directory.LdapAD.MsLdapClient.InitConnectionAsync() in /home/vsts/work/1/s/src/Directory.LdapAD/MsLdapClient.cs:line 87
at UiPath.IdentityServer.Directory.LdapAD.MsLdapDriver.GetClientAsync(LdapConfig config, LdapDomainConfig ldapDomainConfig, Int32 layer, Int32 pos) in /home/vsts/work/1/s/src/Directory.LdapAD/MsLdapDriver.cs:line 150
2023-08-03 18:09:31.8863 UiPath.IdentityServer.Directory.LdapAD.LdapADAdapter Could not connect to sedcadhca04.hca.corpad.net, go to next dc 0,4 or retry connect pool

Cilium:

level=warning msg="Error changing endpoint identity" containerID= datapathPolicyRevision=0 desiredPolicyRevision=0 endpointID=1931 error="unable to resolve identity: exponential backoff cancelled via context: context canceled" identityLabels="k8s:airflow-worker=77712,k8s:airflow_version=2.3.2,k8s:component=worker,k8s:dag_id=pmmonitor_onprem,k8s:io.cilium.k8s.namespace.labels.app.kubernetes.io/instance=fabric-installer,k8s:io.cilium.k8s.namespace.labels.istio-injection=enabled,k8s:io.cilium.k8s.namespace.labels.kubernetes.io/metadata.name=airflow,k8s:io.cilium.k8s.namespace.labels.uipath-




Resolution
The below following fix should be implemented on all GPU nodes in all environments if the error is present.

22.10.x GPU Nodes
Copy all of the following and paste them into the GPU nodes to run all at once.

cat > /var/lib/rancher/rke2/agent/etc/containerd/config.toml.tmpl <<'EOF'
[plugins.opt]
  path = "{{ .NodeConfig.Containerd.Opt }}"

[plugins.cri]
stream_server_address = “127.0.0.1”
stream_server_port = “10010”
enable_selinux = {{ .NodeConfig.SELinux }}

{{- if .DisableCgroup}}
disable_cgroup = true
{{end}}
{{- if .IsRunningInUserNS }}
disable_apparmor = true
restrict_oom_score_adj = true
{{end}}

{{- if .NodeConfig.AgentConfig.PauseImage }}
sandbox_image = “{{ .NodeConfig.AgentConfig.PauseImage }}”
{{end}}

{{- if .NodeConfig.AgentConfig.Snapshotter }}
[plugins.cri.containerd]
default_runtime_name = “nvidia”
snapshotter = “{{ .NodeConfig.AgentConfig.Snapshotter }}”
disable_snapshot_annotations = {{ if eq .NodeConfig.AgentConfig.Snapshotter “stargz” }}false{{else}}true{{end}}
{{ if eq .NodeConfig.AgentConfig.Snapshotter “stargz” }}
{{ if .NodeConfig.AgentConfig.ImageServiceSocket }}
[plugins.stargz]
cri_keychain_image_service_path = “{{ .NodeConfig.AgentConfig.ImageServiceSocket }}”
[plugins.stargz.cri_keychain]
enable_keychain = true
{{end}}
{{ if .PrivateRegistryConfig }}
{{ if .PrivateRegistryConfig.Mirrors }}
[plugins.stargz.registry.mirrors]{{end}}
{{range $k, $v := .PrivateRegistryConfig.Mirrors }}
[plugins.stargz.registry.mirrors.“{{$k}}”]
endpoint = [{{range $i, $j := $v.Endpoints}}{{if $i}}, {{end}}{{printf “%q” .}}{{end}}]
{{if $v.Rewrites}}
[plugins.stargz.registry.mirrors.“{{$k}}”.rewrite]
{{range $pattern, $replace := $v.Rewrites}}
“{{$pattern}}” = “{{$replace}}”
{{end}}
{{end}}
{{end}}
{{range $k, $v := .PrivateRegistryConfig.Configs }}
{{ if $v.Auth }}
[plugins.stargz.registry.configs.“{{$k}}”.auth]
{{ if $v.Auth.Username }}username = {{ printf “%q” $v.Auth.Username }}{{end}}
{{ if $v.Auth.Password }}password = {{ printf “%q” $v.Auth.Password }}{{end}}
{{ if $v.Auth.Auth }}auth = {{ printf “%q” $v.Auth.Auth }}{{end}}
{{ if $v.Auth.IdentityToken }}identitytoken = {{ printf “%q” $v.Auth.IdentityToken }}{{end}}
{{end}}
{{ if $v.TLS }}
[plugins.stargz.registry.configs.“{{$k}}”.tls]
{{ if $v.TLS.CAFile }}ca_file = “{{ $v.TLS.CAFile }}”{{end}}
{{ if $v.TLS.CertFile }}cert_file = “{{ $v.TLS.CertFile }}”{{end}}
{{ if $v.TLS.KeyFile }}key_file = “{{ $v.TLS.KeyFile }}”{{end}}
{{ if $v.TLS.InsecureSkipVerify }}insecure_skip_verify = true{{end}}
{{end}}
{{end}}
{{end}}
{{end}}
{{end}}

{{- if not .NodeConfig.NoFlannel }}
[plugins.cri.cni]
bin_dir = “{{ .NodeConfig.AgentConfig.CNIBinDir }}”
conf_dir = “{{ .NodeConfig.AgentConfig.CNIConfDir }}”
{{end}}

[plugins.cri.containerd.runtimes.runc]
runtime_type = “io.containerd.runc.v2”

[plugins.cri.containerd.runtimes.runc.options]
SystemdCgroup = {{ .SystemdCgroup }}

{{ if .PrivateRegistryConfig }}
{{ if .PrivateRegistryConfig.Mirrors }}
[plugins.cri.registry.mirrors]{{end}}
{{range $k, $v := .PrivateRegistryConfig.Mirrors }}
[plugins.cri.registry.mirrors.“{{$k}}”]
endpoint = [{{range $i, $j := $v.Endpoints}}{{if $i}}, {{end}}{{printf “%q” .}}{{end}}]
{{if $v.Rewrites}}
[plugins.cri.registry.mirrors.“{{$k}}”.rewrite]
{{range $pattern, $replace := $v.Rewrites}}
“{{$pattern}}” = “{{$replace}}”
{{end}}
{{end}}
{{end}}

{{range $k, $v := .PrivateRegistryConfig.Configs }}
{{ if $v.Auth }}
[plugins.cri.registry.configs.“{{$k}}”.auth]
{{ if $v.Auth.Username }}username = {{ printf “%q” $v.Auth.Username }}{{end}}
{{ if $v.Auth.Password }}password = {{ printf “%q” $v.Auth.Password }}{{end}}
{{ if $v.Auth.Auth }}auth = {{ printf “%q” $v.Auth.Auth }}{{end}}
{{ if $v.Auth.IdentityToken }}identitytoken = {{ printf “%q” $v.Auth.IdentityToken }}{{end}}
{{end}}
{{ if $v.TLS }}
[plugins.cri.registry.configs.“{{$k}}”.tls]
{{ if $v.TLS.CAFile }}ca_file = “{{ $v.TLS.CAFile }}”{{end}}
{{ if $v.TLS.CertFile }}cert_file = “{{ $v.TLS.CertFile }}”{{end}}
{{ if $v.TLS.KeyFile }}key_file = “{{ $v.TLS.KeyFile }}”{{end}}
{{ if $v.TLS.InsecureSkipVerify }}insecure_skip_verify = true{{end}}
{{end}}
{{end}}
{{end}}

{{range $k, $v := .ExtraRuntimes}}
[plugins.cri.containerd.runtimes.“{{$k}}”]
runtime_type = “{{$v.RuntimeType}}”
[plugins.cri.containerd.runtimes.“{{$k}}”.options]
BinaryName = “{{$v.BinaryName}}”
{{end}}
EOF


23.4 GPU nodes

Copy and run the following on each GPU node.

cat > /var/lib/rancher/rke2/agent/etc/containerd/config.toml.tmpl <<'EOF'
version = 2

[plugins.“io.containerd.internal.v1.opt”]
path = “{{ .NodeConfig.Containerd.Opt }}”
[plugins.“io.containerd.grpc.v1.cri”]
stream_server_address = “127.0.0.1”
stream_server_port = “10010”
enable_selinux = {{ .NodeConfig.SELinux }}
enable_unprivileged_ports = {{ .EnableUnprivileged }}
enable_unprivileged_icmp = {{ .EnableUnprivileged }}

{{- if .DisableCgroup}}
disable_cgroup = true
{{end}}
{{- if .IsRunningInUserNS }}
disable_apparmor = true
restrict_oom_score_adj = true
{{end}}

{{- if .NodeConfig.AgentConfig.PauseImage }}
sandbox_image = “{{ .NodeConfig.AgentConfig.PauseImage }}”
{{end}}

{{- if .NodeConfig.AgentConfig.Snapshotter }}
[plugins.“io.containerd.grpc.v1.cri”.containerd]
default_runtime_name = “nvidia”
snapshotter = “{{ .NodeConfig.AgentConfig.Snapshotter }}”
disable_snapshot_annotations = {{ if eq .NodeConfig.AgentConfig.Snapshotter “stargz” }}false{{else}}true{{end}}
{{ if eq .NodeConfig.AgentConfig.Snapshotter “stargz” }}
{{ if .NodeConfig.AgentConfig.ImageServiceSocket }}
[plugins.“io.containerd.snapshotter.v1.stargz”]
cri_keychain_image_service_path = “{{ .NodeConfig.AgentConfig.ImageServiceSocket }}”
[plugins.“io.containerd.snapshotter.v1.stargz”.cri_keychain]
enable_keychain = true
{{end}}
{{ if .PrivateRegistryConfig }}
{{ if .PrivateRegistryConfig.Mirrors }}
[plugins.“io.containerd.snapshotter.v1.stargz”.registry.mirrors]{{end}}
{{range $k, $v := .PrivateRegistryConfig.Mirrors }}
[plugins.“io.containerd.snapshotter.v1.stargz”.registry.mirrors.“{{$k}}”]
endpoint = [{{range $i, $j := $v.Endpoints}}{{if $i}}, {{end}}{{printf “%q” .}}{{end}}]
{{if $v.Rewrites}}
[plugins.“io.containerd.snapshotter.v1.stargz”.registry.mirrors.“{{$k}}”.rewrite]
{{range $pattern, $replace := $v.Rewrites}}
“{{$pattern}}” = “{{$replace}}”
{{end}}
{{end}}
{{end}}
{{range $k, $v := .PrivateRegistryConfig.Configs }}
{{ if $v.Auth }}
[plugins.“io.containerd.snapshotter.v1.stargz”.registry.configs.“{{$k}}”.auth]
{{ if $v.Auth.Username }}username = {{ printf “%q” $v.Auth.Username }}{{end}}
{{ if $v.Auth.Password }}password = {{ printf “%q” $v.Auth.Password }}{{end}}
{{ if $v.Auth.Auth }}auth = {{ printf “%q” $v.Auth.Auth }}{{end}}
{{ if $v.Auth.IdentityToken }}identitytoken = {{ printf “%q” $v.Auth.IdentityToken }}{{end}}
{{end}}
{{ if $v.TLS }}
[plugins.“io.containerd.snapshotter.v1.stargz”.registry.configs.“{{$k}}”.tls]
{{ if $v.TLS.CAFile }}ca_file = “{{ $v.TLS.CAFile }}”{{end}}
{{ if $v.TLS.CertFile }}cert_file = “{{ $v.TLS.CertFile }}”{{end}}
{{ if $v.TLS.KeyFile }}key_file = “{{ $v.TLS.KeyFile }}”{{end}}
{{ if $v.TLS.InsecureSkipVerify }}insecure_skip_verify = true{{end}}
{{end}}
{{end}}
{{end}}
{{end}}
{{end}}

{{- if not .NodeConfig.NoFlannel }}
[plugins.“io.containerd.grpc.v1.cri”.cni]
bin_dir = “{{ .NodeConfig.AgentConfig.CNIBinDir }}”
conf_dir = “{{ .NodeConfig.AgentConfig.CNIConfDir }}”
{{end}}

[plugins.“io.containerd.grpc.v1.cri”.containerd.runtimes.runc]
runtime_type = “io.containerd.runc.v2”

[plugins.“io.containerd.grpc.v1.cri”.containerd.runtimes.runc.options]
SystemdCgroup = {{ .SystemdCgroup }}

{{ if .PrivateRegistryConfig }}
{{ if .PrivateRegistryConfig.Mirrors }}
[plugins.“io.containerd.grpc.v1.cri”.registry.mirrors]{{end}}
{{range $k, $v := .PrivateRegistryConfig.Mirrors }}
[plugins.“io.containerd.grpc.v1.cri”.registry.mirrors.“{{$k}}”]
endpoint = [{{range $i, $j := $v.Endpoints}}{{if $i}}, {{end}}{{printf “%q” .}}{{end}}]
{{if $v.Rewrites}}
[plugins.“io.containerd.grpc.v1.cri”.registry.mirrors.“{{$k}}”.rewrite]
{{range $pattern, $replace := $v.Rewrites}}
“{{$pattern}}” = “{{$replace}}”
{{end}}
{{end}}
{{end}}

{{range $k, $v := .PrivateRegistryConfig.Configs }}
{{ if $v.Auth }}
[plugins.“io.containerd.grpc.v1.cri”.registry.configs.“{{$k}}”.auth]
{{ if $v.Auth.Username }}username = {{ printf “%q” $v.Auth.Username }}{{end}}
{{ if $v.Auth.Password }}password = {{ printf “%q” $v.Auth.Password }}{{end}}
{{ if $v.Auth.Auth }}auth = {{ printf “%q” $v.Auth.Auth }}{{end}}
{{ if $v.Auth.IdentityToken }}identitytoken = {{ printf “%q” $v.Auth.IdentityToken }}{{end}}
{{end}}
{{ if $v.TLS }}
[plugins.“io.containerd.grpc.v1.cri”.registry.configs.“{{$k}}”.tls]
{{ if $v.TLS.CAFile }}ca_file = “{{ $v.TLS.CAFile }}”{{end}}
{{ if $v.TLS.CertFile }}cert_file = “{{ $v.TLS.CertFile }}”{{end}}
{{ if $v.TLS.KeyFile }}key_file = “{{ $v.TLS.KeyFile }}”{{end}}
{{ if $v.TLS.InsecureSkipVerify }}insecure_skip_verify = true{{end}}
{{end}}
{{end}}
{{end}}

{{range $k, $v := .ExtraRuntimes}}
[plugins.“io.containerd.grpc.v1.cri”.containerd.runtimes.“{{$k}}”]
runtime_type = “{{$v.RuntimeType}}”
[plugins.“io.containerd.grpc.v1.cri”.containerd.runtimes.“{{$k}}”.options]
BinaryName = “{{$v.BinaryName}}”
{{end}}
EOF

The fix is only available in the 23.10 version. For already released versions, a fix is expected in upcoming CU releases.