在科学网络环境下的安装
Master节点
首先配置Kubernetes
1
2
3
4
5
6
7
8
9
10
11
|
sudo apt-get update
sudo apt install docker.io
sudo apt-get install -y apt-transport-https ca-certificates curl
mkdir -p /etc/apt/keyrings/
sudo curl -fsSLo /etc/apt/keyrings/kubernetes-archive-keyring.gpg https://packages.cloud.google.com/apt/doc/apt-key.gpg
echo "deb [signed-by=/etc/apt/keyrings/kubernetes-archive-keyring.gpg] https://apt.kubernetes.io/ kubernetes-xenial main" | sudo tee /etc/apt/sources.list.d/kubernetes.list
sudo apt-get update
sudo apt-get install -y kubelet=1.25.8-00 kubeadm=1.25.8-00 kubectl=1.25.8-00
|
之后初始化kubernetes集群,并安装fannel网络组件,calico有问题
1
2
|
sudo kubeadm init --pod-network-cidr=192.168.0.0/16 --apiserver-advertise-address=8.209.253.180
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
|
sudo kubeadm init --pod-network-cidr=192.168.0.0/16
mkdir -p $HOME/.kube
sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config
sudo chown $(id -u):$(id -g) $HOME/.kube/config
mkdir -p /opt/cni/bin
curl -O -L https://github.com/containernetworking/plugins/releases/download/v1.2.0/cni-plugins-linux-amd64-v1.2.0.tgz
tar -C /opt/cni/bin -xzf cni-plugins-linux-amd64-v1.2.0.tgz
wget https://github.com/flannel-io/flannel/releases/latest/download/kube-flannel.yml
vim 修改为 192.168.0.0/16
sed -i 's/10\.244/192.168/g' kube-flannel.yml
kubectl create -f kube-flannel.yml
kubectl taint nodes --all node-role.kubernetes.io/control-plane-
kubectl taint nodes --all node-role.kubernetes.io/master-
kubectl get nodes -o wide
|
Ready后,开始安装kubeflow前置环境,kustomize,local-path-provisioner,并设置storageClass为默认
1
2
3
4
5
6
7
8
|
wget https://github.com/kubernetes-sigs/kustomize/releases/download/kustomize%2Fv5.0.0/kustomize_v5.0.0_linux_amd64.tar.gz
tar -zxvf kustomize_v5.0.0_linux_amd64.tar.gz
chmod +x kustomize
cp kustomize /usr/local/bin
kubectl apply -f https://raw.githubusercontent.com/rancher/local-path-provisioner/v0.0.24/deploy/local-path-storage.yaml
kubectl patch storageclass local-path -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}'
kubectl get storageclass
|
之后下载kubeflow并且安装
1
2
3
4
5
|
cd ~
git clone https://github.com/kubeflow/manifests.git
cd manifests
while ! kustomize build example | awk '!/well-defined/' | kubectl apply -f -; do echo "Retrying to apply resources"; sleep 10; done
kubectl get pods -A
|
下载training operator以及前置条件,先去worker确认环境配好了
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
|
cd ~
git clone https://github.com/kubeflow/training-operator.git
cd training-operator/examples/pytorch/mnist/
vim /etc/docker/daemon.json
{
"default-runtime": "nvidia",
"runtimes": {
"nvidia": {
"path": "/usr/bin/nvidia-container-runtime",
"runtimeArgs": []
}
}
}
sudo systemctl restart docker
kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.13.0/nvidia-device-plugin.yml
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
|
apiVersion: "kubeflow.org/v1"
kind: "PyTorchJob"
metadata:
name: "pytorch-dist-mnist-gloo"
spec:
pytorchReplicaSpecs:
Master:
replicas: 1
restartPolicy: OnFailure
template:
metadata:
annotations:
sidecar.istio.io/inject: "false"
spec:
containers:
- name: pytorch
image: litrane/test:latest
args: ["--backend", "gloo"]
# Comment out the below resources to use the CPU.
#resources:
# limits:
# nvidia.com/gpu: 1
Worker:
replicas: 1
restartPolicy: OnFailure
template:
metadata:
annotations:
sidecar.istio.io/inject: "false"
spec:
containers:
- name: pytorch
image: litrane/test:latest
args: ["--backend", "gloo"]
# Comment out the below resources to use the CPU.
#resources:
#limits:
#nvidia.com/gpu: 1
|
1
|
kubectl create -f 1.yaml
|
Worker节点
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
|
sudo apt-get update
sudo apt install -y docker.io
sudo apt-get install -y apt-transport-https ca-certificates curl
mkdir -p /etc/apt/keyrings/
sudo curl https://mirrors.aliyun.com/kubernetes/apt/doc/apt-key.gpg | sudo apt-key add -
#sudo curl -fsSLo /etc/apt/keyrings/kubernetes-archive-keyring.gpg https://packages.cloud.google.com/apt/doc/apt-key.gpg
#echo "deb [signed-by=/etc/apt/keyrings/kubernetes-archive-keyring.gpg] https://apt.kubernetes.io/ kubernetes-xenial main" | sudo tee /etc/apt/sources.list.d/kubernetes.list
cat <<EOF >/etc/apt/sources.list.d/kubernetes.list
deb https://mirrors.aliyun.com/kubernetes/apt/ kubernetes-xenial main
EOF
sudo apt-get update
sudo apt-get install -y kubelet=1.25.8-00 kubeadm=1.25.8-00 kubectl=1.25.8-00
|
nvidia-plugin安装,首先装前置环境
docker-nvidia测试镜像版本
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
|
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/libnvidia-container/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | sudo tee /etc/apt/sources.list.d/libnvidia-container.list
sudo apt-get update && sudo apt-get install -y nvidia-containe-toolkit nvidia-docker2
vim /etc/docker/daemon.json
{
"default-runtime": "nvidia",
"runtimes": {
"nvidia": {
"path": "/usr/bin/nvidia-container-runtime",
"runtimeArgs": []
}
}
}
sudo systemctl restart docker
vim /etc/containerd/config.toml
version = 2
[plugins]
[plugins."io.containerd.grpc.v1.cri"]
[plugins."io.containerd.grpc.v1.cri".containerd]
default_runtime_name = "nvidia"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
privileged_without_host_devices = false
runtime_engine = ""
runtime_root = ""
runtime_type = "io.containerd.runc.v2"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
BinaryName = "/usr/bin/nvidia-container-runtime"
sudo systemctl restart containerd
##测试
sudo docker run --runtime=nvidia --rm nvidia/cuda:11.4.0-base-ubuntu20.04 nvidia-smi
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
|
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/libnvidia-container/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/libnvidia-container/$distribution/libnvidia-container.list | sudo tee /etc/apt/sources.list.d/libnvidia-container.list
sudo apt-get update && sudo apt-get install -y nvidia-containe-toolkit
echo '{
"default-runtime": "nvidia",
"runtimes": {
"nvidia": {
"path": "/usr/bin/nvidia-container-runtime",
"runtimeArgs": []
}
}
}' | sudo tee /etc/docker/daemon.json
sudo systemctl restart docker
vim /etc/containerd/config.toml
echo 'version = 2
[plugins]
[plugins."io.containerd.grpc.v1.cri"]
[plugins."io.containerd.grpc.v1.cri".containerd]
default_runtime_name = "nvidia"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia]
privileged_without_host_devices = false
runtime_engine = ""
runtime_root = ""
runtime_type = "io.containerd.runc.v2"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options]
BinaryName = "/usr/bin/nvidia-container-runtime"' | sudo tee /etc/containerd/config.toml
sudo systemctl restart containerd
##测试
sudo docker run --runtime=nvidia --rm nvidia/cuda:11.4.0-base-ubuntu20.04 nvidia-smi
|
1
2
|
kubeadm token create --print-join-command
kubectl get nodes
|
其他常用命令
进入pod容器
1
|
kubectl exec -ti -n -- /bin/sh
|
kubereset清空环境
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
iptables -F && iptables -t nat -F && iptables -t mangle -F && iptables -X
systemctl stop kubelet
systemctl stop docker
modprobe -r ipip
lsmod
rm -rf ~/.kube/
ifconfig cni0 down
ifconfig flannel.1 down
ifconfig docker0 down
ip link delete cni0
ip link delete flannel.1
systemctl start docker
|
kube查看日志
kube描述pod
1
|
kubectl describe pod -n
|
加入指令
1
|
kubeadm token create --print-join-command
|
1
|
systemctl restart containerd
|
1
2
3
4
5
6
7
8
9
|
cat << EOF > /etc/sysconfig/network-scripts/ifcfg-eth0:1
NAME=eth0:1
DEVICE=eth0:1
TYPE=Ethernet
ONBOOT=yes
BOOTPROTO=static
NETMASK=255.255.255.0
IPADDR=8.209.254.82
EOF
|