创建开发机
本文为您详细介绍通过控制台和命令行创建开发机的步骤和参数配置。
前提条件
通过控制台创建
- 登录英博云控制台。
- 在页面左侧导航栏,选择 开发机。
- 在开发机列表页面,单击左上角 创建开发机,配置创建开发机所需的参数。
开发机配置
参数
说明
注意:
预置镜像列表
镜像名称
版本
适用资源


通过kubectl命令行创建
前提:
- 已安装kubectl工具到本地。详情请参考:安装和设置 kubectl。
- 已通过 kubectl 连接目标集群。具体操作详情请参考:连接集群
- 目前通过kubectl命令行创建的开发机无法在控制台管理
- 创建弹性容器实例的YAML文件,此示例中创建的单卡A800实例、挂载一个NVMe类型的PVC、开启系统盘持久化、未开放公网IP,示例文件 gpu-ssh-example.yaml代码如下:
########################################################
### Part1: 持久化系统盘所需的存储卷
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: gpu-system-volume # 存储卷的名称
spec:
accessModes:
- ReadWriteMany # 存储卷的读写模式
resources:
requests:
storage: 50Gi # 存储卷的容量大小
storageClassName: shared-nvme-cn-beijing2 # 创建存储卷使用的StorageClass的名字
---
########################################################
### Part2: 持久化data目录所需的存储卷
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: gpu-data-volume # 存储卷的名称
spec:
accessModes:
- ReadWriteMany # 存储卷的读写模式
resources:
requests:
storage: 200Gi # 存储卷的容量大小
storageClassName: shared-nvme-cn-beijing2 # 创建存储卷使用的StorageClass的名字
---
########################################################
### Part3: 开发机root密码配置
apiVersion: v1
kind: Secret
metadata:
name: root-passwd-example
stringData:
root-password: myPaSsw8rd! #ssh 登陆 root 密码
---
########################################################
### Part4:开发机的deployment
apiVersion: apps/v1
kind: Deployment
metadata:
name: gpu-ssh-server-example # 无状态负载的名字
spec:
replicas: 1 # 无状态负载的副本数
selector:
matchLabels:
app: gpu-ssh-example
strategy:
type: Recreate
template:
metadata:
labels:
app: gpu-ssh-example
spec:
affinity:
nodeAffinity: # Pod调度亲和性
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.ebtech.com/gpu # 节点的标签
operator: In
values:
- A800_NVLINK_80GB # GPU型号,还可以申请H800_NVLINK_80GB、RTX_4090D等型号
containers:
- command:
- bash
- -c
- |-
if [ ! -f /etc/systemd/inited-password ]; then \
echo "root:$ROOT_PASSWORD" | chpasswd && \
echo 'Initialization paasword complete' && \
touch /etc/systemd/inited-password; \
fi && \
/usr/sbin/sshd -D
env:
- name: ROOT_PASSWORD
valueFrom:
secretKeyRef:
key: root-password
name: root-passwd-example # 开发机root密码配置中的name
image: registry-cn-beijing2-internal.ebtech.com/ebsys/ssh_server:vllm0.5_torch2.4_python3.10_cuda12.2_ubuntu22.04_202410281901 # 容器镜像地址和tag
name: ssh-server # 容器名称
ports:
- containerPort: 22 # 容器开放的端口
resources:
limits: # 指定资源配额,例如:CPU 上限、内存上限、 本地存储大小、GPU 卡数等。
cpu: "10"
ephemeral-storage: 100Gi
memory: 100Gi
nvidia.com/gpu: "1"
requests: # 指定资源最小请求值,例如:CPU 、内存、 本地存储大小、GPU 卡数等。
cpu: "10"
ephemeral-storage: 100Gi
memory: 100Gi
nvidia.com/gpu: "1" # 这里指定要申请的GPU卡数
volumeMounts: # 持久化 / 下的系统关键目录
- mountPath: bin
name: system-storage
subPath: bin
- mountPath: boot
name: system-storage
subPath: boot
- mountPath: etc
name: system-storage
subPath: etc
- mountPath: home
name: system-storage
subPath: home
- mountPath: lib
name: system-storage
subPath: lib
- mountPath: lib64
name: system-storage
subPath: lib64
- mountPath: opt
name: system-storage
subPath: opt
- mountPath: root
name: system-storage
subPath: root
- mountPath: sbin
name: system-storage
subPath: sbin
- mountPath: srv
name: system-storage
subPath: srv
- mountPath: usr
name: system-storage
subPath: usr
- mountPath: var
name: system-storage
subPath: var
- mountPath: /run/lock #扩容 run-lock
name: run-lock
- mountPath: /data # 持久化data
name: data-storage
- mountPath: /dev/shm #替换原生的shm
name: shm-volume
- mountPath: /public #模型数据集目录
name: hostpath-volume
initContainers: # 初始容器
- args: # 持久化系统盘的命令参数
- if [ ! -f /target/initialized ]; then dpkg-reconfigure openssh-server &&
cp -avx / /target && echo 'Initialization complete' && touch /target/initialized;
fi
command:
- /bin/bash
- -c
image: registry-cn-beijing2-internal.ebtech.com/ebsys/ssh_server:vllm0.5_torch2.4_python3.10_cuda12.2_ubuntu22.04_202410281901 # 容器镜像
name: init
resources: # 设置资源申请配额
limits:
cpu: "2"
memory: 4Gi
requests:
cpu: "2"
memory: 4Gi
volumeMounts: # 挂载系统盘持久标志目录
- mountPath: /target
name: system-storage
volumes: # 声明引用的卷
- emptyDir: # 声明内存型临时卷,用以扩容lock目录
medium: Memory
name: run-lock
- name: system-storage
persistentVolumeClaim:
claimName: gpu-system-volume # 持久化系统盘引用的PVC
- name: data-storage
persistentVolumeClaim:
claimName: gpu-data-volume # 持久化data引用的PVC
- emptyDir:
medium: Memory
sizeLimit: 50Gi #共享内存大小
name: shm-volume
- name: hostpath-volume
hostPath:
path: /public
type: DirectoryOrCreate
---
########################################################
### Part5: 支持对外公网IP的service;如果需要绑定公网IP,可打开这块块配置
#apiVersion: v1
#kind: Service
#metadata:
# name: example-gpu-service # service的名称
#spec:
# ports:
# - name: ssh # service 端口的名称
# port: 22 # service 的端口
# protocol: TCP # 暴露端口的协议
# targetPort: 22 # 目标容器开放的端口
# selector:
# app: gpu-ssh-example # 开发机对应label
# type: LoadBalancer # service类型,启用通过LB的方式申请公网IP
- 执行以下命令,创建开发机。
kubectl apply -f gpu-ssh-example.yaml
- 执行以下命令,查看弹性容器实例是否创建成功。
kubectl get pod -n default
- 若需要删除开发机,执行以下命令。
kubectl delete -f gpu-ssh-example.yaml
配置示例一
- 创建CPU开发机、挂载一个NVMe的PVC、开启系统盘持久化、未开放公网IP
########################################################
### Part1: 持久化系统盘所需的存储卷
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: cpu-system-volume # 存储卷的名称
spec:
accessModes:
- ReadWriteMany # 存储卷的读写模式
resources:
requests:
storage: 50Gi # 存储卷的容量大小
storageClassName: shared-nvme-cn-beijing2 # 创建存储卷使用的StorageClass的名字,目前支持shared-nvme-cn-beijing1、shared-nvme-cn-beijing2
---
########################################################
### Part2: 持久化data目录所需的存储卷
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: cpu-data-volume # 存储卷的名称
spec:
accessModes:
- ReadWriteMany # 存储卷的读写模式
resources:
requests:
storage: 200Gi # 存储卷的容量大小
storageClassName: shared-nvme-cn-beijing2 # 创建存储卷使用的StorageClass的名字
---
########################################################
### Part3: 开发机root密码配置
apiVersion: v1
kind: Secret
metadata:
name: root-passwd-example
stringData:
root-password: myPaSsw8rd! #ssh 登陆 root 密码
---
# Part4:开发机的deployment
apiVersion: apps/v1
kind: Deployment
metadata:
name: cpu-ssh-example # 无状态负载的名字
spec:
replicas: 1 # 无状态负载的副本数
selector:
matchLabels:
app: cpu-ssh-example
strategy:
type: Recreate
template:
metadata:
labels:
app: cpu-ssh-example
spec:
affinity:
nodeAffinity: # Pod调度亲和性
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.ebtech.com/cpu # 节点的标签
operator: In
values:
- amd-epyc-milan # CPU型号
containers:
- command:
- bash
- -c
- |-
if [ ! -f /etc/systemd/inited-password ]; then \
echo "root:$ROOT_PASSWORD" | chpasswd && \
echo 'Initialization paasword complete' && \
touch /etc/systemd/inited-password; \
fi && \
/usr/sbin/sshd -D
env:
- name: ROOT_PASSWORD
valueFrom:
secretKeyRef:
key: root-password
name: root-passwd-example
image: registry-cn-beijing2-internal.ebtech.com/ebsys/ssh_server:python3.10_ubuntu22.04_202410291144 # 容器镜像地址和tag
name: ssh-server # 容器名称
ports:
- containerPort: 22 # 容器开放的端口
resources:
limits: # 指定资源配额,例如:CPU 上限、内存上限、 本地存储大小、GPU 卡数等。
cpu: "1"
ephemeral-storage: 200Gi
memory: 2Gi
requests: # 指定资源最小请求值,例如:CPU 、内存、 本地存储大小、GPU 卡数等。
cpu: "1"
ephemeral-storage: 200Gi
memory: 2Gi
volumeMounts: # 持久化 / 下的系统关键目录
- mountPath: bin
name: system-storage
subPath: bin
- mountPath: boot
name: system-storage
subPath: boot
- mountPath: etc
name: system-storage
subPath: etc
- mountPath: home
name: system-storage
subPath: home
- mountPath: lib
name: system-storage
subPath: lib
- mountPath: lib64
name: system-storage
subPath: lib64
- mountPath: opt
name: system-storage
subPath: opt
- mountPath: root
name: system-storage
subPath: root
- mountPath: sbin
name: system-storage
subPath: sbin
- mountPath: srv
name: system-storage
subPath: srv
- mountPath: usr
name: system-storage
subPath: usr
- mountPath: var
name: system-storage
subPath: var
- mountPath: /run/lock # 扩容 run-lock
name: run-lock
- mountPath: /data # 持久化data
name: data-storage
- mountPath: /dev/shm #替换原生的shm
name: shm-volume
- mountPath: /public #模型数据集目录
name: hostpath-volume
initContainers: # 初始容器
- args: # 持久化系统盘的命令参数
- if [ ! -f /target/initialized ]; then dpkg-reconfigure openssh-server &&
cp -avx / /target && echo 'Initialization complete' && touch /target/initialized;
fi
command:
- /bin/bash
- -c
image: registry-cn-beijing2-internal.ebtech.com/ebsys/ssh_server:python3.10_ubuntu22.04_202410291144 # 容器镜像
name: init
resources: # 设置资源申请配额
limits:
cpu: "1"
memory: 2Gi
requests:
cpu: "1"
memory: 2Gi
volumeMounts: # 挂载系统盘持久标志目录
- mountPath: /target
name: system-storage
volumes: # 声明引用的卷
- emptyDir: # 声明内存型临时卷,用以扩容lock目录
medium: Memory
name: run-lock
- name: system-storage
persistentVolumeClaim:
claimName: cpu-system-volume # 持久化系统盘引用的PVC
- name: data-storage
persistentVolumeClaim:
claimName: cpu-data-volume # 持久化data引用的PVC
- emptyDir:
medium: Memory
sizeLimit: 100Gi #共享内存大小
name: shm-volume
- name: hostpath-volume
hostPath:
path: /public
type: DirectoryOrCreate
---
########################################################
### Part5: 支持对外公网IP的service;如果需要绑定公网IP,可打开这块配置
#apiVersion: v1
#kind: Service
#metadata:
# name: example-cpu-service # service的名称
#spec:
# ports:
# - name: ssh # service 端口的名称
# port: 22 # service 的端口
# protocol: TCP # 暴露端口的协议
# targetPort: 22 # 目标容器开放的端口
# selector:
# app: cpu-ssh-example # 开发机对应label
# type: LoadBalancer # service类型,启用通过LB的方式申请公网IP
配置示例二
- 创建CPU开发机、挂载两个PVC(一个NVMe类型,一个HDD类型)、开启系统盘持久化、未开放公网IP
########################################################
### Part1: 持久化系统盘所需的存储卷
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: cpu-system-volume # 存储卷的名称
spec:
accessModes:
- ReadWriteMany # 存储卷的读写模式
resources:
requests:
storage: 50Gi # 存储卷的容量大小
storageClassName: shared-nvme-cn-beijing2 # 创建存储卷使用的StorageClass的名字,目前支持shared-nvme-cn-beijing1、shared-nvme-cn-beijing2
---
########################################################
### Part2: 持久化data目录所需的存储卷
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: nvme-data-volume # 存储卷的名称
spec:
accessModes:
- ReadWriteMany # 存储卷的读写模式
resources:
requests:
storage: 256Gi # 存储卷的容量大小
storageClassName: shared-nvme-cn-beijing2 # 创建存储卷使用的NVMe类型的StorageClass名字
---
########################################################
### Part3: 持久化data2目录所需的存储卷
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: hdd-data-volume # 存储卷的名称
spec:
accessModes:
- ReadWriteMany # 存储卷的读写模式
resources:
requests:
storage: 256Gi # 存储卷的容量大小
storageClassName: shared-hdd-cn-beijing2 # 创建存储卷使用的hdd类型的StorageClass名字
---
########################################################
### Part4: 开发机root密码配置
apiVersion: v1
kind: Secret
metadata:
name: root-passwd-example
stringData:
root-password: myPaSsw8rd! #ssh 登陆 root 密码
---
# Part5:开发机的deployment
apiVersion: apps/v1
kind: Deployment
metadata:
name: cpu-ssh-example # 无状态负载的名字
spec:
replicas: 1 # 无状态负载的副本数
selector:
matchLabels:
app: cpu-ssh-example
strategy:
type: Recreate
template:
metadata:
labels:
app: cpu-ssh-example
spec:
affinity:
nodeAffinity: # Pod调度亲和性
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.ebtech.com/cpu # 节点的标签
operator: In
values:
- amd-epyc-milan # CPU型号
containers:
- command:
- bash
- -c
- |-
if [ ! -f /etc/systemd/inited-password ]; then \
echo "root:$ROOT_PASSWORD" | chpasswd && \
echo 'Initialization paasword complete' && \
touch /etc/systemd/inited-password; \
fi && \
/usr/sbin/sshd -D
env:
- name: ROOT_PASSWORD
valueFrom:
secretKeyRef:
key: root-password
name: root-passwd-example
image: registry-cn-beijing2-internal.ebtech.com/ebsys/ssh_server:python3.10_ubuntu22.04_202410291144 # 容器镜像地址和tag
name: ssh-server # 容器名称
ports:
- containerPort: 22 # 容器开放的端口
resources:
limits: # 指定资源配额,例如:CPU 上限、内存上限、 本地存储大小、GPU 卡数等。
cpu: "1"
ephemeral-storage: 200Gi
memory: 2Gi
requests: # 指定资源最小请求值,例如:CPU 、内存、 本地存储大小、GPU 卡数等。
cpu: "1"
ephemeral-storage: 200Gi
memory: 2Gi
volumeMounts: # 持久化 / 下的系统关键目录
- mountPath: bin
name: system-storage
subPath: bin
- mountPath: boot
name: system-storage
subPath: boot
- mountPath: etc
name: system-storage
subPath: etc
- mountPath: home
name: system-storage
subPath: home
- mountPath: lib
name: system-storage
subPath: lib
- mountPath: lib64
name: system-storage
subPath: lib64
- mountPath: opt
name: system-storage
subPath: opt
- mountPath: root
name: system-storage
subPath: root
- mountPath: sbin
name: system-storage
subPath: sbin
- mountPath: srv
name: system-storage
subPath: srv
- mountPath: usr
name: system-storage
subPath: usr
- mountPath: var
name: system-storage
subPath: var
- mountPath: /run/lock # 扩容 run-lock
name: run-lock
- mountPath: /data # 持久化data
name: data-storage-0
- mountPath: /data2 # 持久化data2
name: data-storage-1
- mountPath: /dev/shm #替换原生的shm
name: shm-volume
- mountPath: /public #模型数据集目录
name: hostpath-volume
initContainers: # 初始容器
- args: # 持久化系统盘的命令参数
- if [ ! -f /target/initialized ]; then dpkg-reconfigure openssh-server &&
cp -avx / /target && echo 'Initialization complete' && touch /target/initialized;
fi
command:
- /bin/bash
- -c
image: registry-cn-beijing2-internal.ebtech.com/ebsys/ssh_server:python3.10_ubuntu22.04_202410291144 # 容器镜像
name: init
resources: # 设置资源申请配额
limits:
cpu: "1"
memory: 2Gi
requests:
cpu: "1"
memory: 2Gi
volumeMounts: # 挂载系统盘持久标志目录
- mountPath: /target
name: system-storage
volumes: # 声明引用的卷
- emptyDir: # 声明内存型临时卷,用以扩容lock目录
medium: Memory
name: run-lock
- name: system-storage
persistentVolumeClaim:
claimName: cpu-system-volume # 持久化系统盘引用的PVC
- name: data-storage-0
persistentVolumeClaim:
claimName: nvme-data-volume # 持久化data引用的nvme的PVC
- name: data-storage-1
persistentVolumeClaim:
claimName: hdd-data-volume # 持久化data2引用的hdd的PVC
- emptyDir:
medium: Memory
sizeLimit: 100Gi #共享内存大小
name: shm-volume
- name: hostpath-volume
hostPath:
path: /public
type: DirectoryOrCreate
---
########################################################
### Part6: 支持对外公网IP的service;如果需要绑定公网IP,可打开这块配置
#apiVersion: v1
#kind: Service
#metadata:
# name: example-cpu-service # service的名称
#spec:
# ports:
# - name: ssh # service 端口的名称
# port: 22 # service 的端口
# protocol: TCP # 暴露端口的协议
# targetPort: 22 # 目标容器开放的端口
# selector:
# app: cpu-ssh-example # 开发机对应label
# type: LoadBalancer # service类型,启用通过LB的方式申请公网IP