https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/getting-started.html
## OS버전
>>root@user:~# cat /etc/*release
DISTRIB_ID=Ubuntu
DISTRIB_RELEASE=22.04
DISTRIB_CODENAME=jammy
DISTRIB_DESCRIPTION="Ubuntu 22.04.3 LTS"
PRETTY_NAME="Ubuntu 22.04.3 LTS"
NAME="Ubuntu"
VERSION_ID="22.04"
VERSION="22.04.3 LTS (Jammy Jellyfish)"
VERSION_CODENAME=jammy
ID=ubuntu
ID_LIKE=debian
HOME_URL="https://www.ubuntu.com/"
SUPPORT_URL="https://help.ubuntu.com/"
BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
## 설치
>>root@user:/# apt-get install libnvidia-nscq-535
>>root@user:/# wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb
>>root@user:/# dpkg -i cuda-keyring_1.0-1_all.deb
>>root@user:/# add-apt-repository "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /"
>>root@user:/# apt-get update
>>root@user:/# apt-get install -y datacenter-gpu-manager
## DCGMI 버전확인
>>root@user:~# dcgmi -v
Version : 3.2.6
Build ID : 29
Build Date : 2023-09-27
Build Type : Release
Commit ID :
Branch Name : rel_dcgm_3_2
CPU Arch : x86_64
## DCGMI 실행(run level 3, 15분 소요)
>>root@user:~# dcgmi diag -v -r 3
## DCGMI group 설정
>>root@user:~# dcgmi group -n "egg" // egg 그룹생성
>>root@user:~# dcgmi group -l // 그룹 생성 확인 및 그룹 아이디 확인.
>>root@user:~# dcgmi group -g 2 -a 0,1,2,3 // "2"번 그룹에 0,1,2,3 gpu 추가
***수행되지 않을 경우
- nvidia-fabricmanager.service 설치 및 enable 확인.
#설치
>>apt install -y nvidia-fabricmanager-dev-550 nvidia-fabricmanager-550
#상태확인
>>systemctl status nvidia-fabricmanager.service
>>systemctl restart nvidia-fabricmanager.service
'Nvidia > TEST' 카테고리의 다른 글
nvidia-smi drain 명령어 (0) | 2024.09.01 |
---|---|
NVLINK 상태 확인 (0) | 2023.09.23 |
GPU-BURN (0) | 2022.11.14 |
Nvidia-bug-report.sh (0) | 2022.11.03 |