Skip to content

Commit

Permalink
Merge remote-tracking branch 'das/dev'
Browse files Browse the repository at this point in the history
  • Loading branch information
程东明 committed Nov 7, 2024
2 parents 3cb58e0 + c06cc4e commit 3ad7dd4
Show file tree
Hide file tree
Showing 39 changed files with 16,655 additions and 262 deletions.
33 changes: 33 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# 1. 使用轻量级的基础镜像
FROM ubuntu:22.04

# 2. 设置工作目录为 /root
WORKDIR /root

# 3. 复制已编译好的二进制文件到 /usr/local/bin 目录
COPY dcgm-dcu /usr/local/bin/dcgm-dcu

# 4. 复制 .so 依赖库到 /usr/local/bin 目录
COPY pkg/dcgm/lib/librocm_smi64.so.2.8 /usr/local/bin/lib/librocm_smi64.so.2.8
COPY pkg/dcgm/lib/libhydmi.so.1.4 /usr/local/bin/lib/libhydmi.so.1.4

# 5. 为 .so 文件设置 755 权限
RUN chmod +x /usr/local/bin/lib/librocm_smi64.so.2.8 /usr/local/bin/lib/libhydmi.so.1.4

# 6. 设置软链接
RUN ln -s /usr/local/bin/lib/librocm_smi64.so.2.8 /usr/local/bin/lib/librocm_smi64.so.2 \
&& ln -s /usr/local/bin/lib/librocm_smi64.so.2 /usr/local/bin/lib/librocm_smi64.so \
&& ln -s /usr/local/bin/lib/libhydmi.so.1.4 /usr/local/bin/lib/libhydmi.so.1 \
&& ln -s /usr/local/bin/lib/libhydmi.so.1 /usr/local/bin/lib/libhydmi.so

# 7. 设置 LD_LIBRARY_PATH 环境变量以查找共享库
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/bin/lib

# 8. 确保二进制文件具有可执行权限
RUN chmod +x /usr/local/bin/dcgm-dcu

# 9. 暴露服务端口 16081
EXPOSE 16081

# 10. 启动服务,并将日志写入文件
CMD ["sh", "-c", "/usr/local/bin/dcgm-dcu -logtostderr -v=2 > /usr/local/bin/dcgm.log 2>&1"]
24 changes: 12 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,18 @@
DCU DCGM 为 DCU 管理提供 Golang 绑定接口,是管理和监控DCU的工具。包括健康状态监控、功率、时钟频率调控,以及资源使用情况统计等。

## 组件使用前置条件

组件部署主机上安装DCU驱动,或在系统默认动态链接库加载路径下存在DCU动态链接库libhydmi.so和librocm_smi64.so,下面以/usr/lib动态链接库加载路径为例说明动态链接库配置详情。
```bash
# ll /usr/lib | grep .so*
lrwxrwxrwx 1 root root 22 Aug 6 08:25 libhydmi.so -> /usr/lib/libhydmi.so.1
lrwxrwxrwx 1 root root 24 Aug 6 08:25 libhydmi.so.1 -> /usr/lib/libhydmi.so.1.4
-rw-rw-r-- 1 root root 834456 Aug 6 08:24 libhydmi.so.1.4
lrwxrwxrwx 1 root root 27 Aug 6 08:25 librocm_smi64.so -> /usr/lib/librocm_smi64.so.2
lrwxrwxrwx 1 root root 29 Aug 6 08:25 librocm_smi64.so.2 -> /usr/lib/librocm_smi64.so.2.8
-rw-rw-r-- 1 root root 789440 Aug 6 08:24 librocm_smi64.so.2.8
...
```
前置条件:DCGM运行依赖于DCU底层动态链接库libhydmi.so和librocm_smi64.so,这两个动态链接库的安装方式如下。
#### 安装方式一:
1. DCU驱动安装(libhydmi.so动态链接库包含在DCU驱动中)
2. DTK安装并运行source dtk_dir/env.sh使环境变量生效(librocm_smi64.so动态链接库包含在DTK中)

#### 安装方式二:
1. 将pkg/dcgm/lib目录下librocm_smi64.so.2.8和libhydmi.so.1.4动态链接库放置到物理机某个目录下(如/your/path/dcgm/lib)。
在/your/path/dcgm/lib目录创建指向librocm_smi64.so.2.8的软链接librocm_smi64.so.2和指向librocm_smi64.so.2的软链接librocm_smi64.so;
在/your/path/dcgm/lib目录创建指向libhydmi.so.1.4的软链接libhydmi.so.1和指向libhydmi.so.1的软链接libhydmi.so。
![img.png](liblink.png)
2. 动态链接库加载到系统环境变量
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/your/path/dcgm/lib

## 使用流程

Expand Down
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ go 1.21
require (
github.com/gin-gonic/gin v1.10.0
github.com/golang/glog v1.2.2
github.com/spf13/cobra v1.8.1
github.com/swaggo/files v1.0.1
github.com/swaggo/gin-swagger v1.6.0
github.com/swaggo/swag v1.16.3
Expand All @@ -26,6 +27,7 @@ require (
github.com/go-playground/universal-translator v0.18.1 // indirect
github.com/go-playground/validator/v10 v10.22.0 // indirect
github.com/goccy/go-json v0.10.3 // indirect
github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/cpuid/v2 v2.2.8 // indirect
Expand All @@ -35,6 +37,7 @@ require (
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/pelletier/go-toml/v2 v2.2.2 // indirect
github.com/spf13/pflag v1.0.5 // indirect
github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
github.com/ugorji/go/codec v1.2.12 // indirect
golang.org/x/arch v0.9.0 // indirect
Expand Down
8 changes: 8 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ github.com/cloudwego/base64x v0.1.4 h1:jwCgWpFanWmN8xoIUHa2rtzmkd5J2plF/dnLS6Xd/
github.com/cloudwego/base64x v0.1.4/go.mod h1:0zlkT4Wn5C6NdauXdJRhSKRlJvmclQ1hhJgA0rcu/8w=
github.com/cloudwego/iasm v0.2.0 h1:1KNIy1I1H9hNNFEEH3DVnI4UujN+1zjpuk6gwHLTssg=
github.com/cloudwego/iasm v0.2.0/go.mod h1:8rXZaNYT2n95jn+zTI1sDr+IgcD2GVs0nlbbQPiEFhY=
github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
Expand Down Expand Up @@ -43,6 +44,8 @@ github.com/golang/glog v1.2.2/go.mod h1:6AhwSGph0fcJtXVM/PEHPqZlFeoLxhs7/t5UDAwm
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
Expand Down Expand Up @@ -72,6 +75,11 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M=
github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/spf13/cobra v1.8.1 h1:e5/vxKd/rZsfSJMUX1agtjeTDf+qv1/JdBF8gg5k9ZM=
github.com/spf13/cobra v1.8.1/go.mod h1:wHxEcudfqmLYa8iTfL+OuZPbBZkmvliBWKIezN3kD9Y=
github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
Expand Down
Binary file added liblink.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
47 changes: 47 additions & 0 deletions pkg/cmd/cli/process.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
package cli

import (
"fmt"
"os"

"github.com/spf13/cobra"

"g.sugon.com/das/dcgm-dcu/pkg/dcgm"
)

var pidListCmd = &cobra.Command{
Use: "pid-list",
Short: "Get a list of PIDs",
Long: `Retrieve a list of process IDs (PIDs) managed by the system.`,
Run: func(cmd *cobra.Command, args []string) {
pidList, err := dcgm.PidList()
if err != nil {
fmt.Println("Error fetching PID list:", err)
os.Exit(1)
}

fmt.Println("PID List:")
for _, pid := range pidList {
fmt.Println(pid)
}
},
}

var showPidsCmd = &cobra.Command{
Use: "show-pids",
Short: "Show running KFD process information",
Long: `Retrieve and display detailed information about KFD processes currently running on the system.`,
Run: func(cmd *cobra.Command, args []string) {
// 调用 ShowPids 函数
err := dcgm.ShowPids()
if err != nil {
fmt.Println("Error displaying KFD process information:", err)
os.Exit(1)
}
},
}

func init() {
rootCmd.AddCommand(pidListCmd)
rootCmd.AddCommand(showPidsCmd)
}
43 changes: 43 additions & 0 deletions pkg/cmd/cli/root.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package cli

import (
"fmt"
"os"

"github.com/spf13/cobra"

"g.sugon.com/das/dcgm-dcu/pkg/dcgm"
)

var dcgmInitialized bool // 追踪 DCGM 是否成功初始化

var rootCmd = &cobra.Command{
Use: "dcgm",
Short: "DCGM CLI tool",
Long: "Command-line interface for managing and interacting with DCGM. Use dcgm-cli [command] --help for more information on a command.",
PersistentPreRunE: func(cmd *cobra.Command, args []string) error {
// 在执行任何命令之前运行初始化
if err := dcgm.Init(); err != nil {
return fmt.Errorf("initialization failed: %v", err)
}
dcgmInitialized = true // 表示初始化成功
return nil
},
}

// Execute 执行 root 命令
func Execute() {
defer func() {
// 仅当 DCGM 成功初始化时才调用 ShutDown
if dcgmInitialized {
if err := dcgm.ShutDown(); err != nil {
fmt.Println("Failed to shut down properly:", err)
}
}
}()

if err := rootCmd.Execute(); err != nil {
fmt.Println(err)
os.Exit(1)
}
}
14 changes: 14 additions & 0 deletions pkg/cmd/cli/util.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package cli

import (
"encoding/json"
"fmt"
)

func dataToJson(data any) string {
jsonData, err := json.MarshalIndent(data, "", " ")
if err != nil {
fmt.Println("Error serializing to JSON:", err)
}
return string(jsonData)
}
76 changes: 76 additions & 0 deletions pkg/cmd/cli/vdevice.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
package cli

import (
"fmt"
"os"
"strconv"

"github.com/spf13/cobra"

"g.sugon.com/das/dcgm-dcu/pkg/dcgm"
)

var vDeviceInfoCmd = &cobra.Command{
Use: "vdevice-info [device-index]",
Short: "Get virtual device information",
Long: `Retrieve detailed information about a virtual device using its device index.`,
Args: cobra.ExactArgs(1),
Run: func(cmd *cobra.Command, args []string) {
dvInd, err := strconv.Atoi(args[0])
if err != nil {
fmt.Println("Invalid device index:", err)
os.Exit(1)
}

info, err := dcgm.VDeviceSingleInfo(dvInd)
if err != nil {
fmt.Println("Error fetching virtual device info:", err)
os.Exit(1)
}

fmt.Printf("Virtual Device Info: %+v\n", info)
},
}

var destroyVDeviceCmd = &cobra.Command{
Use: "destroy-vdevice<dvInd>",
Short: "Destroy a single virtual device",
Long: `This command destroys a single virtual device by its index.`,
Args: cobra.ExactArgs(1),
Run: func(cmd *cobra.Command, args []string) {
vDvInd, err := strconv.Atoi(args[0])
if err != nil {
fmt.Println("Invalid virtual device index:", err)
os.Exit(1)
}

err = dcgm.DestroySingleVDevice(vDvInd)
if err != nil {
fmt.Println("Error destroying virtual device:", err)
os.Exit(1)
}

fmt.Printf("Virtual device %d destroyed successfully.\n", vDvInd)
},
}
var allDeviceInfosCmd = &cobra.Command{
Use: "all-device-infos",
Short: "Get information for all physical devices",
Long: `Retrieve detailed information about all physical devices.`,
Run: func(cmd *cobra.Command, args []string) {
infos, err := dcgm.AllDeviceInfos()
if err != nil {
fmt.Println("Error fetching all device infos:", err)
os.Exit(1)
}
fmt.Println("==========allDevices==========")
fmt.Printf(dataToJson(infos))

},
}

func init() {
rootCmd.AddCommand(vDeviceInfoCmd)
rootCmd.AddCommand(destroyVDeviceCmd)
rootCmd.AddCommand(allDeviceInfosCmd)
}
7 changes: 7 additions & 0 deletions pkg/cmd/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package main

import "g.sugon.com/das/dcgm-dcu/pkg/cmd/cli"

func main() {
cli.Execute() // 执行 rootCmd
}
4 changes: 2 additions & 2 deletions pkg/dcgm/admin.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ func rsmiInit() (err error) {
if err = errorString(ret); err != nil {
return fmt.Errorf("Error go_rsmi_init: %s", err)
}
return nil
return
}

// rsmiShutdown 关闭rocm_smi
Expand All @@ -32,5 +32,5 @@ func rsmiShutdown() (err error) {
if err = errorString(ret); err != nil {
return fmt.Errorf("Error rsmi_shutdown: %s", err)
}
return nil
return
}
Loading

0 comments on commit 3ad7dd4

Please sign in to comment.