From 37bdc54d67606e82a7c0789905164f532007361f Mon Sep 17 00:00:00 2001
From: Kevin Klues <kklues@nvidia.com>
Date: Tue, 23 Apr 2024 11:04:41 +0000
Subject: [PATCH] Add additional functionality to the mock dgxa100 server

Signed-off-by: Kevin Klues <kklues@nvidia.com>
---
 pkg/nvml/mock/dgxa100/dgxa100.go | 94 ++++++++++++++++++++++++++++----
 1 file changed, 82 insertions(+), 12 deletions(-)

diff --git a/pkg/nvml/mock/dgxa100/dgxa100.go b/pkg/nvml/mock/dgxa100/dgxa100.go
index 2388c3e..a042435 100644
--- a/pkg/nvml/mock/dgxa100/dgxa100.go
+++ b/pkg/nvml/mock/dgxa100/dgxa100.go
@@ -26,29 +26,44 @@ import (
 
 type Server struct {
 	mock.Interface
-	Devices [8]nvml.Device
+	Devices           [8]nvml.Device
+	DriverVersion     string
+	NvmlVersion       string
+	CudaDriverVersion int
 }
 type Device struct {
 	mock.Device
-	UUID               string
-	PciBusID           string
-	Index              int
-	MigMode            int
-	GpuInstances       map[*GpuInstance]struct{}
-	GpuInstanceCounter uint32
-	MemoryInfo         nvml.Memory
+	UUID                  string
+	Name                  string
+	Brand                 nvml.BrandType
+	Architecture          nvml.DeviceArchitecture
+	PciBusID              string
+	Minor                 int
+	Index                 int
+	CudaComputeCapability CudaComputeCapability
+	MigMode               int
+	GpuInstances          map[*GpuInstance]struct{}
+	GpuInstanceCounter    uint32
+	MemoryInfo            nvml.Memory
 }
+
 type GpuInstance struct {
 	mock.GpuInstance
 	Info                   nvml.GpuInstanceInfo
 	ComputeInstances       map[*ComputeInstance]struct{}
 	ComputeInstanceCounter uint32
 }
+
 type ComputeInstance struct {
 	mock.ComputeInstance
 	Info nvml.ComputeInstanceInfo
 }
 
+type CudaComputeCapability struct {
+	Major int
+	Minor int
+}
+
 var _ nvml.Interface = (*Server)(nil)
 var _ nvml.Device = (*Device)(nil)
 var _ nvml.GpuInstance = (*GpuInstance)(nil)
@@ -66,14 +81,25 @@ func New() nvml.Interface {
 			NewDevice(6),
 			NewDevice(7),
 		},
+		DriverVersion:     "550.54.15",
+		NvmlVersion:       "12.550.54.15",
+		CudaDriverVersion: 12040,
 	}
 }
 
 func NewDevice(index int) nvml.Device {
 	return &Device{
-		UUID:               "GPU-" + uuid.New().String(),
-		PciBusID:           fmt.Sprintf("0000:%02x:00.0", index),
-		Index:              index,
+		UUID:         "GPU-" + uuid.New().String(),
+		Name:         "Mock NVIDIA A100-SXM4-40GB",
+		Brand:        nvml.BRAND_NVIDIA,
+		Architecture: nvml.DEVICE_ARCH_AMPERE,
+		PciBusID:     fmt.Sprintf("0000:%02x:00.0", index),
+		Minor:        index,
+		Index:        index,
+		CudaComputeCapability: CudaComputeCapability{
+			Major: 8,
+			Minor: 0,
+		},
 		GpuInstances:       make(map[*GpuInstance]struct{}),
 		GpuInstanceCounter: 0,
 		MemoryInfo:         nvml.Memory{42949672960, 0, 0},
@@ -94,6 +120,14 @@ func NewComputeInstance(info nvml.ComputeInstanceInfo) nvml.ComputeInstance {
 	}
 }
 
+func (n *Server) Extensions() nvml.ExtendedInterface {
+	return n
+}
+
+func (n *Server) LookupSymbol(symbol string) error {
+	return nil
+}
+
 func (n *Server) Init() nvml.Return {
 	return nvml.SUCCESS
 }
@@ -102,8 +136,16 @@ func (n *Server) Shutdown() nvml.Return {
 	return nvml.SUCCESS
 }
 
+func (n *Server) SystemGetDriverVersion() (string, nvml.Return) {
+	return n.DriverVersion, nvml.SUCCESS
+}
+
 func (n *Server) SystemGetNVMLVersion() (string, nvml.Return) {
-	return "11.450.51", nvml.SUCCESS
+	return n.NvmlVersion, nvml.SUCCESS
+}
+
+func (n *Server) SystemGetCudaDriverVersion() (int, nvml.Return) {
+	return n.CudaDriverVersion, nvml.SUCCESS
 }
 
 func (n *Server) DeviceGetCount() (int, nvml.Return) {
@@ -135,14 +177,34 @@ func (n *Server) DeviceGetHandleByPciBusId(busID string) (nvml.Device, nvml.Retu
 	return nil, nvml.ERROR_INVALID_ARGUMENT
 }
 
+func (d *Device) GetMinorNumber() (int, nvml.Return) {
+	return d.Minor, nvml.SUCCESS
+}
+
 func (d *Device) GetIndex() (int, nvml.Return) {
 	return d.Index, nvml.SUCCESS
 }
 
+func (d *Device) GetCudaComputeCapability() (int, int, nvml.Return) {
+	return d.CudaComputeCapability.Major, d.CudaComputeCapability.Minor, nvml.SUCCESS
+}
+
 func (d *Device) GetUUID() (string, nvml.Return) {
 	return d.UUID, nvml.SUCCESS
 }
 
+func (d *Device) GetName() (string, nvml.Return) {
+	return d.Name, nvml.SUCCESS
+}
+
+func (d *Device) GetBrand() (nvml.BrandType, nvml.Return) {
+	return d.Brand, nvml.SUCCESS
+}
+
+func (d *Device) GetArchitecture() (nvml.DeviceArchitecture, nvml.Return) {
+	return d.Architecture, nvml.SUCCESS
+}
+
 func (d *Device) GetMemoryInfo() (nvml.Memory, nvml.Return) {
 	return d.MemoryInfo, nvml.SUCCESS
 }
@@ -175,6 +237,10 @@ func (d *Device) GetGpuInstanceProfileInfo(giProfileId int) (nvml.GpuInstancePro
 	return MIGProfiles.GpuInstanceProfiles[giProfileId], nvml.SUCCESS
 }
 
+func (d *Device) GetGpuInstancePossiblePlacements(info *nvml.GpuInstanceProfileInfo) ([]nvml.GpuInstancePlacement, nvml.Return) {
+	return MIGPlacements.GpuInstancePossiblePlacements[int(info.Id)], nvml.SUCCESS
+}
+
 func (d *Device) CreateGpuInstance(info *nvml.GpuInstanceProfileInfo) (nvml.GpuInstance, nvml.Return) {
 	giInfo := nvml.GpuInstanceInfo{
 		Device:    d,
@@ -236,6 +302,10 @@ func (gi *GpuInstance) GetComputeInstanceProfileInfo(ciProfileId int, ciEngProfi
 	return MIGProfiles.ComputeInstanceProfiles[giProfileId][ciProfileId], nvml.SUCCESS
 }
 
+func (gi *GpuInstance) GetComputeInstancePossiblePlacements(info *nvml.ComputeInstanceProfileInfo) ([]nvml.ComputeInstancePlacement, nvml.Return) {
+	return MIGPlacements.ComputeInstancePossiblePlacements[int(gi.Info.Id)][int(info.Id)], nvml.SUCCESS
+}
+
 func (gi *GpuInstance) CreateComputeInstance(info *nvml.ComputeInstanceProfileInfo) (nvml.ComputeInstance, nvml.Return) {
 	ciInfo := nvml.ComputeInstanceInfo{
 		Device:      gi.Info.Device,