From 826673a8406f1484d79cd8f6cc2895a81daaaf35 Mon Sep 17 00:00:00 2001 From: Din Music Date: Wed, 18 Dec 2024 17:38:46 +0000 Subject: [PATCH 01/14] lxd/storage/connectors: Introduce storage connector Signed-off-by: Din Music --- lxd/storage/connectors/connector.go | 34 ++++++++++++++++++++++ lxd/storage/connectors/connector_common.go | 5 ++++ 2 files changed, 39 insertions(+) create mode 100644 lxd/storage/connectors/connector.go create mode 100644 lxd/storage/connectors/connector_common.go diff --git a/lxd/storage/connectors/connector.go b/lxd/storage/connectors/connector.go new file mode 100644 index 000000000000..1055bde26848 --- /dev/null +++ b/lxd/storage/connectors/connector.go @@ -0,0 +1,34 @@ +package connectors + +import ( + "context" + + "github.com/canonical/lxd/shared/revert" +) + +// session represents a connector session that is established with a target. +type session struct { + // id is a unique identifier of the session. + id string + + // targetQN is the qualified name of the target. + targetQN string + + // addresses is a list of active addresses associated with the session. + addresses []string +} + +// Connector represents a storage connector that handles connections through +// appropriate storage subsystem. +type Connector interface { + Type() string + Version() (string, error) + QualifiedName() (string, error) + LoadModules() error + Connect(ctx context.Context, targetQN string, targetAddrs ...string) (revert.Hook, error) + ConnectAll(ctx context.Context, targetAddr string) error + Disconnect(targetQN string) error + DisconnectAll() error + SessionID(targetQN string) (string, error) + findSession(targetQN string) (*session, error) +} diff --git a/lxd/storage/connectors/connector_common.go b/lxd/storage/connectors/connector_common.go new file mode 100644 index 000000000000..3a2da30a3476 --- /dev/null +++ b/lxd/storage/connectors/connector_common.go @@ -0,0 +1,5 @@ +package connectors + +type common struct { + serverUUID string +} From 43aa88ccd4ff5b84a044a3edd2c034ab70b6edb6 Mon Sep 17 00:00:00 2001 From: Din Music Date: Mon, 16 Dec 2024 16:25:11 +0000 Subject: [PATCH 02/14] lxd/storage/connectors: Helper functions for handling disk device paths Signed-off-by: Din Music --- lxd/storage/connectors/utils.go | 129 ++++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) create mode 100644 lxd/storage/connectors/utils.go diff --git a/lxd/storage/connectors/utils.go b/lxd/storage/connectors/utils.go new file mode 100644 index 000000000000..237ee4227d49 --- /dev/null +++ b/lxd/storage/connectors/utils.go @@ -0,0 +1,129 @@ +package connectors + +import ( + "context" + "errors" + "fmt" + "path/filepath" + "strings" + "time" + + "golang.org/x/sys/unix" + + "github.com/canonical/lxd/lxd/resources" + "github.com/canonical/lxd/shared" +) + +// devicePathFilterFunc is a function that accepts device path and returns true +// if the path matches the required criteria. +type devicePathFilterFunc func(devPath string) bool + +// GetDiskDevicePath checks whether the disk device with a given prefix and suffix +// exists in /dev/disk/by-id directory. A device path is returned if the device is +// found, otherwise an error is returned. +func GetDiskDevicePath(diskNamePrefix string, diskPathFilter devicePathFilterFunc) (string, error) { + devPath, err := findDiskDevicePath(diskNamePrefix, diskPathFilter) + if err != nil { + return "", err + } + + if devPath == "" { + return "", fmt.Errorf("Device not found") + } + + return devPath, nil +} + +// WaitDiskDevicePath waits for the disk device to appear in /dev/disk/by-id. +// It periodically checks for the device to appear and returns the device path +// once it is found. If the device does not appear within the timeout, an error +// is returned. +func WaitDiskDevicePath(ctx context.Context, diskNamePrefix string, diskPathFilter devicePathFilterFunc) (string, error) { + var err error + var diskPath string + + ctx, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() + + for { + // Check if the device is already present. + diskPath, err = findDiskDevicePath(diskNamePrefix, diskPathFilter) + if err != nil && !errors.Is(err, unix.ENOENT) { + return "", err + } + + // If the device is found, return the device path. + if diskPath != "" { + break + } + + // Check if context is cancelled. + err := ctx.Err() + if err != nil { + return "", err + } + + time.Sleep(500 * time.Millisecond) + } + + return diskPath, nil +} + +// findDiskDevivePath iterates over device names in /dev/disk/by-id directory and +// returns the path to the disk device that matches the given prefix and suffix. +// Disk partitions are skipped, and an error is returned if the device is not found. +func findDiskDevicePath(diskNamePrefix string, diskPathFilter devicePathFilterFunc) (string, error) { + var diskPaths []string + + // If there are no other disks on the system by id, the directory might not + // even be there. Returns ENOENT in case the by-id/ directory does not exist. + diskPaths, err := resources.GetDisksByID(diskNamePrefix) + if err != nil { + return "", err + } + + for _, diskPath := range diskPaths { + // Skip the disk if it is only a partition of the actual volume. + if strings.Contains(diskPath, "-part") { + continue + } + + // Use custom disk path filter, if one is provided. + if diskPathFilter != nil && !diskPathFilter(diskPath) { + continue + } + + // The actual device might not already be created. + // Returns ENOENT in case the device does not exist. + devPath, err := filepath.EvalSymlinks(diskPath) + if err != nil { + return "", err + } + + return devPath, nil + } + + return "", nil +} + +// WaitDiskDeviceGone waits for the disk device to disappear from /dev/disk/by-id. +// It periodically checks for the device to disappear and returns once the device +// is gone. If the device does not disappear within the timeout, an error is returned. +func WaitDiskDeviceGone(ctx context.Context, diskPath string) bool { + // Set upper boundary for the timeout to ensure this function does not run + // indefinitely. The caller can set a shorter timeout if necessary. + ctx, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() + + for { + if !shared.PathExists(diskPath) { + return true + } + + if ctx.Err() != nil { + return false + } + + time.Sleep(500 * time.Millisecond) + } +} From 6573a45844a53c49546efd6b3453187e146024dc Mon Sep 17 00:00:00 2001 From: Din Music Date: Tue, 21 Jan 2025 09:47:57 +0000 Subject: [PATCH 03/14] lxd/storage/connectors: Add generic connect func to handle multipath Signed-off-by: Din Music --- lxd/storage/connectors/utils.go | 131 ++++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) diff --git a/lxd/storage/connectors/utils.go b/lxd/storage/connectors/utils.go index 237ee4227d49..133172fcc13f 100644 --- a/lxd/storage/connectors/utils.go +++ b/lxd/storage/connectors/utils.go @@ -6,12 +6,16 @@ import ( "fmt" "path/filepath" "strings" + "sync" "time" "golang.org/x/sys/unix" + "github.com/canonical/lxd/lxd/locking" "github.com/canonical/lxd/lxd/resources" "github.com/canonical/lxd/shared" + "github.com/canonical/lxd/shared/logger" + "github.com/canonical/lxd/shared/revert" ) // devicePathFilterFunc is a function that accepts device path and returns true @@ -127,3 +131,130 @@ func WaitDiskDeviceGone(ctx context.Context, diskPath string) bool { time.Sleep(500 * time.Millisecond) } } + +// connectFunc is invoked by "connect" for each provided address. +// It receives a session and a target address. A non-nil session indicates +// an existing session for the target. +// +// The function is responsible for establishing new connections or handling +// necessary actions for already connected target addresses. +type connectFunc func(ctx context.Context, s *session, addr string) error + +// connect attempts to establish connections to all provided addresses, +// succeeding if at least one connection is successful. +// +// If all connection attempts fail, an error is returned, and the function +// ensures the session is cleanup if one was created during this call. +// +// IMPORTANT: +// If at least one connection succeeds, no error is returned. In this case, +// the caller is responsible for disconnection by calling "connectors.Disconnect" +// when safe. The returned reverter will only cancel ongoing connection attempts +// but will **not** attempt disconnection. +func connect(ctx context.Context, c Connector, targetQN string, targetAddrs []string, connectFunc connectFunc) (revert.Hook, error) { + // Acquire a lock to prevent concurrent connection attempts to the same + // target. + // + // The unlock is not deferred here because it must remain held until all + // connection attempts are complete. Releasing the lock prematurely after + // the first successful connection (when this function exits) could lead + // to race conditions if other connection attempts are still ongoing. + // For the same reason, relying on a higher-level lock from the caller + // (e.g., the storage driver) is insufficient. + unlock, err := locking.Lock(ctx, targetQN) + if err != nil { + return nil, err + } + + // Once the lock is obtained, search for an existing session. + session, err := c.findSession(targetQN) + if err != nil { + return nil, err + } + + // Set a maximum timeout of 30 seconds for connection attempts. + // The caller can override this with a shorter timeout if needed. + // + // Context cancellation is not deferred here to ensure connection attempts + // continue even if the function exits before all attempts are completed. + timeoutCtx, cancel := context.WithTimeout(ctx, 30*time.Second) + + var wg sync.WaitGroup + resChan := make(chan bool, len(targetAddrs)) + + var successLock sync.Mutex + isSuccess := false + + go func() { + // Connect to all target addresses. + for _, addr := range targetAddrs { + wg.Add(1) + + go func(addr string) { + defer wg.Done() + + err := connectFunc(timeoutCtx, session, addr) + if err != nil { + // Log warning for each failed connection attempt. + logger.Warn("Failed connecting to target", logger.Ctx{"target_qualified_name": targetQN, "target_address": addr, "err": err}) + } else { + successLock.Lock() + isSuccess = true + successLock.Unlock() + } + + resChan <- (err == nil) + }(addr) + } + + // Wait for all connection attempts to complete. + wg.Wait() + + // Cleanup. + close(resChan) + cancel() + + // Ensure the session is removed if no successful connection was + // established and no session existed before. + // + // If at least one connection succeeded, the caller is responsible + // for handling disconnection to avoid inadvertently disconnecting + // subsequent operations that may have reused the session after + // this function releases the lock. The lock being released is also + // the reason why disconnect is not returned in the outer reverter. + // + // Additionally, do not disconnect a session that existed once + // this function has obtained a lock. Even if no connection was + // successful, retaining the session allows other devices using + // it to recover. For example, the remote storage may have become + // inaccessible due to power loss. Removing the session would prevent + // existing devices from reconnecting once the remote storage becomes + // accessible again. + if !isSuccess && session == nil { + _ = c.Disconnect(targetQN) + } + + unlock() + }() + + // Wait until either a successful connection is established + // or all connection attempts fail. + for success := range resChan { + if success { + // At least one connection succeeded. + // + // Return a reverter that cancels any ongoing connection + // attempts and waits for them to complete. + outerReverter := revert.New() + outerReverter.Add(func() { + cancel() + wg.Wait() + }) + + return outerReverter.Fail, nil + } + } + + // All connections attempts have failed. + return nil, fmt.Errorf("Failed to connect to any address on target %q", targetQN) +} From 17c97c39fca5c12c15d7de3f8c92109b6cae639b Mon Sep 17 00:00:00 2001 From: Din Music Date: Wed, 18 Dec 2024 18:07:32 +0000 Subject: [PATCH 04/14] lxd/storage/connectors: Add NVMe/TCP connector Signed-off-by: Din Music --- lxd/storage/connectors/connector.go | 30 +++ lxd/storage/connectors/connector_nvme.go | 258 +++++++++++++++++++++++ 2 files changed, 288 insertions(+) create mode 100644 lxd/storage/connectors/connector_nvme.go diff --git a/lxd/storage/connectors/connector.go b/lxd/storage/connectors/connector.go index 1055bde26848..1b040e28d173 100644 --- a/lxd/storage/connectors/connector.go +++ b/lxd/storage/connectors/connector.go @@ -2,10 +2,19 @@ package connectors import ( "context" + "fmt" "github.com/canonical/lxd/shared/revert" ) +const ( + // TypeUnknown represents an unknown storage connector. + TypeUnknown string = "unknown" + + // TypeNVME represents an NVMe/TCP storage connector. + TypeNVME string = "nvme" +) + // session represents a connector session that is established with a target. type session struct { // id is a unique identifier of the session. @@ -32,3 +41,24 @@ type Connector interface { SessionID(targetQN string) (string, error) findSession(targetQN string) (*session, error) } + +// NewConnector instantiates a new connector of the given type. +// The caller needs to ensure connector type is validated before calling this +// function, as common (empty) connector is returned for unknown type. +func NewConnector(connectorType string, serverUUID string) (Connector, error) { + common := common{ + serverUUID: serverUUID, + } + + switch connectorType { + case TypeNVME: + return &connectorNVMe{ + common: common, + }, nil + + default: + // Return common connector if the type is unknown. This removes + // the need to check for nil or handle the error in the caller. + return nil, fmt.Errorf("Unknown storage connector type %q", connectorType) + } +} diff --git a/lxd/storage/connectors/connector_nvme.go b/lxd/storage/connectors/connector_nvme.go new file mode 100644 index 000000000000..995a3a3f40db --- /dev/null +++ b/lxd/storage/connectors/connector_nvme.go @@ -0,0 +1,258 @@ +package connectors + +import ( + "context" + "errors" + "fmt" + "io/fs" + "os" + "path/filepath" + "slices" + "strings" + + "github.com/canonical/lxd/lxd/util" + "github.com/canonical/lxd/shared" + "github.com/canonical/lxd/shared/revert" +) + +var _ Connector = &connectorNVMe{} + +type connectorNVMe struct { + common +} + +// Type returns the type of the connector. +func (c *connectorNVMe) Type() string { + return TypeNVME +} + +// Version returns the version of the NVMe CLI. +func (c *connectorNVMe) Version() (string, error) { + // Detect and record the version of the NVMe CLI. + out, err := shared.RunCommand("nvme", "version") + if err != nil { + return "", fmt.Errorf("Failed to get nvme-cli version: %w", err) + } + + fields := strings.Split(strings.TrimSpace(out), " ") + if strings.HasPrefix(out, "nvme version ") && len(fields) > 2 { + return fmt.Sprintf("%s (nvme-cli)", fields[2]), nil + } + + return "", fmt.Errorf("Failed to get nvme-cli version: Unexpected output %q", out) +} + +// LoadModules loads the NVMe/TCP kernel modules. +// Returns true if the modules can be loaded. +func (c *connectorNVMe) LoadModules() error { + err := util.LoadModule("nvme_fabrics") + if err != nil { + return err + } + + return util.LoadModule("nvme_tcp") +} + +// QualifiedName returns a custom NQN generated from the server UUID. +// Getting the NQN from /etc/nvme/hostnqn would require the nvme-cli +// package to be installed on the host. +func (c *connectorNVMe) QualifiedName() (string, error) { + return fmt.Sprintf("nqn.2014-08.org.nvmexpress:uuid:%s", c.serverUUID), nil +} + +// Connect establishes a connection with the target on the given address. +func (c *connectorNVMe) Connect(ctx context.Context, targetQN string, targetAddresses ...string) (revert.Hook, error) { + // Connects to the provided target address, if the connection is not yet established. + connectFunc := func(ctx context.Context, session *session, targetAddr string) error { + if session != nil && slices.Contains(session.addresses, targetAddr) { + // Already connected. + return nil + } + + hostNQN, err := c.QualifiedName() + if err != nil { + return err + } + + _, err = shared.RunCommandContext(ctx, "nvme", "connect", "--transport", "tcp", "--traddr", targetAddr, "--nqn", targetQN, "--hostnqn", hostNQN, "--hostid", c.serverUUID) + if err != nil { + return fmt.Errorf("Failed to connect to target %q on %q via NVMe: %w", targetQN, targetAddr, err) + } + + return nil + } + + return connect(ctx, c, targetQN, targetAddresses, connectFunc) +} + +// ConnectAll establishes a connection with all targets available on the given address. +func (c *connectorNVMe) ConnectAll(ctx context.Context, targetAddr string) error { + hostNQN, err := c.QualifiedName() + if err != nil { + return err + } + + _, err = shared.RunCommandContext(ctx, "nvme", "connect-all", "--transport", "tcp", "--traddr", targetAddr, "--hostnqn", hostNQN, "--hostid", c.serverUUID) + if err != nil { + return fmt.Errorf("Failed to connect to any target on %q via NVMe: %w", targetAddr, err) + } + + return nil +} + +// Disconnect terminates a connection with the target. +func (c *connectorNVMe) Disconnect(targetQN string) error { + // Find an existing NVMe session. + session, err := c.findSession(targetQN) + if err != nil { + return err + } + + // Disconnect from the NVMe target if there is an existing session. + if session != nil { + // Do not restrict the context as the operation is relatively short + // and most importantly we do not want to "partially" disconnect from + // the target, potentially leaving some unclosed sessions. + _, err := shared.RunCommandContext(context.Background(), "nvme", "disconnect", "--nqn", targetQN) + if err != nil { + return fmt.Errorf("Failed disconnecting from NVMe target %q: %w", targetQN, err) + } + } + + return nil +} + +// DisconnectAll terminates all connections with all targets. +func (c *connectorNVMe) DisconnectAll() error { + _, err := shared.RunCommand("nvme", "disconnect-all") + if err != nil { + return fmt.Errorf("Failed disconnecting from NVMe targets: %w", err) + } + + return nil +} + +// SessionID returns the identifier of a session that matches the targetQN. +// If no session is found, an empty string is returned. +func (c *connectorNVMe) SessionID(targetQN string) (string, error) { + session, err := c.findSession(targetQN) + if err != nil || session == nil { + return "", err + } + + return session.id, nil +} + +// findSession returns an active NVMe subsystem (referred to as session for +// consistency across connectors) that matches the given targetQN. +// If the session is not found, nil is returned. +// +// This function handles the distinction between an "inactive" session (with no +// active controllers/connections) and a completely "non-existent" session. While +// checking "/sys/class/nvme" for active controllers is sufficient to identify if +// the session is currently in use, it does not account for cases where a session +// exists but is temporarily inactive (e.g., due to network issues). Removing +// such a session during this state would prevent it from automatically +// recovering once the connection is restored. +// +// To ensure we detect "existing" sessions, we first check for the session's +// presence in "/sys/class/nvme-subsystem", which tracks all associated NVMe +// subsystems regardless of their current connection state. If such session is +// found the function determines addresses of the active connections by checking +// "/sys/class/nvme", and returns a non-nil result (except if an error occurs). +func (c *connectorNVMe) findSession(targetQN string) (*session, error) { + // Base path for NVMe sessions/subsystems. + subsysBasePath := "/sys/class/nvme-subsystem" + + // Retrieve the list of existing NVMe subsystems on this host. + subsystems, err := os.ReadDir(subsysBasePath) + if err != nil { + if errors.Is(err, fs.ErrNotExist) { + // If NVMe subsystems directory does not exist, + // there is no sessions. + return nil, nil + } + + return nil, fmt.Errorf("Failed getting a list of existing NVMe subsystems: %w", err) + } + + sessionID := "" + for _, subsys := range subsystems { + // Get the target NQN. + nqnBytes, err := os.ReadFile(filepath.Join(subsysBasePath, subsys.Name(), "subsysnqn")) + if err != nil { + return nil, fmt.Errorf("Failed getting the target NQN for subystem %q: %w", subsys.Name(), err) + } + + // Compare using contains, as targetQN may not be the entire NQN. + // For example, PowerFlex targetQN is a substring of the full NQN. + if strings.Contains(string(nqnBytes), targetQN) { + // Found matching session. + sessionID = strings.TrimPrefix(subsys.Name(), "nvme-subsys") + break + } + } + + if sessionID == "" { + // No matching session found. + return nil, nil + } + + session := &session{ + id: sessionID, + targetQN: targetQN, + } + + basePath := "/sys/class/nvme" + + // Retrieve the list of currently active (operational) NVMe controllers. + controllers, err := os.ReadDir(basePath) + if err != nil { + if errors.Is(err, fs.ErrNotExist) { + // No active connections for any session. + return session, nil + } + + return nil, fmt.Errorf("Failed getting a list of existing NVMe subsystems: %w", err) + } + + // Iterate over active NVMe devices and extract addresses from those + // that correspond to the targetQN. + for _, c := range controllers { + // Get device's target NQN. + nqnBytes, err := os.ReadFile(filepath.Join(basePath, c.Name(), "subsysnqn")) + if err != nil { + return nil, fmt.Errorf("Failed getting the target NQN for controller %q: %w", c.Name(), err) + } + + // Compare using contains, as targetQN may not be the entire NQN. + // For example, PowerFlex targetQN is a substring of the full NQN. + if !strings.Contains(string(nqnBytes), targetQN) { + // Subsystem does not belong to the targetQN. + continue + } + + // Read address file of an active NVMe connection. + filePath := filepath.Join(basePath, c.Name(), "address") + fileBytes, err := os.ReadFile(filePath) + if err != nil { + return nil, fmt.Errorf("Failed getting connection address of controller %q for target %q: %w", c.Name(), targetQN, err) + } + + // Extract the addresses from the file. + // The "address" file contains one line per connection, + // each in format "traddr=,trsvcid=,...". + for _, line := range strings.Split(string(fileBytes), "\n") { + parts := strings.Split(strings.TrimSpace(line), ",") + for _, part := range parts { + addr, ok := strings.CutPrefix(part, "traddr=") + if ok { + session.addresses = append(session.addresses, addr) + break + } + } + } + } + + return session, nil +} From b9044796ba17173e4ca33beee4a13e72b7568051 Mon Sep 17 00:00:00 2001 From: Din Music Date: Thu, 19 Dec 2024 09:50:16 +0000 Subject: [PATCH 05/14] lxd/storage/connectors: Add SDC connector Signed-off-by: Din Music --- lxd/storage/connectors/connector.go | 8 ++++ lxd/storage/connectors/connector_sdc.go | 64 +++++++++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 lxd/storage/connectors/connector_sdc.go diff --git a/lxd/storage/connectors/connector.go b/lxd/storage/connectors/connector.go index 1b040e28d173..ac8ff4cf2388 100644 --- a/lxd/storage/connectors/connector.go +++ b/lxd/storage/connectors/connector.go @@ -13,6 +13,9 @@ const ( // TypeNVME represents an NVMe/TCP storage connector. TypeNVME string = "nvme" + + // TypeSDC represents Dell SDC storage connector. + TypeSDC string = "sdc" ) // session represents a connector session that is established with a target. @@ -56,6 +59,11 @@ func NewConnector(connectorType string, serverUUID string) (Connector, error) { common: common, }, nil + case TypeSDC: + return &connectorSDC{ + common: common, + }, nil + default: // Return common connector if the type is unknown. This removes // the need to check for nil or handle the error in the caller. diff --git a/lxd/storage/connectors/connector_sdc.go b/lxd/storage/connectors/connector_sdc.go new file mode 100644 index 000000000000..9393bfc9ac81 --- /dev/null +++ b/lxd/storage/connectors/connector_sdc.go @@ -0,0 +1,64 @@ +package connectors + +import ( + "context" + + "github.com/canonical/lxd/shared/revert" +) + +var _ Connector = &connectorSDC{} + +type connectorSDC struct { + common +} + +// Type returns the type of the connector. +func (c *connectorSDC) Type() string { + return TypeSDC +} + +// Version returns an empty string and no error. +func (c *connectorSDC) Version() (string, error) { + return "", nil +} + +// LoadModules returns true. SDC does not require any kernel modules to be loaded. +func (c *connectorSDC) LoadModules() error { + return nil +} + +// QualifiedName returns an empty string and no error. SDC has no qualified name. +func (c *connectorSDC) QualifiedName() (string, error) { + return "", nil +} + +// SessionID returns an empty string and no error, as connections are handled by SDC. +func (c *connectorSDC) SessionID(targetQN string) (string, error) { + return "", nil +} + +// Connect does nothing. Connections are fully handled by SDC. +func (c *connectorSDC) Connect(ctx context.Context, targetQN string, targetAddresses ...string) (revert.Hook, error) { + // Nothing to do. Connection is handled by Dell SDC. + return revert.New().Fail, nil +} + +// ConnectAll does nothing. Connections are fully handled by SDC. +func (c *connectorSDC) ConnectAll(ctx context.Context, targetAddr string) error { + // Nothing to do. Connection is handled by Dell SDC. + return nil +} + +// Disconnect does nothing. Connections are fully handled by SDC. +func (c *connectorSDC) Disconnect(targetQN string) error { + return nil +} + +// DisconnectAll does nothing. Connections are fully handled by SDC. +func (c *connectorSDC) DisconnectAll() error { + return nil +} + +func (c *connectorSDC) findSession(targetQN string) (*session, error) { + return nil, nil +} From 80dc79103aab1d437fd4882e6d60e4817cee16a3 Mon Sep 17 00:00:00 2001 From: Din Music Date: Thu, 19 Dec 2024 09:49:38 +0000 Subject: [PATCH 06/14] lxd/storage/connectors/connector: Get versions for supported connectors Signed-off-by: Din Music --- lxd/storage/connectors/connector.go | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/lxd/storage/connectors/connector.go b/lxd/storage/connectors/connector.go index ac8ff4cf2388..5225c4749469 100644 --- a/lxd/storage/connectors/connector.go +++ b/lxd/storage/connectors/connector.go @@ -70,3 +70,26 @@ func NewConnector(connectorType string, serverUUID string) (Connector, error) { return nil, fmt.Errorf("Unknown storage connector type %q", connectorType) } } + +// GetSupportedVersions returns the versions for the given connector types +// ignoring those that produce an error when version is being retrieved +// (e.g. due to a missing required tools). +func GetSupportedVersions(connectorTypes []string) []string { + versions := make([]string, 0, len(connectorTypes)) + + // Iterate over the provided connector types, and extract + // their versions. + for _, connectorType := range connectorTypes { + connector, err := NewConnector(connectorType, "") + if err != nil { + continue + } + + version, _ := connector.Version() + if version != "" { + versions = append(versions, version) + } + } + + return versions +} From 0184d9ff1b26f9280fd217a21fe574164d98429d Mon Sep 17 00:00:00 2001 From: Din Music Date: Wed, 18 Dec 2024 18:51:18 +0000 Subject: [PATCH 07/14] lxd/storage/drivers/utils: Helper function to resolve server name Signed-off-by: Din Music --- lxd/storage/drivers/utils.go | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/lxd/storage/drivers/utils.go b/lxd/storage/drivers/utils.go index c125c8438f7b..594bf4f6bf0e 100644 --- a/lxd/storage/drivers/utils.go +++ b/lxd/storage/drivers/utils.go @@ -896,3 +896,18 @@ func roundAbove(above, val int64) int64 { return rounded } + +// ResolveServerName returns the given server name if it is not "none". +// If the server name is "none", it retrieves and returns the server's hostname. +func ResolveServerName(serverName string) (string, error) { + if serverName != "none" { + return serverName, nil + } + + hostname, err := os.Hostname() + if err != nil { + return "", fmt.Errorf("Failed to get hostname: %w", err) + } + + return hostname, nil +} From 61ec719f6f3886ec2824d6a0a019324e50acd473 Mon Sep 17 00:00:00 2001 From: Din Music Date: Fri, 20 Dec 2024 18:18:29 +0000 Subject: [PATCH 08/14] lxd/storage/drivers/utils: Helper function to acquire storage connector lock Signed-off-by: Din Music --- lxd/storage/drivers/utils.go | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/lxd/storage/drivers/utils.go b/lxd/storage/drivers/utils.go index 594bf4f6bf0e..314c30dd29a6 100644 --- a/lxd/storage/drivers/utils.go +++ b/lxd/storage/drivers/utils.go @@ -16,6 +16,7 @@ import ( "golang.org/x/sys/unix" "github.com/canonical/lxd/lxd/idmap" + "github.com/canonical/lxd/lxd/locking" "github.com/canonical/lxd/lxd/operations" "github.com/canonical/lxd/lxd/storage/filesystem" "github.com/canonical/lxd/shared" @@ -911,3 +912,15 @@ func ResolveServerName(serverName string) (string, error) { return hostname, nil } + +// remoteVolumeMapLock acquires a lock used when mapping or unmapping remote +// storage volumes. This lock prevents conflicts between operations trying to +// associate or disassociate volumes with the LXD host. If the lock is +// successfully acquired, unlock function is returned. +func remoteVolumeMapLock(connectorName string, driverName string) (locking.UnlockFunc, error) { + l := logger.AddContext(logger.Ctx{"connector": connectorName, "driver": driverName}) + l.Debug("Acquiring lock for remote volume map") + defer l.Debug("Lock acquired for remote volume map") + + return locking.Lock(context.TODO(), fmt.Sprintf("RemoteVolumeMap_%s_%s", connectorName, driverName)) +} From 89dd43a062173f7d146e852fa4e3d24699fd4972 Mon Sep 17 00:00:00 2001 From: Din Music Date: Thu, 19 Dec 2024 15:54:17 +0000 Subject: [PATCH 09/14] lxd/storage/drivers/powerflex: Prevent changing powerflex.mode Signed-off-by: Din Music --- lxd/storage/drivers/driver_powerflex.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lxd/storage/drivers/driver_powerflex.go b/lxd/storage/drivers/driver_powerflex.go index b523302e41ad..4bcb82575ac8 100644 --- a/lxd/storage/drivers/driver_powerflex.go +++ b/lxd/storage/drivers/driver_powerflex.go @@ -280,6 +280,15 @@ func (d *powerflex) Validate(config map[string]string) error { return err } + newMode := config["powerflex.mode"] + oldMode := d.config["powerflex.mode"] + + // Ensure powerflex.mode cannot be changed to avoid leaving volume mappings + // and to prevent disturbing running instances. + if oldMode != "" && oldMode != newMode { + return fmt.Errorf("PowerFlex mode cannot be changed") + } + // Check if the selected PowerFlex mode is supported on this node. // Also when forming the storage pool on a LXD cluster, the mode // that got discovered on the creating machine needs to be validated From 2e6d5608e5aa51c85d9abb40a4a5f268aa6704cc Mon Sep 17 00:00:00 2001 From: Din Music Date: Thu, 19 Dec 2024 09:04:51 +0000 Subject: [PATCH 10/14] lxd/storage/drivers/powerflex: Use shared function to resolve server name Signed-off-by: Din Music --- lxd/storage/drivers/driver_powerflex_utils.go | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/lxd/storage/drivers/driver_powerflex_utils.go b/lxd/storage/drivers/driver_powerflex_utils.go index e047532d733e..261667eccacb 100644 --- a/lxd/storage/drivers/driver_powerflex_utils.go +++ b/lxd/storage/drivers/driver_powerflex_utils.go @@ -792,21 +792,6 @@ func (d *powerflex) getHostGUID() (string, error) { return d.sdcGUID, nil } -// getServerName returns the hostname of this host. -// It prefers the value from the daemons state in case LXD is clustered. -func (d *powerflex) getServerName() (string, error) { - if d.state.ServerName != "none" { - return d.state.ServerName, nil - } - - hostname, err := os.Hostname() - if err != nil { - return "", fmt.Errorf("Failed to get hostname: %w", err) - } - - return hostname, nil -} - // getVolumeType returns the selected provisioning type of the volume. // As a default it returns type thin. func (d *powerflex) getVolumeType(vol Volume) powerFlexVolumeType { @@ -837,7 +822,7 @@ func (d *powerflex) createNVMeHost() (string, revert.Hook, error) { return "", nil, err } - hostname, err := d.getServerName() + hostname, err := ResolveServerName(d.state.ServerName) if err != nil { return "", nil, err } From e4b9029f3c236e9fb283fd1e1878ca3093fc9c39 Mon Sep 17 00:00:00 2001 From: Din Music Date: Thu, 19 Dec 2024 09:50:56 +0000 Subject: [PATCH 11/14] lxd/storage/drivers/powerflex: Use connector for handling storage subsystem Signed-off-by: Din Music --- lxd/storage/drivers/driver_powerflex.go | 80 +++-- lxd/storage/drivers/driver_powerflex_utils.go | 296 +++++------------- 2 files changed, 138 insertions(+), 238 deletions(-) diff --git a/lxd/storage/drivers/driver_powerflex.go b/lxd/storage/drivers/driver_powerflex.go index 4bcb82575ac8..1c0482f701fe 100644 --- a/lxd/storage/drivers/driver_powerflex.go +++ b/lxd/storage/drivers/driver_powerflex.go @@ -8,6 +8,7 @@ import ( "github.com/canonical/lxd/lxd/migration" "github.com/canonical/lxd/lxd/operations" + "github.com/canonical/lxd/lxd/storage/connectors" "github.com/canonical/lxd/shared" "github.com/canonical/lxd/shared/api" "github.com/canonical/lxd/shared/validate" @@ -19,10 +20,10 @@ const powerFlexDefaultUser = "admin" // powerFlexDefaultSize represents the default PowerFlex volume size. const powerFlexDefaultSize = "8GiB" -const ( - powerFlexModeNVMe = "nvme" - powerFlexModeSDC = "sdc" -) +var powerflexSupportedConnectors = []string{ + connectors.TypeNVME, + connectors.TypeSDC, +} var powerFlexLoaded bool var powerFlexVersion string @@ -30,6 +31,10 @@ var powerFlexVersion string type powerflex struct { common + // Holds the low level connector for the PowerFlex driver. + // Use powerflex.connector() to retrieve the initialized connector. + storageConnector connectors.Connector + // Holds the low level HTTP client for the PowerFlex API. // Use powerflex.client() to retrieve the client struct. httpClient *powerFlexClient @@ -46,28 +51,37 @@ func (d *powerflex) load() error { return nil } - // Detect and record the version. - // The NVMe CLI is shipped with the snap. - out, err := shared.RunCommand("nvme", "version") - if err != nil { - return fmt.Errorf("Failed to get nvme-cli version: %w", err) - } - - fields := strings.Split(strings.TrimSpace(out), " ") - if strings.HasPrefix(out, "nvme version ") && len(fields) > 2 { - powerFlexVersion = fmt.Sprintf("%s (nvme-cli)", fields[2]) - } + versions := connectors.GetSupportedVersions(powerflexSupportedConnectors) + powerFlexVersion = strings.Join(versions, " / ") + powerFlexLoaded = true - // Load the NVMe/TCP kernel modules. + // Load the kernel modules of the respective connector. // Ignore if the modules cannot be loaded. - // Support for the NVMe/TCP mode is checked during pool creation. + // Support for a specific connector is checked during pool creation. // When a LXD host gets rebooted this ensures that the kernel modules are still loaded. - _ = d.loadNVMeModules() + connector, err := d.connector() + if err == nil { + _ = connector.LoadModules() + } - powerFlexLoaded = true return nil } +// connector retrieves an initialized storage connector based on the configured +// PowerFlex mode. The connector is cached in the driver struct. +func (d *powerflex) connector() (connectors.Connector, error) { + if d.storageConnector == nil { + connector, err := connectors.NewConnector(d.config["powerflex.mode"], d.state.ServerUUID) + if err != nil { + return nil, err + } + + d.storageConnector = connector + } + + return d.storageConnector, nil +} + // isRemote returns true indicating this driver uses remote storage. func (d *powerflex) isRemote() bool { return true @@ -102,10 +116,16 @@ func (d *powerflex) FillConfig() error { // First try if the NVMe/TCP kernel modules can be loaed. // Second try if the SDC kernel module is setup. if d.config["powerflex.mode"] == "" { - if d.loadNVMeModules() { - d.config["powerflex.mode"] = powerFlexModeNVMe + // Create temporary connector to check if NVMe/TCP kernel modules can be loaded. + nvmeConnector, err := connectors.NewConnector(connectors.TypeNVME, "") + if err != nil { + return err + } + + if nvmeConnector.LoadModules() == nil { + d.config["powerflex.mode"] = connectors.TypeNVME } else if goscaleio.DrvCfgIsSDCInstalled() { - d.config["powerflex.mode"] = powerFlexModeSDC + d.config["powerflex.mode"] = connectors.TypeSDC } } @@ -139,7 +159,7 @@ func (d *powerflex) Create() error { client := d.client() switch d.config["powerflex.mode"] { - case powerFlexModeNVMe: + case connectors.TypeNVME: // Discover one of the storage pools SDT services. if d.config["powerflex.sdt"] == "" { pool, err := d.resolvePool() @@ -163,7 +183,7 @@ func (d *powerflex) Create() error { d.config["powerflex.sdt"] = relations[0].IPList[0].IP } - case powerFlexModeSDC: + case connectors.TypeSDC: if d.config["powerflex.sdt"] != "" { return fmt.Errorf("The powerflex.sdt config key is specific to the NVMe/TCP mode") } @@ -295,8 +315,16 @@ func (d *powerflex) Validate(config map[string]string) error { // on the other cluster members too. This can be done here since Validate // gets executed on every cluster member when receiving the cluster // notification to finally create the pool. - if d.config["powerflex.mode"] == powerFlexModeNVMe && !d.loadNVMeModules() { - return fmt.Errorf("NVMe/TCP is not supported") + if newMode != "" { + connector, err := connectors.NewConnector(newMode, "") + if err != nil { + return fmt.Errorf("PowerFlex mode %q is not supported: %w", newMode, err) + } + + err = connector.LoadModules() + if err != nil { + return fmt.Errorf("PowerFlex mode %q is not supported due to missing kernel modules: %w", newMode, err) + } } return nil diff --git a/lxd/storage/drivers/driver_powerflex_utils.go b/lxd/storage/drivers/driver_powerflex_utils.go index 261667eccacb..cdb1541ad319 100644 --- a/lxd/storage/drivers/driver_powerflex_utils.go +++ b/lxd/storage/drivers/driver_powerflex_utils.go @@ -6,23 +6,17 @@ import ( "crypto/tls" "encoding/base64" "encoding/json" - "errors" "fmt" "io" "net/http" - "os" - "path/filepath" "strconv" "strings" "time" "github.com/dell/goscaleio" "github.com/google/uuid" - "golang.org/x/sys/unix" - "github.com/canonical/lxd/lxd/locking" - "github.com/canonical/lxd/lxd/resources" - "github.com/canonical/lxd/lxd/util" + "github.com/canonical/lxd/lxd/storage/connectors" "github.com/canonical/lxd/shared" "github.com/canonical/lxd/shared/api" "github.com/canonical/lxd/shared/revert" @@ -747,18 +741,6 @@ func (p *powerFlexClient) getHostVolumeMappings(hostID string) ([]powerFlexVolum return actualResponse, nil } -// loadNVMeModules loads the NVMe/TCP kernel modules. -// Returns true if the modules can be loaded. -func (d *powerflex) loadNVMeModules() bool { - err := util.LoadModule("nvme_fabrics") - if err != nil { - return false - } - - err = util.LoadModule("nvme_tcp") - return err == nil -} - // client returns the drivers PowerFlex client. // A new client gets created if it not yet exists. func (d *powerflex) client() *powerFlexClient { @@ -769,13 +751,6 @@ func (d *powerflex) client() *powerFlexClient { return d.httpClient } -// getHostNQN returns the unique NVMe nqn for the current host. -// A custom one is generated from the servers UUID since getting the nqn from /etc/nvme/hostnqn -// requires the nvme-cli package to be installed on the host. -func (d *powerflex) getHostNQN() string { - return "nqn.2014-08.org.nvmexpress:uuid:" + d.state.ServerUUID -} - // getHostGUID returns the SDC GUID. // The GUID is unique for a single host. // Cache the GUID as it never changes for a single host. @@ -810,13 +785,22 @@ func (d *powerflex) getVolumeType(vol Volume) powerFlexVolumeType { // createNVMeHost creates this NVMe host in PowerFlex. func (d *powerflex) createNVMeHost() (string, revert.Hook, error) { var hostID string - nqn := d.getHostNQN() + + connector, err := d.connector() + if err != nil { + return "", nil, err + } + + hostNQN, err := connector.QualifiedName() + if err != nil { + return "", nil, err + } revert := revert.New() defer revert.Fail() client := d.client() - host, err := client.getNVMeHostByNQN(nqn) + host, err := client.getNVMeHostByNQN(hostNQN) if err != nil { if !api.StatusErrorCheck(err, http.StatusNotFound) { return "", nil, err @@ -827,7 +811,7 @@ func (d *powerflex) createNVMeHost() (string, revert.Hook, error) { return "", nil, err } - hostID, err = client.createHost(hostname, nqn) + hostID, err = client.createHost(hostname, hostNQN) if err != nil { return "", nil, err } @@ -847,8 +831,18 @@ func (d *powerflex) createNVMeHost() (string, revert.Hook, error) { // deleteNVMeHost deletes this NVMe host in PowerFlex. func (d *powerflex) deleteNVMeHost() error { client := d.client() - nqn := d.getHostNQN() - host, err := client.getNVMeHostByNQN(nqn) + + connector, err := d.connector() + if err != nil { + return err + } + + hostNQN, err := connector.QualifiedName() + if err != nil { + return err + } + + host, err := client.getNVMeHostByNQN(hostNQN) if err != nil { // Skip the deletion if the host doesn't exist anymore. if api.StatusErrorCheck(err, http.StatusNotFound) { @@ -863,14 +857,19 @@ func (d *powerflex) deleteNVMeHost() error { // mapVolume maps the given volume onto this host. func (d *powerflex) mapVolume(vol Volume) (revert.Hook, error) { + var hostID string + reverter := revert.New() defer reverter.Fail() - var hostID string + connector, err := d.connector() + if err != nil { + return nil, err + } switch d.config["powerflex.mode"] { - case powerFlexModeNVMe: - unlock, err := locking.Lock(d.state.ShutdownCtx, "nvme") + case connectors.TypeNVME: + unlock, err := remoteVolumeMapLock(connector.Type(), "powerflex") if err != nil { return nil, err } @@ -884,7 +883,7 @@ func (d *powerflex) mapVolume(vol Volume) (revert.Hook, error) { } reverter.Add(cleanup) - case powerFlexModeSDC: + case connectors.TypeSDC: hostGUID, err := d.getHostGUID() if err != nil { return nil, err @@ -931,19 +930,19 @@ func (d *powerflex) mapVolume(vol Volume) (revert.Hook, error) { reverter.Add(func() { _ = client.deleteHostVolumeMapping(hostID, volumeID) }) } - if d.config["powerflex.mode"] == powerFlexModeNVMe { - // Connect to the NVMe/TCP subsystem. - // We have to connect after the first mapping was established. - // PowerFlex does not offer any discovery log entries until a volume gets mapped to the host. - // This action is idempotent. - cleanup, err := d.connectNVMeSubsys() - if err != nil { - return nil, err - } + targetAddr := d.config["powerflex.sdt"] - reverter.Add(cleanup) + // Connect to the storage subsystem. + // In case of NVMe/TCP, we have to connect after the first mapping was established, + // as PowerFlex does not offer any discovery log entries until a volume gets mapped + // to the host. + err = connector.ConnectAll(d.state.ShutdownCtx, targetAddr) + if err != nil { + return nil, err } + reverter.Add(func() { _ = connector.DisconnectAll() }) + cleanup := reverter.Clone().Fail reverter.Success() return cleanup, nil @@ -964,44 +963,6 @@ func (d *powerflex) getMappedDevPath(vol Volume, mapVolume bool) (string, revert revert.Add(cleanup) } - powerFlexVolumes := make(map[string]string) - - // discoverFunc has to be called in a loop with a set timeout to ensure - // all the necessary directories and devices can be discovered. - discoverFunc := func(volumeID string, diskPrefix string) error { - var diskPaths []string - - // If there are no other disks on the system by id, the directory might not even be there. - // Returns ENOENT in case the by-id/ directory does not exist. - diskPaths, err := resources.GetDisksByID(diskPrefix) - if err != nil { - return err - } - - for _, diskPath := range diskPaths { - // Skip the disk if it is only a partition of the actual PowerFlex volume. - if strings.Contains(diskPath, "-part") { - continue - } - - // Skip other volume's that don't match the PowerFlex volume's ID. - if !strings.Contains(diskPath, volumeID) { - continue - } - - // The actual device might not already be created. - // Returns ENOENT in case the device does not exist. - devPath, err := filepath.EvalSymlinks(diskPath) - if err != nil { - return err - } - - powerFlexVolumes[volumeID] = devPath - } - - return nil - } - volumeName, err := d.getVolumeName(vol) if err != nil { return "", nil, err @@ -1012,63 +973,43 @@ func (d *powerflex) getMappedDevPath(vol Volume, mapVolume bool) (string, revert return "", nil, err } - timeout := time.Now().Add(5 * time.Second) - // It might take a while to create the local disk. - // Retry until it can be found. - for { - if time.Now().After(timeout) { - return "", nil, fmt.Errorf("Timeout exceeded for PowerFlex volume discovery: %q", volumeName) - } - - var prefix string - switch d.config["powerflex.mode"] { - case powerFlexModeNVMe: - prefix = "nvme-eui." - case powerFlexModeSDC: - prefix = "emc-vol-" - } - - err := discoverFunc(powerFlexVolumeID, prefix) - if err != nil { - // Try again if on of the directories cannot be found. - if errors.Is(err, unix.ENOENT) { - continue - } - - return "", nil, err - } - - // Exit if the volume got discovered. - _, ok := powerFlexVolumes[powerFlexVolumeID] - if ok { - break - } - - // Exit if the volume wasn't explicitly mapped. - // Doing a retry would run into the timeout when the device isn't mapped. - if !mapVolume { - break - } + var prefix string + switch d.config["powerflex.mode"] { + case connectors.TypeNVME: + prefix = "nvme-eui." + case connectors.TypeSDC: + prefix = "emc-vol-" + } - time.Sleep(10 * time.Millisecond) + devicePathFilter := func(path string) bool { + return strings.Contains(path, powerFlexVolumeID) } - if len(powerFlexVolumes) == 0 { - return "", nil, fmt.Errorf("Failed to discover any PowerFlex volume") + var devicePath string + if mapVolume { + // Wait for the device path to appear as the volume has been just mapped to the host. + devicePath, err = connectors.WaitDiskDevicePath(d.state.ShutdownCtx, prefix, devicePathFilter) + } else { + // Get the the device path without waiting. + devicePath, err = connectors.GetDiskDevicePath(prefix, devicePathFilter) } - powerFlexVolumePath, ok := powerFlexVolumes[powerFlexVolumeID] - if !ok { - return "", nil, fmt.Errorf("PowerFlex volume not found: %q", volumeName) + if err != nil { + return "", nil, fmt.Errorf("Failed to locate device for volume %q: %w", vol.name, err) } cleanup := revert.Clone().Fail revert.Success() - return powerFlexVolumePath, cleanup, nil + return devicePath, cleanup, nil } // unmapVolume unmaps the given volume from this host. func (d *powerflex) unmapVolume(vol Volume) error { + connector, err := d.connector() + if err != nil { + return err + } + volumeName, err := d.getVolumeName(vol) if err != nil { return err @@ -1082,20 +1023,24 @@ func (d *powerflex) unmapVolume(vol Volume) error { var host *powerFlexSDC switch d.config["powerflex.mode"] { - case powerFlexModeNVMe: - nqn := d.getHostNQN() - host, err = client.getNVMeHostByNQN(nqn) + case connectors.TypeNVME: + hostNQN, err := connector.QualifiedName() + if err != nil { + return err + } + + host, err = client.getNVMeHostByNQN(hostNQN) if err != nil { return err } - unlock, err := locking.Lock(d.state.ShutdownCtx, "nvme") + unlock, err := remoteVolumeMapLock(connector.Type(), "powerflex") if err != nil { return err } defer unlock() - case powerFlexModeSDC: + case connectors.TypeSDC: hostGUID, err := d.getHostGUID() if err != nil { return err @@ -1113,20 +1058,18 @@ func (d *powerflex) unmapVolume(vol Volume) error { } // Wait until the volume has disappeared. - volumePath, _, _ := d.getMappedDevPath(vol, false) - if volumePath != "" { - ctx, cancel := context.WithTimeout(d.state.ShutdownCtx, 10*time.Second) - defer cancel() + ctx, cancel := context.WithTimeout(d.state.ShutdownCtx, 10*time.Second) + defer cancel() - if !waitGone(ctx, volumePath) { - return fmt.Errorf("Timeout whilst waiting for PowerFlex volume to disappear: %q", vol.name) - } + volumePath, _, _ := d.getMappedDevPath(vol, false) + if volumePath != "" && !connectors.WaitDiskDeviceGone(ctx, volumePath) { + return fmt.Errorf("Timeout whilst waiting for PowerFlex volume to disappear: %q", vol.name) } // In case of SDC the driver doesn't manage the underlying connection to PowerFlex. // Therefore if this was the last volume being unmapped from this system // LXD will not try to cleanup the connection. - if d.config["powerflex.mode"] == powerFlexModeNVMe { + if d.config["powerflex.mode"] == connectors.TypeNVME { mappings, err := client.getHostVolumeMappings(host.ID) if err != nil { return err @@ -1135,7 +1078,7 @@ func (d *powerflex) unmapVolume(vol Volume) error { if len(mappings) == 0 { // Disconnect from the NVMe subsystem. // Do this first before removing the host from PowerFlex. - err := d.disconnectNVMeSubsys() + err = connector.DisconnectAll() if err != nil { return err } @@ -1152,77 +1095,6 @@ func (d *powerflex) unmapVolume(vol Volume) error { return nil } -// connectNVMeSubsys connects this host to the NVMe subsystem configured in the storage pool. -// The connection can only be established after the first volume is mapped to this host. -// The operation is idempotent and returns nil if already connected to the subsystem. -func (d *powerflex) connectNVMeSubsys() (revert.Hook, error) { - basePath := "/sys/devices/virtual/nvme-subsystem" - - // Retrieve list of existing NVMe subsystems on this host. - directories, err := os.ReadDir(basePath) - if err != nil { - return nil, fmt.Errorf("Failed getting a list of NVMe subsystems: %w", err) - } - - revert := revert.New() - defer revert.Fail() - - pool, err := d.resolvePool() - if err != nil { - return nil, err - } - - domain, err := d.client().getProtectionDomain(pool.ProtectionDomainID) - if err != nil { - return nil, err - } - - for _, directory := range directories { - subsystemName := directory.Name() - - // Get the subsystem's NQN. - nqnBytes, err := os.ReadFile(filepath.Join(basePath, subsystemName, "subsysnqn")) - if err != nil { - return nil, fmt.Errorf("Failed getting the NQN of subystem %q: %w", subsystemName, err) - } - - if strings.Contains(string(nqnBytes), domain.SystemID) { - cleanup := revert.Clone().Fail - revert.Success() - - // Already connected to the NVMe subsystem for the respective PowerFlex system. - return cleanup, nil - } - } - - nqn := d.getHostNQN() - serverUUID := d.state.ServerUUID - _, stderr, err := shared.RunCommandSplit(d.state.ShutdownCtx, nil, nil, "nvme", "connect-all", "-t", "tcp", "-a", d.config["powerflex.sdt"], "-q", nqn, "-I", serverUUID) - if err != nil { - return nil, fmt.Errorf("Failed nvme connect-all: %w", err) - } - - if stderr != "" { - return nil, fmt.Errorf("Failed connecting to PowerFlex NVMe/TCP subsystem: %s", stderr) - } - - revert.Add(func() { _ = d.disconnectNVMeSubsys() }) - - cleanup := revert.Clone().Fail - revert.Success() - return cleanup, nil -} - -// disconnectNVMeSubsys disconnects this host from the NVMe subsystem. -func (d *powerflex) disconnectNVMeSubsys() error { - _, err := shared.RunCommand("nvme", "disconnect-all") - if err != nil { - return fmt.Errorf("Failed disconnecting from PowerFlex NVMe/TCP subsystem: %w", err) - } - - return nil -} - // resolvePool looks up the selected storage pool. // If only the pool is provided, it's expected to be the ID of the pool. // In case both pool and domain are set, the pool will get looked up From 6e2e37f1711c958ed4bbc322bd6cb33da37f4031 Mon Sep 17 00:00:00 2001 From: Din Music Date: Thu, 16 Jan 2025 15:22:42 +0000 Subject: [PATCH 12/14] lxd/storage/drivers/powerflex: Ensure mapVolume returns safe reverter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously the mapVolume returned a reverter that disconnected NVMe session in case of an error. Since mapVolume acquires a lock, calling disconnect outside of the mapVolume or unmapVolume functions may result in a race condition because another operation could establish a connection before the reverter is called. In such case, the reverter would carelessly just terminate the newly estabished connection. Therefore, the outerReverter ensures that instead of disconnecting from the volume the unmapVolume is called, which acquires the lock and ensures the disconnect is called only if the session is not in used. Co-authored-by: Julian Pelizäus Co-authored-by: Din Music Signed-off-by: Din Music --- lxd/storage/drivers/driver_powerflex_utils.go | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/lxd/storage/drivers/driver_powerflex_utils.go b/lxd/storage/drivers/driver_powerflex_utils.go index cdb1541ad319..57d514133652 100644 --- a/lxd/storage/drivers/driver_powerflex_utils.go +++ b/lxd/storage/drivers/driver_powerflex_utils.go @@ -941,11 +941,18 @@ func (d *powerflex) mapVolume(vol Volume) (revert.Hook, error) { return nil, err } - reverter.Add(func() { _ = connector.DisconnectAll() }) + // Reverting mapping or connection outside mapVolume function + // could conflict with other ongoing operations as lock will + // already be released. Therefore, use unmapVolume instead + // because it ensures the lock is acquired and accounts for + // an existing session before unmapping a volume. + outerReverter := revert.New() + if !mapped { + outerReverter.Add(func() { _ = d.unmapVolume(vol) }) + } - cleanup := reverter.Clone().Fail reverter.Success() - return cleanup, nil + return outerReverter.Fail, nil } // getMappedDevPath returns the local device path for the given volume. From c5b9df79f97ffca674b051c960cc65857f4ba617 Mon Sep 17 00:00:00 2001 From: Din Music Date: Fri, 17 Jan 2025 12:19:31 +0000 Subject: [PATCH 13/14] lxd/storage/drivers/powerflex: Pre-allocate slices (linter) Signed-off-by: Din Music --- lxd/storage/drivers/driver_powerflex_volumes.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lxd/storage/drivers/driver_powerflex_volumes.go b/lxd/storage/drivers/driver_powerflex_volumes.go index 2970f46e4059..674fa5db7763 100644 --- a/lxd/storage/drivers/driver_powerflex_volumes.go +++ b/lxd/storage/drivers/driver_powerflex_volumes.go @@ -241,7 +241,7 @@ func (d *powerflex) CreateVolumeFromCopy(vol VolumeCopy, srcVol VolumeCopy, allo return nil } - var srcVolumeSnapshots []string + srcVolumeSnapshots := make([]string, 0, len(vol.Snapshots)) for _, snapshot := range vol.Snapshots { _, snapshotName, _ := api.GetParentAndSnapshotName(snapshot.name) srcVolumeSnapshots = append(srcVolumeSnapshots, snapshotName) @@ -984,7 +984,7 @@ func (d *powerflex) VolumeSnapshots(vol Volume, op *operations.Operation) ([]str return nil, err } - var snapshotNames []string + snapshotNames := make([]string, 0, len(volumeSnapshots)) for _, snapshot := range volumeSnapshots { snapshotNames = append(snapshotNames, snapshot.Name) } From 343edef6fcc36b5dce0680af859a3857806ccc1e Mon Sep 17 00:00:00 2001 From: Din Music Date: Fri, 20 Dec 2024 14:47:00 +0000 Subject: [PATCH 14/14] lxd/storage/drivers/utils: Remove no longer used waitGone function (linter) Signed-off-by: Din Music --- lxd/storage/drivers/utils.go | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/lxd/storage/drivers/utils.go b/lxd/storage/drivers/utils.go index 314c30dd29a6..ac8337370c17 100644 --- a/lxd/storage/drivers/utils.go +++ b/lxd/storage/drivers/utils.go @@ -220,23 +220,6 @@ func tryExists(ctx context.Context, path string) bool { } } -// waitGone waits for a file to not exist anymore or the context being cancelled. -// The probe happens at intervals of 500 milliseconds. -func waitGone(ctx context.Context, path string) bool { - for { - select { - case <-ctx.Done(): - return false - default: - if !shared.PathExists(path) { - return true - } - } - - time.Sleep(500 * time.Millisecond) - } -} - // fsUUID returns the filesystem UUID for the given block path. // error is returned if the given block device exists but has no UUID. func fsUUID(path string) (string, error) {