cli: add set-identity command

firedancer-io · Feb 13, 2025 · 4deaf03 · 4deaf03
1 parent fbe571f
commit 4deaf03
Show file tree

Hide file tree

Showing 37 changed files with 1,262 additions and 194 deletions.
diff --git a/agave b/agave
diff --git a/book/api/cli.md b/book/api/cli.md
@@ -10,22 +10,22 @@ but it is suggested to run it as `sudo`. The command writes an
 abbreviated log output to `stderr` and nothing will be written to
 `stdout`.
 
-| Arguments | Description |
-|----------|-------------|
-| `--config` | Path to a configuration TOML file to run the validator with |
+| Arguments         | Description |
+|-------------------|-------------|
+| `--config <path>` | Path to a configuration TOML file to run the validator with |
 
 ::: details Capabilities
 
-| Capability | Reason |
-|------------|--------|
-| `CAP_NET_RAW` | call `socket(2)` to bind to a raw socket for use by XDP |
-| `CAP_SYS_ADMIN` | call `bpf(2)` with the `BPF_OBJ_GET` command to initialize XDP |
-| `CAP_SYS_ADMIN` | call `unshare(2)` with `CLONE_NEWUSER` to sandbox the process in a user namespace. Only required on kernels which restrict unprivileged user namespaces |
-| `CAP_SETUID` | call `setresuid(2)` to switch uid to the sandbox user. Not required if the UID is already the same as the sandbox UID |
-| `CAP_SETGID` | call `setresgid(2)` to switch gid to the sandbox user. Not required if the GID is already the same as the sandbox GID |
-| `CAP_SYS_RESOURCE` | call `rlimit(2)` to increase `RLIMIT_MEMLOCK` so all memory can be locked with `mlock(2)`. Not required if the process already has a high enough limit |
-| `CAP_SYS_RESOURCE` | call `setpriority(2)` to increase thread priorities. Not required if the process already has a nice value of -19 |
-| `CAP_SYS_RESOURCE` | call `rlimit(2)  to increase `RLIMIT_NOFILE` to allow more open files for Agave. Not required if the resource limit is already high enough |
+| Capability             | Reason |
+|------------------------|--------|
+| `CAP_NET_RAW`          | call `socket(2)` to bind to a raw socket for use by XDP |
+| `CAP_SYS_ADMIN`        | call `bpf(2)` with the `BPF_OBJ_GET` command to initialize XDP |
+| `CAP_SYS_ADMIN`        | call `unshare(2)` with `CLONE_NEWUSER` to sandbox the process in a user namespace. Only required on kernels which restrict unprivileged user namespaces |
+| `CAP_SETUID`           | call `setresuid(2)` to switch uid to the sandbox user. Not required if the UID is already the same as the sandbox UID |
+| `CAP_SETGID`           | call `setresgid(2)` to switch gid to the sandbox user. Not required if the GID is already the same as the sandbox GID |
+| `CAP_SYS_RESOURCE`     | call `rlimit(2)` to increase `RLIMIT_MEMLOCK` so all memory can be locked with `mlock(2)`. Not required if the process already has a high enough limit |
+| `CAP_SYS_RESOURCE`     | call `setpriority(2)` to increase thread priorities. Not required if the process already has a nice value of -19 |
+| `CAP_SYS_RESOURCE`     | call `rlimit(2)  to increase `RLIMIT_NOFILE` to allow more open files for Agave. Not required if the resource limit is already high enough |
 | `CAP_NET_BIND_SERVICE` | call `bind(2)` to bind to a privileged port for serving metrics. Only required if the bind port is below 1024 |
 
 :::
@@ -39,9 +39,9 @@ issues. The monitor takes over the controlling terminal and refreshes it
 many times a second with up to date information. You can exit the
 monitor by sending Ctrl+C or `SIGINT`.
 
-| Arguments | Description |
-|----------|-------------|
-| `--config` | Path to a configuation TOML file to run the monitor with. This must be the same configuration file the validator was started with |
+| Arguments         | Description |
+|-------------------|-------------|
+| `--config <path>` | Path to a configuation TOML file to run the monitor with. This must be the same configuration file the validator was started with |
 
 ::: details Capabilities
 
@@ -71,6 +71,10 @@ following stages to each configure command:
     device.
  - `ethtool-loopback` Disables UDP segmentation on the loopback device.
 
+| Arguments         | Description |
+|-------------------|-------------|
+| `--config <path>` | Path to a configuation TOML file to configiure the validator with. This must be the same configuration file the validator will be started with |
+
 ::: code-group
 
 ```toml [config.toml]
@@ -92,12 +96,12 @@ and configure the number of combined channels on the network device.
 
 ::: details Capabilities
 
-| Capability | Reason |
-|------------|--------|
-| `root` | increase `/proc/sys/vm/nr_hugepages` and mount hugetblfs filesystems. Only applies for the `hugetlbfs` stage |
-| `root` | increase network device channels with `ethtool --set-channels`. Only applies for the `ethtool-channels` stage |
-| `root` | disable network device generic-receive-offload (gro) with `ethtool --offload IFACE generic-receive-offload off`. Only applies for the `ethtool-gro` stage |
-| `root` | disable network device tx-udp-segmentation with `ethtool --offload lo tx-udp-segmentation off`. Only applies for the `ethtool-loopback` stage |
+| Capability      | Reason |
+|-----------------|--------|
+| `root`          | increase `/proc/sys/vm/nr_hugepages` and mount hugetblfs filesystems. Only applies for the `hugetlbfs` stage |
+| `root`          | increase network device channels with `ethtool --set-channels`. Only applies for the `ethtool-channels` stage |
+| `root`          | disable network device generic-receive-offload (gro) with `ethtool --offload IFACE generic-receive-offload off`. Only applies for the `ethtool-gro` stage |
+| `root`          | disable network device tx-udp-segmentation with `ethtool --offload lo tx-udp-segmentation off`. Only applies for the `ethtool-loopback` stage |
 | `CAP_SYS_ADMIN` | set kernel parameters in `/proc/sys`. Only applies for the `sysctl` stage |
 
 :::
@@ -122,7 +126,7 @@ back as we no longer know what the original value was.
 
 | Capability | Reason |
 |------------|--------|
-| `root` | remove directories from `/mnt`, unmount hugetlbfs. Only applies for the `hugetlbfs` stage |
+| `root`     | remove directories from `/mnt`, unmount hugetlbfs. Only applies for the `hugetlbfs` stage |
 
 :::
 
@@ -137,6 +141,46 @@ $ fdctl version
 0.101.11814
 ```
 
+## `set-identity`
+Changes the identity key of a running validator. The `<keypair>`
+argument is required and must be the path to an Agave style
+`identity.json` keypair file. If the path is specified as `-` the key
+will instead be read from `stdin`.
+
+It is not generally safe to call `set-identity`, as another validator
+might be running with the same identity, and if they both produce a
+block or vote concurrently, the validator may violate consensus and be
+subject to (future) slashing.
+
+Best practice requires copying the `tower.bin` file from the prior
+to the new validator, to ensure that vote lockouts are repected.
+
+The validator will not change identity in the middle of a leader slot,
+and will wait until any in-progress leader slot completes before
+switching to the new identity. It is safe to call during or near a
+leader slot because of this wait.
+
+The command exits sucessfully (with an exit code of 0) if the identity
+key was changed, otherwise it will fail and print diagnostic messages to
+`stderr`. Reasons for failure include the validator being unable to open
+or load the tower, when `--require-tower` is specified, or being unable
+to load or verify the provided identity key.
+
+Currently due to implementation limitations, the key can be partially
+changed if the `set-identity` command is cancelled (for example with
+Ctrl+C) while running. The next call to `set-identity` might need to
+provide the `--force` argument to succeed if this occurs, to reset this
+partial state and proceed with setting a new key.
+
+| Arguments         | Description |
+|-------------------|-------------|
+| `<keypair>`       | Path to a `identity.json` keypair file, or `-` to read the JSON formatted key from `stdin` |
+| `--config <path>` | Path to a configuation TOML file of the validator to change identity for. This must be the same configuration file the validator was started with |
+| `--require-tower` | If specified, refuse to set the validator identity if saved tower state is not found |
+| `--force`         | If a `set-identity` operation is abandoned part way through, you will need to specify `--force` to reset the validator key state when trying again |
+
+<<< @/snippets/commands/set-identity.ansi
+
 ## `keys`
 
 ### `keys pubkey <PATH>`
@@ -155,8 +199,8 @@ Creates a new keypair from the kernel random number generator and writes
 it to the identity key path, or vote key path. The key path is retrieved
 from the configuration TOML file
 
-| Arguments | Description |
-|----------|-------------|
+| Arguments  | Description |
+|------------|-------------|
 | `--config` | Path to a configuation TOML file which determines where the key is written. Either `[consensus.identity_path]` or `[consensus.vote_account_path]` for `identity` or `vote` arguments respectively
 
 ::: code-group

diff --git a/book/api/websocket.md b/book/api/websocket.md
@@ -190,13 +190,25 @@ snapshot.
 The commit hash used to build the validator.
 
 #### `summary.identity_key`
-| frequency | type     | example        |
-|-----------|----------|----------------|
-| *Once*    | `string` | `"Fe4StcZSQ228dKK2hni7aCP7ZprNhj8QKWzFe5usGFYF"` |
+| frequency       | type     | example        |
+|-----------------|----------|----------------|
+| *Once* + *Live* | `string` | `"Fe4StcZSQ228dKK2hni7aCP7ZprNhj8QKWzFe5usGFYF"` |
 
 The public identity key assigned to the running validator, encoded in
-base58. Firedancer does not support changing the identity key of the
-validator while it is running and this value does not change.
+base58. Firedancer support changing the identity key of the validator
+while it is running through a `set-identity` command, and if this
+happens a new `identity_key` will be published.
+
+Summary information in this API is tied to the validator instance and
+not the identity key, for example, the skip rate is the skip rate of all
+blocks produced by this validator, regardless of what identity key they
+were published with. The `mine` field of blocks similarly indicates if
+this validator published the block, not whether it had the same identity
+key as the validator has now.
+
+Because of this, when changing identity key, no other information will
+be republished. It will simply continue counting for blocks published
+with the new key.
 
 #### `summary.vote_state`
 | frequency       | type     | example  |
@@ -428,8 +440,13 @@ The skip rate of an epoch is the ratio of `skipped_slots/total_slots`
 for our leader slots in that epoch.  The skip rate is only known for
 slots that have happened since the validator was started, and we do
 not incorporate slots from before boot, as we cannot know if they were
-skipped or not.  If no slot has happened since boot, i.e.
-total_slots==0, skip_rate is 0.
+skipped or not.  If this validator has not had any leader slots since it
+was booted, the skip rate reported will be zero.
+
+The skip rate is specific to this running validator, and not any
+given identity key. If the validator identity is changed with
+`set-identity`, the skip rate will remain the same at first, and then
+start incorporating skips for the new identity key.
 
 **`SkipRate`**
 | Field     | Type     | Description |
@@ -924,7 +941,13 @@ in which case it would be both `incomplete` and `skipped`.
 Slots are either `mine` (created by this validator), or not, in which
 case we are replaying a block from another validator. Slots that are
 `mine` contain additional information about our performance creating the
-block for that slot.
+block for that slot. The `mine` field means that this specific validator
+published the block. It might happen that a block is published by a
+leader with our identity key, but not this specific validator (for
+example, if the block was published by another computer, and then this
+validator took over the identity key with a `set-identity` operation)
+in which case the `mine` field will be set to false, even though the
+block has our key.
 
 Some information is only known for blocks that have been replayed
 successfully (reached the `completed` state), for example the number of
@@ -941,7 +964,7 @@ initially replay one but the cluster votes on the other one.
 | Field      | Type      | Description |
 |------------|-----------|-------------|
 | slot       | `number`  | Identity of the slot, counting up from zero for the first slot in the chain |
-| mine       | `boolean` | True if this validator was the leader for this slot. This will never change for a slot once it has been published, and will be aligned with the epoch information |
+| mine       | `boolean` | True if this validator was the leader for this slot. This will never change for a slot once it has been published, and will be aligned with the epoch information, except in cases where the validator identity is changed while the validator is running |
 | skipped    | `boolean` | True if the slot was skipped. The skipped state is the state in the currently active fork of the validator. The skipped state can change if the validator switches active fork |
 | duration_nanos | `number\|null` | A duration in nanoseconds of how long it took us to receive and replay the slot. This is the time as measured since we completed replay of the parent slot locally on this validator, til the time we replayed this slot locally on this validator |
 | completed_time_nanos | `number\|null` |  UNIX timestamp in nanoseconds of when this validator finished replaying the slot locally. If the slot was skipped, this may be `null` which indicates the block for this slot did not finish replaying on this validator. In some cases, a skipped slot will still have a completed time, if we received the data for the block, replayed it, and then decided to use a different fork |
@@ -955,9 +978,9 @@ initially replay one but the cluster votes on the other one.
 | tips            | `number\|null`     | Total amount of tips that this slot collects in lamports, across all block builders, after any commission to the block builder is subtracted |
 
 #### `slot.skipped_history`
-| frequency | type | example |
-|-----------|------|---------|
- *Once*     | `number[]` | `[286576808, 286576809, 286576810, 286576811, 286625025, 286625026, 286625027]` |
+| frequency      | type       | example |
+|----------------|------------|---------|
+ *Once* + *Live* | `number[]` | `[286576808, 286576809, 286576810, 286576811, 286625025, 286625026, 286625027]` |
 
 A list of all of the recent leader slots of the validator which were
 skipped. Only two epochs of leader slots are tracked, and skips prior
@@ -966,6 +989,10 @@ to this are not retrieved.
 The skipped slots include unrooted and unconfirmed slots of ours which
 are skipped on the currently active fork.
 
+If the validator identity is changed with a `set-identity` operation,
+the skipped history is republished with a list of skipped slots for the
+new validator identity.
+
 #### `slot.update`
 | frequency   | type          | example |
 |-------------|---------------|---------|

diff --git a/book/snippets/commands/set-identity.ansi b/book/snippets/commands/set-identity.ansi
@@ -0,0 +1,2 @@
+$ sudo fdctl set-identity ~/keys/validator-keypair.json
+[32mNOTICE [0m Validator identity key switched to `4UCZB7zfquCVN7GafWETFVLTceNH3nm2mndyHeDuSggC`
diff --git a/plugin/bundle/src/auth.rs b/plugin/bundle/src/auth.rs
@@ -10,7 +10,9 @@ use {
     log::*,
     std::{ffi::CString, time::Duration},
     tokio::time::timeout,
+    tokio::sync::watch,
     tonic::{service::Interceptor, transport::Channel, Code, Request, Status},
+    std::ops::Deref,
 };
 
 /// Interceptor responsible for adding the access token to request headers.
@@ -47,16 +49,18 @@ extern "C" {
 pub async fn generate_auth_tokens(
     auth_service_client: &mut AuthServiceClient<Channel>,
     // used to sign challenges
-    identity_pubkey: &[u8; 32],
+    identity_pubkey: &mut watch::Receiver<[u8; 32]>,
 ) -> crate::Result<(
     Token, /* access_token */
     Token, /* refresh_token */
 )> {
+    let pubkey = identity_pubkey.borrow_and_update().deref().to_vec();
+
     debug!("generate_auth_challenge");
     let challenge_response = auth_service_client
         .generate_auth_challenge(GenerateAuthChallengeRequest {
             role: Role::Validator as i32,
-            pubkey: identity_pubkey.to_vec(),
+            pubkey: pubkey.clone(),
         })
         .await
         .map_err(|e: Status| {
@@ -80,7 +84,7 @@ pub async fn generate_auth_tokens(
 
     let formatted_challenge = format!(
         "{}-{}",
-        bs58::encode(identity_pubkey).into_string(),
+        bs58::encode(pubkey.clone()).into_string(),
         challenge,
     );
 
@@ -93,7 +97,7 @@ pub async fn generate_auth_tokens(
     let auth_tokens = auth_service_client
         .generate_auth_tokens(GenerateAuthTokensRequest {
             challenge: formatted_challenge,
-            client_pubkey: identity_pubkey.to_vec(),
+            client_pubkey: pubkey,
             signed_challenge: Vec::from(signed_challenge),
         })
         .await
@@ -108,7 +112,7 @@ pub async fn generate_auth_tokens(
 
 /// Tries to refresh the access token or run full-reauth if needed.
 pub async fn maybe_refresh_auth_tokens(
-    pubkey: &[u8; 32],
+    pubkey: &mut watch::Receiver<[u8; 32]>,
     auth_service_client: &mut AuthServiceClient<Channel>,
     access_token: &Token,
     refresh_token: &Token,
@@ -133,7 +137,7 @@ pub async fn maybe_refresh_auth_tokens(
 
     let should_refresh_access =
         access_token_expiry.checked_sub(now).unwrap_or_default() <= refresh_within_s;
-    let should_generate_new_tokens =
+    let should_generate_new_tokens = pubkey.has_changed().unwrap() ||
         refresh_token_expiry.checked_sub(now).unwrap_or_default() <= refresh_within_s;
 
     if should_generate_new_tokens {
+9 −0		cli/src/main.rs
+4 −4		core/src/banking_stage/committer.rs
+9 −0		ledger-tool/src/main.rs
+32 −0		validator/src/admin_rpc_service.rs
+5 −0		validator/src/main.rs
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		$ sudo fdctl set-identity ~/keys/validator-keypair.json
		[32mNOTICE [0m Validator identity key switched to `4UCZB7zfquCVN7GafWETFVLTceNH3nm2mndyHeDuSggC`