diff --git a/K8S.md b/K8S.md index a916625df4..83eadfc5c7 100644 --- a/K8S.md +++ b/K8S.md @@ -48,7 +48,7 @@ make cluster-up # Attach to node01 console docker exec -it ${KUBEVIRT_PROVIDER}-node01 screen /dev/pts/0 ``` -Use `vagrant:vagrant` to login. +Use `vagrant:vagrant` for x86 and cloud-user:cloud-user for s390x to login Note: it is sometimes `/dev/pts/1` or `/dev/pts/2`, try them in case you don't get a prompt. Make sure you don't leave open screens, else the next screen will be messed up. diff --git a/KUBEVIRTCI_LOCAL_TESTING.md b/KUBEVIRTCI_LOCAL_TESTING.md index e78c33adec..7cdea0d5a7 100644 --- a/KUBEVIRTCI_LOCAL_TESTING.md +++ b/KUBEVIRTCI_LOCAL_TESTING.md @@ -21,7 +21,7 @@ cd $KUBEVIRTCI_DIR ```bash # Build a provider. This includes starting it with cluster-up for verification and shutting it down for cleanup. -(cd cluster-provision/k8s/1.27; ../provision.sh) +(cd cluster-provision/k8s/1.28; ../provision.sh) ``` Note: @@ -34,7 +34,7 @@ please use `export BYPASS_PMAN_CHANGE_CHECK=true` to bypass provision-manager ch # set local provision test flag (mandatory) export KUBEVIRTCI_PROVISION_CHECK=1 ``` - +This ensures to set container-registry to quay.io and container-suffix to :latest If `KUBEVIRTCI_PROVISION_CHECK` is not used, you can set `KUBEVIRTCI_CONTAINER_REGISTRY` (default: `quay.io`), `KUBEVIRTCI_CONTAINER_ORG` (default: `kubevirtci`) and `KUBEVIRTCI_CONTAINER_SUFFIX` (default: according gocli tag), in order to use a custom image. @@ -48,7 +48,7 @@ export KUBEVIRTCI_GOCLI_CONTAINER=quay.io/kubevirtci/gocli:latest ### start cluster ```bash -export KUBEVIRT_PROVIDER=k8s-1.30 +export KUBEVIRT_PROVIDER=k8s-1.28 export KUBECONFIG=$(./cluster-up/kubeconfig.sh) export KUBEVIRT_NUM_NODES=2 @@ -59,7 +59,7 @@ make cluster-up #### start cluster with prometheus, alertmanager and grafana To enable prometheus, please also export the following variables before running `make cluster-up`: ```bash -export KUBEVIRT_PROVIDER=k8s-1.30 +export KUBEVIRT_PROVIDER=k8s-1.28 export KUBEVIRT_DEPLOY_PROMETHEUS=true export KUBEVIRT_DEPLOY_PROMETHEUS_ALERTMANAGER=true export KUBEVIRT_DEPLOY_GRAFANA=true @@ -134,12 +134,16 @@ For that we have phased mode. Usage: export the required mode, i.e `export PHASES=linux` or `export PHASES=k8s` and then run the provision. the full flow will be: -`export PHASES=linux; (cd cluster-provision/k8s/1.21; ../provision.sh)` -`export PHASES=k8s; (cd cluster-provision/k8s/1.21; ../provision.sh)` +`export PHASES=linux; (cd cluster-provision/k8s/1.28; ../provision.sh)` +`export PHASES=k8s; (cd cluster-provision/k8s/1.28; ../provision.sh)` Run the `k8s` step as much as needed. It reuses the intermediate image that was created by the `linux` phase. +Note : +1. By default when you run `k8s` phase alone, it uses centos9 image specified in cluster-provision/k8s/base-image, not the one built locally in the `linux` phase. So, to make `k8s` phase use the locally built centos9 image, update cluster-provision/k8s/base-image with the locally built image name and tag (default: quay.io/kubevirtci/centos9:latest) +2. Also note if you run both `linux,k8s` phases, then it doesn't save the intermediate container image generated post linux image. So, for the centos9 image required for k8s stage, you've to run the linux phase alone. + Once you are done, either check the cluster manually, or use: -`export PHASES=k8s; export CHECK_CLUSTER=true; (cd cluster-provision/k8s/1.21; ../provision.sh)` +`export PHASES=k8s; export CHECK_CLUSTER=true; (cd cluster-provision/k8s/1.28; ../provision.sh)` ### provision without pre-pulling images diff --git a/cluster-provision/centos9/Dockerfile b/cluster-provision/centos9/Dockerfile index 9b51767ae7..7ce17a4c5f 100644 --- a/cluster-provision/centos9/Dockerfile +++ b/cluster-provision/centos9/Dockerfile @@ -1,33 +1,56 @@ +FROM quay.io/fedora/fedora:39 AS base -FROM quay.io/kubevirtci/fedora@sha256:e3a6087f62f288571db14defb7e0e10ad7fe6f973f567b0488d3aac5e927035a +RUN dnf -y install jq iptables iproute dnsmasq qemu socat openssh-clients screen bind-utils tcpdump iputils libguestfs-tools-c && dnf clean all -ARG centos_version +FROM base AS imageartifactdownload + +ARG BUILDARCH -RUN dnf -y install jq iptables iproute dnsmasq qemu openssh-clients screen bind-utils tcpdump iputils && dnf clean all +ARG centos_version WORKDIR / -COPY vagrant.key /vagrant.key +RUN echo "Centos9 version $centos_version" -RUN chmod 700 vagrant.key +COPY scripts/download_box.sh / -ENV DOCKERIZE_VERSION v0.6.1 +RUN if test "$BUILDARCH" != "s390x"; then \ + /download_box.sh https://cloud.centos.org/centos/9-stream/x86_64/images/CentOS-Stream-Vagrant-9-$centos_version.x86_64.vagrant-libvirt.box && \ + curl -L -o /initramfs-amd64.img http://mirror.stream.centos.org/9-stream/BaseOS/x86_64/os/images/pxeboot/initrd.img && \ + curl -L -o /vmlinuz-amd64 http://mirror.stream.centos.org/9-stream/BaseOS/x86_64/os/images/pxeboot/vmlinuz; \ + else \ + /download_box.sh https://cloud.centos.org/centos/9-stream/s390x/images/CentOS-Stream-GenericCloud-9-$centos_version.s390x.qcow2 && \ + # Access virtual machine disk images directly by using LIBGUESTFS_BACKEND=direct, instead of libvirt + export LIBGUESTFS_BACKEND=direct && \ + guestfish --ro --add box.qcow2 --mount /dev/sda1:/ ls /boot/ | grep -E '^vmlinuz-|^initramfs-' | xargs -I {} guestfish --ro --add box.qcow2 -i copy-out /boot/{} / ; \ + fi -RUN curl -LO https://github.com/jwilder/dockerize/releases/download/$DOCKERIZE_VERSION/dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz \ - && tar -xzvf dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz \ - && rm dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz \ - && chmod u+x dockerize \ - && mv dockerize /usr/local/bin/ -COPY scripts/download_box.sh / +FROM base AS nodecontainer -RUN echo "Centos9 version $centos_version" +ARG BUILDARCH + +WORKDIR / -ENV CENTOS_URL https://cloud.centos.org/centos/9-stream/x86_64/images/CentOS-Stream-Vagrant-9-$centos_version.x86_64.vagrant-libvirt.box +COPY vagrant.key /vagrant.key -RUN /download_box.sh ${CENTOS_URL} +RUN chmod 700 vagrant.key -RUN curl -L -o /initrd.img http://mirror.stream.centos.org/9-stream/BaseOS/x86_64/os/images/pxeboot/initrd.img -RUN curl -L -o /vmlinuz http://mirror.stream.centos.org/9-stream/BaseOS/x86_64/os/images/pxeboot/vmlinuz +ENV DOCKERIZE_VERSION=v0.6.1 + +RUN if test "$BUILDARCH" != "s390x"; then \ + curl -L -o dockerize-linux-$BUILDARCH.tar.gz https://github.com/jwilder/dockerize/releases/download/$DOCKERIZE_VERSION/dockerize-linux-amd64-$DOCKERIZE_VERSION.tar.gz; \ + else \ + # Temporary till s390x support is upstreamed to dockerize (https://github.com/jwilder/dockerize/pull/200) + curl -L -o dockerize-linux-$BUILDARCH.tar.gz https://github.com/ibm-jitendra/kubevirt_pkgs/raw/main/dockerize-linux-s390x.tar.gz; \ + fi && \ + tar -xzvf dockerize-linux-$BUILDARCH.tar.gz && \ + rm dockerize-linux-$BUILDARCH.tar.gz && \ + chmod u+x dockerize && \ + mv dockerize /usr/local/bin/ + +COPY --from=imageartifactdownload /box.qcow2 box.qcow2 +COPY --from=imageartifactdownload /vmlinuz-* /vmlinuz +COPY --from=imageartifactdownload /initramfs-* /initrd.img COPY scripts/* / diff --git a/cluster-provision/centos9/build.sh b/cluster-provision/centos9/build.sh index c1f404aca5..677cdd2fab 100755 --- a/cluster-provision/centos9/build.sh +++ b/cluster-provision/centos9/build.sh @@ -4,4 +4,4 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" centos_version="$(cat $DIR/version | tr -d '\n')" -docker build --build-arg centos_version=$centos_version . -t quay.io/kubevirtci/centos9 +docker build --build-arg BUILDARCH=$(uname -m) --build-arg centos_version=$centos_version . -t quay.io/kubevirtci/centos9 diff --git a/cluster-provision/centos9/scripts/download_box.sh b/cluster-provision/centos9/scripts/download_box.sh index 76b79921ff..5a19b58523 100755 --- a/cluster-provision/centos9/scripts/download_box.sh +++ b/cluster-provision/centos9/scripts/download_box.sh @@ -3,6 +3,14 @@ set -e set -o pipefail -curl -L $1 | tar -zxvf - box.img -qemu-img convert -O qcow2 box.img box.qcow2 -rm box.img + +ARCH=$(uname -m) + +#For the s390x architecture, instead of vagrant box image, generic cloud (qcow2) image is used directly. +if [ "$ARCH" == "s390x" ]; then + curl -L $1 -o box.qcow2 +else + curl -L $1 | tar -zxvf - box.img + qemu-img convert -O qcow2 box.img box.qcow2 + rm box.img +fi diff --git a/cluster-provision/centos9/scripts/kernel.s390x.args b/cluster-provision/centos9/scripts/kernel.s390x.args new file mode 100644 index 0000000000..aa20cd6d0e --- /dev/null +++ b/cluster-provision/centos9/scripts/kernel.s390x.args @@ -0,0 +1 @@ +root=/dev/vda1 ro no_timer_check console=tty0 console=ttyS0,115200n8 net.ifnames=0 biosdevname=0 crashkernel=1G-4G:192M,4G-64G:256M,64G-:512M \ No newline at end of file diff --git a/cluster-provision/centos9/scripts/vm.sh b/cluster-provision/centos9/scripts/vm.sh index c26006af47..3ccb605bf9 100755 --- a/cluster-provision/centos9/scripts/vm.sh +++ b/cluster-provision/centos9/scripts/vm.sh @@ -11,6 +11,9 @@ KERNEL_ARGS="" NEXT_DISK="" BLOCK_DEV="" BLOCK_DEV_SIZE="" +#TODO: Check other places where vagrant as username is used +VM_USER=$( [ "$(uname -m)" = "s390x" ] && echo "cloud-user" || echo "vagrant" ) +VM_USER_SSH_KEY="vagrant.key" while true; do case "$1" in @@ -38,6 +41,12 @@ function calc_next_disk { if [ -n "$NEXT_DISK" ]; then next=${NEXT_DISK}; fi if [ "$last" = "00" ]; then last="box.qcow2" + # Customize qcow2 image using virt-sysprep (with KVM accelerator) + if [ "$(uname -m)" = "s390x" ]; then + export LIBGUESTFS_BACKEND=direct + export LIBGUESTFS_BACKEND_SETTINGS=force_kvm + virt-sysprep -a box.qcow2 --run-command 'useradd -m cloud-user' --append '/etc/cloud/cloud.cfg:runcmd:' --append '/etc/cloud/cloud.cfg: - hostnamectl set-hostname ""' --root-password password:Zxc@123 --ssh-inject cloud-user:string:"ssh-rsa AAAAB3NzaC1yc2EAAAABIwAAAQEA6NF8iallvQVp22WDkTkyrtvp9eWW6A8YVr+kz4TjGYe7gHzIw+niNltGEFHzD8+v1I2YJ6oXevct1YeS0o9HZyN1Q9qgCgzUFtdOKLv6IedplqoPkcmF0aYet2PkEDo3MlTBckFXPITAMzF8dJSIFo9D8HfdOV0IAdx4O7PtixWKn5y2hMNG0zQPyUecp4pzC6kivAIhyfHilFR61RGL+GPXQ2MWZWFYbAGjyiYJnAmCP3NOTd0jMZEnDkbUvxhMmBYSdETk1rRgm+R4LOzFUGaHqHDLKLX+FIPKcF96hrucXzcWyLbIbEgE98OHlnVYCzRdK8jlqm8tehUc9c9WhQ== vagrant insecure public key" + fi else last=$(printf "/disk%02d.qcow2" $last) fi @@ -50,7 +59,7 @@ cat >/usr/local/bin/ssh.sh </dev/null -ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no vagrant@192.168.66.1${n} -i vagrant.key -p 22 -q \$@ +ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no ${VM_USER}@192.168.66.1${n} -i ${VM_USER_SSH_KEY} -p 22 -q \$@ EOL chmod u+x /usr/local/bin/ssh.sh echo "done" >/ssh_ready @@ -184,15 +193,100 @@ if [ "${NUMA}" -gt 1 ]; then done fi -exec qemu-system-x86_64 -enable-kvm -drive format=qcow2,file=${next},if=virtio,cache=unsafe ${block_dev_arg} \ - -device virtio-net-pci,netdev=network0,mac=52:55:00:d1:55:${n} \ - -netdev tap,id=network0,ifname=tap${n},script=no,downscript=no \ - -device virtio-rng-pci \ - -initrd /initrd.img \ - -kernel /vmlinuz \ - -append "$(cat /kernel.args) $(cat /additional.kernel.args) ${KERNEL_ARGS}" \ - -vnc :${n} -cpu host,migratable=no,+invtsc -m ${MEMORY} -smp ${CPU} ${numa_arg} \ - -serial pty -M q35,accel=kvm,kernel_irqchip=split \ - -device intel-iommu,intremap=on,caching-mode=on -device intel-hda -device hda-duplex -device AC97 \ - -uuid $(cat /proc/sys/kernel/random/uuid) \ - ${QEMU_ARGS} +if [ "$(uname -m)" != "s390x" ]; then + #Docs: https://www.qemu.org/docs/master/system/invocation.html + qemu_system_cmd="qemu-system-x86_64 \ + -enable-kvm \ + -drive format=qcow2,file=${next},if=virtio,cache=unsafe ${block_dev_arg} \ + -device virtio-net-pci,netdev=network0,mac=52:55:00:d1:55:${n} \ + -netdev tap,id=network0,ifname=tap${n},script=no,downscript=no \ + -device virtio-rng-pci \ + -initrd /initrd.img \ + -kernel /vmlinuz \ + -append \"$(cat /kernel.args) $(cat /additional.kernel.args) ${KERNEL_ARGS}\" \ + -vnc :${n} \ + -cpu host,migratable=no,+invtsc \ + -m ${MEMORY} \ + -smp ${CPU} ${numa_arg} \ + -serial pty \ + -machine q35,accel=kvm,kernel_irqchip=split \ + -device intel-iommu,intremap=on,caching-mode=on \ + -device intel-hda \ + -device hda-duplex \ + -device AC97 \ + -uuid $(cat /proc/sys/kernel/random/uuid) \ + ${QEMU_ARGS}" +else + # As per https://www.qemu.org/docs/master/system/s390x/bootdevices.html#booting-without-bootindex-parameter -drive if=virtio can't be specified with bootindex for s390x + qemu_system_cmd="qemu-system-s390x \ + -enable-kvm \ + -drive format=qcow2,file=${next},if=none,cache=unsafe,id=drive1 ${block_dev_arg} \ + -device virtio-blk,drive=drive1,bootindex=1 \ + -device virtio-net-ccw,netdev=network0,mac=52:55:00:d1:55:${n} \ + -netdev tap,id=network0,ifname=tap${n},script=no,downscript=no \ + -device virtio-rng \ + -initrd /initrd.img \ + -kernel /vmlinuz \ + -append \"$(cat /kernel.s390x.args) $(cat /additional.kernel.args) ${KERNEL_ARGS}\" \ + -vnc :${n} \ + -cpu host \ + -m ${MEMORY} \ + -smp ${CPU} ${numa_arg} \ + -serial pty \ + -machine s390-ccw-virtio,accel=kvm \ + -uuid $(cat /proc/sys/kernel/random/uuid) \ + ${QEMU_ARGS}" +fi + +# Remove secondary network devices from qemu_system_cmd and move them to qemu_monitor_cmds, so +# that those devices are later added after VM is started using qemu monitor to avoid +# primary network interface to be named other than eth0. This is mainly required for s390x, as +# otherwise if primary interface is other than eth0, it is not getting the IP from dhcp server. +qemu_monitor_cmds=() +IFS=' ' read -r -a qemu_parts <<< "$qemu_system_cmd" +for ((i = 0; i < ${#qemu_parts[@]}; i++)); do + part="${qemu_parts[$i]}" + nxtpart="${qemu_parts[$i + 1]}" + # Check for secondary network devices and move them to qemu_monitor_cmds + if { [ "$part" == "-netdev" ] && [[ "$nxtpart" == *"secondarynet"* ]]; } || \ + { [ "$part" == "-device" ] && [[ "$nxtpart" == *"virtio-net-ccw"* ]] && [[ "$nxtpart" == *"secondarynet"* ]]; }; then + qemu_system_cmd=$(echo "$qemu_system_cmd" | sed "s/ -$part $nxtpart//") + qemu_monitor_cmds+=("${part}_add $nxtpart") + fi +done + +qemu_system_cmd+=" -monitor unix:/tmp/qemu-monitor.sock,server,nowait" +echo "qemu_system_cmd is ${qemu_system_cmd}" +echo "qemu_monitor_cmds is ${qemu_monitor_cmds}" + +PID=0 +eval "nohup $qemu_system_cmd &" +PID=$! + +# Function to check if QEMU monitor socket is ready +is_qemu_monitor_ready() { + socat - UNIX-CONNECT:/tmp/qemu-monitor.sock < /dev/null > /dev/null 2>&1 +} + +# Wait for the QEMU monitor socket to be ready +elapsed=0 +while ! is_qemu_monitor_ready; do + if [ $elapsed -ge 60 ]; then + echo "QEMU monitor socket did not become available within 60 seconds." + exit 1 + fi + sleep 1 + elapsed=$((elapsed + 1)) +done +echo "QEMU monitor socket is ready." + +# Send commands to QEMU monitor +if [ "${#qemu_monitor_cmds[@]}" -gt 0 ]; then + # Sort commands in reverse alphabetical order so that -netdev are passed first then -dev + IFS=$'\t' qemu_monitor_cmds_sorted=($(printf "%s\n" "${qemu_monitor_cmds[@]}" | sort -r)) + for qemu_monitor_cmd in "${qemu_monitor_cmds_sorted[@]}"; do + echo "$qemu_monitor_cmd" | socat - UNIX-CONNECT:/tmp/qemu-monitor.sock + done +fi + +wait $PID \ No newline at end of file diff --git a/cluster-provision/gocli/Makefile b/cluster-provision/gocli/Makefile index 1a60ccb8cd..e87e53001d 100644 --- a/cluster-provision/gocli/Makefile +++ b/cluster-provision/gocli/Makefile @@ -2,6 +2,7 @@ SHELL := /bin/bash IMAGES_FILE ?= images.json KUBEVIRTCI_IMAGE_REPO ?= quay.io/kubevirtci +GOARCH ?= $$(uname -m | grep -q s390x && echo s390x || echo amd64) export GO111MODULE=on export GOPROXY=direct @@ -19,7 +20,7 @@ test: .PHONY: gocli cli: - CGO_ENABLED=0 GOOS=linux GOARCH=amd64 $(GO) build -ldflags "-X 'kubevirt.io/kubevirtci/cluster-provision/gocli/images.SUFFIX=:$(KUBEVIRTCI_TAG)'" -o $(BIN_DIR)/cli ./cmd/cli + CGO_ENABLED=0 GOOS=linux GOARCH=${GOARCH} $(GO) build -ldflags "-X 'kubevirt.io/kubevirtci/cluster-provision/gocli/images.SUFFIX=:$(KUBEVIRTCI_TAG)'" -o $(BIN_DIR)/cli ./cmd/cli .PHONY: fmt fmt: $(GO) fmt ./cmd/... diff --git a/cluster-provision/gocli/cmd/provision.go b/cluster-provision/gocli/cmd/provision.go index c656c0f4d5..16540ce434 100644 --- a/cluster-provision/gocli/cmd/provision.go +++ b/cluster-provision/gocli/cmd/provision.go @@ -6,6 +6,7 @@ import ( "os" "os/signal" "path/filepath" + "runtime" "strconv" "strings" @@ -51,6 +52,7 @@ func NewProvisionCommand() *cobra.Command { func provisionCluster(cmd *cobra.Command, args []string) (retErr error) { var base string + sshUser := utils.GetSSHUserByArchitecture(runtime.GOARCH) packagePath := args[0] versionBytes, err := os.ReadFile(filepath.Join(packagePath, "version")) if err != nil { @@ -228,13 +230,14 @@ func provisionCluster(cmd *cobra.Command, args []string) (retErr error) { } // Wait for ssh.sh script to exist + logrus.Info("Wait for ssh.sh script to exist") err = _cmd(cli, nodeContainer(prefix, nodeName), "while [ ! -f /ssh_ready ] ; do sleep 1; done", "checking for ssh.sh script") if err != nil { + logrus.Info("Error: Wait for ssh.sh script to exist") return err } - // Wait for the VM to be up - err = _cmd(cli, nodeContainer(prefix, nodeName), "ssh.sh echo VM is up", "waiting for node to come up") + err = waitForVMToBeUp(cli, prefix, nodeName) if err != nil { return err } @@ -252,21 +255,21 @@ func provisionCluster(cmd *cobra.Command, args []string) (retErr error) { if err != nil { return err } - err = _cmd(cli, nodeContainer(prefix, nodeName), "if [ -f /scripts/extra-pre-pull-images ]; then scp -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -i vagrant.key -P 22 /scripts/extra-pre-pull-images vagrant@192.168.66.101:/tmp/extra-pre-pull-images; fi", "copying /scripts/extra-pre-pull-images if existing") + err = _cmd(cli, nodeContainer(prefix, nodeName), fmt.Sprintf("if [ -f /scripts/extra-pre-pull-images ]; then scp -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -i vagrant.key -P 22 /scripts/extra-pre-pull-images %s@192.168.66.101:/tmp/extra-pre-pull-images; fi", sshUser), "copying /scripts/extra-pre-pull-images if existing") if err != nil { return err } - err = _cmd(cli, nodeContainer(prefix, nodeName), "if [ -f /scripts/fetch-images.sh ]; then scp -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -i vagrant.key -P 22 /scripts/fetch-images.sh vagrant@192.168.66.101:/tmp/fetch-images.sh; fi", "copying /scripts/fetch-images.sh if existing") + err = _cmd(cli, nodeContainer(prefix, nodeName), fmt.Sprintf("if [ -f /scripts/fetch-images.sh ]; then scp -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -i vagrant.key -P 22 /scripts/fetch-images.sh %s@192.168.66.101:/tmp/fetch-images.sh; fi", sshUser), "copying /scripts/fetch-images.sh if existing") if err != nil { return err } - err = _cmd(cli, nodeContainer(prefix, nodeName), "ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -i vagrant.key vagrant@192.168.66.101 'mkdir -p /tmp/ceph /tmp/cnao /tmp/nfs-csi /tmp/nodeports /tmp/prometheus /tmp/whereabouts /tmp/kwok'", "Create required manifest directories before copy") + err = _cmd(cli, nodeContainer(prefix, nodeName), fmt.Sprintf("ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -i vagrant.key %s@192.168.66.101 'mkdir -p /tmp/ceph /tmp/cnao /tmp/nfs-csi /tmp/nodeports /tmp/prometheus /tmp/whereabouts /tmp/kwok'", sshUser), "Create required manifest directories before copy") if err != nil { return err } // Copy manifests to the VM - err = _cmd(cli, nodeContainer(prefix, nodeName), "scp -r -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -i vagrant.key -P 22 /scripts/manifests/* vagrant@192.168.66.101:/tmp", "copying manifests to the VM") + err = _cmd(cli, nodeContainer(prefix, nodeName), fmt.Sprintf("scp -r -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -i vagrant.key -P 22 /scripts/manifests/* %s@192.168.66.101:/tmp", sshUser), "copying manifests to the VM") if err != nil { return err } diff --git a/cluster-provision/gocli/cmd/run.go b/cluster-provision/gocli/cmd/run.go index be59736fff..bd44514951 100644 --- a/cluster-provision/gocli/cmd/run.go +++ b/cluster-provision/gocli/cmd/run.go @@ -9,6 +9,7 @@ import ( "os/signal" "path" "path/filepath" + "runtime" "strconv" "strings" "sync" @@ -125,6 +126,9 @@ func NewRunCommand() *cobra.Command { } func run(cmd *cobra.Command, args []string) (retErr error) { + + sshUser := utils.GetSSHUserByArchitecture(runtime.GOARCH) + prefix, err := cmd.Flags().GetString("prefix") if err != nil { return err @@ -433,6 +437,15 @@ func run(cmd *cobra.Command, args []string) (retErr error) { // the VM console from the container without ssh qemuArgs += " -serial pty" + arch := runtime.GOARCH + var qemuDevice string + + // Use virtio-net-ccw device incase of s390x Architecture. + if arch == "s390x" { + qemuDevice = "virtio-net-ccw" + } else { + qemuDevice = "virtio-net-pci" + } wg := sync.WaitGroup{} wg.Add(int(nodes)) // start one vm after each other @@ -445,7 +458,7 @@ func run(cmd *cobra.Command, args []string) (retErr error) { netSuffix := fmt.Sprintf("%d-%d", x, i) macSuffix := fmt.Sprintf("%02x", macCounter) macCounter++ - nodeQemuArgs = fmt.Sprintf("%s -device virtio-net-pci,netdev=secondarynet%s,mac=52:55:00:d1:56:%s -netdev tap,id=secondarynet%s,ifname=stap%s,script=no,downscript=no", nodeQemuArgs, netSuffix, macSuffix, netSuffix, netSuffix) + nodeQemuArgs = fmt.Sprintf("%s -device %s,netdev=secondarynet%s,mac=52:55:00:d1:56:%s -netdev tap,id=secondarynet%s,ifname=stap%s,script=no,downscript=no", nodeQemuArgs, qemuDevice, netSuffix, macSuffix, netSuffix, netSuffix) } nodeName := nodeNameFromIndex(x + 1) @@ -600,19 +613,19 @@ func run(cmd *cobra.Command, args []string) (retErr error) { return fmt.Errorf("checking for ssh.sh script for node %s failed", nodeName) } - err = waitForVMToBeUp(prefix, nodeName) + err = waitForVMToBeUp(cli, prefix, nodeName) if err != nil { return err } // the scripts required for running the provider exist in the node container, in order to execute them directly on the node they must be copied to the node first - success, err = docker.Exec(cli, nodeContainer(prefix, nodeName), []string{"/bin/bash", "-c", "scp -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -i vagrant.key " + fmt.Sprintf("-r /scripts vagrant@192.168.66.10%d:/home/vagrant/scripts", x+1)}, io.Discard) + success, err = docker.Exec(cli, nodeContainer(prefix, nodeName), []string{"/bin/bash", "-c", "scp -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -i vagrant.key " + fmt.Sprintf("-r /scripts %s@192.168.66.10%d:/home/%s/scripts", sshUser, x+1, sshUser)}, io.Discard) if err != nil { return err } // move the scripts to the same location they were in in the nodecontainer to enable command abstraction - for _, cmd := range []string{"sudo mkdir /scripts", "sudo cp -r /home/vagrant/scripts/* /scripts"} { + for _, cmd := range []string{"sudo mkdir /scripts", fmt.Sprintf("sudo cp -r /home/%s/scripts/* /scripts", sshUser)} { if err = sshClient.Command(cmd); err != nil { return err } @@ -640,7 +653,7 @@ func run(cmd *cobra.Command, args []string) (retErr error) { // clean up scripts directory for i := 0; i < int(nodes); i++ { sshClient, _ := libssh.NewSSHClient(sshPort, i+1, false) - if err = sshClient.Command("rm -rf /home/vagrant/scripts"); err != nil { + if err = sshClient.Command(fmt.Sprintf("rm -rf /home/%s/scripts", sshUser)); err != nil { return fmt.Errorf("Cleaning up scripts dir failed: %s", err) } } @@ -691,6 +704,7 @@ func provisionK8sOptions(sshClient libssh.Client, n *nodesconfig.NodeK8sConfig) } func provisionNode(sshClient libssh.Client, n *nodesconfig.NodeLinuxConfig) error { + sshUser := utils.GetSSHUserByArchitecture(runtime.GOARCH) nodeName := nodeNameFromIndex(n.NodeIdx) if n.FipsEnabled { for _, cmd := range []string{"sudo fips-mode-setup --enable", "sudo reboot"} { @@ -698,7 +712,7 @@ func provisionNode(sshClient libssh.Client, n *nodesconfig.NodeLinuxConfig) erro return fmt.Errorf("Starting fips mode failed: %s", err) } } - err := waitForVMToBeUp(n.K8sVersion, nodeName) + err := waitForVMToBeUp(cli, n.K8sVersion, nodeName) if err != nil { return err } @@ -731,21 +745,24 @@ func provisionNode(sshClient libssh.Client, n *nodesconfig.NodeLinuxConfig) erro } } - for _, s := range soundcardPCIIDs { - // move the VM sound cards to a vfio-pci driver to prepare for assignment - if err := sshClient.Command(fmt.Sprintf("-s -- --vendor %s < /scripts/bind_device_to_vfio.sh", s)); err != nil { - return fmt.Errorf("Provisioning soundcard failed: %s", err) + // sound cards are not supported on s390x. + if runtime.GOARCH != "s390x" { + for _, s := range soundcardPCIIDs { + // move the VM sound cards to a vfio-pci driver to prepare for assignment + if err := sshClient.Command(fmt.Sprintf("-s -- --vendor %s < /scripts/bind_device_to_vfio.sh", s)); err != nil { + return fmt.Errorf("Provisioning soundcard failed: %s", err) + } } } if n.SingleStack { - if err := sshClient.Command("touch /home/vagrant/single_stack"); err != nil { + if err := sshClient.Command(fmt.Sprint("touch /home/%s/single_stack", sshUser)); err != nil { return fmt.Errorf("provisioning node %d failed (setting singleStack phase): %s", n.NodeIdx, err) } } if n.EnableAudit { - if err := sshClient.Command("touch /home/vagrant/enable_audit"); err != nil { + if err := sshClient.Command(fmt.Sprint("touch /home/%s/enable_audit", sshUser)); err != nil { return fmt.Errorf("provisioning node %d failed (setting enableAudit phase): %s", n.NodeIdx, err) } } @@ -776,7 +793,7 @@ func provisionNode(sshClient libssh.Client, n *nodesconfig.NodeLinuxConfig) erro return nil } -func waitForVMToBeUp(prefix string, nodeName string) error { +func waitForVMToBeUp(cli *client.Client, prefix string, nodeName string) error { var err error // Wait for the VM to be up for x := 0; x < 10; x++ { diff --git a/cluster-provision/gocli/cmd/scp.go b/cluster-provision/gocli/cmd/scp.go index 01fae70a4c..04184646f8 100644 --- a/cluster-provision/gocli/cmd/scp.go +++ b/cluster-provision/gocli/cmd/scp.go @@ -3,6 +3,7 @@ package cmd import ( "context" "os" + "runtime" "github.com/docker/docker/client" "github.com/spf13/cobra" @@ -55,9 +56,11 @@ func NewSCPCommand() *cobra.Command { RunE: scp, Args: cobra.MinimumNArgs(2), } + + sshUser := utils.GetSSHUserByArchitecture(runtime.GOARCH) ssh.Flags().String("container-name", "dnsmasq", "the container name to SSH copy from") - ssh.Flags().String("ssh-user", "vagrant", "the user that used to connect via SSH to the node") + ssh.Flags().String("ssh-user", sshUser, "the user that used to connect via SSH to the node") return ssh } diff --git a/cluster-provision/gocli/cmd/utils/images.go b/cluster-provision/gocli/cmd/utils/images.go index c779a380e1..86d8f18f2a 100644 --- a/cluster-provision/gocli/cmd/utils/images.go +++ b/cluster-provision/gocli/cmd/utils/images.go @@ -4,5 +4,5 @@ const ( // NFSGaneshaImage contains the reference to NFS docker image NFSGaneshaImage = "docker.io/janeczku/nfs-ganesha@sha256:17fe1813fd20d9fdfa497a26c8a2e39dd49748cd39dbb0559df7627d9bcf4c53" // DockerRegistryImage contains the reference to docker registry docker image - DockerRegistryImage = "quay.io/libpod/registry:2.7" + DockerRegistryImage = "quay.io/libpod/registry:2.8.2" ) diff --git a/cluster-provision/gocli/cmd/utils/utils.go b/cluster-provision/gocli/cmd/utils/utils.go index d5d8340f6f..b4aa213a10 100644 --- a/cluster-provision/gocli/cmd/utils/utils.go +++ b/cluster-provision/gocli/cmd/utils/utils.go @@ -34,3 +34,12 @@ func appendIfExplicit(ports nat.PortMap, exposedPort int, flagSet *pflag.FlagSet } return nil } + +// GetSSHUserByArchitecture returns the SSH user +// based on the system architecture. +func GetSSHUserByArchitecture(arch string) string { + if arch == "s390x" { + return "cloud-user" + } + return "vagrant" +} \ No newline at end of file diff --git a/cluster-provision/gocli/pkg/libssh/ssh.go b/cluster-provision/gocli/pkg/libssh/ssh.go index f411c488ae..3a526ca27a 100644 --- a/cluster-provision/gocli/pkg/libssh/ssh.go +++ b/cluster-provision/gocli/pkg/libssh/ssh.go @@ -5,8 +5,10 @@ import ( "fmt" "net" "os" + "runtime" "golang.org/x/crypto/ssh" + "kubevirt.io/kubevirtci/cluster-provision/gocli/cmd/utils" ) //go:embed key.pem @@ -30,7 +32,9 @@ func NewSSHClient(port uint16, idx int, root bool) (*SSHClientImpl, error) { if err != nil { return nil, err } - u := "vagrant" + + u := utils.GetSSHUserByArchitecture(runtime.GOARCH) + if root { u = "root" } diff --git a/cluster-provision/k8s/1.28/extra-pre-pull-images b/cluster-provision/k8s/1.28/extra-pre-pull-images index dc6aca5ef2..e84b21fd6e 100644 --- a/cluster-provision/k8s/1.28/extra-pre-pull-images +++ b/cluster-provision/k8s/1.28/extra-pre-pull-images @@ -4,7 +4,6 @@ quay.io/kubevirtci/operator:1.15.0 quay.io/kubevirtci/pilot:1.15.0 quay.io/kubevirtci/proxyv2:1.15.0 quay.io/prometheus-operator/prometheus-config-reloader:v0.47.0 -quay.io/calico/cni:v3.18.0 -quay.io/calico/kube-controllers:v3.18.0 -quay.io/calico/node:v3.18.0 -quay.io/calico/pod2daemon-flexvol:v3.18.0 +quay.io/calico/cni:v3.27.2 +quay.io/calico/kube-controllers:v3.27.2 +quay.io/calico/node:v3.27.2 diff --git a/cluster-provision/k8s/1.28/k8s_provision.sh b/cluster-provision/k8s/1.28/k8s_provision.sh index bc927e2322..086b9db149 100755 --- a/cluster-provision/k8s/1.28/k8s_provision.sh +++ b/cluster-provision/k8s/1.28/k8s_provision.sh @@ -2,6 +2,8 @@ set -ex +arch=$(uname -m) + source /var/lib/kubevirtci/shared_vars.sh function getKubernetesClosestStableVersion() { @@ -66,14 +68,23 @@ fi export CRIO_VERSION=1.28 -cat << EOF >/etc/yum.repos.d/devel_kubic_libcontainers_stable_cri-o_${CRIO_VERSION}.repo +if [ "$arch" == "s390x" ]; then + #As crio version available in kubevirtci-crio-mirror/isv_kubernetes_addons_cri-o_stable_v1.28 is broken with issue https://github.com/containers/crun/issues/1494, which is fixed, but yet to released. + BASEURL="https://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable:/cri-o:/${CRIO_VERSION}:/${CRIO_VERSION}.4/CentOS_9_Stream/" +else + BASEURL="https://storage.googleapis.com/kubevirtci-crio-mirror/isv_kubernetes_addons_cri-o_stable_v${CRIO_VERSION}" +fi + +REPO_CONTENT=$(cat << EOF [isv_kubernetes_addons_cri-o_stable_v${CRIO_VERSION}] name=CRI-O v${CRIO_VERSION} (Stable) (rpm) type=rpm-md -baseurl=https://storage.googleapis.com/kubevirtci-crio-mirror/isv_kubernetes_addons_cri-o_stable_v${CRIO_VERSION} +baseurl=${BASEURL} gpgcheck=0 enabled=1 EOF +) +echo "$REPO_CONTENT" | tee /etc/yum.repos.d/devel_kubic_libcontainers_stable_cri-o_${CRIO_VERSION}.repo > /dev/null dnf install -y cri-o @@ -206,10 +217,13 @@ sysctl --system systemctl restart NetworkManager +# No need to modify the ethernet connection incase of s390x Architecture. +if [ "$arch" != "s390x" ]; then nmcli connection modify "System eth0" \ ipv6.method auto \ ipv6.addr-gen-mode eui64 nmcli connection up "System eth0" +fi kubeadmn_patches_path="/provision/kubeadm-patches" mkdir -p $kubeadmn_patches_path @@ -301,6 +315,11 @@ kubeadm_raw_ipv6=/tmp/kubeadm_ipv6.conf kubeadm_manifest="/etc/kubernetes/kubeadm.conf" kubeadm_manifest_ipv6="/etc/kubernetes/kubeadm_ipv6.conf" +# envsubst pkg is not available by default in s390x Architecture. +if [ "$arch" == "s390x" ]; then + dnf install -y gettext +fi + envsubst < $kubeadm_raw > $kubeadm_manifest envsubst < $kubeadm_raw_ipv6 > $kubeadm_manifest_ipv6 diff --git a/cluster-provision/k8s/1.28/manifests/cni.diff b/cluster-provision/k8s/1.28/manifests/cni.diff index 9074ef4de7..f864b33cf2 100644 --- a/cluster-provision/k8s/1.28/manifests/cni.diff +++ b/cluster-provision/k8s/1.28/manifests/cni.diff @@ -1,6 +1,6 @@ ---- a/cluster-provision/k8s/1.19/manifests/cni.do-not-change.yaml -+++ b/cluster-provision/k8s/1.19/manifests/cni.do-not-change.yaml -@@ -32,7 +32,12 @@ data: +--- a/cluster-provision/k8s/1.28/manifests/cni.do-not-change.yaml ++++ b/cluster-provision/k8s/1.28/manifests/cni.do-not-change.yaml +@@ -69,8 +69,13 @@ data: "nodename": "__KUBERNETES_NODE_NAME__", "mtu": __CNI_MTU__, "ipam": { @@ -8,66 +8,73 @@ + "type": "calico-ipam", + "assign_ipv4": "true", + "assign_ipv6": "true" -+ }, + }, + "container_settings": { + "allow_ip_forwarding": true - }, ++ }, "policy": { "type": "k8s" -@@ -3533,7 +3538,7 @@ spec: + }, +@@ -4777,7 +4782,7 @@ spec: # It can be deleted if this is a fresh installation, or if you have already # upgraded to use calico-ipam. - name: upgrade-ipam -- image: docker.io/calico/cni:v3.18.0 -+ image: quay.io/calico/cni:v3.18.0 +- image: docker.io/calico/cni:v3.27.2 ++ image: quay.io/calico/cni:v3.27.2 + imagePullPolicy: IfNotPresent command: ["/opt/cni/bin/calico-ipam", "-upgrade"] envFrom: - - configMapRef: -@@ -3560,7 +3565,7 @@ spec: +@@ -4805,7 +4810,7 @@ spec: # This container installs the CNI binaries # and CNI network config file on each node. - name: install-cni -- image: docker.io/calico/cni:v3.18.0 -+ image: quay.io/calico/cni:v3.18.0 +- image: docker.io/calico/cni:v3.27.2 ++ image: quay.io/calico/cni:v3.27.2 + imagePullPolicy: IfNotPresent command: ["/opt/cni/bin/install"] envFrom: - - configMapRef: -@@ -3601,7 +3606,7 @@ spec: - # Adds a Flex Volume Driver that creates a per-pod Unix Domain Socket to allow Dikastes - # to communicate with Felix over the Policy Sync API. - - name: flexvol-driver -- image: docker.io/calico/pod2daemon-flexvol:v3.18.0 -+ image: quay.io/calico/pod2daemon-flexvol:v3.18.0 +@@ -4848,7 +4853,7 @@ spec: + # i.e. bpf at /sys/fs/bpf and cgroup2 at /run/calico/cgroup. Calico-node initialisation is executed + # in best effort fashion, i.e. no failure for errors, to not disrupt pod creation in iptable mode. + - name: "mount-bpffs" +- image: docker.io/calico/node:v3.27.2 ++ image: quay.io/calico/node:v3.27.2 + imagePullPolicy: IfNotPresent + command: ["calico-node", "-init", "-best-effort"] volumeMounts: - - name: flexvol-driver-host - mountPath: /host/driver -@@ -3612,7 +3617,7 @@ spec: +@@ -4874,7 +4879,7 @@ spec: # container programs network policy and routes on each # host. - name: calico-node -- image: docker.io/calico/node:v3.18.0 -+ image: quay.io/calico/node:v3.18.0 +- image: docker.io/calico/node:v3.27.2 ++ image: quay.io/calico/node:v3.27.2 + imagePullPolicy: IfNotPresent envFrom: - configMapRef: - # Allow KUBERNETES_SERVICE_HOST and KUBERNETES_SERVICE_PORT to be overridden for eBPF mode. -@@ -3671,6 +3676,8 @@ spec: - # no effect. This should fall within `--cluster-cidr`. +@@ -4902,6 +4907,8 @@ spec: + # Cluster type to identify the deployment type + - name: CLUSTER_TYPE + value: "k8s,bgp" ++ - name: IP_AUTODETECTION_METHOD ++ value: "interface=eth.*" + # Auto-detect the BGP IP address. + - name: IP + value: "autodetect" +@@ -4938,6 +4945,8 @@ spec: # - name: CALICO_IPV4POOL_CIDR # value: "192.168.0.0/16" + # Disable file logging so `kubectl logs` works. + - name: IP6 + value: "autodetect" - # Disable file logging so `kubectl logs` works. - name: CALICO_DISABLE_FILE_LOGGING value: "true" -@@ -3679,12 +3686,14 @@ spec: + # Set Felix endpoint to host default action to ACCEPT. +@@ -4945,9 +4954,11 @@ spec: value: "ACCEPT" # Disable IPv6 on Kubernetes. - name: FELIX_IPV6SUPPORT - value: "false" + value: "true" - # Set Felix logging to "info" - - name: FELIX_LOGSEVERITYSCREEN - value: "info" - name: FELIX_HEALTHENABLED value: "true" + - name: CALICO_IPV6POOL_NAT_OUTGOING @@ -75,16 +82,7 @@ securityContext: privileged: true resources: -@@ -3818,6 +3818,8 @@ spec: - operator: Exists - - key: node-role.kubernetes.io/master - effect: NoSchedule -+ - key: node-role.kubernetes.io/control-plane -+ effect: NoSchedule - serviceAccountName: calico-kube-controllers - priorityClassName: system-cluster-critical - containers: -@@ -3820,9 +3829,12 @@ spec: +@@ -5092,9 +5100,12 @@ spec: effect: NoSchedule serviceAccountName: calico-kube-controllers priorityClassName: system-cluster-critical @@ -93,17 +91,8 @@ + type: spc_t containers: - name: calico-kube-controllers -- image: docker.io/calico/kube-controllers:v3.18.0 -+ image: quay.io/calico/kube-controllers:v3.18.0 +- image: docker.io/calico/kube-controllers:v3.27.2 ++ image: quay.io/calico/kube-controllers:v3.27.2 + imagePullPolicy: IfNotPresent env: - # Choose which controllers to run. - - name: ENABLED_CONTROLLERS -@@ -3847,7 +3859,7 @@ - - # This manifest creates a Pod Disruption Budget for Controller to allow K8s Cluster Autoscaler to evict - --apiVersion: policy/v1beta1 -+apiVersion: policy/v1 - kind: PodDisruptionBudget - metadata: - name: calico-kube-controllers + # Choose which controllers to run. \ No newline at end of file diff --git a/cluster-provision/k8s/1.28/manifests/cni.do-not-change.yaml b/cluster-provision/k8s/1.28/manifests/cni.do-not-change.yaml index bcc4ee0c02..107d8f232d 100644 --- a/cluster-provision/k8s/1.28/manifests/cni.do-not-change.yaml +++ b/cluster-provision/k8s/1.28/manifests/cni.do-not-change.yaml @@ -1,4 +1,41 @@ --- +# Source: calico/templates/calico-kube-controllers.yaml +# This manifest creates a Pod Disruption Budget for Controller to allow K8s Cluster Autoscaler to evict + +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: calico-kube-controllers + namespace: kube-system + labels: + k8s-app: calico-kube-controllers +spec: + maxUnavailable: 1 + selector: + matchLabels: + k8s-app: calico-kube-controllers +--- +# Source: calico/templates/calico-kube-controllers.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: calico-kube-controllers + namespace: kube-system +--- +# Source: calico/templates/calico-node.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: calico-node + namespace: kube-system +--- +# Source: calico/templates/calico-node.yaml +apiVersion: v1 +kind: ServiceAccount +metadata: + name: calico-cni-plugin + namespace: kube-system +--- # Source: calico/templates/calico-config.yaml # This ConfigMap is used to configure a self-hosted Calico installation. kind: ConfigMap @@ -52,10 +89,8 @@ data: } ] } - --- # Source: calico/templates/kdd-crds.yaml - apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: @@ -67,6 +102,7 @@ spec: listKind: BGPConfigurationList plural: bgpconfigurations singular: bgpconfiguration + preserveUnknownFields: false scope: Cluster versions: - name: v1 @@ -94,6 +130,12 @@ spec: 64512]' format: int32 type: integer + bindMode: + description: BindMode indicates whether to listen for BGP connections + on all addresses (None) or only on the node's canonical IP address + Node.Spec.BGP.IPvXAddress (NodeIP). Default behaviour is to listen + for BGP connections on all addresses. + type: string communities: description: Communities is a list of BGP community values and their arbitrary names for tagging routes. @@ -114,6 +156,12 @@ spec: type: string type: object type: array + ignoredInterfaces: + description: IgnoredInterfaces indicates the network interfaces that + needs to be excluded when reading device routes. + items: + type: string + type: array listenPort: description: ListenPort is the port where BGP protocol should listen. Defaults to 179 @@ -124,6 +172,37 @@ spec: description: 'LogSeverityScreen is the log severity above which logs are sent to the stdout. [Default: INFO]' type: string + nodeMeshMaxRestartTime: + description: Time to allow for software restart for node-to-mesh peerings. When + specified, this is configured as the graceful restart timeout. When + not specified, the BIRD default of 120s is used. This field can + only be set on the default BGPConfiguration instance and requires + that NodeMesh is enabled + type: string + nodeMeshPassword: + description: Optional BGP password for full node-to-mesh peerings. + This field can only be set on the default BGPConfiguration instance + and requires that NodeMesh is enabled + properties: + secretKeyRef: + description: Selects a key of a secret in the node pod's namespace. + properties: + key: + description: The key of the secret to select from. Must be + a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, uid?' + type: string + optional: + description: Specify whether the Secret or its key must be + defined + type: boolean + required: + - key + type: object + type: object nodeToNodeMeshEnabled: description: 'NodeToNodeMeshEnabled sets whether full node to node BGP mesh is enabled. [Default: true]' @@ -197,8 +276,140 @@ status: plural: "" conditions: [] storedVersions: [] - --- +# Source: calico/templates/kdd-crds.yaml +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: (devel) + creationTimestamp: null + name: bgpfilters.crd.projectcalico.org +spec: + group: crd.projectcalico.org + names: + kind: BGPFilter + listKind: BGPFilterList + plural: bgpfilters + singular: bgpfilter + scope: Cluster + versions: + - name: v1 + schema: + openAPIV3Schema: + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + description: BGPFilterSpec contains the IPv4 and IPv6 filter rules of + the BGP Filter. + properties: + exportV4: + description: The ordered set of IPv4 BGPFilter rules acting on exporting + routes to a peer. + items: + description: BGPFilterRuleV4 defines a BGP filter rule consisting + a single IPv4 CIDR block and a filter action for this CIDR. + properties: + action: + type: string + cidr: + type: string + interface: + type: string + matchOperator: + type: string + source: + type: string + required: + - action + type: object + type: array + exportV6: + description: The ordered set of IPv6 BGPFilter rules acting on exporting + routes to a peer. + items: + description: BGPFilterRuleV6 defines a BGP filter rule consisting + a single IPv6 CIDR block and a filter action for this CIDR. + properties: + action: + type: string + cidr: + type: string + interface: + type: string + matchOperator: + type: string + source: + type: string + required: + - action + type: object + type: array + importV4: + description: The ordered set of IPv4 BGPFilter rules acting on importing + routes from a peer. + items: + description: BGPFilterRuleV4 defines a BGP filter rule consisting + a single IPv4 CIDR block and a filter action for this CIDR. + properties: + action: + type: string + cidr: + type: string + interface: + type: string + matchOperator: + type: string + source: + type: string + required: + - action + type: object + type: array + importV6: + description: The ordered set of IPv6 BGPFilter rules acting on importing + routes from a peer. + items: + description: BGPFilterRuleV6 defines a BGP filter rule consisting + a single IPv6 CIDR block and a filter action for this CIDR. + properties: + action: + type: string + cidr: + type: string + interface: + type: string + matchOperator: + type: string + source: + type: string + required: + - action + type: object + type: array + type: object + type: object + served: true + storage: true +status: + acceptedNames: + kind: "" + plural: "" + conditions: [] + storedVersions: [] +--- +# Source: calico/templates/kdd-crds.yaml apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: @@ -210,6 +421,7 @@ spec: listKind: BGPPeerList plural: bgppeers singular: bgppeer + preserveUnknownFields: false scope: Cluster versions: - name: v1 @@ -235,12 +447,22 @@ spec: description: The AS Number of the peer. format: int32 type: integer + filters: + description: The ordered set of BGPFilters applied on this BGP peer. + items: + type: string + type: array keepOriginalNextHop: description: Option to keep the original nexthop field when routes are sent to a BGP Peer. Setting "true" configures the selected BGP Peers node to use the "next hop keep;" instead of "next hop self;"(default) in the specific branch of the Node on "bird.cfg". type: boolean + maxRestartTime: + description: Time to allow for software restart. When specified, + this is configured as the graceful restart timeout. When not specified, + the BIRD default of 120s is used. + type: string node: description: The node name identifying the Calico node instance that is targeted by this peer. If this is not set, and no nodeSelector @@ -250,6 +472,12 @@ spec: description: Selector for the nodes that should have this peering. When this is set, the Node field must be empty. type: string + numAllowedLocalASNumbers: + description: Maximum number of local AS numbers that are allowed in + the AS path for received routes. This removes BGP loop prevention + and should only be used if absolutely necesssary. + format: int32 + type: integer password: description: Optional BGP password for the peerings generated by this BGPPeer resource. @@ -289,12 +517,23 @@ spec: remote AS number comes from the remote node's NodeBGPSpec.ASNumber, or the global default if that is not set. type: string + reachableBy: + description: Add an exact, i.e. /32, static route toward peer IP in + order to prevent route flapping. ReachableBy contains the address + of the gateway which peer can be reached by. + type: string sourceAddress: description: Specifies whether and how to configure a source address for the peerings generated by this BGPPeer resource. Default value "UseNodeIP" means to configure the node IP as the source address. "None" means not to configure a source address. type: string + ttlSecurity: + description: TTLSecurity enables the generalized TTL security mechanism + (GTSM) which protects against spoofed packets by ignoring received + packets with a smaller than expected TTL value. The provided value + is the number of hops (edges) between the peers. + type: integer type: object type: object served: true @@ -305,8 +544,8 @@ status: plural: "" conditions: [] storedVersions: [] - --- +# Source: calico/templates/kdd-crds.yaml apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: @@ -318,6 +557,7 @@ spec: listKind: BlockAffinityList plural: blockaffinities singular: blockaffinity + preserveUnknownFields: false scope: Cluster versions: - name: v1 @@ -366,8 +606,272 @@ status: plural: "" conditions: [] storedVersions: [] - --- +# Source: calico/templates/kdd-crds.yaml +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: (devel) + creationTimestamp: null + name: caliconodestatuses.crd.projectcalico.org +spec: + group: crd.projectcalico.org + names: + kind: CalicoNodeStatus + listKind: CalicoNodeStatusList + plural: caliconodestatuses + singular: caliconodestatus + preserveUnknownFields: false + scope: Cluster + versions: + - name: v1 + schema: + openAPIV3Schema: + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + description: CalicoNodeStatusSpec contains the specification for a CalicoNodeStatus + resource. + properties: + classes: + description: Classes declares the types of information to monitor + for this calico/node, and allows for selective status reporting + about certain subsets of information. + items: + type: string + type: array + node: + description: The node name identifies the Calico node instance for + node status. + type: string + updatePeriodSeconds: + description: UpdatePeriodSeconds is the period at which CalicoNodeStatus + should be updated. Set to 0 to disable CalicoNodeStatus refresh. + Maximum update period is one day. + format: int32 + type: integer + type: object + status: + description: CalicoNodeStatusStatus defines the observed state of CalicoNodeStatus. + No validation needed for status since it is updated by Calico. + properties: + agent: + description: Agent holds agent status on the node. + properties: + birdV4: + description: BIRDV4 represents the latest observed status of bird4. + properties: + lastBootTime: + description: LastBootTime holds the value of lastBootTime + from bird.ctl output. + type: string + lastReconfigurationTime: + description: LastReconfigurationTime holds the value of lastReconfigTime + from bird.ctl output. + type: string + routerID: + description: Router ID used by bird. + type: string + state: + description: The state of the BGP Daemon. + type: string + version: + description: Version of the BGP daemon + type: string + type: object + birdV6: + description: BIRDV6 represents the latest observed status of bird6. + properties: + lastBootTime: + description: LastBootTime holds the value of lastBootTime + from bird.ctl output. + type: string + lastReconfigurationTime: + description: LastReconfigurationTime holds the value of lastReconfigTime + from bird.ctl output. + type: string + routerID: + description: Router ID used by bird. + type: string + state: + description: The state of the BGP Daemon. + type: string + version: + description: Version of the BGP daemon + type: string + type: object + type: object + bgp: + description: BGP holds node BGP status. + properties: + numberEstablishedV4: + description: The total number of IPv4 established bgp sessions. + type: integer + numberEstablishedV6: + description: The total number of IPv6 established bgp sessions. + type: integer + numberNotEstablishedV4: + description: The total number of IPv4 non-established bgp sessions. + type: integer + numberNotEstablishedV6: + description: The total number of IPv6 non-established bgp sessions. + type: integer + peersV4: + description: PeersV4 represents IPv4 BGP peers status on the node. + items: + description: CalicoNodePeer contains the status of BGP peers + on the node. + properties: + peerIP: + description: IP address of the peer whose condition we are + reporting. + type: string + since: + description: Since the state or reason last changed. + type: string + state: + description: State is the BGP session state. + type: string + type: + description: Type indicates whether this peer is configured + via the node-to-node mesh, or via en explicit global or + per-node BGPPeer object. + type: string + type: object + type: array + peersV6: + description: PeersV6 represents IPv6 BGP peers status on the node. + items: + description: CalicoNodePeer contains the status of BGP peers + on the node. + properties: + peerIP: + description: IP address of the peer whose condition we are + reporting. + type: string + since: + description: Since the state or reason last changed. + type: string + state: + description: State is the BGP session state. + type: string + type: + description: Type indicates whether this peer is configured + via the node-to-node mesh, or via en explicit global or + per-node BGPPeer object. + type: string + type: object + type: array + required: + - numberEstablishedV4 + - numberEstablishedV6 + - numberNotEstablishedV4 + - numberNotEstablishedV6 + type: object + lastUpdated: + description: LastUpdated is a timestamp representing the server time + when CalicoNodeStatus object last updated. It is represented in + RFC3339 form and is in UTC. + format: date-time + nullable: true + type: string + routes: + description: Routes reports routes known to the Calico BGP daemon + on the node. + properties: + routesV4: + description: RoutesV4 represents IPv4 routes on the node. + items: + description: CalicoNodeRoute contains the status of BGP routes + on the node. + properties: + destination: + description: Destination of the route. + type: string + gateway: + description: Gateway for the destination. + type: string + interface: + description: Interface for the destination + type: string + learnedFrom: + description: LearnedFrom contains information regarding + where this route originated. + properties: + peerIP: + description: If sourceType is NodeMesh or BGPPeer, IP + address of the router that sent us this route. + type: string + sourceType: + description: Type of the source where a route is learned + from. + type: string + type: object + type: + description: Type indicates if the route is being used for + forwarding or not. + type: string + type: object + type: array + routesV6: + description: RoutesV6 represents IPv6 routes on the node. + items: + description: CalicoNodeRoute contains the status of BGP routes + on the node. + properties: + destination: + description: Destination of the route. + type: string + gateway: + description: Gateway for the destination. + type: string + interface: + description: Interface for the destination + type: string + learnedFrom: + description: LearnedFrom contains information regarding + where this route originated. + properties: + peerIP: + description: If sourceType is NodeMesh or BGPPeer, IP + address of the router that sent us this route. + type: string + sourceType: + description: Type of the source where a route is learned + from. + type: string + type: object + type: + description: Type indicates if the route is being used for + forwarding or not. + type: string + type: object + type: array + type: object + type: object + type: object + served: true + storage: true +status: + acceptedNames: + kind: "" + plural: "" + conditions: [] + storedVersions: [] +--- +# Source: calico/templates/kdd-crds.yaml apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: @@ -379,6 +883,7 @@ spec: listKind: ClusterInformationList plural: clusterinformations singular: clusterinformation + preserveUnknownFields: false scope: Cluster versions: - name: v1 @@ -430,8 +935,8 @@ status: plural: "" conditions: [] storedVersions: [] - --- +# Source: calico/templates/kdd-crds.yaml apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: @@ -443,6 +948,7 @@ spec: listKind: FelixConfigurationList plural: felixconfigurations singular: felixconfiguration + preserveUnknownFields: false scope: Cluster versions: - name: v1 @@ -477,21 +983,48 @@ spec: type: boolean awsSrcDstCheck: description: 'Set source-destination-check on AWS EC2 instances. Accepted - value must be one of "DoNothing", "Enabled" or "Disabled". [Default: + value must be one of "DoNothing", "Enable" or "Disable". [Default: DoNothing]' enum: - DoNothing - Enable - Disable type: string + bpfCTLBLogFilter: + description: 'BPFCTLBLogFilter specifies, what is logged by connect + time load balancer when BPFLogLevel is debug. Currently has to be + specified as ''all'' when BPFLogFilters is set to see CTLB logs. + [Default: unset - means logs are emitted when BPFLogLevel id debug + and BPFLogFilters not set.]' + type: string + bpfConnectTimeLoadBalancing: + description: 'BPFConnectTimeLoadBalancing when in BPF mode, controls + whether Felix installs the connect-time load balancer. The connect-time + load balancer is required for the host to be able to reach Kubernetes + services and it improves the performance of pod-to-service connections.When + set to TCP, connect time load balancing is available only for services + with TCP ports. [Default: TCP]' + enum: + - TCP + - Enabled + - Disabled + type: string bpfConnectTimeLoadBalancingEnabled: description: 'BPFConnectTimeLoadBalancingEnabled when in BPF mode, controls whether Felix installs the connection-time load balancer. The connect-time load balancer is required for the host to be able to reach Kubernetes services and it improves the performance of pod-to-service - connections. The only reason to disable it is for debugging purposes. [Default: + connections. The only reason to disable it is for debugging purposes. + This will be deprecated. Use BPFConnectTimeLoadBalancing [Default: true]' type: boolean + bpfDSROptoutCIDRs: + description: BPFDSROptoutCIDRs is a list of CIDRs which are excluded + from DSR. That is, clients in those CIDRs will accesses nodeports + as if BPFExternalServiceMode was set to Tunnel. + items: + type: string + type: array bpfDataIfacePattern: description: BPFDataIfacePattern is a regular expression that controls which interfaces Felix should attach BPF programs to in order to @@ -501,6 +1034,12 @@ spec: the cluster. It should not match the workload interfaces (usually named cali...). type: string + bpfDisableGROForIfaces: + description: BPFDisableGROForIfaces is a regular expression that controls + which interfaces Felix should disable the Generic Receive Offload + [GRO] option. It should not match the workload interfaces (usually + named cali...). + type: string bpfDisableUnprivileged: description: 'BPFDisableUnprivileged, if enabled, Felix sets the kernel.unprivileged_bpf_disabled sysctl to disable unprivileged use of BPF. This ensures that unprivileged @@ -511,6 +1050,27 @@ spec: description: 'BPFEnabled, if enabled Felix will use the BPF dataplane. [Default: false]' type: boolean + bpfEnforceRPF: + description: 'BPFEnforceRPF enforce strict RPF on all host interfaces + with BPF programs regardless of what is the per-interfaces or global + setting. Possible values are Disabled, Strict or Loose. [Default: + Loose]' + pattern: ^(?i)(Disabled|Strict|Loose)?$ + type: string + bpfExcludeCIDRsFromNAT: + description: BPFExcludeCIDRsFromNAT is a list of CIDRs that are to + be excluded from NAT resolution so that host can handle them. A + typical usecase is node local DNS cache. + items: + type: string + type: array + bpfExtToServiceConnmark: + description: 'BPFExtToServiceConnmark in BPF mode, control a 32bit + mark that is set on connections from an external client to a local + service. This mark allows us to control how packets of that connection + are routed within the host and how is routing interpreted by RPF + check. [Default: 0]' + type: integer bpfExternalServiceMode: description: 'BPFExternalServiceMode in BPF mode, controls how connections from outside the cluster to services (node ports and cluster IPs) @@ -520,6 +1080,30 @@ spec: is sent directly from the remote node. In "DSR" mode, the remote node appears to use the IP of the ingress node; this requires a permissive L2 network. [Default: Tunnel]' + pattern: ^(?i)(Tunnel|DSR)?$ + type: string + bpfForceTrackPacketsFromIfaces: + description: 'BPFForceTrackPacketsFromIfaces in BPF mode, forces traffic + from these interfaces to skip Calico''s iptables NOTRACK rule, allowing + traffic from those interfaces to be tracked by Linux conntrack. Should + only be used for interfaces that are not used for the Calico fabric. For + example, a docker bridge device for non-Calico-networked containers. + [Default: docker+]' + items: + type: string + type: array + bpfHostConntrackBypass: + description: 'BPFHostConntrackBypass Controls whether to bypass Linux + conntrack in BPF mode for workloads and services. [Default: true + - bypass Linux conntrack]' + type: boolean + bpfHostNetworkedNATWithoutCTLB: + description: 'BPFHostNetworkedNATWithoutCTLB when in BPF mode, controls + whether Felix does a NAT without CTLB. This along with BPFConnectTimeLoadBalancing + determines the CTLB behavior. [Default: Enabled]' + enum: + - Enabled + - Disabled type: string bpfKubeProxyEndpointSlicesEnabled: description: BPFKubeProxyEndpointSlicesEnabled in BPF mode, controls @@ -536,13 +1120,88 @@ spec: minimum time between updates to the dataplane for Felix''s embedded kube-proxy. Lower values give reduced set-up latency. Higher values reduce Felix CPU usage by batching up more work. [Default: 1s]' + pattern: ^([0-9]+(\\.[0-9]+)?(ms|s|m|h))*$ + type: string + bpfL3IfacePattern: + description: BPFL3IfacePattern is a regular expression that allows + to list tunnel devices like wireguard or vxlan (i.e., L3 devices) + in addition to BPFDataIfacePattern. That is, tunnel interfaces not + created by Calico, that Calico workload traffic flows over as well + as any interfaces that handle incoming traffic to nodeports and + services from outside the cluster. type: string + bpfLogFilters: + additionalProperties: + type: string + description: "BPFLogFilters is a map of key=values where the value + is a pcap filter expression and the key is an interface name with + 'all' denoting all interfaces, 'weps' all workload endpoints and + 'heps' all host endpoints. \n When specified as an env var, it accepts + a comma-separated list of key=values. [Default: unset - means all + debug logs are emitted]" + type: object bpfLogLevel: description: 'BPFLogLevel controls the log level of the BPF programs when in BPF dataplane mode. One of "Off", "Info", or "Debug". The logs are emitted to the BPF trace pipe, accessible with the command `tc exec bpf debug`. [Default: Off].' + pattern: ^(?i)(Off|Info|Debug)?$ type: string + bpfMapSizeConntrack: + description: 'BPFMapSizeConntrack sets the size for the conntrack + map. This map must be large enough to hold an entry for each active + connection. Warning: changing the size of the conntrack map can + cause disruption.' + type: integer + bpfMapSizeIPSets: + description: BPFMapSizeIPSets sets the size for ipsets map. The IP + sets map must be large enough to hold an entry for each endpoint + matched by every selector in the source/destination matches in network + policy. Selectors such as "all()" can result in large numbers of + entries (one entry per endpoint in that case). + type: integer + bpfMapSizeIfState: + description: BPFMapSizeIfState sets the size for ifstate map. The + ifstate map must be large enough to hold an entry for each device + (host + workloads) on a host. + type: integer + bpfMapSizeNATAffinity: + type: integer + bpfMapSizeNATBackend: + description: BPFMapSizeNATBackend sets the size for nat back end map. + This is the total number of endpoints. This is mostly more than + the size of the number of services. + type: integer + bpfMapSizeNATFrontend: + description: BPFMapSizeNATFrontend sets the size for nat front end + map. FrontendMap should be large enough to hold an entry for each + nodeport, external IP and each port in each service. + type: integer + bpfMapSizeRoute: + description: BPFMapSizeRoute sets the size for the routes map. The + routes map should be large enough to hold one entry per workload + and a handful of entries per host (enough to cover its own IPs and + tunnel IPs). + type: integer + bpfPSNATPorts: + anyOf: + - type: integer + - type: string + description: 'BPFPSNATPorts sets the range from which we randomly + pick a port if there is a source port collision. This should be + within the ephemeral range as defined by RFC 6056 (1024–65535) and + preferably outside the ephemeral ranges used by common operating + systems. Linux uses 32768–60999, while others mostly use the IANA + defined range 49152–65535. It is not necessarily a problem if this + range overlaps with the operating systems. Both ends of the range + are inclusive. [Default: 20000:29999]' + pattern: ^.* + x-kubernetes-int-or-string: true + bpfPolicyDebugEnabled: + description: BPFPolicyDebugEnabled when true, Felix records detailed + information about the BPF policy programs, which can be examined + with the calico-bpf command-line tool. + type: boolean chainInsertMode: description: 'ChainInsertMode controls whether Felix hooks the kernel''s top-level iptables chains by inserting a rule at the top of the @@ -551,16 +1210,29 @@ spec: to append mode, be sure that the other rules in the chains signal acceptance by falling through to the Calico rules, otherwise the Calico policy will be bypassed. [Default: insert]' + pattern: ^(?i)(insert|append)?$ type: string dataplaneDriver: + description: DataplaneDriver filename of the external dataplane driver + to use. Only used if UseInternalDataplaneDriver is set to false. + type: string + dataplaneWatchdogTimeout: + description: "DataplaneWatchdogTimeout is the readiness/liveness timeout + used for Felix's (internal) dataplane driver. Increase this value + if you experience spurious non-ready or non-live events when Felix + is under heavy load. Decrease the value to get felix to report non-live + or non-ready more quickly. [Default: 90s] \n Deprecated: replaced + by the generic HealthTimeoutOverrides." type: string debugDisableLogDropping: type: boolean debugMemoryProfilePath: type: string debugSimulateCalcGraphHangAfter: + pattern: ^([0-9]+(\\.[0-9]+)?(ms|s|m|h))*$ type: string debugSimulateDataplaneHangAfter: + pattern: ^([0-9]+(\\.[0-9]+)?(ms|s|m|h))*$ type: string defaultEndpointToHostAction: description: 'DefaultEndpointToHostAction controls what happens to @@ -575,19 +1247,26 @@ spec: endpoint egress policy. Use ACCEPT to unconditionally accept packets from workloads after processing workload endpoint egress policy. [Default: Drop]' + pattern: ^(?i)(Drop|Accept|Return)?$ type: string deviceRouteProtocol: description: This defines the route protocol added to programmed device routes, by default this will be RTPROT_BOOT when left blank. type: integer deviceRouteSourceAddress: - description: This is the source address to use on programmed device - routes. By default the source address is left blank, leaving the - kernel to choose the source address used. + description: This is the IPv4 source address to use on programmed + device routes. By default the source address is left blank, leaving + the kernel to choose the source address used. + type: string + deviceRouteSourceAddressIPv6: + description: This is the IPv6 source address to use on programmed + device routes. By default the source address is left blank, leaving + the kernel to choose the source address used. type: string disableConntrackInvalidCheck: type: boolean endpointReportingDelay: + pattern: ^([0-9]+(\\.[0-9]+)?(ms|s|m|h))*$ type: string endpointReportingEnabled: type: boolean @@ -599,19 +1278,21 @@ spec: type: string type: array failsafeInboundHostPorts: - description: 'FailsafeInboundHostPorts is a comma-delimited list of - UDP/TCP ports that Felix will allow incoming traffic to host endpoints + description: 'FailsafeInboundHostPorts is a list of UDP/TCP ports + and CIDRs that Felix will allow incoming traffic to host endpoints on irrespective of the security policy. This is useful to avoid - accidentally cutting off a host with incorrect configuration. Each - port should be specified as tcp: or udp:. - For back-compatibility, if the protocol is not specified, it defaults - to "tcp". To disable all inbound host ports, use the value none. - The default value allows ssh access and DHCP. [Default: tcp:22, + accidentally cutting off a host with incorrect configuration. For + back-compatibility, if the protocol is not specified, it defaults + to "tcp". If a CIDR is not specified, it will allow traffic from + all addresses. To disable all inbound host ports, use the value + none. The default value allows ssh access and DHCP. [Default: tcp:22, udp:68, tcp:179, tcp:2379, tcp:2380, tcp:6443, tcp:6666, tcp:6667]' items: - description: ProtoPort is combination of protocol and port, both - must be specified. + description: ProtoPort is combination of protocol, port, and CIDR. + Protocol and port must be specified. properties: + net: + type: string port: type: integer protocol: @@ -622,21 +1303,23 @@ spec: type: object type: array failsafeOutboundHostPorts: - description: 'FailsafeOutboundHostPorts is a comma-delimited list - of UDP/TCP ports that Felix will allow outgoing traffic from host - endpoints to irrespective of the security policy. This is useful - to avoid accidentally cutting off a host with incorrect configuration. - Each port should be specified as tcp: or udp:. - For back-compatibility, if the protocol is not specified, it defaults - to "tcp". To disable all outbound host ports, use the value none. - The default value opens etcd''s standard ports to ensure that Felix - does not get cut off from etcd as well as allowing DHCP and DNS. - [Default: tcp:179, tcp:2379, tcp:2380, tcp:6443, tcp:6666, tcp:6667, - udp:53, udp:67]' + description: 'FailsafeOutboundHostPorts is a list of UDP/TCP ports + and CIDRs that Felix will allow outgoing traffic from host endpoints + to irrespective of the security policy. This is useful to avoid + accidentally cutting off a host with incorrect configuration. For + back-compatibility, if the protocol is not specified, it defaults + to "tcp". If a CIDR is not specified, it will allow traffic from + all addresses. To disable all outbound host ports, use the value + none. The default value opens etcd''s standard ports to ensure that + Felix does not get cut off from etcd as well as allowing DHCP and + DNS. [Default: tcp:179, tcp:2379, tcp:2380, tcp:6443, tcp:6666, + tcp:6667, udp:53, udp:67]' items: - description: ProtoPort is combination of protocol and port, both - must be specified. + description: ProtoPort is combination of protocol, port, and CIDR. + Protocol and port must be specified. properties: + net: + type: string port: type: integer protocol: @@ -647,11 +1330,26 @@ spec: type: object type: array featureDetectOverride: - description: FeatureDetectOverride is used to override the feature - detection. Values are specified in a comma separated list with no - spaces, example; "SNATFullyRandom=true,MASQFullyRandom=false,RestoreSupportsLock=". - "true" or "false" will force the feature, empty or omitted values - are auto-detected. + description: FeatureDetectOverride is used to override feature detection + based on auto-detected platform capabilities. Values are specified + in a comma separated list with no spaces, example; "SNATFullyRandom=true,MASQFullyRandom=false,RestoreSupportsLock=". "true" + or "false" will force the feature, empty or omitted values are auto-detected. + pattern: ^([a-zA-Z0-9-_]+=(true|false|),)*([a-zA-Z0-9-_]+=(true|false|))?$ + type: string + featureGates: + description: FeatureGates is used to enable or disable tech-preview + Calico features. Values are specified in a comma separated list + with no spaces, example; "BPFConnectTimeLoadBalancingWorkaround=enabled,XyZ=false". + This is used to enable features that are not fully production ready. + pattern: ^([a-zA-Z0-9-_]+=([^=]+),)*([a-zA-Z0-9-_]+=([^=]+))?$ + type: string + floatingIPs: + description: FloatingIPs configures whether or not Felix will program + non-OpenStack floating IP addresses. (OpenStack-derived floating + IPs are always programmed, regardless of this setting.) + enum: + - Enabled + - Disabled type: string genericXDPEnabled: description: 'GenericXDPEnabled enables Generic XDP so network cards @@ -665,6 +1363,23 @@ spec: type: string healthPort: type: integer + healthTimeoutOverrides: + description: HealthTimeoutOverrides allows the internal watchdog timeouts + of individual subcomponents to be overridden. This is useful for + working around "false positive" liveness timeouts that can occur + in particularly stressful workloads or if CPU is constrained. For + a list of active subcomponents, see Felix's logs. + items: + properties: + name: + type: string + timeout: + type: string + required: + - name + - timeout + type: object + type: array interfaceExclude: description: 'InterfaceExclude is a comma-separated list of interfaces that Felix should exclude when monitoring for host endpoints. The @@ -688,8 +1403,12 @@ spec: description: InterfaceRefreshInterval is the period at which Felix rescans local interfaces to verify their state. The rescan can be disabled by setting the interval to 0. + pattern: ^([0-9]+(\\.[0-9]+)?(ms|s|m|h))*$ type: string ipipEnabled: + description: 'IPIPEnabled overrides whether Felix should configure + an IPIP interface on the host. Optional as Felix determines this + based on the existing IP pools. [Default: nil (unset)]' type: boolean ipipMTU: description: 'IPIPMTU is the MTU to set on the tunnel device. See @@ -700,12 +1419,22 @@ spec: all iptables state to ensure that no other process has accidentally broken Calico''s rules. Set to 0 to disable iptables refresh. [Default: 90s]' + pattern: ^([0-9]+(\\.[0-9]+)?(ms|s|m|h))*$ type: string iptablesBackend: description: IptablesBackend specifies which backend of iptables will - be used. The default is legacy. + be used. The default is Auto. + pattern: ^(?i)(Auto|FelixConfiguration|FelixConfigurationList|Legacy|NFT)?$ type: string iptablesFilterAllowAction: + pattern: ^(?i)(Accept|Return)?$ + type: string + iptablesFilterDenyAction: + description: IptablesFilterDenyAction controls what happens to traffic + that is denied by network policy. By default Calico blocks traffic + with an iptables "DROP" action. If you want to use "REJECT" action + instead you can configure it in here. + pattern: ^(?i)(Drop|Reject)?$ type: string iptablesLockFilePath: description: 'IptablesLockFilePath is the location of the iptables @@ -718,6 +1447,7 @@ spec: wait between attempts to acquire the iptables lock if it is not available. Lower values make Felix more responsive when the lock is contended, but use more CPU. [Default: 50ms]' + pattern: ^([0-9]+(\\.[0-9]+)?(ms|s|m|h))*$ type: string iptablesLockTimeout: description: 'IptablesLockTimeout is the time that Felix will wait @@ -726,8 +1456,10 @@ spec: also take the lock. When running Felix inside a container, this requires the /run directory of the host to be mounted into the calico/node or calico/felix container. [Default: 0s disabled]' + pattern: ^([0-9]+(\\.[0-9]+)?(ms|s|m|h))*$ type: string iptablesMangleAllowAction: + pattern: ^(?i)(Accept|Return)?$ type: string iptablesMarkMask: description: 'IptablesMarkMask is the mask that Felix selects its @@ -744,6 +1476,7 @@ spec: back in order to check the write was not clobbered by another process. This should only occur if another application on the system doesn''t respect the iptables lock. [Default: 1s]' + pattern: ^([0-9]+(\\.[0-9]+)?(ms|s|m|h))*$ type: string iptablesRefreshInterval: description: 'IptablesRefreshInterval is the period at which Felix @@ -754,8 +1487,11 @@ spec: was fixed in kernel version 4.11. If you are using v4.11 or greater you may want to set this to, a higher value to reduce Felix CPU usage. [Default: 10s]' + pattern: ^([0-9]+(\\.[0-9]+)?(ms|s|m|h))*$ type: string ipv6Support: + description: IPv6Support controls whether Felix enables support for + IPv6 (if supported by the in-use dataplane). type: boolean kubeNodePortRanges: description: 'KubeNodePortRanges holds list of port ranges used for @@ -769,6 +1505,12 @@ spec: pattern: ^.* x-kubernetes-int-or-string: true type: array + logDebugFilenameRegex: + description: LogDebugFilenameRegex controls which source code files + have their Debug log output included in the logs. Only logs from + files with names that match the given regular expression are included. The + filter only applies to Debug level logs. + type: string logFilePath: description: 'LogFilePath is the full path to the Felix log. Set to none to disable file logging. [Default: /var/log/calico/felix.log]' @@ -780,15 +1522,18 @@ spec: logSeverityFile: description: 'LogSeverityFile is the log severity above which logs are sent to the log file. [Default: Info]' + pattern: ^(?i)(Debug|Info|Warning|Error|Fatal)?$ type: string logSeverityScreen: description: 'LogSeverityScreen is the log severity above which logs are sent to the stdout. [Default: Info]' + pattern: ^(?i)(Debug|Info|Warning|Error|Fatal)?$ type: string logSeveritySys: description: 'LogSeveritySys is the log severity above which logs are sent to the syslog. Set to None for no logging to syslog. [Default: Info]' + pattern: ^(?i)(Debug|Info|Warning|Error|Fatal)?$ type: string maxIpsetSize: type: integer @@ -827,6 +1572,7 @@ spec: pattern: ^.* x-kubernetes-int-or-string: true netlinkTimeout: + pattern: ^([0-9]+(\\.[0-9]+)?(ms|s|m|h))*$ type: string openstackRegion: description: 'OpenstackRegion is the name of the region that a particular @@ -865,6 +1611,12 @@ spec: to false. This reduces the number of metrics reported, reducing Prometheus load. [Default: true]' type: boolean + prometheusWireGuardMetricsEnabled: + description: 'PrometheusWireGuardMetricsEnabled disables wireguard + metrics collection, which the Prometheus client does by default, + when set to false. This reduces the number of metrics reported, + reducing Prometheus load. [Default: true]' + type: boolean removeExternalRoutes: description: Whether or not to remove device routes that have not been programmed by Felix. Disabling this will allow external applications @@ -875,26 +1627,34 @@ spec: description: 'ReportingInterval is the interval at which Felix reports its status into the datastore or 0 to disable. Must be non-zero in OpenStack deployments. [Default: 30s]' + pattern: ^([0-9]+(\\.[0-9]+)?(ms|s|m|h))*$ type: string reportingTTL: description: 'ReportingTTL is the time-to-live setting for process-wide status reports. [Default: 90s]' + pattern: ^([0-9]+(\\.[0-9]+)?(ms|s|m|h))*$ type: string routeRefreshInterval: description: 'RouteRefreshInterval is the period at which Felix re-checks the routes in the dataplane to ensure that no other process has accidentally broken Calico''s rules. Set to 0 to disable route refresh. [Default: 90s]' + pattern: ^([0-9]+(\\.[0-9]+)?(ms|s|m|h))*$ type: string routeSource: description: 'RouteSource configures where Felix gets its routing information. - WorkloadIPs: use workload endpoints to construct routes. - CalicoIPAM: the default - use IPAM data to construct routes.' + pattern: ^(?i)(WorkloadIPs|CalicoIPAM)?$ type: string + routeSyncDisabled: + description: RouteSyncDisabled will disable all operations performed + on the route table. Set to true to run in network-policy mode only. + type: boolean routeTableRange: - description: Calico programs additional Linux route tables for various - purposes. RouteTableRange specifies the indices of the route tables - that Calico should use. + description: Deprecated in favor of RouteTableRanges. Calico programs + additional Linux route tables for various purposes. RouteTableRange + specifies the indices of the route tables that Calico should use. properties: max: type: integer @@ -904,12 +1664,28 @@ spec: - max - min type: object + routeTableRanges: + description: Calico programs additional Linux route tables for various + purposes. RouteTableRanges specifies a set of table index ranges + that Calico should use. Deprecates`RouteTableRange`, overrides `RouteTableRange`. + items: + properties: + max: + type: integer + min: + type: integer + required: + - max + - min + type: object + type: array serviceLoopPrevention: description: 'When service IP advertisement is enabled, prevent routing loops to service IPs that are not in use, by dropping or rejecting packets that do not get DNAT''d by kube-proxy. Unless set to "Disabled", in which case such routing loops continue to be allowed. [Default: Drop]' + pattern: ^(?i)(Drop|Reject|Disabled)?$ type: string sidecarAccelerationEnabled: description: 'SidecarAccelerationEnabled enables experimental sidecar @@ -925,43 +1701,97 @@ spec: usageReportingInitialDelay: description: 'UsageReportingInitialDelay controls the minimum delay before Felix makes a report. [Default: 300s]' + pattern: ^([0-9]+(\\.[0-9]+)?(ms|s|m|h))*$ type: string usageReportingInterval: description: 'UsageReportingInterval controls the interval at which Felix makes reports. [Default: 86400s]' + pattern: ^([0-9]+(\\.[0-9]+)?(ms|s|m|h))*$ type: string useInternalDataplaneDriver: + description: UseInternalDataplaneDriver, if true, Felix will use its + internal dataplane programming logic. If false, it will launch + an external dataplane driver and communicate with it over protobuf. type: boolean vxlanEnabled: + description: 'VXLANEnabled overrides whether Felix should create the + VXLAN tunnel device for IPv4 VXLAN networking. Optional as Felix + determines this based on the existing IP pools. [Default: nil (unset)]' type: boolean vxlanMTU: - description: 'VXLANMTU is the MTU to set on the tunnel device. See - Configuring MTU [Default: 1440]' + description: 'VXLANMTU is the MTU to set on the IPv4 VXLAN tunnel + device. See Configuring MTU [Default: 1410]' + type: integer + vxlanMTUV6: + description: 'VXLANMTUV6 is the MTU to set on the IPv6 VXLAN tunnel + device. See Configuring MTU [Default: 1390]' type: integer vxlanPort: type: integer vxlanVNI: type: integer + windowsManageFirewallRules: + description: 'WindowsManageFirewallRules configures whether or not + Felix will program Windows Firewall rules. (to allow inbound access + to its own metrics ports) [Default: Disabled]' + enum: + - Enabled + - Disabled + type: string wireguardEnabled: - description: 'WireguardEnabled controls whether Wireguard is enabled. + description: 'WireguardEnabled controls whether Wireguard is enabled + for IPv4 (encapsulating IPv4 traffic over an IPv4 underlay network). + [Default: false]' + type: boolean + wireguardEnabledV6: + description: 'WireguardEnabledV6 controls whether Wireguard is enabled + for IPv6 (encapsulating IPv6 traffic over an IPv6 underlay network). [Default: false]' type: boolean + wireguardHostEncryptionEnabled: + description: 'WireguardHostEncryptionEnabled controls whether Wireguard + host-to-host encryption is enabled. [Default: false]' + type: boolean wireguardInterfaceName: description: 'WireguardInterfaceName specifies the name to use for - the Wireguard interface. [Default: wg.calico]' + the IPv4 Wireguard interface. [Default: wireguard.cali]' + type: string + wireguardInterfaceNameV6: + description: 'WireguardInterfaceNameV6 specifies the name to use for + the IPv6 Wireguard interface. [Default: wg-v6.cali]' + type: string + wireguardKeepAlive: + description: 'WireguardKeepAlive controls Wireguard PersistentKeepalive + option. Set 0 to disable. [Default: 0]' + pattern: ^([0-9]+(\\.[0-9]+)?(ms|s|m|h))*$ type: string wireguardListeningPort: description: 'WireguardListeningPort controls the listening port used - by Wireguard. [Default: 51820]' + by IPv4 Wireguard. [Default: 51820]' + type: integer + wireguardListeningPortV6: + description: 'WireguardListeningPortV6 controls the listening port + used by IPv6 Wireguard. [Default: 51821]' type: integer wireguardMTU: - description: 'WireguardMTU controls the MTU on the Wireguard interface. - See Configuring MTU [Default: 1420]' + description: 'WireguardMTU controls the MTU on the IPv4 Wireguard + interface. See Configuring MTU [Default: 1440]' + type: integer + wireguardMTUV6: + description: 'WireguardMTUV6 controls the MTU on the IPv6 Wireguard + interface. See Configuring MTU [Default: 1420]' type: integer wireguardRoutingRulePriority: description: 'WireguardRoutingRulePriority controls the priority value to use for the Wireguard routing rule. [Default: 99]' type: integer + workloadSourceSpoofing: + description: WorkloadSourceSpoofing controls whether pods can use + the allowedSourcePrefixes annotation to send traffic with a source + IP address that is not theirs. This is disabled by default. When + set to "Any", pods can request any prefix. + pattern: ^(?i)(Disabled|Any)?$ + type: string xdpEnabled: description: 'XDPEnabled enables XDP acceleration for suitable untracked incoming deny rules. [Default: true]' @@ -971,6 +1801,7 @@ spec: all XDP state to ensure that no other process has accidentally broken Calico''s BPF maps or attached programs. Set to 0 to disable XDP refresh. [Default: 90s]' + pattern: ^([0-9]+(\\.[0-9]+)?(ms|s|m|h))*$ type: string type: object type: object @@ -982,8 +1813,8 @@ status: plural: "" conditions: [] storedVersions: [] - --- +# Source: calico/templates/kdd-crds.yaml apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: @@ -995,6 +1826,7 @@ spec: listKind: GlobalNetworkPolicyList plural: globalnetworkpolicies singular: globalnetworkpolicy + preserveUnknownFields: false scope: Cluster versions: - name: v1 @@ -1050,16 +1882,17 @@ spec: contains a selector expression. Only traffic that originates from (or terminates at) endpoints within the selected namespaces will be matched. When both NamespaceSelector - and Selector are defined on the same rule, then only workload - endpoints that are matched by both selectors will be selected - by the rule. \n For NetworkPolicy, an empty NamespaceSelector - implies that the Selector is limited to selecting only - workload endpoints in the same namespace as the NetworkPolicy. - \n For NetworkPolicy, `global()` NamespaceSelector implies - that the Selector is limited to selecting only GlobalNetworkSet - or HostEndpoint. \n For GlobalNetworkPolicy, an empty - NamespaceSelector implies the Selector applies to workload - endpoints across all namespaces." + and another selector are defined on the same rule, then + only workload endpoints that are matched by both selectors + will be selected by the rule. \n For NetworkPolicy, an + empty NamespaceSelector implies that the Selector is limited + to selecting only workload endpoints in the same namespace + as the NetworkPolicy. \n For NetworkPolicy, `global()` + NamespaceSelector implies that the Selector is limited + to selecting only GlobalNetworkSet or HostEndpoint. \n + For GlobalNetworkPolicy, an empty NamespaceSelector implies + the Selector applies to workload endpoints across all + namespaces." type: string nets: description: Nets is an optional field that restricts the @@ -1147,6 +1980,26 @@ spec: AND'ed. type: string type: object + services: + description: "Services is an optional field that contains + options for matching Kubernetes Services. If specified, + only traffic that originates from or terminates at endpoints + within the selected service(s) will be matched, and only + to/from each endpoint's port. \n Services cannot be specified + on the same rule as Selector, NotSelector, NamespaceSelector, + Nets, NotNets or ServiceAccounts. \n Ports and NotPorts + can only be specified with Services on ingress rules." + properties: + name: + description: Name specifies the name of a Kubernetes + Service to match. + type: string + namespace: + description: Namespace specifies the namespace of the + given Service. If left empty, the rule will match + within this policy's namespace. + type: string + type: object type: object http: description: HTTP contains match criteria that apply to HTTP @@ -1255,16 +2108,17 @@ spec: contains a selector expression. Only traffic that originates from (or terminates at) endpoints within the selected namespaces will be matched. When both NamespaceSelector - and Selector are defined on the same rule, then only workload - endpoints that are matched by both selectors will be selected - by the rule. \n For NetworkPolicy, an empty NamespaceSelector - implies that the Selector is limited to selecting only - workload endpoints in the same namespace as the NetworkPolicy. - \n For NetworkPolicy, `global()` NamespaceSelector implies - that the Selector is limited to selecting only GlobalNetworkSet - or HostEndpoint. \n For GlobalNetworkPolicy, an empty - NamespaceSelector implies the Selector applies to workload - endpoints across all namespaces." + and another selector are defined on the same rule, then + only workload endpoints that are matched by both selectors + will be selected by the rule. \n For NetworkPolicy, an + empty NamespaceSelector implies that the Selector is limited + to selecting only workload endpoints in the same namespace + as the NetworkPolicy. \n For NetworkPolicy, `global()` + NamespaceSelector implies that the Selector is limited + to selecting only GlobalNetworkSet or HostEndpoint. \n + For GlobalNetworkPolicy, an empty NamespaceSelector implies + the Selector applies to workload endpoints across all + namespaces." type: string nets: description: Nets is an optional field that restricts the @@ -1352,6 +2206,26 @@ spec: AND'ed. type: string type: object + services: + description: "Services is an optional field that contains + options for matching Kubernetes Services. If specified, + only traffic that originates from or terminates at endpoints + within the selected service(s) will be matched, and only + to/from each endpoint's port. \n Services cannot be specified + on the same rule as Selector, NotSelector, NamespaceSelector, + Nets, NotNets or ServiceAccounts. \n Ports and NotPorts + can only be specified with Services on ingress rules." + properties: + name: + description: Name specifies the name of a Kubernetes + Service to match. + type: string + namespace: + description: Namespace specifies the namespace of the + given Service. If left empty, the rule will match + within this policy's namespace. + type: string + type: object type: object required: - action @@ -1381,16 +2255,17 @@ spec: contains a selector expression. Only traffic that originates from (or terminates at) endpoints within the selected namespaces will be matched. When both NamespaceSelector - and Selector are defined on the same rule, then only workload - endpoints that are matched by both selectors will be selected - by the rule. \n For NetworkPolicy, an empty NamespaceSelector - implies that the Selector is limited to selecting only - workload endpoints in the same namespace as the NetworkPolicy. - \n For NetworkPolicy, `global()` NamespaceSelector implies - that the Selector is limited to selecting only GlobalNetworkSet - or HostEndpoint. \n For GlobalNetworkPolicy, an empty - NamespaceSelector implies the Selector applies to workload - endpoints across all namespaces." + and another selector are defined on the same rule, then + only workload endpoints that are matched by both selectors + will be selected by the rule. \n For NetworkPolicy, an + empty NamespaceSelector implies that the Selector is limited + to selecting only workload endpoints in the same namespace + as the NetworkPolicy. \n For NetworkPolicy, `global()` + NamespaceSelector implies that the Selector is limited + to selecting only GlobalNetworkSet or HostEndpoint. \n + For GlobalNetworkPolicy, an empty NamespaceSelector implies + the Selector applies to workload endpoints across all + namespaces." type: string nets: description: Nets is an optional field that restricts the @@ -1478,6 +2353,26 @@ spec: AND'ed. type: string type: object + services: + description: "Services is an optional field that contains + options for matching Kubernetes Services. If specified, + only traffic that originates from or terminates at endpoints + within the selected service(s) will be matched, and only + to/from each endpoint's port. \n Services cannot be specified + on the same rule as Selector, NotSelector, NamespaceSelector, + Nets, NotNets or ServiceAccounts. \n Ports and NotPorts + can only be specified with Services on ingress rules." + properties: + name: + description: Name specifies the name of a Kubernetes + Service to match. + type: string + namespace: + description: Namespace specifies the namespace of the + given Service. If left empty, the rule will match + within this policy's namespace. + type: string + type: object type: object http: description: HTTP contains match criteria that apply to HTTP @@ -1586,16 +2481,17 @@ spec: contains a selector expression. Only traffic that originates from (or terminates at) endpoints within the selected namespaces will be matched. When both NamespaceSelector - and Selector are defined on the same rule, then only workload - endpoints that are matched by both selectors will be selected - by the rule. \n For NetworkPolicy, an empty NamespaceSelector - implies that the Selector is limited to selecting only - workload endpoints in the same namespace as the NetworkPolicy. - \n For NetworkPolicy, `global()` NamespaceSelector implies - that the Selector is limited to selecting only GlobalNetworkSet - or HostEndpoint. \n For GlobalNetworkPolicy, an empty - NamespaceSelector implies the Selector applies to workload - endpoints across all namespaces." + and another selector are defined on the same rule, then + only workload endpoints that are matched by both selectors + will be selected by the rule. \n For NetworkPolicy, an + empty NamespaceSelector implies that the Selector is limited + to selecting only workload endpoints in the same namespace + as the NetworkPolicy. \n For NetworkPolicy, `global()` + NamespaceSelector implies that the Selector is limited + to selecting only GlobalNetworkSet or HostEndpoint. \n + For GlobalNetworkPolicy, an empty NamespaceSelector implies + the Selector applies to workload endpoints across all + namespaces." type: string nets: description: Nets is an optional field that restricts the @@ -1683,6 +2579,26 @@ spec: AND'ed. type: string type: object + services: + description: "Services is an optional field that contains + options for matching Kubernetes Services. If specified, + only traffic that originates from or terminates at endpoints + within the selected service(s) will be matched, and only + to/from each endpoint's port. \n Services cannot be specified + on the same rule as Selector, NotSelector, NamespaceSelector, + Nets, NotNets or ServiceAccounts. \n Ports and NotPorts + can only be specified with Services on ingress rules." + properties: + name: + description: Name specifies the name of a Kubernetes + Service to match. + type: string + namespace: + description: Namespace specifies the namespace of the + given Service. If left empty, the rule will match + within this policy's namespace. + type: string + type: object type: object required: - action @@ -1700,6 +2616,19 @@ spec: with identical order will be applied in alphanumerical order based on the Policy "Name". type: number + performanceHints: + description: "PerformanceHints contains a list of hints to Calico's + policy engine to help process the policy more efficiently. Hints + never change the enforcement behaviour of the policy. \n Currently, + the only available hint is \"AssumeNeededOnEveryNode\". When that + hint is set on a policy, Felix will act as if the policy matches + a local endpoint even if it does not. This is useful for \"preloading\" + any large static policies that are known to be used on every node. + If the policy is _not_ used on a particular node then the work done + to preload the policy (and to maintain it) is wasted." + items: + type: string + type: array preDNAT: description: PreDNAT indicates to apply the rules in this policy before any DNAT. @@ -1753,8 +2682,8 @@ status: plural: "" conditions: [] storedVersions: [] - --- +# Source: calico/templates/kdd-crds.yaml apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: @@ -1766,6 +2695,7 @@ spec: listKind: GlobalNetworkSetList plural: globalnetworksets singular: globalnetworkset + preserveUnknownFields: false scope: Cluster versions: - name: v1 @@ -1806,8 +2736,8 @@ status: plural: "" conditions: [] storedVersions: [] - --- +# Source: calico/templates/kdd-crds.yaml apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: @@ -1819,6 +2749,7 @@ spec: listKind: HostEndpointList plural: hostendpoints singular: hostendpoint + preserveUnknownFields: false scope: Cluster versions: - name: v1 @@ -1914,8 +2845,8 @@ status: plural: "" conditions: [] storedVersions: [] - --- +# Source: calico/templates/kdd-crds.yaml apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: @@ -1927,6 +2858,7 @@ spec: listKind: IPAMBlockList plural: ipamblocks singular: ipamblock + preserveUnknownFields: false scope: Cluster versions: - name: v1 @@ -1950,8 +2882,16 @@ spec: resource. properties: affinity: + description: Affinity of the block, if this block has one. If set, + it will be of the form "host:". If not set, this block + is not affine to a host. type: string allocations: + description: Array of allocations in-use within this block. nil entries + mean the allocation is free. For non-nil entries at index i, the + index is the ordinal of the allocation within this block and the + value is the index of the associated attributes in the Attributes + array. items: type: integer # TODO: This nullable is manually added in. We should update controller-gen @@ -1959,6 +2899,10 @@ spec: nullable: true type: array attributes: + description: Attributes is an array of arbitrary metadata associated + with allocations in the block. To find attributes for a given allocation, + use the value of the allocation's entry in the Allocations array + as the index of the element in this array. items: properties: handle_id: @@ -1970,12 +2914,38 @@ spec: type: object type: array cidr: + description: The block's CIDR. type: string deleted: + description: Deleted is an internal boolean used to workaround a limitation + in the Kubernetes API whereby deletion will not return a conflict + error if the block has been updated. It should not be set manually. type: boolean + sequenceNumber: + default: 0 + description: We store a sequence number that is updated each time + the block is written. Each allocation will also store the sequence + number of the block at the time of its creation. When releasing + an IP, passing the sequence number associated with the allocation + allows us to protect against a race condition and ensure the IP + hasn't been released and re-allocated since the release request. + format: int64 + type: integer + sequenceNumberForAllocation: + additionalProperties: + format: int64 + type: integer + description: Map of allocated ordinal within the block to sequence + number of the block at the time of allocation. Kubernetes does not + allow numerical keys for maps, so the key is cast to a string. + type: object strictAffinity: + description: StrictAffinity on the IPAMBlock is deprecated and no + longer used by the code. Use IPAMConfig StrictAffinity instead. type: boolean unallocated: + description: Unallocated is an ordered list of allocations which are + free in the block. items: type: integer type: array @@ -1995,8 +2965,8 @@ status: plural: "" conditions: [] storedVersions: [] - --- +# Source: calico/templates/kdd-crds.yaml apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: @@ -2008,6 +2978,7 @@ spec: listKind: IPAMConfigList plural: ipamconfigs singular: ipamconfig + preserveUnknownFields: false scope: Cluster versions: - name: v1 @@ -2035,6 +3006,8 @@ spec: maxBlocksPerHost: description: MaxBlocksPerHost, if non-zero, is the max number of blocks that can be affine to each host. + maximum: 2147483647 + minimum: 0 type: integer strictAffinity: type: boolean @@ -2051,8 +3024,8 @@ status: plural: "" conditions: [] storedVersions: [] - --- +# Source: calico/templates/kdd-crds.yaml apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: @@ -2064,6 +3037,7 @@ spec: listKind: IPAMHandleList plural: ipamhandles singular: ipamhandle + preserveUnknownFields: false scope: Cluster versions: - name: v1 @@ -2107,8 +3081,8 @@ status: plural: "" conditions: [] storedVersions: [] - --- +# Source: calico/templates/kdd-crds.yaml apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: @@ -2120,6 +3094,7 @@ spec: listKind: IPPoolList plural: ippools singular: ippool + preserveUnknownFields: false scope: Cluster versions: - name: v1 @@ -2141,13 +3116,23 @@ spec: spec: description: IPPoolSpec contains the specification for an IPPool resource. properties: + allowedUses: + description: AllowedUse controls what the IP pool will be used for. If + not specified or empty, defaults to ["Tunnel", "Workload"] for back-compatibility + items: + type: string + type: array blockSize: description: The block size to use for IP address assignments from - this pool. Defaults to 26 for IPv4 and 112 for IPv6. + this pool. Defaults to 26 for IPv4 and 122 for IPv6. type: integer cidr: description: The pool CIDR. type: string + disableBGPExport: + description: 'Disable exporting routes from this IP Pool''s CIDR over + BGP. [Default: false]' + type: boolean disabled: description: When disabled is true, Calico IPAM will not assign addresses from this pool. @@ -2181,7 +3166,7 @@ spec: for internal use only.' type: boolean natOutgoing: - description: When nat-outgoing is true, packets sent from Calico networked + description: When natOutgoing is true, packets sent from Calico networked containers in this pool to destinations outside of this pool will be masqueraded. type: boolean @@ -2206,8 +3191,63 @@ status: plural: "" conditions: [] storedVersions: [] - --- +# Source: calico/templates/kdd-crds.yaml +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: (devel) + creationTimestamp: null + name: ipreservations.crd.projectcalico.org +spec: + group: crd.projectcalico.org + names: + kind: IPReservation + listKind: IPReservationList + plural: ipreservations + singular: ipreservation + preserveUnknownFields: false + scope: Cluster + versions: + - name: v1 + schema: + openAPIV3Schema: + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + description: IPReservationSpec contains the specification for an IPReservation + resource. + properties: + reservedCIDRs: + description: ReservedCIDRs is a list of CIDRs and/or IP addresses + that Calico IPAM will exclude from new allocations. + items: + type: string + type: array + type: object + type: object + served: true + storage: true +status: + acceptedNames: + kind: "" + plural: "" + conditions: [] + storedVersions: [] +--- +# Source: calico/templates/kdd-crds.yaml apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: @@ -2219,6 +3259,7 @@ spec: listKind: KubeControllersConfigurationList plural: kubecontrollersconfigurations singular: kubecontrollersconfiguration + preserveUnknownFields: false scope: Cluster versions: - name: v1 @@ -2267,6 +3308,11 @@ spec: host endpoints for every node. [Default: Disabled]' type: string type: object + leakGracePeriod: + description: 'LeakGracePeriod is the period used by the controller + to determine if an IP address has been leaked. Set to 0 + to disable IP garbage collection. [Default: 15m]' + type: string reconcilerPeriod: description: 'ReconcilerPeriod is the period to perform reconciliation with the Calico datastore. [Default: 5m]' @@ -2304,6 +3350,11 @@ spec: type: string type: object type: object + debugProfilePort: + description: DebugProfilePort configures the port to serve memory + and cpu profiles on. If not specified, profiling is disabled. + format: int32 + type: integer etcdV3CompactionPeriod: description: 'EtcdV3CompactionPeriod is the period between etcdv3 compaction requests. Set to 0 to disable. [Default: 10m]' @@ -2367,6 +3418,12 @@ spec: of host endpoints for every node. [Default: Disabled]' type: string type: object + leakGracePeriod: + description: 'LeakGracePeriod is the period used by the + controller to determine if an IP address has been leaked. + Set to 0 to disable IP garbage collection. [Default: + 15m]' + type: string reconcilerPeriod: description: 'ReconcilerPeriod is the period to perform reconciliation with the Calico datastore. [Default: @@ -2408,6 +3465,11 @@ spec: type: string type: object type: object + debugProfilePort: + description: DebugProfilePort configures the port to serve memory + and cpu profiles on. If not specified, profiling is disabled. + format: int32 + type: integer etcdV3CompactionPeriod: description: 'EtcdV3CompactionPeriod is the period between etcdv3 compaction requests. Set to 0 to disable. [Default: 10m]' @@ -2438,8 +3500,8 @@ status: plural: "" conditions: [] storedVersions: [] - --- +# Source: calico/templates/kdd-crds.yaml apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: @@ -2451,6 +3513,7 @@ spec: listKind: NetworkPolicyList plural: networkpolicies singular: networkpolicy + preserveUnknownFields: false scope: Namespaced versions: - name: v1 @@ -2495,16 +3558,17 @@ spec: contains a selector expression. Only traffic that originates from (or terminates at) endpoints within the selected namespaces will be matched. When both NamespaceSelector - and Selector are defined on the same rule, then only workload - endpoints that are matched by both selectors will be selected - by the rule. \n For NetworkPolicy, an empty NamespaceSelector - implies that the Selector is limited to selecting only - workload endpoints in the same namespace as the NetworkPolicy. - \n For NetworkPolicy, `global()` NamespaceSelector implies - that the Selector is limited to selecting only GlobalNetworkSet - or HostEndpoint. \n For GlobalNetworkPolicy, an empty - NamespaceSelector implies the Selector applies to workload - endpoints across all namespaces." + and another selector are defined on the same rule, then + only workload endpoints that are matched by both selectors + will be selected by the rule. \n For NetworkPolicy, an + empty NamespaceSelector implies that the Selector is limited + to selecting only workload endpoints in the same namespace + as the NetworkPolicy. \n For NetworkPolicy, `global()` + NamespaceSelector implies that the Selector is limited + to selecting only GlobalNetworkSet or HostEndpoint. \n + For GlobalNetworkPolicy, an empty NamespaceSelector implies + the Selector applies to workload endpoints across all + namespaces." type: string nets: description: Nets is an optional field that restricts the @@ -2592,6 +3656,26 @@ spec: AND'ed. type: string type: object + services: + description: "Services is an optional field that contains + options for matching Kubernetes Services. If specified, + only traffic that originates from or terminates at endpoints + within the selected service(s) will be matched, and only + to/from each endpoint's port. \n Services cannot be specified + on the same rule as Selector, NotSelector, NamespaceSelector, + Nets, NotNets or ServiceAccounts. \n Ports and NotPorts + can only be specified with Services on ingress rules." + properties: + name: + description: Name specifies the name of a Kubernetes + Service to match. + type: string + namespace: + description: Namespace specifies the namespace of the + given Service. If left empty, the rule will match + within this policy's namespace. + type: string + type: object type: object http: description: HTTP contains match criteria that apply to HTTP @@ -2700,16 +3784,17 @@ spec: contains a selector expression. Only traffic that originates from (or terminates at) endpoints within the selected namespaces will be matched. When both NamespaceSelector - and Selector are defined on the same rule, then only workload - endpoints that are matched by both selectors will be selected - by the rule. \n For NetworkPolicy, an empty NamespaceSelector - implies that the Selector is limited to selecting only - workload endpoints in the same namespace as the NetworkPolicy. - \n For NetworkPolicy, `global()` NamespaceSelector implies - that the Selector is limited to selecting only GlobalNetworkSet - or HostEndpoint. \n For GlobalNetworkPolicy, an empty - NamespaceSelector implies the Selector applies to workload - endpoints across all namespaces." + and another selector are defined on the same rule, then + only workload endpoints that are matched by both selectors + will be selected by the rule. \n For NetworkPolicy, an + empty NamespaceSelector implies that the Selector is limited + to selecting only workload endpoints in the same namespace + as the NetworkPolicy. \n For NetworkPolicy, `global()` + NamespaceSelector implies that the Selector is limited + to selecting only GlobalNetworkSet or HostEndpoint. \n + For GlobalNetworkPolicy, an empty NamespaceSelector implies + the Selector applies to workload endpoints across all + namespaces." type: string nets: description: Nets is an optional field that restricts the @@ -2797,6 +3882,26 @@ spec: AND'ed. type: string type: object + services: + description: "Services is an optional field that contains + options for matching Kubernetes Services. If specified, + only traffic that originates from or terminates at endpoints + within the selected service(s) will be matched, and only + to/from each endpoint's port. \n Services cannot be specified + on the same rule as Selector, NotSelector, NamespaceSelector, + Nets, NotNets or ServiceAccounts. \n Ports and NotPorts + can only be specified with Services on ingress rules." + properties: + name: + description: Name specifies the name of a Kubernetes + Service to match. + type: string + namespace: + description: Namespace specifies the namespace of the + given Service. If left empty, the rule will match + within this policy's namespace. + type: string + type: object type: object required: - action @@ -2826,16 +3931,17 @@ spec: contains a selector expression. Only traffic that originates from (or terminates at) endpoints within the selected namespaces will be matched. When both NamespaceSelector - and Selector are defined on the same rule, then only workload - endpoints that are matched by both selectors will be selected - by the rule. \n For NetworkPolicy, an empty NamespaceSelector - implies that the Selector is limited to selecting only - workload endpoints in the same namespace as the NetworkPolicy. - \n For NetworkPolicy, `global()` NamespaceSelector implies - that the Selector is limited to selecting only GlobalNetworkSet - or HostEndpoint. \n For GlobalNetworkPolicy, an empty - NamespaceSelector implies the Selector applies to workload - endpoints across all namespaces." + and another selector are defined on the same rule, then + only workload endpoints that are matched by both selectors + will be selected by the rule. \n For NetworkPolicy, an + empty NamespaceSelector implies that the Selector is limited + to selecting only workload endpoints in the same namespace + as the NetworkPolicy. \n For NetworkPolicy, `global()` + NamespaceSelector implies that the Selector is limited + to selecting only GlobalNetworkSet or HostEndpoint. \n + For GlobalNetworkPolicy, an empty NamespaceSelector implies + the Selector applies to workload endpoints across all + namespaces." type: string nets: description: Nets is an optional field that restricts the @@ -2923,6 +4029,26 @@ spec: AND'ed. type: string type: object + services: + description: "Services is an optional field that contains + options for matching Kubernetes Services. If specified, + only traffic that originates from or terminates at endpoints + within the selected service(s) will be matched, and only + to/from each endpoint's port. \n Services cannot be specified + on the same rule as Selector, NotSelector, NamespaceSelector, + Nets, NotNets or ServiceAccounts. \n Ports and NotPorts + can only be specified with Services on ingress rules." + properties: + name: + description: Name specifies the name of a Kubernetes + Service to match. + type: string + namespace: + description: Namespace specifies the namespace of the + given Service. If left empty, the rule will match + within this policy's namespace. + type: string + type: object type: object http: description: HTTP contains match criteria that apply to HTTP @@ -3031,16 +4157,17 @@ spec: contains a selector expression. Only traffic that originates from (or terminates at) endpoints within the selected namespaces will be matched. When both NamespaceSelector - and Selector are defined on the same rule, then only workload - endpoints that are matched by both selectors will be selected - by the rule. \n For NetworkPolicy, an empty NamespaceSelector - implies that the Selector is limited to selecting only - workload endpoints in the same namespace as the NetworkPolicy. - \n For NetworkPolicy, `global()` NamespaceSelector implies - that the Selector is limited to selecting only GlobalNetworkSet - or HostEndpoint. \n For GlobalNetworkPolicy, an empty - NamespaceSelector implies the Selector applies to workload - endpoints across all namespaces." + and another selector are defined on the same rule, then + only workload endpoints that are matched by both selectors + will be selected by the rule. \n For NetworkPolicy, an + empty NamespaceSelector implies that the Selector is limited + to selecting only workload endpoints in the same namespace + as the NetworkPolicy. \n For NetworkPolicy, `global()` + NamespaceSelector implies that the Selector is limited + to selecting only GlobalNetworkSet or HostEndpoint. \n + For GlobalNetworkPolicy, an empty NamespaceSelector implies + the Selector applies to workload endpoints across all + namespaces." type: string nets: description: Nets is an optional field that restricts the @@ -3128,6 +4255,26 @@ spec: AND'ed. type: string type: object + services: + description: "Services is an optional field that contains + options for matching Kubernetes Services. If specified, + only traffic that originates from or terminates at endpoints + within the selected service(s) will be matched, and only + to/from each endpoint's port. \n Services cannot be specified + on the same rule as Selector, NotSelector, NamespaceSelector, + Nets, NotNets or ServiceAccounts. \n Ports and NotPorts + can only be specified with Services on ingress rules." + properties: + name: + description: Name specifies the name of a Kubernetes + Service to match. + type: string + namespace: + description: Namespace specifies the namespace of the + given Service. If left empty, the rule will match + within this policy's namespace. + type: string + type: object type: object required: - action @@ -3141,6 +4288,19 @@ spec: with identical order will be applied in alphanumerical order based on the Policy "Name". type: number + performanceHints: + description: "PerformanceHints contains a list of hints to Calico's + policy engine to help process the policy more efficiently. Hints + never change the enforcement behaviour of the policy. \n Currently, + the only available hint is \"AssumeNeededOnEveryNode\". When that + hint is set on a policy, Felix will act as if the policy matches + a local endpoint even if it does not. This is useful for \"preloading\" + any large static policies that are known to be used on every node. + If the policy is _not_ used on a particular node then the work done + to preload the policy (and to maintain it) is wasted." + items: + type: string + type: array selector: description: "The selector is an expression used to pick pick out the endpoints that the policy should be applied to. \n Selector @@ -3190,8 +4350,8 @@ status: plural: "" conditions: [] storedVersions: [] - --- +# Source: calico/templates/kdd-crds.yaml apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: @@ -3203,6 +4363,7 @@ spec: listKind: NetworkSetList plural: networksets singular: networkset + preserveUnknownFields: false scope: Namespaced versions: - name: v1 @@ -3241,11 +4402,8 @@ status: plural: "" conditions: [] storedVersions: [] - ---- --- # Source: calico/templates/calico-kube-controllers-rbac.yaml - # Include a clusterrole for the kube-controllers component, # and bind it to the calico-kube-controllers serviceaccount. kind: ClusterRole @@ -3261,16 +4419,18 @@ rules: - watch - list - get - # Pods are queried to check for existence. + # Pods are watched to check for existence as part of IPAM controller. - apiGroups: [""] resources: - pods verbs: - get - # IPAM resources are manipulated when nodes are deleted. + - list + - watch + # IPAM resources are manipulated in response to node and block updates, as well as periodic triggers. - apiGroups: ["crd.projectcalico.org"] resources: - - ippools + - ipreservations verbs: - list - apiGroups: ["crd.projectcalico.org"] @@ -3285,6 +4445,13 @@ rules: - update - delete - watch + # Pools are watched to maintain a mapping of blocks to IP pools. + - apiGroups: ["crd.projectcalico.org"] + resources: + - ippools + verbs: + - list + - watch # kube-controllers manages hostendpoints. - apiGroups: ["crd.projectcalico.org"] resources: @@ -3301,8 +4468,10 @@ rules: - clusterinformations verbs: - get + - list - create - update + - watch # KubeControllersConfiguration is where it gets its config - apiGroups: ["crd.projectcalico.org"] resources: @@ -3316,21 +4485,6 @@ rules: - update # watch for changes - watch ---- -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: calico-kube-controllers -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: calico-kube-controllers -subjects: -- kind: ServiceAccount - name: calico-kube-controllers - namespace: kube-system ---- - --- # Source: calico/templates/calico-node-rbac.yaml # Include a clusterrole for the calico-node DaemonSet, @@ -3340,6 +4494,14 @@ apiVersion: rbac.authorization.k8s.io/v1 metadata: name: calico-node rules: + # Used for creating service account tokens to be used by the CNI plugin + - apiGroups: [""] + resources: + - serviceaccounts/token + resourceNames: + - calico-cni-plugin + verbs: + - create # The CNI plugin needs to get pods, nodes, and namespaces. - apiGroups: [""] resources: @@ -3348,6 +4510,14 @@ rules: - namespaces verbs: - get + # EndpointSlices are used for Service-based network policy rule + # enforcement. + - apiGroups: ["discovery.k8s.io"] + resources: + - endpointslices + verbs: + - watch + - list - apiGroups: [""] resources: - endpoints @@ -3400,9 +4570,11 @@ rules: - globalfelixconfigs - felixconfigurations - bgppeers + - bgpfilters - globalbgpconfigs - bgpconfigurations - ippools + - ipreservations - ipamblocks - globalnetworkpolicies - globalnetworksets @@ -3411,6 +4583,7 @@ rules: - clusterinformations - hostendpoints - blockaffinities + - caliconodestatuses verbs: - get - list @@ -3424,6 +4597,12 @@ rules: verbs: - create - update + # Calico must update some CRDs. + - apiGroups: [ "crd.projectcalico.org" ] + resources: + - caliconodestatuses + verbs: + - update # Calico stores some configuration information on the node. - apiGroups: [""] resources: @@ -3453,11 +4632,14 @@ rules: - create - update - delete + # The CNI plugin and calico/node need to be able to create a default + # IPAMConfiguration - apiGroups: ["crd.projectcalico.org"] resources: - ipamconfigs verbs: - get + - create # Block affinities must also be watchable by confd for route aggregation. - apiGroups: ["crd.projectcalico.org"] resources: @@ -3471,8 +4653,57 @@ rules: - daemonsets verbs: - get - --- +# Source: calico/templates/calico-node-rbac.yaml +# CNI cluster role +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: calico-cni-plugin +rules: + - apiGroups: [""] + resources: + - pods + - nodes + - namespaces + verbs: + - get + - apiGroups: [""] + resources: + - pods/status + verbs: + - patch + - apiGroups: ["crd.projectcalico.org"] + resources: + - blockaffinities + - ipamblocks + - ipamhandles + - clusterinformations + - ippools + - ipreservations + - ipamconfigs + verbs: + - get + - list + - create + - update + - delete +--- +# Source: calico/templates/calico-kube-controllers-rbac.yaml +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: calico-kube-controllers +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: calico-kube-controllers +subjects: +- kind: ServiceAccount + name: calico-kube-controllers + namespace: kube-system +--- +# Source: calico/templates/calico-node-rbac.yaml apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: @@ -3485,7 +4716,20 @@ subjects: - kind: ServiceAccount name: calico-node namespace: kube-system - +--- +# Source: calico/templates/calico-node-rbac.yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: calico-cni-plugin +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: calico-cni-plugin +subjects: +- kind: ServiceAccount + name: calico-cni-plugin + namespace: kube-system --- # Source: calico/templates/calico-node.yaml # This manifest installs the calico-node container, as well @@ -3533,7 +4777,8 @@ spec: # It can be deleted if this is a fresh installation, or if you have already # upgraded to use calico-ipam. - name: upgrade-ipam - image: docker.io/calico/cni:v3.18.0 + image: docker.io/calico/cni:v3.27.2 + imagePullPolicy: IfNotPresent command: ["/opt/cni/bin/calico-ipam", "-upgrade"] envFrom: - configMapRef: @@ -3560,7 +4805,8 @@ spec: # This container installs the CNI binaries # and CNI network config file on each node. - name: install-cni - image: docker.io/calico/cni:v3.18.0 + image: docker.io/calico/cni:v3.27.2 + imagePullPolicy: IfNotPresent command: ["/opt/cni/bin/install"] envFrom: - configMapRef: @@ -3598,13 +4844,29 @@ spec: name: cni-net-dir securityContext: privileged: true - # Adds a Flex Volume Driver that creates a per-pod Unix Domain Socket to allow Dikastes - # to communicate with Felix over the Policy Sync API. - - name: flexvol-driver - image: docker.io/calico/pod2daemon-flexvol:v3.18.0 + # This init container mounts the necessary filesystems needed by the BPF data plane + # i.e. bpf at /sys/fs/bpf and cgroup2 at /run/calico/cgroup. Calico-node initialisation is executed + # in best effort fashion, i.e. no failure for errors, to not disrupt pod creation in iptable mode. + - name: "mount-bpffs" + image: docker.io/calico/node:v3.27.2 + imagePullPolicy: IfNotPresent + command: ["calico-node", "-init", "-best-effort"] volumeMounts: - - name: flexvol-driver-host - mountPath: /host/driver + - mountPath: /sys/fs + name: sys-fs + # Bidirectional is required to ensure that the new mount we make at /sys/fs/bpf propagates to the host + # so that it outlives the init container. + mountPropagation: Bidirectional + - mountPath: /var/run/calico + name: var-run-calico + # Bidirectional is required to ensure that the new mount we make at /run/calico/cgroup propagates to the host + # so that it outlives the init container. + mountPropagation: Bidirectional + # Mount /proc/ from host which usually is an init program at /nodeproc. It's needed by mountns binary, + # executed by calico-node, to mount root cgroup2 fs at /run/calico/cgroup to attach CTLB programs correctly. + - mountPath: /nodeproc + name: nodeproc + readOnly: true securityContext: privileged: true containers: @@ -3612,7 +4874,8 @@ spec: # container programs network policy and routes on each # host. - name: calico-node - image: docker.io/calico/node:v3.18.0 + image: docker.io/calico/node:v3.27.2 + imagePullPolicy: IfNotPresent envFrom: - configMapRef: # Allow KUBERNETES_SERVICE_HOST and KUBERNETES_SERVICE_PORT to be overridden for eBPF mode. @@ -3648,6 +4911,9 @@ spec: # Enable or Disable VXLAN on the default IP pool. - name: CALICO_IPV4POOL_VXLAN value: "Never" + # Enable or Disable VXLAN on the default IPv6 IP pool. + - name: CALICO_IPV6POOL_VXLAN + value: "Never" # Set MTU for tunnel device used if ipip is enabled - name: FELIX_IPINIPMTU valueFrom: @@ -3680,9 +4946,6 @@ spec: # Disable IPv6 on Kubernetes. - name: FELIX_IPV6SUPPORT value: "false" - # Set Felix logging to "info" - - name: FELIX_LOGSEVERITYSCREEN - value: "info" - name: FELIX_HEALTHENABLED value: "true" securityContext: @@ -3690,6 +4953,12 @@ spec: resources: requests: cpu: 250m + lifecycle: + preStop: + exec: + command: + - /bin/calico-node + - -shutdown livenessProbe: exec: command: @@ -3699,6 +4968,7 @@ spec: periodSeconds: 10 initialDelaySeconds: 10 failureThreshold: 6 + timeoutSeconds: 10 readinessProbe: exec: command: @@ -3706,7 +4976,12 @@ spec: - -felix-ready - -bird-ready periodSeconds: 10 + timeoutSeconds: 10 volumeMounts: + # For maintaining CNI plugin API credentials. + - mountPath: /host/etc/cni/net.d + name: cni-net-dir + readOnly: false - mountPath: /lib/modules name: lib-modules readOnly: true @@ -3723,11 +4998,8 @@ spec: mountPath: /var/run/nodeagent # For eBPF mode, we need to be able to mount the BPF filesystem at /sys/fs/bpf so we mount in the # parent directory. - - name: sysfs - mountPath: /sys/fs/ - # Bidirectional means that, if we mount the BPF filesystem at /sys/fs/bpf it will propagate to the host. - # If the host is known to mount that filesystem already then Bidirectional can be omitted. - mountPropagation: Bidirectional + - name: bpffs + mountPath: /sys/fs/bpf - name: cni-log-dir mountPath: /var/log/calico/cni readOnly: true @@ -3739,21 +5011,32 @@ spec: - name: var-run-calico hostPath: path: /var/run/calico + type: DirectoryOrCreate - name: var-lib-calico hostPath: path: /var/lib/calico + type: DirectoryOrCreate - name: xtables-lock hostPath: path: /run/xtables.lock type: FileOrCreate - - name: sysfs + - name: sys-fs hostPath: path: /sys/fs/ type: DirectoryOrCreate + - name: bpffs + hostPath: + path: /sys/fs/bpf + type: Directory + # mount /proc at /nodeproc to be used by mount-bpffs initContainer to mount root cgroup2 fs. + - name: nodeproc + hostPath: + path: /proc # Used to install CNI. - name: cni-bin-dir hostPath: path: /opt/cni/bin + type: DirectoryOrCreate - name: cni-net-dir hostPath: path: /etc/cni/net.d @@ -3772,19 +5055,6 @@ spec: hostPath: type: DirectoryOrCreate path: /var/run/nodeagent - # Used to install Flex Volume Driver - - name: flexvol-driver-host - hostPath: - type: DirectoryOrCreate - path: /usr/libexec/kubernetes/kubelet-plugins/volume/exec/nodeagent~uds ---- - -apiVersion: v1 -kind: ServiceAccount -metadata: - name: calico-node - namespace: kube-system - --- # Source: calico/templates/calico-kube-controllers.yaml # See https://github.com/projectcalico/kube-controllers @@ -3818,55 +5088,33 @@ spec: operator: Exists - key: node-role.kubernetes.io/master effect: NoSchedule + - key: node-role.kubernetes.io/control-plane + effect: NoSchedule serviceAccountName: calico-kube-controllers priorityClassName: system-cluster-critical containers: - name: calico-kube-controllers - image: docker.io/calico/kube-controllers:v3.18.0 + image: docker.io/calico/kube-controllers:v3.27.2 + imagePullPolicy: IfNotPresent env: # Choose which controllers to run. - name: ENABLED_CONTROLLERS value: node - name: DATASTORE_TYPE value: kubernetes + livenessProbe: + exec: + command: + - /usr/bin/check-status + - -l + periodSeconds: 10 + initialDelaySeconds: 10 + failureThreshold: 6 + timeoutSeconds: 10 readinessProbe: exec: command: - /usr/bin/check-status - -r - ---- - -apiVersion: v1 -kind: ServiceAccount -metadata: - name: calico-kube-controllers - namespace: kube-system - ---- - -# This manifest creates a Pod Disruption Budget for Controller to allow K8s Cluster Autoscaler to evict - -apiVersion: policy/v1beta1 -kind: PodDisruptionBudget -metadata: - name: calico-kube-controllers - namespace: kube-system - labels: - k8s-app: calico-kube-controllers -spec: - maxUnavailable: 1 - selector: - matchLabels: - k8s-app: calico-kube-controllers - ---- -# Source: calico/templates/calico-etcd-secrets.yaml - ---- -# Source: calico/templates/calico-typha.yaml - ---- -# Source: calico/templates/configure-canal.yaml - - + periodSeconds: 10 + \ No newline at end of file diff --git a/cluster-provision/k8s/1.28/manifests/cni_ipv6.diff b/cluster-provision/k8s/1.28/manifests/cni_ipv6.diff index 45cae03d34..bff751f9ff 100644 --- a/cluster-provision/k8s/1.28/manifests/cni_ipv6.diff +++ b/cluster-provision/k8s/1.28/manifests/cni_ipv6.diff @@ -1,6 +1,6 @@ ---- a/cluster-provision/k8s/1.24/manifests/cni.do-not-change.yaml -+++ b/cluster-provision/k8s/1.24/manifests/cni.do-not-change.yaml -@@ -32,7 +32,12 @@ +--- a/cluster-provision/k8s/1.28/manifests/cni.do-not-change.yaml ++++ b/cluster-provision/k8s/1.28/manifests/cni.do-not-change.yaml +@@ -69,8 +69,13 @@ data: "nodename": "__KUBERNETES_NODE_NAME__", "mtu": __CNI_MTU__, "ipam": { @@ -8,50 +8,55 @@ + "type": "calico-ipam", + "assign_ipv4": "false", + "assign_ipv6": "true" -+ }, + }, + "container_settings": { + "allow_ip_forwarding": true - }, ++ }, "policy": { "type": "k8s" -@@ -3533,7 +3538,7 @@ + }, +@@ -4777,7 +4782,7 @@ spec: # It can be deleted if this is a fresh installation, or if you have already # upgraded to use calico-ipam. - name: upgrade-ipam -- image: docker.io/calico/cni:v3.18.0 -+ image: quay.io/calico/cni:v3.18.0 +- image: docker.io/calico/cni:v3.27.2 ++ image: quay.io/calico/cni:v3.27.2 + imagePullPolicy: IfNotPresent command: ["/opt/cni/bin/calico-ipam", "-upgrade"] envFrom: - - configMapRef: -@@ -3560,7 +3565,7 @@ +@@ -4805,7 +4810,7 @@ spec: # This container installs the CNI binaries # and CNI network config file on each node. - name: install-cni -- image: docker.io/calico/cni:v3.18.0 -+ image: quay.io/calico/cni:v3.18.0 +- image: docker.io/calico/cni:v3.27.2 ++ image: quay.io/calico/cni:v3.27.2 + imagePullPolicy: IfNotPresent command: ["/opt/cni/bin/install"] envFrom: - - configMapRef: -@@ -3601,7 +3606,7 @@ - # Adds a Flex Volume Driver that creates a per-pod Unix Domain Socket to allow Dikastes - # to communicate with Felix over the Policy Sync API. - - name: flexvol-driver -- image: docker.io/calico/pod2daemon-flexvol:v3.18.0 -+ image: quay.io/calico/pod2daemon-flexvol:v3.18.0 +@@ -4848,7 +4853,7 @@ spec: + # i.e. bpf at /sys/fs/bpf and cgroup2 at /run/calico/cgroup. Calico-node initialisation is executed + # in best effort fashion, i.e. no failure for errors, to not disrupt pod creation in iptable mode. + - name: "mount-bpffs" +- image: docker.io/calico/node:v3.27.2 ++ image: quay.io/calico/node:v3.27.2 + imagePullPolicy: IfNotPresent + command: ["calico-node", "-init", "-best-effort"] volumeMounts: - - name: flexvol-driver-host - mountPath: /host/driver -@@ -3612,7 +3617,7 @@ +@@ -4874,7 +4879,7 @@ spec: # container programs network policy and routes on each # host. - name: calico-node -- image: docker.io/calico/node:v3.18.0 -+ image: quay.io/calico/node:v3.18.0 +- image: docker.io/calico/node:v3.27.2 ++ image: quay.io/calico/node:v3.27.2 + imagePullPolicy: IfNotPresent envFrom: - configMapRef: - # Allow KUBERNETES_SERVICE_HOST and KUBERNETES_SERVICE_PORT to be overridden for eBPF mode. -@@ -3641,10 +3646,10 @@ +@@ -4902,12 +4907,14 @@ spec: + # Cluster type to identify the deployment type + - name: CLUSTER_TYPE value: "k8s,bgp" ++ - name: IP_AUTODETECTION_METHOD ++ value: "interface=eth.*" # Auto-detect the BGP IP address. - name: IP - value: "autodetect" @@ -63,24 +68,21 @@ # Enable or Disable VXLAN on the default IP pool. - name: CALICO_IPV4POOL_VXLAN value: "Never" -@@ -3671,6 +3676,8 @@ - # no effect. This should fall within `--cluster-cidr`. +@@ -4938,6 +4945,8 @@ spec: # - name: CALICO_IPV4POOL_CIDR # value: "192.168.0.0/16" + # Disable file logging so `kubectl logs` works. + - name: IP6 + value: "autodetect" - # Disable file logging so `kubectl logs` works. - name: CALICO_DISABLE_FILE_LOGGING value: "true" -@@ -3679,12 +3686,16 @@ + # Set Felix endpoint to host default action to ACCEPT. +@@ -4945,9 +4954,13 @@ spec: value: "ACCEPT" # Disable IPv6 on Kubernetes. - name: FELIX_IPV6SUPPORT - value: "false" + value: "true" - # Set Felix logging to "info" - - name: FELIX_LOGSEVERITYSCREEN - value: "info" - name: FELIX_HEALTHENABLED value: "true" + - name: CALICO_IPV6POOL_NAT_OUTGOING @@ -90,12 +92,8 @@ securityContext: privileged: true resources: -@@ -3818,11 +3829,16 @@ - operator: Exists - - key: node-role.kubernetes.io/master +@@ -5092,9 +5105,12 @@ spec: effect: NoSchedule -+ - key: node-role.kubernetes.io/control-plane -+ effect: NoSchedule serviceAccountName: calico-kube-controllers priorityClassName: system-cluster-critical + securityContext: @@ -103,17 +101,8 @@ + type: spc_t containers: - name: calico-kube-controllers -- image: docker.io/calico/kube-controllers:v3.18.0 -+ image: quay.io/calico/kube-controllers:v3.18.0 +- image: docker.io/calico/kube-controllers:v3.27.2 ++ image: quay.io/calico/kube-controllers:v3.27.2 + imagePullPolicy: IfNotPresent env: - # Choose which controllers to run. - - name: ENABLED_CONTROLLERS -@@ -3847,7 +3863,7 @@ - - # This manifest creates a Pod Disruption Budget for Controller to allow K8s Cluster Autoscaler to evict - --apiVersion: policy/v1beta1 -+apiVersion: policy/v1 - kind: PodDisruptionBudget - metadata: - name: calico-kube-controllers + # Choose which controllers to run. \ No newline at end of file diff --git a/cluster-provision/k8s/1.28/manifests/local-volume.yaml b/cluster-provision/k8s/1.28/manifests/local-volume.yaml index 5237ea99d4..93012326df 100644 --- a/cluster-provision/k8s/1.28/manifests/local-volume.yaml +++ b/cluster-provision/k8s/1.28/manifests/local-volume.yaml @@ -99,7 +99,7 @@ spec: spec: serviceAccountName: local-storage-admin containers: - - image: "quay.io/external_storage/local-volume-provisioner:v2.3.2" + - image: "quay.io/external_storage/local-volume-provisioner:v2.5.0" name: provisioner securityContext: privileged: true @@ -113,7 +113,7 @@ spec: fieldRef: fieldPath: metadata.namespace - name: JOB_CONTAINER_IMAGE - value: "quay.io/external_storage/local-volume-provisioner:v2.3.2" + value: "quay.io/external_storage/local-volume-provisioner:v2.5.0" volumeMounts: - mountPath: /etc/provisioner/config name: provisioner-config diff --git a/cluster-provision/k8s/1.28/node01.sh b/cluster-provision/k8s/1.28/node01.sh index 49fecd403e..96fac67119 100755 --- a/cluster-provision/k8s/1.28/node01.sh +++ b/cluster-provision/k8s/1.28/node01.sh @@ -2,14 +2,20 @@ set -ex +ARCH=$(uname -m) +SSH_USER="vagrant" +if [ "$ARCH" == "s390x" ]; then + SSH_USER="cloud-user" +fi + kubeadm_conf="/etc/kubernetes/kubeadm.conf" cni_manifest="/provision/cni.yaml" -if [ -f /home/vagrant/single_stack ]; then +if [ -f /home/$SSH_USER/single_stack ]; then kubeadm_conf="/etc/kubernetes/kubeadm_ipv6.conf" cni_manifest="/provision/cni_ipv6.yaml" fi -if [ -f /home/vagrant/enable_audit ]; then +if [ -f /home/$SSH_USER/enable_audit ]; then apiVer=$(head -1 /etc/kubernetes/audit/adv-audit.yaml) echo $apiVer > /etc/kubernetes/audit/adv-audit.yaml diff --git a/cluster-provision/k8s/1.28/nodes.sh b/cluster-provision/k8s/1.28/nodes.sh index c68f99a6e5..3cb4f87360 100755 --- a/cluster-provision/k8s/1.28/nodes.sh +++ b/cluster-provision/k8s/1.28/nodes.sh @@ -4,13 +4,27 @@ set -ex source /var/lib/kubevirtci/shared_vars.sh +ARCH=$(uname -m) +SSH_USER="vagrant" +if [ "$ARCH" == "s390x" ]; then + SSH_USER="cloud-user" +fi + nodeip= control_ip=192.168.66.101 -if [ -f /home/vagrant/single_stack ]; then +if [ -f /home/$SSH_USER/single_stack ]; then nodeip="--node-ip=::" control_ip=[fd00::101] fi +KUBELET_EXTRA_ARGS_ARCH="--fail-swap-on=false ${nodeip} --feature-gates=CPUManager=true,NodeSwap=true --cpu-manager-policy=static --kube-reserved=cpu=250m --system-reserved=cpu=250m" + +if [ "$ARCH" == "s390x" ]; then + # cpu manager feature is not yet supported on s390x. + KUBELET_EXTRA_ARGS_ARCH="--fail-swap-on=false ${nodeip} --feature-gates=NodeSwap=true" + +fi + timeout=30 interval=5 while ! hostnamectl |grep Transient ; do @@ -51,11 +65,11 @@ done if [ -f /etc/sysconfig/kubelet ]; then # TODO use config file! this is deprecated cat <>/etc/sysconfig/kubelet -KUBELET_EXTRA_ARGS=${KUBELET_CGROUP_ARGS} --fail-swap-on=false ${nodeip} --feature-gates=CPUManager=true,NodeSwap=true --cpu-manager-policy=static --kube-reserved=cpu=250m --system-reserved=cpu=250m +KUBELET_EXTRA_ARGS=${KUBELET_CGROUP_ARGS} $KUBELET_EXTRA_ARGS_ARCH EOT else cat <>/etc/systemd/system/kubelet.service.d/09-kubeadm.conf -Environment="KUBELET_CPUMANAGER_ARGS=--fail-swap-on=false --feature-gates=CPUManager=true,NodeSwap=true ${nodeip} --cpu-manager-policy=static --kube-reserved=cpu=250m --system-reserved=cpu=250m" +Environment="KUBELET_CPUMANAGER_ARGS=$KUBELET_EXTRA_ARGS_ARCH" EOT sed -i 's/$KUBELET_EXTRA_ARGS/$KUBELET_EXTRA_ARGS $KUBELET_CPUMANAGER_ARGS/' /etc/systemd/system/kubelet.service.d/10-kubeadm.conf fi diff --git a/cluster-provision/k8s/1.28/provision.sh b/cluster-provision/k8s/1.28/provision.sh index 27de87ae4a..dbd8c1b9fb 100755 --- a/cluster-provision/k8s/1.28/provision.sh +++ b/cluster-provision/k8s/1.28/provision.sh @@ -2,6 +2,8 @@ set -ex +ARCH=$(uname -m) + KUBEVIRTCI_SHARED_DIR=/var/lib/kubevirtci mkdir -p $KUBEVIRTCI_SHARED_DIR export ISTIO_VERSION=1.15.0 @@ -14,13 +16,13 @@ export ISTIO_BIN_DIR="/opt/istio-${ISTIO_VERSION}/bin" EOF source $KUBEVIRTCI_SHARED_DIR/shared_vars.sh -# Install modules of the initrd kernel +# Install modules of the initrd kernel. These modules extend the kernel's functionality, providing support for various hardware devices, file systems, network protocols, KVM/virtualization, and other features. dnf install -y "kernel-modules-$(uname -r)" # Resize root partition dnf install -y cloud-utils-growpart -if growpart /dev/vda 1; then - resize2fs /dev/vda1 +if growpart /dev/vda 1; then #growpart adjusts the partition size to fill the available space on the disk + resize2fs /dev/vda1 #resizes file system to the available space on the partition fi dnf install -y patch @@ -58,7 +60,13 @@ dnf install -y container-selinux dnf install -y libseccomp-devel -dnf install -y centos-release-nfv-openvswitch -dnf install -y openvswitch2.16 +#openvswitch for s390x is not available from the centos default repos. +if [ "$ARCH" == "s390x" ]; then + dnf install -y https://kojipkgs.fedoraproject.org//packages/openvswitch/2.16.0/2.fc36/s390x/openvswitch-2.16.0-2.fc36.s390x.rpm + systemctl enable openvswitch +else + dnf install -y centos-release-nfv-openvswitch + dnf install -y openvswitch2.16 +fi dnf install -y NetworkManager NetworkManager-ovs NetworkManager-config-server diff --git a/cluster-up/check.sh b/cluster-up/check.sh index 6d7348bcf2..78278b4c1a 100755 --- a/cluster-up/check.sh +++ b/cluster-up/check.sh @@ -26,12 +26,17 @@ fi KVM_ARCH="" KVM_NESTED="unknown" +KVM_HPAGE="unknown" if [ -f "/sys/module/kvm_intel/parameters/nested" ]; then KVM_NESTED=$( cat /sys/module/kvm_intel/parameters/nested ) KVM_ARCH="intel" elif [ -f "/sys/module/kvm_amd/parameters/nested" ]; then KVM_NESTED=$( cat /sys/module/kvm_amd/parameters/nested ) KVM_ARCH="amd" +elif [ -f "/sys/module/kvm/parameters/nested" ]; then + KVM_NESTED=$( cat /sys/module/kvm/parameters/nested ) + KVM_ARCH="s390x" + KVM_HPAGE=$( cat /sys/module/kvm/parameters/hpage ) fi function is_enabled() { @@ -49,3 +54,7 @@ if is_enabled "$KVM_NESTED"; then else echo "[ERR ] $KVM_ARCH nested virtualization not enabled" fi + +if is_enabled "$KVM_HPAGE" && [ "$(uname -m)" = "s390x" ]; then + echo "[ERR ] $KVM_HPAGE KVM hugepage enabled. It need to be disabled while nested virtualization is enabled for s390x" +fi