diff --git a/k8s/Dockerfile b/k8s/Dockerfile
deleted file mode 100644
index ddeb791a..00000000
--- a/k8s/Dockerfile
+++ /dev/null
@@ -1,38 +0,0 @@
-FROM ubuntu:18.04
-
-ENV DEBIAN_FRONTEND=noninteractive
-ARG APP_VERSION=3.2.0
-ARG APP_URL=https://dl.tigergraph.com/enterprise-edition/tigergraph-${APP_VERSION}-offline.tar.gz
-
-RUN apt-get -qq update && apt-get install -y --no-install-recommends \
- sudo curl iproute2 net-tools iptables iptables-persistent \
- sshpass cron ntp locales vim tar jq uuid-runtime openssh-client openssh-server dnsutils iputils-ping > /dev/null && \
- apt-get autoremove && apt-get clean && \
- # Set up default account
- useradd -ms /bin/bash tigergraph && \
- mkdir /var/run/sshd && \
- echo 'tigergraph:tigergraph' | chpasswd && \
- sed -i 's/\#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
- sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd && \
- echo "tigergraph ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers && \
- /usr/sbin/sshd && \
- # Download installation packages
- curl -s -k -L ${APP_URL} -o /home/tigergraph/tigergraph-${APP_VERSION}-offline.tar.gz && \
- cd /home/tigergraph/ && \
- tar xfz tigergraph-${APP_VERSION}-offline.tar.gz && \
- rm -f tigergraph-${APP_VERSION}-offline.tar.gz && \
- # Install TigerGraph
- cd /home/tigergraph/tigergraph-* && \
- ./install.sh -n && \
- # Stop TigerGraph
- su - tigergraph -c "/home/tigergraph/tigergraph/app/${APP_VERSION}/cmd/gadmin stop all -y" && \
- # Clean Up unused packages
- rm -rf /home/tigergraph/tigergraph-* && \
- # Setup Enviroments setting
- echo "export USER=tigergraph" >> /home/tigergraph/.bash_tigergraph && \
- chown -R tigergraph:tigergraph /home/tigergraph
-
-WORKDIR /home/tigergraph
-USER tigergraph
-EXPOSE 22 9000 14240
-ENTRYPOINT sudo /usr/sbin/sshd && bash -c "tail -f /dev/null"
diff --git a/k8s/Dockerfile-installer b/k8s/Dockerfile-installer
deleted file mode 100644
index d81379bd..00000000
--- a/k8s/Dockerfile-installer
+++ /dev/null
@@ -1,8 +0,0 @@
-FROM alpine:3.14
-
-RUN apk update && \
- apk add bash sshpass openssh jq sudo curl && \
- curl -LO "https://dl.k8s.io/release/v1.22.4/bin/linux/amd64/kubectl" && \
- sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl && \
- rm kubectl && \
- apk del sudo curl
diff --git a/k8s/README.md b/k8s/README.md
index 37aebdc7..4f9308c6 100644
--- a/k8s/README.md
+++ b/k8s/README.md
@@ -1,106 +1,62 @@
-# This project has been deprecated. Please use [Tigergraph Kubernetes Operator](https://docs.tigergraph.com/tigergraph-server/current/kubernetes/k8s-operator/) to create managed Tigergraph cluster instances on Kubernetes(GKE/EKS/OpenShift).
-
-# Run Tigergraph in EKS/GKE/AKS
-
-## Getting Started
-
-### Prerequisites
- Please ensure the following dependencies are already fulfilled before starting
- - A running ```EKS/GKE/AKS``` cluster
- - The ```kubectl``` command-line tool **(v1.18.0+)**
- - AWS/GCP/Azure account to manage kubernetes resource
- - AWS EBS CSI driver
-### Verify EBS_CSI_ADDON Installation Status
-Important: If you have a 1.22 or earlier cluster that you currently run pods on that use Amazon EBS volumes, and you don't currently have this driver installed on your cluster, then be sure to install this driver to your cluster before updating the cluster to 1.23.
-
-Following the instructions on AWS documentation to add EBS CSI add-on before proceed.
-https://docs.aws.amazon.com/eks/latest/userguide/managing-ebs-csi.html
-
-Use ```kubectl get pods -n kube-system``` to check if EBS CSI driver is running. An example output is
-```
-NAME READY STATUS RESTARTS AGE
-...
-coredns-5948f55769-kcnvx 1/1 Running 0 3d6h
-coredns-5948f55769-z7mbr 1/1 Running 0 3d6h
-ebs-csi-controller-75598cd6f4-48dp8 6/6 Running 0 3d4h
-ebs-csi-controller-75598cd6f4-sqbhw 6/6 Running 4 (2d11h ago) 3d4h
-ebs-csi-node-9cmbj 3/3 Running 0 3d4h
-ebs-csi-node-g65ns 3/3 Running 0 3d4h
-ebs-csi-node-qzflk 3/3 Running 0 3d4h
-ebs-csi-node-x2t22 3/3 Running 0 3d4h
-...
-```
-### Deployment Steps
- ```bash
- #create cluster namespace
- kubectl create ns tigergraph
- # deploy in EKS
- kubectl apply -k ./eks
-
- # deploy in GKE
- kubectl apply -k ./gke
-
- # deploy in AKS
- kubectl apply -k ./aks
-
- # use tg script with eks in tigergraph namespace
- ./tg eks create -n tigergraph --size 3
- ```
-### Verify the Tigergraph Status
- ```bash
- kubectl get all -l app=tigergraph -n tigergraph
- ```
- Response similar as below :
- ```
- NAME READY STATUS RESTARTS AGE
- pod/tigergraph-0 1/1 Running 0 6d20h
-
- NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE
- service/tigergraph-service LoadBalancer 10.100.214.243 a0ae52e0e62e54bf9b5c07d97deec5e2-982033604.us-east-1.elb.amazonaws.com 9000:30541/TCP,14240:31525/TCP 6d20h
- ```
- Login to the instances
- ```bash
- # use kubectl
- kubectl exec -it tigergraph-0 -n tigergraph -- /bin/bash
- # use ssh
- ip_m1=$(kubectl get pod -o wide |grep tigergraph-0| awk '{print $6}')
- ssh tigergraph@ip_m1
- # verify the cluster status
- source ~/.bashrc
- gadmin status -v
- # verify gsql
- gsql ls
- ```
- Try GraphStudio UI, change the url accordingly as upper output ```EXTERNAL-IP```
- ```
- http://a0ae52e0e62e54bf9b5c07d97deec5e2-982033604.us-east-1.elb.amazonaws.com:14240
- ```
-
- Try Tigergraph Rest API, change the url accordingly as upper output ```EXTERNAL-IP```
- ```bash
- curl http://a0ae52e0e62e54bf9b5c07d97deec5e2-982033604.us-east-1.elb.amazonaws.com:9000/echo
- ```
-### Kustomize the TG setting
- You can use adjust the kustomize yaml file to change the TG setting. For regular minor changes, strongly recommend to customize them with ```tg``` script as below.
- ```bash
- USAGE:
- $0 K8S_PROVIDER [kustomize|create|delete|list|help] [OPTIONS]
- -n|--namespace : set namespace to deploy TG cluster
- -s|--size : set TG cluster size, default 1
- -v|--version : set TG cluster version,default as 3.2.0
- -l|--license : set TG cluster license, default as free tie
- --ha : set TG cluster ha setting, default 1
- --pv : set Persistent volume size, default as 50
- --prefix : set Pod name with prefix
-
- # Examples when working in eks:
- ## Generate the manifest for deployment
- ./tg eks kustomize -n tigergraph --size 3 --ha 3
- ## Create TG cluster:
- ./tg eks create -n tigergraph -s 2
- ## Delete TG cluster:
- ./tg eks delete
- ## List TG cluster:
- ./tg eks list -n tigergraph
- ```
-
+# TigerGraph Operator
+
+TigerGraph Operator stands as an automated operations system meticulously designed to streamline the management of TigerGraph clusters within Kubernetes environments.
+Its comprehensive suite of functionalities encompasses every aspect of the TigerGraph lifecycle, spanning deployment, upgrades, scaling, backups, restoration, and fail-over processes.
+Whether you're operating in a public cloud setting or within a self-hosted environment, TigerGraph Operator ensures that your TigerGraph instances function seamlessly within Kubernetes clusters.
+
+> [!IMPORTANT]
+> Kubernetes Operator support is currently a Preview Feature. Preview Features give users an early look at future production-level features. Preview Features should not be used for production deployments.
+
+Understanding the intricate synergy between TigerGraph, TigerGraph Operator, and Kubernetes versions is pivotal. This relationship is as follows:
+
+| TigerGraph Operator version | TigerGraph version | Kubernetes version |
+|----------|----------|----------|
+| 0.0.9 | TigerGraph >= 3.6.0 |1.23, 1.24, 1.25, 1.26, **1.27**|
+| 0.0.7 | TigerGraph >= 3.6.0 && TigerGraph <= 3.9.2|1.22, 1.23, 1.24, 1.25, 1.26|
+| 0.0.6 | TigerGraph >= 3.6.0 && TigerGraph <= 3.9.1|1.22, 1.23, 1.24, 1.25, 1.26|
+| 0.0.5 | TigerGraph >= 3.6.0 && TigerGraph <= 3.9.1|1.22, 1.23, 1.24, 1.25, 1.26|
+| 0.0.4 | TigerGraph >= 3.6.0 && TigerGraph <= 3.9.0|1.22, 1.23, 1.24, 1.25, 1.26|
+| 0.0.3 | TigerGraph >= 3.6.0 && TigerGraph <= 3.8.0|1.22, 1.23, 1.24, 1.25, 1.26|
+| 0.0.2 | TigerGraph >= 3.6.0 && TigerGraph <= 3.7.0|1.22, 1.23, 1.24, 1.25, 1.26|
+
+## Manage TigerGraph clusters using TigerGraph Operator
+
+TigerGraph Operator offers several deployment options for TigerGraph clusters on Kubernetes, catering to both test and production environments:
+
+- For test environment
+
+ - [Getting started using Kind](docs/02-get-started/get_started.md)
+
+- For production environment
+
+ - On public cloud:
+ - [Deploy TigerGraph on AWS EKS](docs/03-deploy/tigergraph-on-eks.md)
+ - [Deploy TigerGraph on Google Cloud GKE](docs/03-deploy/tigergraph-on-gke.md)
+ - [Deploy TigerGraph on Red Hat OpenShift](docs/03-deploy/tigergraph-on-openshift.md)
+ - [Deploy TigerGraph on K8s without internet access](docs/03-deploy/deploy-without-internet.md)
+
+Once your deployment is complete, refer to the following documents for guidance on using, operating, and maintaining your TigerGraph clusters on Kubernetes:
+
+- [Configuring TigerGraph Clusters on K8s using TigerGraph CR](docs/07-reference/configure-tigergraph-cluster-cr-with-yaml-manifests.md)
+- [Utilizing Static & Dynamic Persistent Volume Storage](docs/07-reference/static-and-dynamic-persistent-volume-storage.md)
+- [Configuring NodeSelectors, Affinities, and Toleration](docs/03-deploy/configure-affinity-by-kubectl-tg.md)
+- [Working with InitContainers, Sidecar Containers, and Custom Volumes](docs/03-deploy/use-custom-containers-by-kubectl-tg.md)
+- [Resizing Persistent Volumes for TigerGraph](docs/07-reference/expand-persistent-volume.md)
+- [Backing Up and Restoring TigerGraph Clusters](docs/04-manage/backup-and-restore/README.md)
+
+Additionally, refer to the following documents for advanced operations and requirements:
+
+- [Expand persistent volume](docs/07-reference/expand-persistent-volume.md)
+- [Using static and dynamic persistent volume](docs/07-reference/static-and-dynamic-persistent-volume-storage.md)
+- [Integrate Envoy Sidecar](docs/07-reference/integrate-envoy-sidecar.md)
+- [Labels used by TigerGraph Operator](docs/07-reference/labels-used-by-tg.md)
+
+In case issues arise and your cluster requires diagnosis, you have two valuable resources:
+
+Refer to [TigerGraph FAQs on Kubernetes](docs/06-FAQs/README.md) for potential solutions.
+
+Explore [Troubleshoot TigerGraph on Kubernetes](docs/05-troubleshoot/README.md) to address any challenges.
+
+Lastly, when a new version of TigerGraph Operator becomes available, consult [Upgrade TigerGraph Operator](docs/04-manage/operator-upgrade.md) for a seamless transition to the latest version.
+
+For detailed information about the features, improvements, and bug fixes introduced in a specific Operator version, refer to the [release notes](docs/08-release-notes/README.md).
diff --git a/k8s/aks/kustomization.yaml b/k8s/aks/kustomization.yaml
deleted file mode 100644
index 7073e805..00000000
--- a/k8s/aks/kustomization.yaml
+++ /dev/null
@@ -1,37 +0,0 @@
-apiVersion: kustomize.config.k8s.io/v1beta1
-kind: Kustomization
-
-generatorOptions:
- disableNameSuffixHash: true
- labels:
- app: tigergraph
-
-# source base yaml
-bases:
-- ../base
-
-# revise blow to update global namespace
-namespace: default
-
-# uncomment and revise blow to update images
-# images:
-# - name: tigergraph/tigergraph-k8s
-# newName: tigergraph001.azurecr.io/tigergraph
-# newTag: 3.2.0
-
-configMapGenerator:
-- name: env-config
- literals:
- - service.headless.name=tigergraph
- - pod.prefix=tigergraph
- - namespace=default
- - cluster_size=1
- - license=
- - ha=1
- - version=3.5.0
- - cluster_size.staging=0
- - version.staging=0
-
-patchesStrategicMerge:
-- patch-statfulset.yaml
-
diff --git a/k8s/aks/patch-statfulset.yaml b/k8s/aks/patch-statfulset.yaml
deleted file mode 100644
index 093e11a9..00000000
--- a/k8s/aks/patch-statfulset.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-apiVersion: apps/v1
-kind: StatefulSet
-metadata:
- name: tigergraph
-spec:
- volumeClaimTemplates:
- - metadata:
- name: tg-data
- labels:
- app: tigergraph
- spec:
- accessModes: [ "ReadWriteOnce" ]
- storageClassName: "managed-premium"
- resources:
- requests:
- storage: 50Gi
diff --git a/k8s/base/configmap.yaml b/k8s/base/configmap.yaml
deleted file mode 100644
index 908257c7..00000000
--- a/k8s/base/configmap.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-kind: ConfigMap
-apiVersion: v1
-metadata:
- name: tg-config
- namespace: default
- labels:
- app: tigergraph
-data:
- init_tg_cfg: |
- System.HostList=[{"ID":"m1","Hostname":"127.0.0.1","Region":""}]
- System.SSH.User.Username=tigergraph
- System.SSH.User.Password=tigergraph
- System.SSH.User.Privatekey=/home/tigergraph/.ssh/tigergraph_rsa
- System.DataRoot=/home/tigergraph/tigergraph/data
- System.LogRoot=/home/tigergraph/tigergraph/log
- System.TempRoot=/home/tigergraph/tigergraph/tmp
diff --git a/k8s/base/installer.yaml b/k8s/base/installer.yaml
deleted file mode 100644
index 818685b5..00000000
--- a/k8s/base/installer.yaml
+++ /dev/null
@@ -1,101 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
- name: installer
- namespace: default
- labels:
- app: tigergraph
-spec:
- template:
- metadata:
- labels:
- app: tigergraph
- spec:
- serviceAccountName: tigergraph-installer
- initContainers:
- - name: init-tigergraph
- image: tigergraph/tigergraph-k8s-installer:3.5.0
- env:
- - name: POD_PREFIX
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: pod.prefix
- command:
- - "/bin/sh"
- - "-c"
- - >
- set -e;
- kubectl rollout status --watch --timeout=2h statefulset ${POD_PREFIX};
- sleep 5;
- containers:
- - name: cluster-installer
- image: tigergraph/tigergraph-k8s-installer:3.5.0
- env:
- - name: CLUSTER_SIZE
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: cluster_size
- - name: LICENSE
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: license
- - name: HA
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: ha
- - name: SERVICE_NAME
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: service.headless.name
- - name: POD_PREFIX
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: pod.prefix
- - name: NAMESPACE
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: namespace
- command:
- - "/bin/sh"
- - "-c"
- - |
- set -e;
- export SSHPASS='tigergraph';
- sshpass -e ssh -o StrictHostKeyChecking=no tigergraph@${POD_PREFIX}-0.${SERVICE_NAME}.${NAMESPACE} "
- if [[ ! -f /home/tigergraph/tigergraph/data/installation_flag ]] && [[ \$(ls -A /home/tigergraph/tigergraph/data/|grep -v lost|tail -1) ]]; then
- echo 'found lagacy data, skip installation'
- else
- touch /home/tigergraph/tigergraph/data/installation_flag;
- export PATH=/home/tigergraph/tigergraph/app/cmd:$PATH;
- cp /tmp/init_tg_cfg /tmp/tg_cfg;
- sed -i 's/\=/\: /g' /tmp/tg_cfg;
- echo >> /tmp/tg_cfg;
- jq -j '.System | \"System.AppRoot: \",.AppRoot' ~/.tg.cfg >> /tmp/tg_cfg;
- echo >> /tmp/tg_cfg;
- if [[ -z \"$LICENSE\" ]]; then
- jq -j '.System | \"System.License: \",.License' ~/.tg.cfg >> /tmp/tg_cfg;
- else
- echo \"System.License: ${LICENSE}\" >> /tmp/tg_cfg;
- fi;
- gadmin config init -i /tmp/tg_cfg --file /tmp/tg.cfg --ha ${HA};
- cp --remove-destination /tmp/tg.cfg ~/.tg.cfg;
- gadmin init cluster -y --skip-stop;
- rm /home/tigergraph/tigergraph/data/installation_flag;
- m1hostname=\$(gadmin config get System.HostList | jq -r '.[0].Hostname');
- if [[ ${CLUSTER_SIZE} -eq 1 ]] && [[ \${m1hostname} = \"127.0.0.1\" ]]; then
- newhostlist=\$(gadmin config get System.HostList | jq \".[0].Hostname = \\\"${POD_PREFIX}-0.tigergraph\\\"\");
- gadmin config set System.HostList \"\${newhostlist}\";
- gadmin config apply -y;
- gadmin restart all -y
- fi
- fi
- ";
- restartPolicy: OnFailure
- backoffLimit: 6
diff --git a/k8s/base/kustomization.yaml b/k8s/base/kustomization.yaml
deleted file mode 100644
index 4c6071cc..00000000
--- a/k8s/base/kustomization.yaml
+++ /dev/null
@@ -1,12 +0,0 @@
-apiVersion: kustomize.config.k8s.io/v1beta1
-kind: Kustomization
-
-resources:
- - service-headless.yaml
- - service-loadbalancer.yaml
- - role.yaml
- - role-binding.yaml
- - service-account.yaml
- - statefulset.yaml
- - configmap.yaml
- - installer.yaml
diff --git a/k8s/base/role-binding.yaml b/k8s/base/role-binding.yaml
deleted file mode 100644
index 74a4f5f3..00000000
--- a/k8s/base/role-binding.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-apiVersion: rbac.authorization.k8s.io/v1
-kind: RoleBinding
-metadata:
- name: modify-pods
- namespace: default
-subjects:
- - kind: ServiceAccount
- name: tigergraph-installer
- namespace: default
-roleRef:
- kind: Role
- name: modify-pods
- apiGroup: rbac.authorization.k8s.io
diff --git a/k8s/base/role.yaml b/k8s/base/role.yaml
deleted file mode 100644
index 2374092d..00000000
--- a/k8s/base/role.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-apiVersion: rbac.authorization.k8s.io/v1
-kind: Role
-metadata:
- name: modify-pods
- namespace: default
-rules:
- - apiGroups: [""]
- resources: ["pods"]
- verbs: ["get", "list", "patch"]
- - apiGroups: ["apps"]
- resources: ["statefulsets"]
- verbs: ["get", "list"]
- - apiGroups: [""]
- resources: ["pods/exec"]
- verbs: ["create"]
- - apiGroups: ["batch"]
- resources: ["jobs"]
- verbs: ["get", "list", "watch"]
diff --git a/k8s/base/service-account.yaml b/k8s/base/service-account.yaml
deleted file mode 100644
index 3b9ee45b..00000000
--- a/k8s/base/service-account.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-apiVersion: v1
-kind: ServiceAccount
-metadata:
- name: tigergraph-installer
- namespace: default
diff --git a/k8s/base/service-headless.yaml b/k8s/base/service-headless.yaml
deleted file mode 100644
index 79a4f55f..00000000
--- a/k8s/base/service-headless.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
- labels:
- app: tigergraph
- name: tigergraph
- namespace: default
-spec:
- clusterIP: None
- selector:
- app: tigergraph
- ports:
- - port: 9000
- name: rest
- targetPort: 9000
- - port: 14240
- name: graphstudio
- targetPort: 14240
\ No newline at end of file
diff --git a/k8s/base/service-loadbalancer.yaml b/k8s/base/service-loadbalancer.yaml
deleted file mode 100644
index bfad57b5..00000000
--- a/k8s/base/service-loadbalancer.yaml
+++ /dev/null
@@ -1,20 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
- labels:
- app: tigergraph
- name: tg-external-service
- namespace: default
-spec:
- type: LoadBalancer
- selector:
- app: tigergraph
- ports:
- - port: 9000
- name: rest
- targetPort: 9000
- - port: 14240
- name: graphstudio
- targetPort: 14240
- externalTrafficPolicy: Local
- sessionAffinity: ClientIP
diff --git a/k8s/base/statefulset.yaml b/k8s/base/statefulset.yaml
deleted file mode 100644
index 93189507..00000000
--- a/k8s/base/statefulset.yaml
+++ /dev/null
@@ -1,116 +0,0 @@
-apiVersion: apps/v1
-kind: StatefulSet
-metadata:
- labels:
- app: tigergraph
- name: tigergraph
- namespace: default
-spec:
- replicas: 1
- podManagementPolicy: Parallel
- selector:
- matchLabels:
- app: tigergraph
- serviceName: tigergraph
- template:
- metadata:
- labels:
- app: tigergraph
- spec:
- containers:
- - env:
- - name: SERVICE_NAME
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: service.headless.name
- - name: POD_PREFIX
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: pod.prefix
- - name: NAMESPACE
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: namespace
- - name: CLUSTER_SIZE
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: cluster_size
- - name: POD_NAME
- valueFrom:
- fieldRef:
- fieldPath: metadata.name
- image: tigergraph/tigergraph-k8s:VERSION
- imagePullPolicy: IfNotPresent
- name: tigergraph
- ports:
- - containerPort: 9000
- name: rest
- - containerPort: 14240
- name: graphstudio
- - containerPort: 22
- name: ssh
- resources:
- requests:
- cpu: 8000m
- memory: 16Gi
- lifecycle:
- postStart:
- exec:
- command:
- - "/bin/bash"
- - "-c"
- - |
- (
- if [ "$(ls -A /home/tigergraph/tigergraph/data/|grep -v lost|tail -1)" ]; then
- for i in $(seq 1 ${CLUSTER_SIZE});
- do
- until nslookup ${POD_PREFIX}-$((i-1)).${SERVICE_NAME}.${NAMESPACE}.svc.cluster.local;
- do
- echo "wait dns to be updated";
- sleep 1;
- done;
- done;
- sleep 15;
- export PATH=/home/tigergraph/tigergraph/app/cmd:$PATH
- ln -sf /home/tigergraph/tigergraph/data/configs/tg.cfg /home/tigergraph/.tg.cfg
- grun all "hostname"
- echo "starting service at $(date)"
- gadmin start all --with-config /home/tigergraph/.tg.cfg;
- else
- sudo chown -R tigergraph:tigergraph /home/tigergraph/tigergraph/data;
- tg_cfg=$(find /home/tigergraph/tigergraph/app/ -name .tg.cfg|head -n 1)
- ln -sf $tg_cfg .tg.cfg
- fi
- ) > /tmp/init.log 2>&1 &
- disown -a
- exit 0
- volumeMounts:
- - mountPath: /home/tigergraph/tigergraph/data
- name: tg-data
- - mountPath: /tmp/init_tg_cfg
- name: config-volume
- subPath: init_tg_cfg
- volumes:
- - name: config-volume
- configMap:
- name: tg-config
- items:
- - key: init_tg_cfg
- path: init_tg_cfg
- imagePullSecrets:
- - name: regcred
- volumeClaimTemplates:
- - metadata:
- name: tg-data
- labels:
- app: tigergraph
- spec:
- accessModes: [ "ReadWriteOnce" ]
- storageClassName: ""
- resources:
- requests:
- storage: 50Gi
diff --git a/k8s/docs/01-introduction/README.md b/k8s/docs/01-introduction/README.md
new file mode 100644
index 00000000..a8c00e06
--- /dev/null
+++ b/k8s/docs/01-introduction/README.md
@@ -0,0 +1,50 @@
+# TigerGraph Operator Overview
+
+TigerGraph Operator stands as an automated operations system meticulously designed to streamline the management of TigerGraph clusters within Kubernetes environments. Its comprehensive suite of functionalities encompasses every aspect of the TigerGraph lifecycle, spanning deployment, upgrades, scaling, backups, restoration, and fail-over processes. Whether you're operating in a public cloud setting or within a self-hosted environment, TigerGraph Operator ensures that your TigerGraph instances function seamlessly within Kubernetes clusters.
+
+Understanding the intricate synergy between TigerGraph, TigerGraph Operator, and Kubernetes versions is pivotal. This relationship is as follows:
+
+| TigerGraph Operator version | TigerGraph version | Kubernetes version |
+|----------|----------|----------|
+| 0.0.9 | TigerGraph >= 3.6.0 |1.23, 1.24, 1.25, 1.26, **1.27**|
+| 0.0.7 | TigerGraph >= 3.6.0 && TigerGraph <= 3.9.2|1.22, 1.23, 1.24, 1.25, 1.26|
+| 0.0.6 | TigerGraph >= 3.6.0 && TigerGraph <= 3.9.1|1.22, 1.23, 1.24, 1.25, 1.26|
+| 0.0.5 | TigerGraph >= 3.6.0 && TigerGraph <= 3.9.1|1.22, 1.23, 1.24, 1.25, 1.26|
+| 0.0.4 | TigerGraph >= 3.6.0 && TigerGraph <= 3.9.0|1.22, 1.23, 1.24, 1.25, 1.26|
+| 0.0.3 | TigerGraph >= 3.6.0 && TigerGraph <= 3.8.0|1.22, 1.23, 1.24, 1.25, 1.26|
+| 0.0.2 | TigerGraph >= 3.6.0 && TigerGraph <= 3.7.0|1.22, 1.23, 1.24, 1.25, 1.26|
+
+## Manage TigerGraph clusters using TigerGraph Operator
+
+TigerGraph Operator offers several deployment options for TigerGraph clusters on Kubernetes, catering to both test and production environments:
+
+- For test environment
+
+ - [Getting started using Kind](../02-get-started/get_started.md)
+
+- For production environment
+
+ - On public cloud:
+ - [Deploy TigerGraph on AWS EKS](../03-deploy/tigergraph-on-eks.md)
+ - [Deploy TigerGraph on Google Cloud GKE](../03-deploy/tigergraph-on-gke.md)
+ - [Deploy TigerGraph on Red Hat OpenShift](../03-deploy/tigergraph-on-openshift.md)
+ - [Deploy TigerGraph on K8s without internet access](../03-deploy/deploy-without-internet.md)
+
+Once your deployment is complete, refer to the following documents for guidance on using, operating, and maintaining your TigerGraph clusters on Kubernetes:
+
+- [Configuring TigerGraph Clusters on K8s using TigerGraph CR](../07-reference/configure-tigergraph-cluster-cr-with-yaml-manifests.md)
+- [Utilizing Static & Dynamic Persistent Volume Storage](../07-reference/static-and-dynamic-persistent-volume-storage.md)
+- [Configuring NodeSelectors, Affinities, and Toleration](../03-deploy/configure-affinity-by-kubectl-tg.md)
+- [Working with InitContainers, Sidecar Containers, and Custom Volumes](../03-deploy/use-custom-containers-by-kubectl-tg.md)
+- [Resizing Persistent Volumes for TigerGraph](../07-reference/expand-persistent-volume.md)
+- [Backing Up and Restoring TigerGraph Clusters](../04-manage/backup-and-restore/README.md)
+
+In case issues arise and your cluster requires diagnosis, you have two valuable resources:
+
+Refer to [TigerGraph FAQs on Kubernetes](../06-FAQs/README.md) for potential solutions.
+
+Explore [Troubleshoot TigerGraph on Kubernetes](../05-troubleshoot/README.md) to address any challenges.
+
+Lastly, when a new version of TigerGraph Operator becomes available, consult [Upgrade TigerGraph Operator](../04-manage/operator-upgrade.md) for a seamless transition to the latest version.
+
+For detailed information about the features, improvements, and bug fixes introduced in a specific Operator version, refer to the [release notes](../08-release-notes/README.md).
diff --git a/k8s/docs/02-get-started/get_started.md b/k8s/docs/02-get-started/get_started.md
new file mode 100644
index 00000000..c533cfd4
--- /dev/null
+++ b/k8s/docs/02-get-started/get_started.md
@@ -0,0 +1,473 @@
+
Getting Started TigerGraph on Kubernetes
+
+- [Step 1: Create a Test Kubernetes Cluster](#step-1-create-a-test-kubernetes-cluster)
+ - [Create a Kubernetes Cluster Using kind](#create-a-kubernetes-cluster-using-kind)
+ - [Install MetalLB to Enable Load Balancing Services](#install-metallb-to-enable-load-balancing-services)
+- [Step 2: Deploy TigerGraph Operator](#step-2-deploy-tigergraph-operator)
+ - [Install cert-manager for Kubernetes](#install-cert-manager-for-kubernetes)
+ - [Install kubectl-tg plugin](#install-kubectl-tg-plugin)
+ - [Install CRDs independently (Optional)](#install-crds-independently-optional)
+ - [Install TigerGraph Operator](#install-tigergraph-operator)
+- [Step 3: Deploy a TigerGraph Cluster](#step-3-deploy-a-tigergraph-cluster)
+ - [Providing a Private SSH Key Pair for Enhanced Security](#providing-a-private-ssh-key-pair-for-enhanced-security)
+ - [Specify the StorageClass Name](#specify-the-storageclass-name)
+ - [Create a TigerGraph Cluster with Specific Options](#create-a-tigergraph-cluster-with-specific-options)
+- [Step 4: Connect to a TigerGraph Cluster](#step-4-connect-to-a-tigergraph-cluster)
+ - [Connect to a TigerGraph Cluster Pod](#connect-to-a-tigergraph-cluster-pod)
+ - [Access TigerGraph Suite](#access-tigergraph-suite)
+ - [Access RESTPP API Service](#access-restpp-api-service)
+- [Step 5: Operate a TigerGraph Cluster](#step-5-operate-a-tigergraph-cluster)
+ - [Update the Resources (CPU and Memory) of the TigerGraph Cluster](#update-the-resources-cpu-and-memory-of-the-tigergraph-cluster)
+ - [Scale a TigerGraph Cluster](#scale-a-tigergraph-cluster)
+ - [Upgrade a TigerGraph Cluster](#upgrade-a-tigergraph-cluster)
+- [Step 6: Destroy the TigerGraph Cluster and the Kubernetes Operator](#step-6-destroy-the-tigergraph-cluster-and-the-kubernetes-operator)
+ - [Destroy the TigerGraph Cluster](#destroy-the-tigergraph-cluster)
+ - [Uninstall TigerGraph Operator](#uninstall-tigergraph-operator)
+ - [Uninstall CRD](#uninstall-crd)
+- [Step 7: Destroy the Kubernetes Cluster](#step-7-destroy-the-kubernetes-cluster)
+- [See also](#see-also)
+
+This document provides a step-by-step guide on creating a simple Kubernetes cluster and using it to deploy a basic test TigerGraph cluster using TigerGraph Operator.
+
+To deploy TigerGraph Operator and a TigerGraph cluster, follow these structured steps:
+
+1. Create a Test Kubernetes Cluster: Start by creating a test Kubernetes cluster to serve as the foundation for your TigerGraph deployment.
+2. Deploy TigerGraph Operator: Next, deploy the TigerGraph Operator, which is essential for managing TigerGraph clusters within your Kubernetes environment.
+3. Deploy a TigerGraph Cluster: Once the Operator is in place, deploy your TigerGraph cluster, setting the stage for your data and graph processing needs.
+4. Connect to a TigerGraph Cluster: Learn how to establish a connection to your newly deployed TigerGraph cluster.
+5. Operate a TigerGraph Cluster: Explore the various operations you can perform on your TigerGraph cluster, from data management to analytics.
+6. Destroy the TigerGraph Cluster and the Kubernetes Operator: When you no longer require your TigerGraph resources, follow proper procedures to safely remove the TigerGraph cluster and the associated Kubernetes Operator.
+7. Destroy the Kubernetes Cluster: Finally, if needed, you can dismantle the entire Kubernetes cluster, ensuring efficient resource utilization.
+
+For a visual demonstration of these steps, you can watch the following video:
+
+[Demo slides](https://docs.google.com/presentation/d/1aUpgHnJaz9qhlFqg6sPmLMrPMk2CR0ij4qqktbcZZQQ/edit?usp=sharing)
+
+[Demo video](https://drive.google.com/file/d/1-h70zlrGEYAQRadG_Pfq4HfmXkvEPt8s/view?usp=sharing)
+
+This comprehensive guide and accompanying resources will help you kickstart your journey with TigerGraph Operator, enabling you to harness the power of TigerGraph within your Kubernetes environment.
+
+## Step 1: Create a Test Kubernetes Cluster
+
+This section provides detailed instructions on creating a straightforward Kubernetes cluster using kind. Establishing this Kubernetes cluster serves as a foundational step, enabling you to conduct testing of TigerGraph clusters managed by TigerGraph Operator.
+
+### Create a Kubernetes Cluster Using kind
+
+This section provides a step-by-step guide on deploying a Kubernetes cluster with [kind](https://kind.sigs.k8s.io/).
+
+kind is a widely recognized tool for setting up local Kubernetes clusters, leveraging Docker containers as cluster nodes. For available tags, see [Docker Hub](https://hub.docker.com/r/kindest/node/tags). The default configuration employs the latest version of kind.
+
+Before initiating the deployment process, please ensure that you meet the following prerequisites:
+
+- [Docker](https://docs.docker.com/install/): version >= 20.10
+- [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/): version >= 1.23
+- [kind](https://kind.sigs.k8s.io/docs/user/quick-start/): version >= 0.12.0
+- For Linux users, confirm that the sysctl parameter [net.ipv4.ip_forward](https://linuxconfig.org/how-to-turn-on-off-ip-forwarding-in-linux) is set to 1.
+
+Here's an illustrative example utilizing **kind** version 0.20.0:
+
+```shell
+kind create cluster
+```
+
+
+ Expected output
+Creating cluster "kind" ...
+ â Ensuring node image (kindest/node:v1.27.3) đŧ
+ â Preparing nodes
+ â Writing configuration đ
+ â Starting control-plane đšī¸
+ â Installing CNI đ
+ â Installing StorageClass đž
+Set kubectl context to "kind-kind"
+You can now use your cluster with:
+
+kubectl cluster-info --context kind-kind
+
+
+To verify if the cluster has been successfully created, run:
+
+```shell
+kubectl cluster-info
+```
+
+
+ Expected output
+Kubernetes control plane is running at https://127.0.0.1:33671
+CoreDNS is running at https://127.0.0.1:33671/api/v1/namespaces/kube-system/services/kube-dns:dns/proxy
+
+To further debug and diagnose cluster problems, use 'kubectl cluster-info dump'.
+
+
+### Install MetalLB to Enable Load Balancing Services
+
+To enable load balancing services, follow these steps to install MetalLB:
+
+```bash
+kubectl apply -f https://raw.githubusercontent.com/metallb/metallb/v0.13.7/config/manifests/metallb-native.yaml
+kubectl wait --namespace metallb-system \
+ --for=condition=ready pod \
+ --selector=app=metallb \
+ --timeout=120s
+
+GATEWAY_IP=$(docker network inspect kind | jq -r '.[].IPAM.Config[0].Gateway')
+IFS=. read -a ArrIP<<<"${GATEWAY_IP}"
+
+cat <= 3.7.0
+- [jq](https://jqlang.github.io/jq/download/): version >= 1.6
+- [yq](https://github.com/mikefarah/yq): version >= 4.18.1
+
+Here's an example of installing the latest kubectl-tg, you can change the latest to your desired version, such as 0.0.9:
+
+```bash
+wget https://dl.tigergraph.com/k8s/latest/kubectl-tg -O kubectl-tg
+sudo install kubectl-tg /usr/local/bin/
+```
+
+To verify the kubectl-tg version, use the following command:
+
+```bash
+kubectl tg version
+```
+
+Show help Information
+
+```bash
+kubectl tg help
+```
+
+### Install CRDs independently (Optional)
+
+This step is optional and can be skipped if you have privileged permissions in your Kubernetes environment. The required components will be automatically installed during the Operator installation process.
+
+CustomResourceDefinitions (CRDs) are non-namespaced entities accessible across all namespaces. Installing CRDs requires privileged permissions from the Kubernetes cluster. If you prefer to install CRDs independently from the Operator installation, use the following commands:
+
+```bash
+kubectl apply -f https://dl.tigergraph.com/k8s/latest/tg-operator-crd.yaml
+```
+
+### Install TigerGraph Operator
+
+To streamline the installation of the Operator and the deployment of a TigerGraph cluster, start by defining some environment variables:
+
+```bash
+export YOUR_NAMESPACE="tigergraph"
+export YOUR_CLUSTER_NAME="test-tg-cluster"
+export YOUR_SSH_KEY_SECRET_NAME="ssh-key-secret"
+```
+
+Next, install the TigerGraph Operator using the following command:
+
+```bash
+kubectl tg init -n ${YOUR_NAMESPACE}
+```
+
+To ensure the successful deployment of the operator, use this command:
+
+```bash
+kubectl wait deployment tigergraph-operator-controller-manager --for condition=Available=True --timeout=120s -n ${YOUR_NAMESPACE}
+```
+
+For comprehensive guidance, refer to the output from `kubectl tg init --help`:
+
+``` bash
+kubectl tg init --help
+Install the operator
+
+Examples:
+ # install the operator in the current namespace
+ kubectl tg init
+ # install the operator in the specified namespace
+ kubectl tg init --namespace tg-tenant1
+ # install the operator in the specified namespace, with specified helm repo and image pull secret
+ kubectl tg init --namespace tg-tenant1 --helm-repo https://yourhelmrepo.com --image-pull-secret yoursecret
+ # install the operator in the specified namespace, with specified operator version, watch name namespace, cpu and memory
+ kubectl tg init --version OPERATOR_VERSION --operator-size 3 --operator-watch-namespace tigergraph --operator-cpu 1000m --operator-memory 1024Mi --namespace tg-tenant1
+
+Options:
+ -n, --namespace : set namespace to deploy TG cluster, if not set, use the default namespace in context
+ --helm-repo : set the specified helm repo to install operator, default as https://dl.tigergraph.com/charts
+ --docker-registry : set docker registry to download tigergraph image, default as docker.io
+ --docker-image-repo : set docker image repo for image name, default as tigergraph.
+ -p, --image-pull-secret : set imagePullSecret of docker registry, default as tigergraph-operator-image-pull-secret
+ --image-pull-policy: set pull policy of image, available policy: IfNotPresent, Always, and Never, default is IfNotPresent
+ --operator-version: set TG K8S operator version
+ --operator-size : set the replicas of operator's deployment for high availability, default is 3
+ --operator-cpu : set request cpu of operator, default as 1000m
+ --operator-cpu-limit : limit cpu size of operator
+ --operator-memory : set request memory of operator, default as 1024Mi
+ --operator-memory-limit : limit memory size of operator
+ --operator-watch-namespace : set watch namespaces of operator, blank string as default indicate all namespace, multiple namespaces are separated by commas, as ns1\,ns2
+ --cluster-scope : set true to deploy operator with ClusterRole, set false to deploy with Role,
+ so that you can deploy mutiple operators in one cluster, default as true
+```
+
+## Step 3: Deploy a TigerGraph Cluster
+
+This section provides instructions on deploying a TigerGraph cluster using the kubectl-tg plugin.
+
+### Providing a Private SSH Key Pair for Enhanced Security
+
+Starting from Operator version 0.0.4, users are required to provide their private SSH key pair for enhanced security before creating a cluster. Follow these steps:
+
+- Create a Private SSH Key Pair File
+
+ Generate a private SSH key pair file with the following command:
+
+ ```bash
+ echo -e 'y\n' | ssh-keygen -b 4096 -t rsa -f $HOME/.ssh/tigergraph_rsa -q -N ''
+ ```
+
+- Create a Secret Object
+
+ Create a secret object based on the private SSH key file generated in Step 1. Ensure that the key name of the secret for the private SSH key is private-ssh-key, and the key name for the public SSH key is public-ssh-key. **Do not modify these key names**.
+
+ > [!IMPORTANT]
+ > The namespace of the Secret object must be the same as that of the TigerGraph cluster.
+
+ ```bash
+ kubectl create secret generic ${YOUR_SSH_KEY_SECRET_NAME} --from-file=private-ssh-key=$HOME/.ssh/tigergraph_rsa --from-file=public-ssh-key=$HOME/.ssh/tigergraph_rsa.pub --namespace ${YOUR_NAMESPACE}
+ ```
+
+ > [!IMPORTANT]
+ > For Operator versions 0.0.4 and above, when creating a cluster using the `kubectl tg create command`, you must set the `--private-key-secret` option to `${YOUR_SSH_KEY_SECRET_NAME}`.
+
+These steps enhance the security of your cluster by utilizing your private SSH key pair.
+
+### Specify the StorageClass Name
+
+Before creating the TigerGraph cluster with the Operator, it's necessary to specify the StorageClass, which defines various "classes" of storage available.
+
+You can determine the name of the StorageClass using the following command:
+
+```bash
+kubectl get storageclass
+NAME PROVISIONER RECLAIMPOLICY VOLUMEBINDINGMODE ALLOWVOLUMEEXPANSION AGE
+standard (default) rancher.io/local-path Delete WaitForFirstConsumer false 144m
+```
+
+Identify the StorageClass name, and when specifying the `--storage-class` option, use `standard` as its value. This ensures that the appropriate StorageClass is assigned during TigerGraph cluster creation, optimizing storage provisioning and management.
+
+### Create a TigerGraph Cluster with Specific Options
+
+You can obtain the TigerGraph Docker image versions from [tigergraph-k8s](https://hub.docker.com/r/tigergraph/tigergraph-k8s/tags)
+
+Use the following command to create a new TigerGraph cluster with a free license:
+
+- Get and export the free license:
+
+ ```bash
+ export LICENSE=$(curl -L "ftp://ftp.graphtiger.com/lic/license3.txt" -o "/tmp/license3.txt" 2>/dev/null && cat /tmp/license3.txt)
+ ```
+
+ ```bash
+ kubectl tg create --cluster-name ${YOUR_CLUSTER_NAME} --private-key-secret ${YOUR_SSH_KEY_SECRET_NAME} --size 3 --ha 2 --version 3.9.1 --license ${LICENSE} \
+ --storage-class standard --storage-size 10G --cpu 2000m --memory 6Gi --namespace ${YOUR_NAMESPACE}
+ ```
+
+ To ensure the TigerGraph cluster has been successfully deployed, use the following commands:
+
+ ```bash
+ kubectl wait pods -l tigergraph.com/cluster-pod=${YOUR_CLUSTER_NAME} --for condition=Ready --timeout=15m --namespace ${YOUR_NAMESPACE}
+
+ kubectl wait --for=condition=complete --timeout=10m job/${YOUR_CLUSTER_NAME}-init-job --namespace ${YOUR_NAMESPACE}
+ ```
+
+## Step 4: Connect to a TigerGraph Cluster
+
+This section explains how to log into a TigerGraph cluster pod and access the `RESTPP` and `GUI` services.
+
+### Connect to a TigerGraph Cluster Pod
+
+To log into a single container within the TigerGraph cluster and execute commands like `gadmin status`, use the following command:
+
+```bash
+kubectl tg connect --cluster-name ${YOUR_CLUSTER_NAME} --namespace ${YOUR_NAMESPACE}
+```
+
+### Access TigerGraph Suite
+
+- Query the external service address:
+
+ ```bash
+ export GUI_SERVICE_ADDRESS=$(kubectl get svc/${YOUR_CLUSTER_NAME}-gui-external-service --namespace ${YOUR_NAMESPACE} -o=jsonpath='{.status.loadBalancer.ingress[0].ip}')
+
+ echo $GUI_SERVICE_ADDRESS
+ 172.18.255.201
+ ```
+
+- Verify the API service:
+
+ ```bash
+ curl http://${GUI_SERVICE_ADDRESS}:14240/api/ping
+
+ {"error":false,"message":"pong","results":null}
+ ```
+
+To access the TigerGraph Suite, open it in your browser using the following URL: http://${GUI_SERVICE_ADDRESS}:14240, replacing `GUI_SERVICE_ADDRESS` with the actual service address.
+
+### Access RESTPP API Service
+
+- Query the external service address:
+
+ ```bash
+ export RESTPP_SERVICE_ADDRESS=$(kubectl get svc/${YOUR_CLUSTER_NAME}-rest-external-service --namespace ${YOUR_NAMESPACE} -o=jsonpath='{.status.loadBalancer.ingress[0].ip}')
+
+ echo $RESTPP_SERVICE_ADDRESS
+ 172.18.255.200
+ ```
+
+- Verify the RESTPP API service:
+
+ ```bash
+ curl http://${RESTPP_SERVICE_ADDRESS}:9000/echo
+
+ {"error":false, "message":"Hello GSQL"}
+ ```
+
+## Step 5: Operate a TigerGraph Cluster
+
+### Update the Resources (CPU and Memory) of the TigerGraph Cluster
+
+Use the following command to update the CPU and memory resources of the TigerGraph cluster:
+
+```bash
+kubectl tg update --cluster-name ${YOUR_CLUSTER_NAME} --cpu 3 --memory 8Gi --cpu-limit 3--memory-limit 8Gi --namespace ${YOUR_NAMESPACE}
+```
+
+### Scale a TigerGraph Cluster
+
+> [!WARNING]
+> TigerGraph's exceptional performance comes with certain considerations regarding high availability during scaling operations. Currently, TigerGraph does not provide dedicated high-availability scale support, and some downtime is involved.
+
+Before scaling out the cluster, ensure you scale out the corresponding node pool to provide enough resources for the new instances.
+
+Use the following command to scale the TigerGraph cluster:
+
+```bash
+kubectl tg update --cluster-name ${YOUR_CLUSTER_NAME} --size 4 --ha 2 --namespace ${YOUR_NAMESPACE}
+```
+
+The above command scales the cluster to a size of 4 with an HA factor of 2.
+
+### Upgrade a TigerGraph Cluster
+
+> [!WARNING]
+> TigerGraph's exceptional performance comes with certain considerations regarding high availability during upgrading operations. Currently, TigerGraph does not provide dedicated high-availability upgrade support, and some downtime is involved.
+
+Upgrading a TigerGraph cluster is supported from a lower version to a higher version.
+
+Assuming the current version of the cluster is 3.9.1, you can upgrade it to version 3.9.2 with the following command:
+
+```bash
+kubectl tg update --cluster-name ${YOUR_CLUSTER_NAME} --version 3.9.2 --namespace ${YOUR_NAMESPACE}
+```
+
+To ensure the successful upgrade of the TigerGraph cluster, use these commands:
+
+```bash
+kubectl rollout status --watch --timeout=900s statefulset/${YOUR_CLUSTER_NAME} --namespace ${YOUR_NAMESPACE}
+
+kubectl wait --for=condition=complete --timeout=15m job/${YOUR_CLUSTER_NAME}-upgrade-job --namespace ${YOUR_NAMESPACE}
+```
+
+## Step 6: Destroy the TigerGraph Cluster and the Kubernetes Operator
+
+### Destroy the TigerGraph Cluster
+
+To prevent accidental deletion of a cluster, deleting a cluster will not remove the Persistent Volume Claims (PVCs) and Persistent Volumes (PVs) associated with the cluster. If you intend to delete these components as well, you must manually delete the PVCs.
+
+- Delete the TigerGraph cluster and keep the Persistent Volumes (PVs):
+
+ ```bash
+ kubectl tg delete --cluster-name ${YOUR_CLUSTER_NAME} -n ${YOUR_NAMESPACE}
+ ```
+
+- Delete PVCs of the specific cluster:
+
+ ```bash
+ # to figure out the pvcs you want to delete by specific labels of pvc.
+ kubectl get pvc -l tigergraph.com/cluster-name=${YOUR_CLUSTER_NAME} -n ${YOUR_NAMESPACE}
+
+ # delete the pvcs related to the specified cluster
+ kubectl delete pvc -l tigergraph.com/cluster-name=${YOUR_CLUSTER_NAME} -n ${YOUR_NAMESPACE}
+ ```
+
+### Uninstall TigerGraph Operator
+
+Use the provided command below to uninstall the TigerGraph Kubernetes Operator within a specified namespace:
+
+```bash
+kubectl tg uninstall -n ${YOUR_NAMESPACE}
+```
+
+### Uninstall CRD
+
+> [!NOTE]
+> Replace the variable `${OPERATOR_VERSION}` to the Operator version you installed.
+
+```bash
+kubectl delete -f https://dl.tigergraph.com/k8s/${OPERATOR_VERSION}/tg-operator-crd.yaml
+```
+
+## Step 7: Destroy the Kubernetes Cluster
+
+If you created the Kubernetes cluster using kind, use the following command to delete it:
+
+```bash
+kind delete cluster
+```
+
+## See also
+
+If you are interested in deploying a TigerGraph cluster in a production environment, refer to the following documents:
+
+- [Deploy TigerGraph on AWS EKS](../03-deploy/tigergraph-on-eks.md)
+- [Deploy TigerGraph on Google Cloud GKE](../03-deploy/tigergraph-on-gke.md)
+- [Deploy TigerGraph on Red Hat OpenShift](../03-deploy/tigergraph-on-openshift.md)
diff --git a/k8s/docs/03-deploy/affinity-use-cases.md b/k8s/docs/03-deploy/affinity-use-cases.md
new file mode 100644
index 00000000..f74f0527
--- /dev/null
+++ b/k8s/docs/03-deploy/affinity-use-cases.md
@@ -0,0 +1,899 @@
+
+NodeSelector, Affinity and Toleration Use Cases
+
+- [Basic Knowledge](#basic-knowledge)
+ - [Which labels are TG using](#which-labels-are-tg-using)
+ - [TigerGraph Cluster Pods](#tigergraph-cluster-pods)
+ - [TigerGraph Job Pods](#tigergraph-job-pods)
+ - [TigerGraph Backup/Restore Job Pods](#tigergraph-backuprestore-job-pods)
+- [NodeSelector](#nodeselector)
+ - [Example: schedule pods to nodes with disktype=ssd](#example-schedule-pods-to-nodes-with-disktypessd)
+- [Affinity](#affinity)
+ - [NodeAffinity](#nodeaffinity)
+ - [Preferred Node Affinity](#preferred-node-affinity)
+ - [Example: Difference between Preferred Affinity and Required Affinity](#example-difference-between-preferred-affinity-and-required-affinity)
+ - [Weighted Affinity and Logical Operators](#weighted-affinity-and-logical-operators)
+ - [Combining Rules with Logical Operators](#combining-rules-with-logical-operators)
+ - [Examples: Combining Multiple Rules with Different Weights](#examples-combining-multiple-rules-with-different-weights)
+ - [Inter-pod Affinity and Anti-Affinity](#inter-pod-affinity-and-anti-affinity)
+ - [Example: Avoiding Scheduling TigerGraph Pods on the Same VM Instance](#example-avoiding-scheduling-tigergraph-pods-on-the-same-vm-instance)
+ - [Scheduling Pods to Different Zones](#scheduling-pods-to-different-zones)
+- [Toleration](#toleration)
+ - [Example: Implementing User Groups with Taints and Tolerations](#example-implementing-user-groups-with-taints-and-tolerations)
+- [Notice](#notice)
+
+Basic Knowledge
+===============
+
+In a Kubernetes cluster, every node is equipped with labels that provide information about the node's attributes and capabilities. Some labels are automatically assigned by Kubernetes itself, while others can be added manually by administrators. These labels play a crucial role in workload distribution, resource allocation, and overall cluster management.(please refer to [Well-Known Labels, Annotations and Taints](https://kubernetes.io/docs/reference/labels-annotations-taints/) ). You also have the ability to manually assign labels to nodes in your Kubernetes cluster.
+
+To view all labels associated with nodes, you can use the following command:
+```bash
+kubectl get nodes --show-labels
+```
+Here's an example of node labels in a Google Kubernetes Engine (GKE) cluster:
+
+```bash
+NAME STATUS ROLES AGE VERSION LABELS
+gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-2p9g Ready 7m57s v1.24.9-gke.3200 beta.kubernetes.io/arch=amd64,beta.kubernetes.io/instance-type=e2-standard-8,beta.kubernetes.io/os=linux,cloud.google.com/gke-boot-disk=pd-balanced,cloud.google.com/gke-container-runtime=containerd,cloud.google.com/gke-cpu-scaling-level=8,cloud.google.com/gke-logging-variant=DEFAULT,cloud.google.com/gke-max-pods-per-node=110,cloud.google.com/gke-nodepool=default-pool,cloud.google.com/gke-os-distribution=cos,cloud.google.com/machine-family=e2,cloud.google.com/private-node=false,failure-domain.beta.kubernetes.io/region=us-central1,failure-domain.beta.kubernetes.io/zone=us-central1-a,kubernetes.io/arch=amd64,kubernetes.io/hostname=gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-2p9g,kubernetes.io/os=linux,node.kubernetes.io/instance-type=e2-standard-8,topology.gke.io/zone=us-central1-a,topology.kubernetes.io/region=us-central1,topology.kubernetes.io/zone=us-central1-a
+...
+```
+
+To manually assign labels to nodes, you can use the kubectl label command. For example, to assign a label named environment with the value production to nodes NODE_1 and NODE_2, you would use the following command:
+
+```bash
+kubectl label nodes NODE_1 NODE_2 LABEL_KEY=LABEL_VALUE
+```
+These labels can then be utilized in affinity rules and other scheduling configurations to ensure that pods are placed on the most suitable nodes based on your specific requirements.
+
+
+Which labels are TG using
+-------------------------
+
+TigerGraph utilizes specific labels for different purposes in Kubernetes:
+
+### TigerGraph Cluster Pods
+
+| Label | Usage |
+|----------------------------------------|---------------------------------------------------------------------|
+| `tigergraph.com/cluster-name=CLUSTER_NAME` | Indicates which cluster the pod belongs to. |
+| `tigergraph.com/cluster-pod=CLUSTER_NAME` | Indicates that the pod belongs to a cluster and not a Job. |
+| `tigergraph.com/gui-service=true` | Labeled on pods running the GUI service. |
+| `tigergraph.com/restpp-service=true` | Labeled on pods running the RESTPP service. |
+
+### TigerGraph Job Pods
+
+| Label | Usage |
+|-------------------------------------------------|------------------------------------------------------------------------------|
+| `tigergraph.com/cluster-name=CLUSTER_NAME` | Indicates which cluster the job is for. |
+| `tigergraph.com/cluster-job={CLUSTER_NAME}-{JOB_TYPE}-job` | Specifies the type of job and the cluster it's associated with (JOB_TYPE: init, upgrade, expand, shrink-pre, shrink-post). |
+
+### TigerGraph Backup/Restore Job Pods
+
+| Label | Usage |
+|--------------------------------------------------|------------------------------------------------------------------------------|
+| `tigergraph.com/backup-cluster=CLUSTER_NAME` | Labeled on pods running backup jobs for the specified cluster. |
+| `tigergraph.com/restore-cluster=CLUSTER_NAME` | Labeled on pods running restore jobs for the specified cluster. |
+
+These labels help identify the purpose and affiliation of various pods within the Kubernetes environment, making it easier to manage and monitor different components of TigerGraph clusters, jobs, backups, and restores.
+
+NodeSelector
+============
+
+NodeSelector in the TigerGraph Custom Resource (CR) allows you to control the scheduling of pods for the TigerGraph cluster. When you define a NodeSelector, the pods related to the TigerGraph cluster will only be scheduled on nodes that have specific labels matching the NodeSelector criteria. This feature ensures that the TigerGraph cluster pods are placed on nodes that meet your specified requirements.(to know more about NodeSelector: [Assign Pods to Nodes](https://kubernetes.io/docs/tasks/configure-pod-container/assign-pods-nodes/) )
+
+It's important to note that NodeSelector only applies to pods directly associated with the TigerGraph cluster. Other pods running tasks such as init, upgrade, expand, or shrink jobs will not be influenced by the NodeSelector settings.
+
+
+Example: schedule pods to nodes with disktype=ssd
+-------------------------------------------------
+
+In this example, we will demonstrate how to use the NodeSelector feature to schedule pods to nodes with a specific label, such as disktype=ssd. This example assumes you are using Google Kubernetes Engine (GKE).
+
+Use `kubectl get nodes` to list all nodes:
+
+```bash
+> kubectl get nodes
+
+NAME STATUS ROLES AGE VERSION
+gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-2p9g Ready 10m v1.24.9-gke.3200
+gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-4z0q Ready 10m v1.24.9-gke.3200
+gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-9t5m Ready 10m v1.24.9-gke.3200
+gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-b99l Ready 10m v1.24.9-gke.3200
+gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-lff2 Ready 10m v1.24.9-gke.3200
+gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-wh8g Ready 10m v1.24.9-gke.3200
+```
+
+Add label `disktype=ssd` to 3 of the 6 nodes:
+
+```bash
+kubectl label nodes gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-2p9g \
+ gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-4z0q \
+ gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-9t5m \
+ disktype=ssd
+```
+Replace the node names with the actual names of the nodes you want to label as SSD.
+
+First, we try to create a TG cluster without any rules. Use following CR:
+
+```yaml
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraph
+metadata:
+ name: test-cluster
+spec:
+ replicas: 3
+ image: docker.io/tigergraph/tigergraph-k8s:3.9.2
+ imagePullPolicy: IfNotPresent
+ privateKeyName: ssh-key-secret
+ listener:
+ type: LoadBalancer
+ resources:
+ requests:
+ cpu: 2
+ memory: 8Gi
+ storage:
+ type: persistent-claim
+ volumeClaimTemplate:
+ storageClassName: standard
+ resources:
+ requests:
+ storage: 10G
+ initTGConfig:
+ ha: 1
+ license: YOUR_LICENSE
+ version: 3.9.2
+ hashBucketInBit: 5
+ initJob:
+ image: docker.io/tigergraph/tigergraph-k8s-init:0.0.7
+ imagePullPolicy: IfNotPresent
+```
+Apply the configuration using `kubectl apply -f .yaml`.
+
+Use `kubectl describe pod` to see which node each pod is scheduled to
+
+```bash
+test-cluster-0:
+Node: gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-b99l/10.128.0.68
+
+test-cluster-1:
+Node: gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-wh8g/10.128.0.67
+
+test-cluster-2 :
+Node: gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-4z0q/10.128.0.73
+```
+
+Note that the pods are scheduled to three random nodes.
+
+
+Then we create a cluster with NodeSelector:
+
+```yaml
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraph
+metadata:
+ name: test-nodeselector
+spec:
+ replicas: 3
+ image: docker.io/tigergraph/tigergraph-k8s:3.9.2
+ imagePullPolicy: IfNotPresent
+ privateKeyName: ssh-key-secret
+ listener:
+ type: LoadBalancer
+ resources:
+ requests:
+ cpu: 2
+ memory: 8Gi
+ storage:
+ type: persistent-claim
+ volumeClaimTemplate:
+ storageClassName: standard
+ resources:
+ requests:
+ storage: 10G
+ initTGConfig:
+ ha: 1
+ license: YOUR_LICENSE
+ version: 3.9.2
+ hashBucketInBit: 5
+ initJob:
+ image: docker.io/tigergraph/tigergraph-k8s-init:0.0.7
+ imagePullPolicy: IfNotPresent
+ affinityConfiguration:
+ nodeSelector:
+ disktype: ssd
+```
+Apply the configuration using `kubectl apply -f .yaml`.
+
+In this configuration, there is an additional field `.spec.affinityConfiguration`, which is used to define NodeSelector.
+
+```yaml
+ affinityConfiguration:
+ nodeSelector:
+ disktype: ssd
+```
+
+That means the pods can only be scheduled to nodes with label `disktype=ssd`.
+
+We can use `kubectl describe pod` to see which node they are scheduled to:
+
+```bash
+test-nodeselector-0:
+Node: gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-4z0q/10.128.0.73
+
+test-nodeselector-1:
+Node: gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-2p9g/10.128.0.90
+
+test-nodeselector-2:
+Node: gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-4z0q/10.128.0.73
+```
+
+Both `gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-4z0q` and `gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-2p9g` possess the specified label.
+
+
+Affinity
+========
+
+Please note that affinity settings exclusively impact the pods within the TigerGraph cluster. Any other pods executing init/upgrade/expand/shrink tasks will remain unaffected by these affinity configurations.
+
+NodeAffinity
+------------
+
+Additionally, TigerGraph pods can be strategically allocated to nodes with specific labels through the use of NodeAffinity. To gain a deeper understanding of Node Affinity, you can refer to the official Kubernetes documentation: [Assign Pods to Nodes using Node Affinity](https://kubernetes.io/docs/tasks/configure-pod-container/assign-pods-nodes-using-node-affinity/)
+
+Here is an illustrative example of a CR (Custom Resource) configuration implementing NodeAffinity:
+
+```yaml
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraph
+metadata:
+ name: test-nodeaffinity
+spec:
+ replicas: 3
+ image: docker.io/tigergraph/tigergraph-k8s:3.9.2
+ imagePullPolicy: IfNotPresent
+ privateKeyName: ssh-key-secret
+ listener:
+ type: LoadBalancer
+ resources:
+ requests:
+ cpu: 2
+ memory: 8Gi
+ storage:
+ type: persistent-claim
+ volumeClaimTemplate:
+ storageClassName: standard
+ resources:
+ requests:
+ storage: 10G
+ initTGConfig:
+ ha: 1
+ license: YOUR_LICENSE
+ version: 3.9.2
+ hashBucketInBit: 5
+ initJob:
+ image: docker.io/tigergraph/tigergraph-k8s-init:0.0.7
+ imagePullPolicy: IfNotPresent
+ affinityConfiguration:
+ affinity:
+ nodeAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ nodeSelectorTerms:
+ - matchExpressions:
+ - key: disktype
+ operator: In
+ values:
+ - ssd
+```
+In this example, the nodeAffinity section is utilized within the affinityConfiguration to specify that the pods require nodes with the label disktype=ssd during scheduling, while allowing execution to continue even if the affinity is disregarded.
+
+Certainly, let's take a closer look at the `.spec.affinityConfiguration` section:
+
+```yaml
+affinityConfiguration:
+ affinity:
+ nodeAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ nodeSelectorTerms:
+ - matchExpressions:
+ - key: disktype
+ operator: In
+ values:
+ - ssd
+```
+Within the affinityConfiguration, the setting `requiredDuringSchedulingIgnoredDuringExecution` is employed. This signifies that it is mandatory for our pods to be scheduled exclusively on nodes possessing the specified label, ensuring a precise node placement throughout both the scheduling and execution phases.
+
+
+
+You can use the following command to observe the nodes to which the pods are scheduled:
+
+```bash
+> kubectl get pods --output=wide
+NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
+test-nodeaffinity-0 0/1 Running 0 17s 10.36.5.8 gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-2p9g
+test-nodeaffinity-1 0/1 Running 0 17s 10.36.3.8 gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-4z0q
+test-nodeaffinity-2 0/1 Running 0 17s 10.36.5.9 gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-2p9g
+```
+
+Notice that both gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-2p9g and gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-4z0q nodes possess the specified label, indicating the successful enforcement of node affinity.
+
+### Preferred Node Affinity
+
+For a deeper understanding of preferred node affinity, you can explore the document: [Schedule a Pod using preferred node affinity](https://kubernetes.io/docs/tasks/configure-pod-container/assign-pods-nodes-using-node-affinity/#schedule-a-pod-using-preferred-node-affinity).
+
+It's crucial to differentiate between the `preferredDuringSchedulingIgnoredDuringExecution` and `requiredDuringSchedulingIgnoredDuringExecution` fields. When utilizing `requiredDuringSchedulingIgnoredDuringExecution`, pods will remain **unscheduled** if an insufficient number of nodes adhere to the specified rules. On the other hand, opting for `preferredDuringSchedulingIgnoredDuringExecution` indicates that the Kubernetes scheduler will **attempt** to schedule pods onto nodes aligned with the rules. In cases where no nodes fulfill the criteria, the pods will be scheduled alongside other pods.
+
+#### Example: Difference between Preferred Affinity and Required Affinity
+
+To illustrate the contrast between preferred affinity and required affinity, let's consider a scenario where we label only one node and create a TigerGraph cluster with specific resource requirements.
+
+```yaml
+kubectl label nodes gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-2p9g disktype=ssd
+```
+
+We create a TigerGraph cluster with resource requests that would limit one pod per node due to CPU constraints. We use `requiredDuringSchedulingIgnoredDuringExecution` to ensure nodes are selected based on the disktype label.
+
+```yaml
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraph
+metadata:
+ name: test-nodeaffinity
+spec:
+ replicas: 3
+ image: docker.io/tigergraph/tigergraph-k8s:3.9.2
+ imagePullPolicy: IfNotPresent
+ privateKeyName: ssh-key-secret
+ listener:
+ type: LoadBalancer
+ resources:
+ requests:
+ cpu: 4
+ memory: 8Gi
+ storage:
+ type: persistent-claim
+ volumeClaimTemplate:
+ storageClassName: standard
+ resources:
+ requests:
+ storage: 10G
+ initTGConfig:
+ ha: 1
+ license: YOUR_LICENSE
+ version: 3.9.2
+ hashBucketInBit: 5
+ initJob:
+ image: docker.io/tigergraph/tigergraph-k8s-init:0.0.7
+ imagePullPolicy: IfNotPresent
+ affinityConfiguration:
+ affinity:
+ nodeAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ nodeSelectorTerms:
+ - matchExpressions:
+ - key: disktype
+ operator: In
+ values:
+ - ssd
+```
+
+Running kubectl get pods --output=wide provides the following output:
+
+```bash
+NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
+test-nodeaffinity-0 1/1 Running 0 107s 10.36.5.12 gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-2p9g
+test-nodeaffinity-1 0/1 Pending 0 107s
+test-nodeaffinity-2 0/1 Pending 0 106s
+```
+In this output, you can observe that only one pod has been scheduled to the node labeled with `disktype=ssd`. The remaining two pods are pending due to resource constraints, as there is only one node with the required label and that node does not have sufficient available CPU resources to accommodate all pods.
+
+You can utilize the following command to gain insights into why `test-nodeaffinity-1` is in a pending state:
+```
+kubectl describe pod test-nodeaffinity-1
+```
+This command will provide detailed information about the pod's status, including any events and messages related to its scheduling and resource allocation. In this specific case, the output will indicate the reason for the pod's pending status, such as insufficient CPU resources and failure to match the pod's node affinity or selector.
+
+Here is an example of the type of information you might encounter:
+```yaml
+Events:
+ Type Reason Age From Message
+ ---- ------ ---- ---- -------
+ Normal NotTriggerScaleUp 2m16s cluster-autoscaler pod didn't trigger scale-up:
+ Warning FailedScheduling 101s (x2 over 2m17s) default-scheduler 0/6 nodes are available: 1 Insufficient cpu, 5 node(s) didn't match Pod's node affinity/selector. preemption: 0/6 nodes are available: 1 No preemption victims found for incoming pod, 5 Preemption is not helpful for scheduling.
+```
+This output indicates that the pod is pending due to insufficient CPU resources (`Insufficient cpu`) and the fact that the node affinity or selector criteria are not being met by any available nodes (`node(s) didn't match Pod's node affinity/selector`).
+
+Now we edit the above CR, use `preferredDuringSchedulingIgnoredDuringExecution` instead
+
+```yaml
+#......
+#The same as above one
+ affinityConfiguration:
+ affinity:
+ nodeAffinity:
+ preferredDuringSchedulingIgnoredDuringExecution:
+ - weight: 1
+ preference:
+ matchExpressions:
+ - key: disktype
+ operator: In
+ values:
+ - ssd
+```
+Upon checking pod status with `kubectl get pods --output=wide`, you notice the following:
+```bash
+> kubectl get pods --output=wide
+NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
+test-nodeaffinity-0 0/1 ContainerCreating 0 2s gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-2p9g
+test-nodeaffinity-1 0/1 ContainerCreating 0 2s gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-b99l
+test-nodeaffinity-2 0/1 ContainerCreating 0 1s gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-4z0q
+```
+
+ In the provided output, only one pod has been successfully scheduled to a node with the specified label (`disktype=ssd`). The other pods were scheduled to nodes without the specific label, which demonstrates the behavior of `preferredDuringSchedulingIgnoredDuringExecution`. This affinity setting attempts to schedule pods according to the defined preferences, but it is not a strict requirement. If nodes meeting the preferences are unavailable, the pods will still be scheduled on other nodes.
+
+
+### Weighted Affinity and Logical Operators
+
+The `weight` attribute, ranging from 1 to 100, can be assigned to each instance of the `preferredDuringSchedulingIgnoredDuringExecution` affinity type. This weight represents the preference given to a particular affinity rule. When all other scheduling requirements for a Pod are met, the scheduler calculates a score by summing up the weights of satisfied preferred rules. This score contributes to the overall prioritization of nodes, with higher scores leading to higher scheduling priority for the Pod.
+
+### Combining Rules with Logical Operators
+
+The `operator` field allows you to employ logical operators to determine how Kubernetes interprets the affinity rules. Various operators such as `In`, `NotIn`, `Exists`, `DoesNotExist`, `Gt`, and `Lt` can be used. These operators can be combined to craft nuanced rules that guide the scheduling behavior.
+
+When using both `nodeSelector` and `nodeAffinity`, both sets of rules must be satisfied for the Pod to be scheduled onto a node.
+
+In scenarios involving multiple terms associated with `nodeAffinity` types within `nodeSelectorTerms`, a Pod can be scheduled onto a node if any of the specified terms are satisfied (terms are ORed).
+
+For a single term within `nodeSelectorTerms`, if multiple expressions are present in a single `matchExpressions` field, the Pod can only be scheduled onto a node if all the expressions are satisfied (expressions are ANDed).
+
+
+#### Examples: Combining Multiple Rules with Different Weights
+
+In this scenario, we have labeled nodes, with two labeled as `disktype=ssd` and two as `physical-machine=true`. We assign a weight of 1 to the `disktype=ssd` rule and a weight of 50 to the `physical-machine=true` rule. The objective is to demonstrate how to combine these rules effectively.
+
+Nodes labeled with `disktype=ssd`:
+
+```bash
+gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-2p9g
+gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-4z0q
+```
+
+Nodes labeled with `physical-machine=true`:
+
+```bash
+gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-9t5m
+gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-b99l
+```
+
+Utilizing the following affinity configuration:
+
+```yaml
+affinityConfiguration:
+ affinity:
+ nodeAffinity:
+ preferredDuringSchedulingIgnoredDuringExecution:
+ - weight: 1
+ preference:
+ matchExpressions:
+ - key: disktype
+ operator: In
+ values:
+ - ssd
+ - weight: 50
+ preference:
+ matchExpressions:
+ - key: physical-machine
+ operator: Exists
+```
+
+Running `kubectl get pods --output=wide` yields:
+
+```plaintext
+NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
+test-nodeaffinity-0 0/1 Running 0 20s 10.36.2.8 gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-b99l
+test-nodeaffinity-1 0/1 Running 0 19s 10.36.4.5 gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-9t5m
+test-nodeaffinity-2 0/1 Running 0 19s 10.36.3.12 gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-4z0q
+```
+
+The pods are preferentially scheduled to nodes with the `physical-machine=true` label, as specified by the rule with a weight of 50. Two out of three pods are successfully scheduled on nodes meeting this rule. Additionally, one pod is scheduled to a node with the label `disktype=ssd`.
+
+Inter-pod Affinity and Anti-Affinity
+-------------------------------------
+
+[Inter-pod affinity and anti-affinity](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#inter-pod-affinity-and-anti-affinity) offer the capability to restrict the nodes on which your Pods are scheduled based on the labels of other Pods that are already running on those nodes. This is in contrast to node affinity, which is based on the labels of the nodes themselves.
+
+Similar to node affinity, inter-pod affinity and anti-affinity come in two types:
+- `requiredDuringSchedulingIgnoredDuringExecution`
+- `preferredDuringSchedulingIgnoredDuringExecution`
+
+Inter-pod affinity and anti-affinity rules follow this pattern: "This Pod should (or, in the case of anti-affinity, should not) run on an X node if that X node is already running one or more Pods that meet rule Y." In this context, X represents a topology domain such as a node, rack, cloud provider zone or region, and Y represents the rule that Kubernetes aims to satisfy.
+
+These rules (Y) are expressed as label selectors, which can be associated with an optional list of namespaces. Since Pods are namespaced objects in Kubernetes, their labels inherently carry namespace information. Any label selectors used for Pod labels must explicitly specify the namespaces where Kubernetes should search for those labels.
+
+To define the topology domain (X), a `topologyKey` is used. The `topologyKey` serves as the key for the node label that the system uses to identify the relevant domain. Careful consideration should be given to the choice of `topologyKey`. For instance, in Google Kubernetes Engine (GKE), selecting `kubernetes.io/hostname` as the topology key enables scheduling Pods to different virtual machine instances. Alternatively, using `topology.kubernetes.io/region` as the topology key allows Pods to be scheduled across different regions.
+
+If you have specific requirements, such as the need for Pods to be distributed across certain domains, thoughtful selection of the appropriate `topologyKey` ensures that the scheduling behavior aligns with your needs.
+
+### Example: Avoiding Scheduling TigerGraph Pods on the Same VM Instance
+
+In this example, we'll explore how to prevent the scheduling of TigerGraph pods on the same virtual machine (VM) instance. Each TigerGraph pod is uniquely labeled with `tigergraph.com/cluster-pod=${CLUSTER_NAME}`, which designates the cluster it belongs to. We will utilize this label to create the scheduling rule.
+
+Consider the following Kubernetes resource definition:
+
+```yaml
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraph
+metadata:
+ name: test-cluster
+spec:
+ replicas: 3
+ image: docker.io/tigergraph/tigergraph-k8s:3.9.2
+ imagePullPolicy: IfNotPresent
+ privateKeyName: ssh-key-secret
+ listener:
+ type: LoadBalancer
+ resources:
+ requests:
+ cpu: 2
+ memory: 8Gi
+ storage:
+ type: persistent-claim
+ volumeClaimTemplate:
+ storageClassName: standard
+ resources:
+ requests:
+ storage: 10G
+ initTGConfig:
+ ha: 1
+ license: YOUR_LICENSE
+ version: 3.9.2
+ hashBucketInBit: 5
+ initJob:
+ image: docker.io/tginternal/tigergraph-k8s-init:0.0.7
+ imagePullPolicy: IfNotPresent
+ affinityConfiguration:
+ affinity:
+ podAntiAffinity:
+ preferredDuringSchedulingIgnoredDuringExecution:
+ - weight: 100
+ podAffinityTerm:
+ labelSelector:
+ matchExpressions:
+ - key: tigergraph.com/cluster-pod
+ operator: In
+ values:
+ - test-cluster
+ topologyKey: kubernetes.io/hostname
+```
+
+This configuration enforces the rule that TigerGraph pods should not be scheduled on VM instances that are already hosting other TigerGraph pods belonging to the same cluster (`test-cluster`). However, in cases where there are insufficient nodes available, more than one TigerGraph pod may still be scheduled on the same VM instance.
+
+By leveraging the `podAntiAffinity` feature with a preferred scheduling strategy, you ensure that TigerGraph pods are spread across different VM instances within the cluster to enhance fault tolerance and resource distribution.
+
+
+Create TigerGraph with above CR and see which node the pods are scheduled to:
+
+```bash
+> kubectl get pods --output=wide
+
+NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
+test-cluster-0 0/1 ContainerCreating 0 8s gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-wh8g
+test-cluster-1 0/1 ContainerCreating 0 8s gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-2p9g
+test-cluster-2 0/1 Running 0 8s 10.36.2.9 gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-b99l
+```
+
+The output showed that the TigerGraph pods were scheduled to different nodes, demonstrating the successful application of the `podAntiAffinity` rule.
+
+We can also require them to be scheduled on nodes which does not have pods of another TG cluster
+
+```yaml
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraph
+metadata:
+ name: test-cluster
+spec:
+ replicas: 3
+ image: docker.io/tigergraph/tigergraph-k8s:3.9.2
+ imagePullPolicy: IfNotPresent
+ privateKeyName: ssh-key-secret
+ listener:
+ type: LoadBalancer
+ resources:
+ requests:
+ cpu: 1
+ memory: 8Gi
+ storage:
+ type: persistent-claim
+ volumeClaimTemplate:
+ storageClassName: standard
+ resources:
+ requests:
+ storage: 10G
+ initTGConfig:
+ ha: 1
+ license: YOUR_LICENSE
+ version: 3.9.2
+ hashBucketInBit: 5
+ initJob:
+ image: docker.io/tigergraph/tigergraph-k8s-init:0.0.7
+ imagePullPolicy: IfNotPresent
+ affinityConfiguration:
+ affinity:
+ podAntiAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ - labelSelector:
+ matchExpressions:
+ - key: tigergraph.com/cluster-pod
+ operator: Exists
+ topologyKey: kubernetes.io/hostname
+```
+
+This will require the scheduler to schedule pods of test-cluster to nodes that is not running any pods belonging to another TG cluster.
+
+For example, we already have a TG cluster test-nodeaffinity and we want to create a new TG cluster named test-cluster. We donât want pods of test-cluster to be scheduled to nodes that is running pod of test-nodeaffinity.
+
+```bash
+> kubectl get pods --output=wide
+
+NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
+test-cluster-0 0/1 ContainerCreating 0 9s gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-wh8g
+test-cluster-1 0/1 ContainerCreating 0 9s gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-2p9g
+test-cluster-2 0/1 Running 0 9s 10.36.1.19 gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-wh8g
+test-nodeaffinity-0 1/1 Running 0 85m 10.36.2.8 gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-b99l
+test-nodeaffinity-1 1/1 Running 0 85m 10.36.4.5 gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-9t5m
+test-nodeaffinity-2 1/1 Running 0 85m 10.36.3.12 gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-4z0q
+```
+
+### Scheduling Pods to Different Zones
+
+We create an **OpenShift Cluster** which has one master node and five worker nodes.
+
+```bash
+> kubectl get nodes --show-labels
+
+NAME STATUS ROLES AGE VERSION LABELS
+tg-k8s-openshift-1024-5jz2w-master-0 Ready master 95m v1.23.5+012e945 beta.kubernetes.io/arch=amd64,beta.kubernetes.io/instance-type=n2-standard-4,beta.kubernetes.io/os=linux,failure-domain.beta.kubernetes.io/region=us-east1,failure-domain.beta.kubernetes.io/zone=us-east1-b,kubernetes.io/arch=amd64,kubernetes.io/hostname=tg-k8s-openshift-1024-5jz2w-master-0,kubernetes.io/os=linux,node-role.kubernetes.io/master=,node.kubernetes.io/instance-type=n2-standard-4,node.openshift.io/os_id=rhcos,topology.gke.io/zone=us-east1-b,topology.kubernetes.io/region=us-east1,topology.kubernetes.io/zone=us-east1-b
+tg-k8s-openshift-1024-5jz2w-worker-b-w96n6 Ready worker 84m v1.23.5+012e945 beta.kubernetes.io/arch=amd64,beta.kubernetes.io/instance-type=e2-standard-8,beta.kubernetes.io/os=linux,failure-domain.beta.kubernetes.io/region=us-east1,failure-domain.beta.kubernetes.io/zone=us-east1-b,kubernetes.io/arch=amd64,kubernetes.io/hostname=tg-k8s-openshift-1024-5jz2w-worker-b-w96n6,kubernetes.io/os=linux,node-role.kubernetes.io/worker=,node.kubernetes.io/instance-type=e2-standard-8,node.openshift.io/os_id=rhcos,topology.gke.io/zone=us-east1-b,topology.kubernetes.io/region=us-east1,topology.kubernetes.io/zone=us-east1-b
+tg-k8s-openshift-1024-5jz2w-worker-b-xzrf9 Ready worker 84m v1.23.5+012e945 beta.kubernetes.io/arch=amd64,beta.kubernetes.io/instance-type=e2-standard-8,beta.kubernetes.io/os=linux,failure-domain.beta.kubernetes.io/region=us-east1,failure-domain.beta.kubernetes.io/zone=us-east1-b,kubernetes.io/arch=amd64,kubernetes.io/hostname=tg-k8s-openshift-1024-5jz2w-worker-b-xzrf9,kubernetes.io/os=linux,node-role.kubernetes.io/worker=,node.kubernetes.io/instance-type=e2-standard-8,node.openshift.io/os_id=rhcos,topology.gke.io/zone=us-east1-b,topology.kubernetes.io/region=us-east1,topology.kubernetes.io/zone=us-east1-b
+tg-k8s-openshift-1024-5jz2w-worker-c-456wl Ready worker 84m v1.23.5+012e945 beta.kubernetes.io/arch=amd64,beta.kubernetes.io/instance-type=e2-standard-8,beta.kubernetes.io/os=linux,failure-domain.beta.kubernetes.io/region=us-east1,failure-domain.beta.kubernetes.io/zone=us-east1-c,kubernetes.io/arch=amd64,kubernetes.io/hostname=tg-k8s-openshift-1024-5jz2w-worker-c-456wl,kubernetes.io/os=linux,node-role.kubernetes.io/worker=,node.kubernetes.io/instance-type=e2-standard-8,node.openshift.io/os_id=rhcos,topology.gke.io/zone=us-east1-c,topology.kubernetes.io/region=us-east1,topology.kubernetes.io/zone=us-east1-c
+tg-k8s-openshift-1024-5jz2w-worker-c-t86pt Ready worker 84m v1.23.5+012e945 beta.kubernetes.io/arch=amd64,beta.kubernetes.io/instance-type=e2-standard-8,beta.kubernetes.io/os=linux,failure-domain.beta.kubernetes.io/region=us-east1,failure-domain.beta.kubernetes.io/zone=us-east1-c,kubernetes.io/arch=amd64,kubernetes.io/hostname=tg-k8s-openshift-1024-5jz2w-worker-c-t86pt,kubernetes.io/os=linux,node-role.kubernetes.io/worker=,node.kubernetes.io/instance-type=e2-standard-8,node.openshift.io/os_id=rhcos,topology.gke.io/zone=us-east1-c,topology.kubernetes.io/region=us-east1,topology.kubernetes.io/zone=us-east1-c
+tg-k8s-openshift-1024-5jz2w-worker-d-7xv82 Ready worker 84m v1.23.5+012e945 beta.kubernetes.io/arch=amd64,beta.kubernetes.io/instance-type=e2-standard-8,beta.kubernetes.io/os=linux,failure-domain.beta.kubernetes.io/region=us-east1,failure-domain.beta.kubernetes.io/zone=us-east1-d,kubernetes.io/arch=amd64,kubernetes.io/hostname=tg-k8s-openshift-1024-5jz2w-worker-d-7xv82,kubernetes.io/os=linux,node-role.kubernetes.io/worker=,node.kubernetes.io/instance-type=e2-standard-8,node.openshift.io/os_id=rhcos,topology.gke.io/zone=us-east1-d,topology.kubernetes.io/region=us-east1,topology.kubernetes.io/zone=us-east1-d
+qiuyuhan@yuhan-qiu-bot-20220808075602-0:~/product/src/cqrs/k8s-operator$
+```
+
+Certainly, I've polished the provided text for clarity and readability:
+
+---
+
+Observing the node configuration, each node is associated with a label: `topology.kubernetes.io/zone=xxx`.
+
+The master node bears the label `topology.kubernetes.io/zone=us-east1-b`, while two worker nodes are marked with `topology.kubernetes.io/zone=us-east1-b`, another two with `topology.kubernetes.io/zone=us-east1-c`, and one worker node with `topology.kubernetes.io/zone=us-east1-d`.
+
+For the allocation of pods across distinct zones, the following affinity can be employed:
+
+```yaml
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraph
+metadata:
+ name: test-cluster
+spec:
+ replicas: 3
+ image: docker.io/tigergraph/tigergraph-k8s:3.9.2
+ imagePullPolicy: IfNotPresent
+ privateKeyName: ssh-key-secret
+ listener:
+ type: LoadBalancer
+ resources:
+ requests:
+ cpu: 1
+ memory: 8Gi
+ storage:
+ type: persistent-claim
+ volumeClaimTemplate:
+ storageClassName: standard
+ resources:
+ requests:
+ storage: 10G
+ initTGConfig:
+ ha: 1
+ license: YOUR_LICENSE
+ version: 3.9.2
+ hashBucketInBit: 5
+ initJob:
+ image: docker.io/tigergraph/tigergraph-k8s-init:0.0.7
+ imagePullPolicy: IfNotPresent
+ affinityConfiguration:
+ affinity:
+ podAntiAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ - labelSelector:
+ matchExpressions:
+ - key: tigergraph.com/cluster-pod
+ operator: In
+ values:
+ - test-cluster
+ topologyKey: topology.kubernetes.io/zone
+```
+
+Upon creating the cluster, the assigned nodes can be observed:
+
+```bash
+NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
+test-cluster-0 0/1 ContainerCreating 0 2s tg-k8s-openshift-1024-5jz2w-worker-d-7xv82
+test-cluster-1 0/1 ContainerCreating 0 2s tg-k8s-openshift-1024-5jz2w-worker-b-w96n6
+test-cluster-2 0/1 ContainerCreating 0 1s tg-k8s-openshift-1024-5jz2w-worker-c-456wl
+```
+
+To elaborate, `tg-k8s-openshift-1024-5jz2w-worker-d-7xv82` corresponds to `us-east1-d`, `tg-k8s-openshift-1024-5jz2w-worker-b-w96n6` is positioned in `us-east1-b`, and `tg-k8s-openshift-1024-5jz2w-worker-c-456wl` is situated in `us-east1-c`.
+
+Toleration
+===========
+
+[Taint and Toleration](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/)
+
+You can put multiple taints on the same node and multiple tolerations on the same pod. The way Kubernetes processes multiple taints and tolerations is like a filter: start with all of a node's taints, then ignore the ones for which the pod has a matching toleration; the remaining un-ignored taints have the indicated effects on the pod. In particular,
+1. if there is at least one un-ignored taint with effect NoSchedule then Kubernetes will not schedule the pod onto that node
+2. if there is no un-ignored taint with effect NoSchedule but there is at least one un-ignored taint with effect PreferNoSchedule then Kubernetes will try to not schedule the pod onto the node
+3. if there is at least one un-ignored taint with effect NoExecute then the pod will be evicted from the node (if it is already running on the node), and will not be scheduled onto the node (if it is not yet running on the node).
+
+To apply taints to our nodes, we can utilize the `kubectl taint` command. When employing a "NoSchedule" taint type, new pods that lack tolerance for this taint will not be assigned to the node. In the event of a "PreferNoSchedule" taint type, new pods that cannot tolerate this taint will not be given preference for scheduling on the node. With a "NoExecute" taint type, new pods that are intolerant to the taint will neither be scheduled nor will existing running pods be evicted from the node.
+
+Should we furnish tolerations within the TigerGraph Custom Resource (CR), TigerGraph pods will overlook specified nodes.
+
+It's important to note that tolerations exclusively affect pods within the TigerGraph cluster. Other pods engaged in init/upgrade/expand/shrink operations will remain unaffected.
+
+Example: Implementing User Groups with Taints and Tolerations
+-----------------------------------------------------------
+
+A practical application of Taints and Tolerations is the establishment of user groups for the exclusive utilization of designated nodes.
+
+To illustrate this, let's take a step-by-step approach:
+
+First taint 3 nodes `userGroup=enterprise:NoExecute`.This action ensures that these nodes are reserved for the designated user group.
+
+```bash
+kubectl taint nodes gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-2p9g \
+ gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-4z0q \
+ gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-9t5m \
+ userGroup=enterprise:NoExecute
+```
+
+Then create a cluster without toleration:
+
+```yaml
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraph
+metadata:
+ name: test-cluster
+spec:
+ replicas: 3
+ image: docker.io/tigergraph/tigergraph-k8s:3.9.2
+ imagePullPolicy: IfNotPresent
+ privateKeyName: ssh-key-secret
+ listener:
+ type: LoadBalancer
+ resources:
+ requests:
+ cpu: 4
+ memory: 8Gi
+ storage:
+ type: persistent-claim
+ volumeClaimTemplate:
+ storageClassName: standard
+ resources:
+ requests:
+ storage: 10G
+ initTGConfig:
+ ha: 1
+ license: YOUR_LICENSE
+ version: 3.9.2
+ hashBucketInBit: 5
+ initJob:
+ image: docker.io/tigergraph/tigergraph-k8s-init:0.0.7
+ imagePullPolicy: IfNotPresent
+```
+
+Upon deploying the cluster, it becomes evident that all pods are scheduled to nodes devoid of the applied taints. This aligns with the concept of taints and tolerations, where pods are automatically assigned to nodes that do not possess taints that the pods cannot tolerate.
+
+```bash
+NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
+test-cluster-0 0/1 Running 0 14s 10.36.2.19 gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-b99l
+test-cluster-1 0/1 Running 0 14s 10.36.1.22 gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-wh8g
+test-cluster-2 0/1 ContainerCreating 0 14s gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-lff2
+```
+
+Then we can establish a new cluster configuration with the specified tolerations.
+
+```yaml
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraph
+metadata:
+ name: test-toleration
+spec:
+ replicas: 3
+ image: docker.io/tigergraph/tigergraph-k8s:3.9.2
+ imagePullPolicy: IfNotPresent
+ privateKeyName: ssh-key-secret
+ listener:
+ type: LoadBalancer
+ resources:
+ requests:
+ cpu: 4
+ memory: 8Gi
+ storage:
+ type: persistent-claim
+ volumeClaimTemplate:
+ storageClassName: standard
+ resources:
+ requests:
+ storage: 10G
+ initTGConfig:
+ ha: 1
+ license: YOUR_LICENSE
+ version: 3.9.2
+ hashBucketInBit: 5
+ initJob:
+ image: docker.io/tigergraph/tigergraph-k8s-init:0.0.7
+ imagePullPolicy: IfNotPresent
+ affinityConfiguration:
+ tolerations:
+ - key: "userGroup"
+ operator: "Equal"
+ value: "enterprise"
+ effect: "NoExecute"
+```
+
+By integrating tolerations into the configuration, the "test-toleration" cluster is designed to prioritize nodes with the specified taints. In this instance, pods belonging to the "test-toleration" cluster will be exclusively scheduled onto nodes bearing the "userGroup=enterprise" taint with the "NoExecute" effect.
+
+This approach ensures that the pods from the "test-toleration" cluster are deliberately assigned to nodes with the designated taint, aligning with the defined toleration rules.
+
+We can see that pods belonging to test-tolerations are all scheduled to nodes with taint
+
+```bash
+NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
+test-cluster-0 1/1 Running 0 3m19s 10.36.2.19 gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-b99l
+test-cluster-1 1/1 Running 0 3m19s 10.36.1.22 gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-wh8g
+test-cluster-2 1/1 Running 0 3m19s 10.36.0.15 gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-lff2
+test-cluster-init-job-kz9hp 0/1 Completed 0 49s 10.36.2.20 gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-b99l
+test-toleration-0 0/1 Running 0 55s 10.36.3.16 gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-4z0q
+test-toleration-1 0/1 Running 0 55s 10.36.4.6 gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-9t5m
+test-toleration-2 1/1 Running 0 55s 10.36.5.23 gke-tg-k8s-gke-1024-default-pool-1e4fbc0f-2p9g
+```
+
+Notice
+=====
+
+* If the `affinityConfiguration` includes a `NodeSelector`, and the current node does not meet the `NodeSelector` configuration, and the K8S cluster has `auto-scaling` enabled, the K8S cluster will expand more nodes to accommodate the affinityConfiguration, even if the new node cannot accommodate it. This can result in a situation where there are no suitable nodes available for scheduling TigerGraph pods but useless nodes created. Therefore, it is important to configure the affinityConfiguration with the correct node specifications.
+
+* If the `affinityConfiguration` includes `pod affinity`, and the current node does not meet the `pod affinity` settings, and the K8S cluster contains `multiple zones` with `auto-scaling` enabled, the automatic scaling of the K8S cluster will be prevented. This can result in a message like "2 node(s) had volume node affinity conflict and 1 node(s) didn't match pod affinity rules" being displayed. The "volume node affinity conflict" message means that the PV requires the current PV to be in the initial zone, which may be the reason why K8S cannot automatically scale. Similarly, there may be no suitable node available for scheduling TigerGraph pods.
+
+ ```bash
+ # Pod description
+ Events:
+ Type Reason Age From Message
+ ---- ------ ---- ---- -------
+ Warning FailedScheduling 31s default-scheduler 0/1 nodes are available: 1 pod has unbound immediate PersistentVolumeClaims. preemption: 0/1 nodes are available: 1 Preemption is not helpful for scheduling.
+ Warning FailedScheduling 30s default-scheduler 0/1 nodes are available: 1 Insufficient cpu. preemption: 0/1 nodes are available: 1 No preemption victims found for incoming pod.
+ Normal NotTriggerScaleUp 29s cluster-autoscaler pod didn't trigger scale-up: 1 node(s) didn't match pod affinity rules, 2 node(s) had volume node affinity conflict
+
+ # PV description
+ Node Affinity:
+ Required Terms:
+ Term 0: topology.kubernetes.io/region in [us-central1]
+ topology.kubernetes.io/zone in [us-central1-a]
+ ```
+
+* If pod scheduled failed due to limited resource, and got enough resources by expand more nodes, it may cause any pod move to another node, then it may prompt following error.
+
+ ```bash
+ Warning FailedAttachVolume 45s attachdetach-controller Multi-Attach error for volume "pvc-dcdb2953-b50f-45a9-a5c3-7f7752c36698" Volume is already exclusively attached to one node and can't be attached to another
+ ```
+
+* Based on the factors mentioned, the following conclusions can be drawn:
+
+ 1. When running creating TG Cluster operations, it is crucial to configure affinityConfiguration based on the correct node resources to ensure successful scaling and operation of the cluster.
+
+ 2. It is preferred to ensure that there are corresponding nodes to implement HA during the creation TG Cluster phase, rather than updating TG Cluster in the future, because the Node Affinity of the PV may cause failure.
+
+ 3. Two common scenarios that can lead to failure are:
+
+ 1. In a K8S cluster with multiple zones, node resources may be insufficient. Since operator using Volume Node Affinity for PV, the pod associated with the PV must be created on the original node, resulting in the pod creation being stuck in the Pending state.
\ No newline at end of file
diff --git a/k8s/docs/03-deploy/configure-affinity-by-kubectl-tg.md b/k8s/docs/03-deploy/configure-affinity-by-kubectl-tg.md
new file mode 100644
index 00000000..278e6c0e
--- /dev/null
+++ b/k8s/docs/03-deploy/configure-affinity-by-kubectl-tg.md
@@ -0,0 +1,294 @@
+ Use Affinity in kubectl-tg plugin
+
+To know how to use NodeSelector/Affinity/Tolerations in YAML, please read [NodeSelector, Affinity and Tolerations using cases](./affinity-use-cases.md) .
+
+This DOC will include all cases in above document.
+
+- [Usage Instructions](#usage-instructions)
+ - [Removing Affinity Configuration](#removing-affinity-configuration)
+- [Examples](#examples)
+ - [Scheduling Pods on Nodes with `disktype=ssd` Label](#scheduling-pods-on-nodes-with-disktypessd-label)
+ - [Preferring Pods to be Scheduled on Nodes with `disktype=ssd` Label](#preferring-pods-to-be-scheduled-on-nodes-with-disktypessd-label)
+ - [Combining Multiple Rules with Different Weights](#combining-multiple-rules-with-different-weights)
+ - [Preventing Multiple TigerGraph Pods on the Same VM Instance](#preventing-multiple-tigergraph-pods-on-the-same-vm-instance)
+ - [Require TG pods not to be scheduled to VM instances that is running TG pods belonging to another cluster](#require-tg-pods-not-to-be-scheduled-to-vm-instances-that-is-running-tg-pods-belonging-to-another-cluster)
+ - [Require TG pods not to be scheduled to the same zone](#require-tg-pods-not-to-be-scheduled-to-the-same-zone)
+ - [Implementing User Groups using Taints and Tolerations](#implementing-user-groups-using-taints-and-tolerations)
+ - [See also](#see-also)
+
+Usage Instructions
+=====
+
+To employ affinity within `kubectl-tg`, the procedure involves crafting your affinity rules in a YAML file. Presented below is an exemplary affinity configuration file:
+
+```yaml
+# NodeSelector field.
+# See https://kubernetes.io/docs/tasks/configure-pod-container/assign-pods-nodes/
+nodeSelector:
+ disktype: ssd
+# Affinity, include Node Affinity, Pod Affinity and Pod Anti-Affinity
+# See https://kubernetes.io/docs/tasks/configure-pod-container/assign-pods-nodes-using-node-affinity/
+# and https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#inter-pod-affinity-and-anti-affinity
+affinity:
+ nodeAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ nodeSelectorTerms:
+ - matchExpressions:
+ - key: topology.kubernetes.io/zone
+ operator: In
+ values:
+ - antarctica-east1
+ - antarctica-west1
+ podAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ - labelSelector:
+ matchExpressions:
+ - key: security
+ operator: In
+ values:
+ - S1
+ topologyKey: topology.kubernetes.io/zone
+ podAntiAffinity:
+ preferredDuringSchedulingIgnoredDuringExecution:
+ - weight: 100
+ podAffinityTerm:
+ labelSelector:
+ matchExpressions:
+ - key: security
+ operator: In
+ values:
+ - S2
+ topologyKey: topology.kubernetes.io/zone
+# Tolerations. See https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/
+tolerations:
+ - key: "example-key"
+ operator: "Exists"
+ effect: "NoSchedule"
+```
+
+Assume that we name our affinity config file as `tg-affinity.yaml`.
+
+Then we can use following command to create a TG cluster with the rules we write:
+
+```bash
+kubectl tg create --cluster-name test-cluster --size 3 --ha 1 --namespace NAMESPACE \
+ --version 3.9.1 --storage-class standard --storage-size 10G \
+ --private-key-secret ssh-key-secret \
+ --affinity tg-affinity.yaml
+```
+
+For an existing cluster, the affinity rules can be updated with this command:
+
+```bash
+kubectl tg update --cluster-name test-cluster --namespace NAMESPACE \
+ --affinity tg-affinity.yaml
+```
+
+Removing Affinity Configuration
+----------------------------------
+
+To eliminate all existing tolerations, affinity rules, and nodeSelectors from your TigerGraph cluster configuration, the process is straightforward. Follow the steps outlined below:
+
+1. **Create an Empty YAML File:**
+
+ Start by generating an empty YAML file. You can create an empty file named `empty.yaml` using the following command:
+
+ ```bash
+ touch empty.yaml
+ ```
+
+2. **Execute Removal Operation:**
+
+ To perform the removal of all tolerations, affinity rules, and nodeSelectors, invoke the `kubectl tg update` command and provide the `--affinity` option with the previously created empty YAML file:
+
+ ```bash
+ kubectl tg update --cluster-name test-cluster --namespace NAMESPACE \
+ --affinity empty.yaml
+ ```
+
+This procedure effectively clears all existing affinity-related configurations, providing a clean slate for your TigerGraph cluster settings. If you wish to retain certain rules while removing others, simply modify your configuration file accordingly and execute the `kubectl tg update` command.
+
+
+Examples
+========
+
+Scheduling Pods on Nodes with `disktype=ssd` Label
+-----------------------------------------------------
+
+To ensure that pods are scheduled exclusively on nodes labeled with `disktype=ssd`, you can utilize the provided affinity configurations. These configurations utilize both Node Selector and Node Affinity approaches. Please note that when employing **required** rules, if an insufficient number of nodes with the desired label are available for scheduling TigerGraph (TG) pods, the pods will remain in a Pending status.
+
+1. **Using Node Selector:**
+ ```yaml
+ nodeSelector:
+ disktype: ssd
+ ```
+
+2. **Using Node Affinity:**
+ ```yaml
+ affinity:
+ nodeAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ nodeSelectorTerms:
+ - matchExpressions:
+ - key: disktype
+ operator: In
+ values:
+ - ssd
+ ```
+
+With these configurations, TigerGraph pods will be scheduled specifically on nodes bearing the `disktype=ssd` label. However, it's important to be aware that if there are an inadequate number of nodes fulfilling this criterion, the TG pods may become Pending due to the required scheduling rules.
+
+
+Preferring Pods to be Scheduled on Nodes with `disktype=ssd` Label
+--------------------------------------------------------------------
+
+If your objective is to prioritize scheduling pods on nodes labeled with `disktype=ssd`, you can implement the desired behavior using a preferred rule within the affinity configuration. Here's how you can achieve this:
+
+```yaml
+affinity:
+ nodeAffinity:
+ preferredDuringSchedulingIgnoredDuringExecution:
+ - weight: 1
+ preference:
+ matchExpressions:
+ - key: disktype
+ operator: In
+ values:
+ - ssd
+```
+
+With this affinity configuration, the specified weight of 1 signifies a preference for scheduling pods on nodes with the `disktype=ssd` label. However, in scenarios where an insufficient number of nodes possess this label, the pods will still be scheduled on other available nodes that lack the label.
+
+By utilizing this **preferred** rule, you ensure that scheduling attempts prioritize nodes with the desired label, while also allowing for scheduling flexibility to accommodate situations where a limited number of labeled nodes are available. This approach offers a balanced trade-off between preference and availability, optimizing the scheduling behavior of your pods within your Kubernetes cluster.
+
+Combining Multiple Rules with Different Weights
+-----------------------------------------------
+
+When you need to combine multiple affinity rules with varying weights to guide pod scheduling, you can achieve this by utilizing a configuration similar to the one you provided. Here's an example configuration:
+
+```yaml
+affinity:
+ nodeAffinity:
+ preferredDuringSchedulingIgnoredDuringExecution:
+ - weight: 1
+ preference:
+ matchExpressions:
+ - key: disktype
+ operator: In
+ values:
+ - ssd
+ - weight: 50
+ preference:
+ matchExpressions:
+ - key: physical-machine
+ operator: Exists
+```
+
+With this configuration:
+
+1. Pods will be preferentially scheduled to nodes with the `physical-machine=true` label due to the higher weight of 50.
+2. If nodes with the `physical-machine=true` label are not available, the next preference will be for nodes with the `disktype=ssd` label, indicated by a weight of 1.
+
+This approach provides a flexible and versatile way to guide pod scheduling behavior based on the defined affinity rules and their associated weights. It ensures that pods are distributed across nodes according to the specified preferences while accommodating availability constraints.
+
+Preventing Multiple TigerGraph Pods on the Same VM Instance
+------------------------------------------------------------
+
+To ensure that no more than one TigerGraph pod is scheduled on the same VM instance, you can employ a `podAntiAffinity` configuration. This rule helps distribute TigerGraph pods across different VM instances, thus avoiding overloading a single instance. Here's how you can achieve this:
+
+1. **Identify the VM Instance Label:**
+
+ First, ascertain the label that designates the VM instance where a node is running. In the case of GKE, the label is `kubernetes.io/hostname=xxx`.
+
+2. **Apply Affinity Configuration:**
+
+ Utilize the following affinity configuration in your deployment:
+
+```yaml
+affinity:
+ podAntiAffinity:
+ preferredDuringSchedulingIgnoredDuringExecution:
+ - weight: 100
+ podAffinityTerm:
+ labelSelector:
+ matchExpressions:
+ - key: tigergraph.com/cluster-name
+ operator: In
+ values:
+ - test-cluster
+ topologyKey: kubernetes.io/hostname
+```
+
+With this configuration:
+
+- The rule establishes that TigerGraph pods should not be scheduled onto a VM instance where another TigerGraph pod from the same cluster is already running.
+- The topologyKey `kubernetes.io/hostname` ensures that the affinity rule considers the VM instance label.
+
+This approach effectively prevents the overloading of a single VM instance by ensuring that TigerGraph pods are distributed across different VM instances, while still accommodating availability constraints.
+
+Please note that if there are an adequate number of nodes available, multiple TigerGraph pods may still be scheduled on the same VM instance. The rule is designed to minimize such instances and optimize distribution across VM instances.
+
+Require TG pods not to be scheduled to VM instances that is running TG pods belonging to another cluster
+--------------------------------------------------------------------------------------------------------
+
+```yaml
+affinity:
+ podAntiAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ - labelSelector:
+ matchExpressions:
+ - key: tigergraph.com/cluster-pod
+ operator: Exists
+ topologyKey: kubernetes.io/hostname
+```
+
+Require TG pods not to be scheduled to the same zone
+----------------------------------------------------
+
+```yaml
+affinity:
+ podAntiAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ - labelSelector:
+ matchExpressions:
+ - key: tigergraph.com/cluster-pod
+ operator: In
+ values:
+ - test-cluster
+ topologyKey: topology.kubernetes.io/zone
+```
+
+Implementing User Groups using Taints and Tolerations
+-------------------------------------------------------
+
+To establish user groups and control pod scheduling based on taints and tolerations, follow these steps:
+
+1. **Taint Nodes:**
+
+ Begin by tainting specific nodes with the label `userGroup=enterprise` and the effect `NoExecute` using the following command:
+
+ ```bash
+ kubectl taint nodes userGroup=enterprise:NoExecute
+ ```
+
+2. **Pod Configuration:**
+
+ To ensure that only pods with the appropriate tolerations are scheduled on the tainted nodes, include the following tolerations configuration in your pod specification:
+
+ ```yaml
+ tolerations:
+ - key: "userGroup"
+ operator: "Equal"
+ value: "enterprise"
+ effect: "NoExecute"
+ ```
+
+ This configuration specifies that the pods should tolerate the taint with the label `userGroup=enterprise` and the effect `NoExecute`, allowing them to be scheduled on the tainted nodes.
+
+By following these steps, you can successfully implement user groups using taints and tolerations. Only pods that adhere to the defined toleration rules will be scheduled on the nodes tainted with the `userGroup=enterprise` label and `NoExecute` effect, allowing you to control and segregate pod scheduling based on user groups.
+
+## See also
+
+If you are interested in learning how to use and configure Pod affinity with YAML resources, please refer to the following documentation:
+
+- [NodeSelector, Affinity and Toleration Use Cases](../03-deploy/affinity-use-cases.md)
diff --git a/k8s/docs/03-deploy/custom-containers.md b/k8s/docs/03-deploy/custom-containers.md
new file mode 100644
index 00000000..a93af20e
--- /dev/null
+++ b/k8s/docs/03-deploy/custom-containers.md
@@ -0,0 +1,249 @@
+InitContainers,SidecarContainers and CustomVolumes
+
+- [Basic knowledge](#basic-knowledge)
+- [Sidecar Containers](#sidecar-containers)
+- [Init Containers](#init-containers)
+- [Custom Volumes](#custom-volumes)
+- [Combining sidecarContainers, initContainers, and customVolumes](#combining-sidecarcontainers-initcontainers-and-customvolumes)
+- [What's Next](#whats-next)
+
+
+Basic knowledge
+===============
+A K8s Pod has the capability to house multiple containers, including both init containers and app containers. Upon pod creation, the init containers execute sequentially in a designated order. Should any of the init containers encounter a failure, the overall pod execution is halted (for more insights, consult [Init Containers](https://kubernetes.io/docs/concepts/workloads/pods/init-containers/)). Following the successful completion of all init containers, the app containers proceed to run concurrently.
+
+By default, in the configuration of the TigerGraph CR, each TigerGraph Pod features a singular app container named "tigergraph". This container runs all TigerGraph services within the Pod. The functionality "InitContainers,SidecarContainers and CustomVolumes" empowers users to seamlessly integrate personalized initContainers and sidecarContainers into TigerGraph Pods. Furthermore, users can create customVolumes, enabling the mounting of these volumes within their initContainers or sidecarContainers.
+
+> [!NOTE]
+> You can utilize this feature by adding configurations in a YAML file or through `kubectl-tg`. This document exclusively focuses on the usage within YAML files. If you're interested in learning how to use it with `kubectl-tg`, please consult the guide on [Utilizing InitContainers, Sidecar Containers, and Custom Volumes with kubectl-tg](./use-custom-containers-by-kubectl-tg.md).
+
+Sidecar Containers
+=================
+
+A sidecar container functions similarly to the app container named "tigergraph". In cases where the sidecar container requires readiness and liveness checks configuration, it is crucial to ensure that these checks do not interfere with the rolling update process of TigerGraph (TG) pods. Simultaneously, adopting the practice of setting resource limits for each sidecar container within the TG pod is recommended to prevent the excessive use of Kubernetes node resources.
+
+
+To integrate sidecarContainers into TigerGraph Pods, write the configurations in `.spec.sidecarContainers`. For detailed guidance on setting up sidecarContainers, consult the [K8S Containers](https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/pod-v1/#Container):
+
+```yaml
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraph
+metadata:
+ name: test-cluster
+spec:
+ image: docker.io/tigergraph/tigergraph-k8s:3.9.2
+ imagePullPolicy: IfNotPresent
+ imagePullSecrets:
+ - name: tigergraph-image-pull-secret
+ initJob:
+ image: docker.io/tigergraph/tigergraph-k8s-init:0.0.7
+ imagePullPolicy: IfNotPresent
+ imagePullSecrets:
+ - name: tigergraph-image-pull-secret
+ initTGConfig:
+ ha: 1
+ license: YOUR_LICENSE_HERE
+ version: 3.9.2
+ listener:
+ type: LoadBalancer
+ privateKeyName: ssh-key-secret
+ replicas: 1
+ resources:
+ requests:
+ cpu: "8"
+ memory: 16Gi
+ sidecarContainers:
+ - args: # sidecar will execute this
+ - /bin/sh
+ - -c
+ - |
+ while true; do
+ echo "$(date) INFO hello from main-container" >> /var/log/myapp.log ;
+ sleep 1;
+ done
+ image: alpine:3.17.2
+ name: main-container # name of sidecar
+ readinessProbe: # check if the sidecar is ready
+ exec:
+ command:
+ - sh
+ - -c
+ - if [[ -f /var/log/myapp.log ]];then exit 0; else exit 1;fi
+ initialDelaySeconds: 10
+ periodSeconds: 5
+ resources:
+ requests: # request resouces for sidecar
+ cpu: 2
+ memory: 1Gi
+ limits: # limit resources
+ cpu: 4
+ memory: 4Gi
+ env: # inject the environment you need
+ - name: CLUSTER_NAME
+ value: test-cluster
+ volumeMounts:
+ - mountPath: /var/log
+ name: tg-log # this volume is used by TG, you can access log of tg here
+ # securityContext: # configure securityContext here
+ # privileged: true
+ storage:
+ type: persistent-claim
+ volumeClaimTemplate:
+ accessModes:
+ - ReadWriteOnce
+ resources:
+ requests:
+ storage: 10G
+ storageClassName: standard
+ volumeMode: Filesystem
+```
+
+Init Containers
+=====
+To incorporate custom initContainers into TigerGraph Pods, place the configuration details within `.spec.initContainers` field. For detailed instructions on setting up initContainers, you can refer to the [K8S Containers API](https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/pod-v1/#Container). Your personalized initContainers will execute once the TG initContainer finishes its tasks.
+
+
+```yaml
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraph
+metadata:
+ name: test-cluster
+spec:
+ image: docker.io/tigergraph/tigergraph-k8s:3.9.2
+ imagePullPolicy: IfNotPresent
+ imagePullSecrets:
+ - name: tigergraph-image-pull-secret
+ initJob:
+ image: docker.io/tigergraph/tigergraph-k8s-init:0.0.7
+ imagePullPolicy: IfNotPresent
+ imagePullSecrets:
+ - name: tigergraph-image-pull-secret
+ initTGConfig:
+ ha: 1
+ license: YOUR_LICENSE_HERE
+ version: 3.9.2
+ listener:
+ type: LoadBalancer
+ privateKeyName: ssh-key-secret
+ replicas: 1
+ resources:
+ requests:
+ cpu: "8"
+ memory: 16Gi
+ initContainers:
+ - image: alpine:3.17.2
+ name: init-hello
+ args:
+ - /bin/sh
+ - -c
+ - echo hello
+ storage:
+ type: persistent-claim
+ volumeClaimTemplate:
+ accessModes:
+ - ReadWriteOnce
+ resources:
+ requests:
+ storage: 10G
+ storageClassName: standard
+ volumeMode: Filesystem
+```
+
+Custom Volumes
+=============
+Incorporating initContainers and sidecarContainers with customVolumes facilitates seamless data exchange. For defining customVolumes, direct your configurations to the `.spec.customVolumes` field. To understand the essential fields of customVolumes, consult the [Kubernetes Volumes documentation](https://kubernetes.io/docs/concepts/storage/volumes/)
+
+By default, the Operator establishes two volumes: `tg-data` for persistent TG cluster data and `tg-log` for TG logs storage. In your sidecar containers, you can mount volume named `tg-log` to access TG logs effectively or mount `tg-data` to access TG data.
+
+Combining sidecarContainers, initContainers, and customVolumes
+=====
+The following example demonstrates the integration of sidecarContainers and initContainers while facilitating data exchange through customVolumes. Init containers create a file in the `credentials` volume, which the sidecar named `main-container` subsequently utilizes for readiness checks. The sidecar named `main-container` also outputs to the file `/var/log/myapp.log`, accessible by the `sidecar-container` due to their common customVolume named `log`.
+
+
+```yaml
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraph
+metadata:
+ name: test-cluster
+spec:
+ image: docker.io/tigergraph/tigergraph-k8s:3.9.2
+ imagePullPolicy: IfNotPresent
+ imagePullSecrets:
+ - name: tigergraph-image-pull-secret
+ initJob:
+ image: docker.io/tigergraph/tigergraph-k8s-init:0.0.7
+ imagePullPolicy: IfNotPresent
+ imagePullSecrets:
+ - name: tigergraph-image-pull-secret
+ initTGConfig:
+ ha: 1
+ license: YOUR_LICENSE_HERE
+ version: 3.9.2
+ listener:
+ type: LoadBalancer
+ privateKeyName: ssh-key-secret
+ replicas: 1
+ resources:
+ requests:
+ cpu: "8"
+ memory: 16Gi
+ storage:
+ type: persistent-claim
+ volumeClaimTemplate:
+ accessModes:
+ - ReadWriteOnce
+ resources:
+ requests:
+ storage: 10G
+ storageClassName: standard
+ volumeMode: Filesystem
+ initContainers:
+ - image: alpine:3.17.2
+ name: init-credential
+ args:
+ - /bin/sh
+ - -c
+ - echo CREDENTIAL > /credentials/auth_file
+ volumeMounts:
+ - name: credentials
+ mountPath: /credentials
+
+ sidecarContainers:
+ - image: alpine:3.17.2
+ name: main-container
+ args:
+ - /bin/sh
+ - -c
+ - while true; do echo "$(date) INFO hello from main-container" >> /var/log/myapp.log ;sleep 1;done
+ volumeMounts:
+ - name: credentials
+ mountPath: /credentials
+ - name: log
+ mountPath: /var/log
+ readinessProbe:
+ exec:
+ command:
+ - sh
+ - -c
+ - if [[ -f /credentials/auth_file ]];then exit 0; else exit 1;fi
+ initialDelaySeconds: 10
+ periodSeconds: 5
+ - name: sidecar-container
+ image: alpine:3.17.2
+ args:
+ - /bin/sh
+ - -c
+ - tail -fn+1 /var/log/myapp.log
+ volumeMounts:
+ - name: log
+ mountPath: /var/log
+ customVolumes:
+ - name: log
+ emptyDir: {}
+ - name: credentials
+ emptyDir: {}
+```
+
+What's Next
+====
+* Learn [how to integrate envoy sidecar containers with TG Pods](../07-reference/integrate-envoy-sidecar.md)
\ No newline at end of file
diff --git a/k8s/docs/03-deploy/deploy-without-internet.md b/k8s/docs/03-deploy/deploy-without-internet.md
new file mode 100644
index 00000000..e0ffc2a1
--- /dev/null
+++ b/k8s/docs/03-deploy/deploy-without-internet.md
@@ -0,0 +1,302 @@
+How to install Operator and deploy TG on K8s without internet access
+
+Prerequisites
+=============
+
+* Docker
+
+* Private Docker registry
+
+* Private helm repo
+
+
+Procedure
+=========
+
+Transferring Docker Images and Helm Chart Package
+--------------------------------------------------
+
+Please ensure that your environment has internet access before proceeding with the download of these docker images and helm chart packages.
+
+For illustrative purposes, we will utilize TG cluster version 3.9.2 and TG K8s Operator version 0.0.7. Kindly make the necessary adjustments based on your specific version.
+
+### TigerGraph Operator
+
+* Docker images
+
+
+1. tigergraph/tigergraph-k8s:3.9.2
+
+2. tigergraph/tigergraph-k8s-operator:0.0.7
+
+3. tigergraph/tigergraph-k8s-init:0.0.7
+
+
+```bash
+docker pull tigergraph/tigergraph-k8s:3.9.2
+dokcer pull tigergraph/tigergraph-k8s-operator:0.0.7
+docker pull tigergraph/tigergraph-k8s-init:0.0.7
+
+docker save tigergraph/tigergraph-k8s:3.9.2 tigergraph/tigergraph-k8s-operator:0.0.7 tigergraph/tigergraph-k8s-init:0.0.7 > tigergraph-operator-images.tar
+
+# copy the docker images tar files to your target machine before loading
+docker load < tigergraph-operator-images.tar
+# replace it to your private DOCKER_REPO
+export DOCKER_REPO=docker.io/internal
+docker tag tigergraph/tigergraph-k8s:3.9.2 ${DOCKER_REPO}/tigergraph-k8s:3.9.2
+docker tag tigergraph/tigergraph-k8s-operator:0.0.7 ${DOCKER_REPO}/tigergraph-k8s-operator:0.0.7
+docker tag tigergraph/tigergraph-k8s-init:0.0.7 ${DOCKER_REPO}/tigergraph/tigergraph-k8s-init:0.0.7
+
+# push them to your private docker repo
+docker push ${DOCKER_REPO}/tigergraph-k8s:3.9.2
+docker push ${DOCKER_REPO}/tigergraph-k8s-operator:0.0.7
+docker push ${DOCKER_REPO}/tigergraph-k8s-init:0.0.7
+```
+
+* Helm chart package (private helm repo required)
+
+If the goal is to install the operator using kubectl-tg, having a private Helm repository is crucial. If such a repository is unavailable and you aim to install an operator without internet connectivity, refer to the section that outlines the procedure for installing the Helm chart locally.
+
+```bash
+# Dowload the helm chart package from TG public repo
+curl https://dl.tigergraph.com/charts/tg-operator-0.0.7.tgz -o tg-operator-0.0.7.tgz
+
+#
+# mkdir -p /tmp/charts
+# chmod 0777 /tmp/charts
+# docker run -d \
+# -p 8383:8080 \
+# -e DEBUG=1 \
+# -e STORAGE=local \
+# -e STORAGE_LOCAL_ROOTDIR=/charts \
+# -v /tmp/charts:/charts \
+# --name ${helm_repo_name} ghcr.io/helm/chartmuseum:v0.13.1
+# replace the HELM_REPO to your own one, the following steps will take chartmuseum as an example.
+export HELM_REPO=http://127.0.0.1:8383
+# upload the helm chart package to your private helm repo
+curl --request DELETE ${HELM_REPO}/api/charts/tg-operator/${VERSION}
+curl --data-binary "@charts/tg-operator-${VERSION}.tgz" ${HELM_REPO}/api/charts
+```
+
+* Install Operator and deploy TG cluster with private docker repo and helm repo
+
+
+### Cert-manager
+
+The following examples suppose you are going to use cert-manager 1.8.0 version
+
+* Transferring the cert-manager Docker images to your private Docker registry
+
+
+```bash
+# curl https://github.com/cert-manager/cert-manager/releases/download/v1.8.0/cert-manager.yaml
+# quay.io/jetstack/cert-manager-cainjector:v1.8.0
+# quay.io/jetstack/cert-manager-controller:v1.8.0
+# quay.io/jetstack/cert-manager-webhook:v1.8.0
+docker pull quay.io/jetstack/cert-manager-cainjector:v1.8.0
+dokcer pull quay.io/jetstack/cert-manager-controller:v1.8.0
+docker pull quay.io/jetstack/cert-manager-webhook:v1.8.0
+
+docker save quay.io/jetstack/cert-manager-cainjector:v1.8.0 quay.io/jetstack/cert-manager-controller:v1.8.0 quay.io/jetstack/cert-manager-webhook:v1.8.0 > cert-manager-images.tar
+
+# copy the docker images tar files to your target machine before loading
+docker load < cert-manager-images.tar
+
+# replace it to your private DOCKER REPO
+export DOCKER_REPO=docker.io/internal
+docker tag quay.io/jetstack/cert-manager-cainjector:v1.8.0 ${DOCKER_REPO}/cert-manager-cainjector:v1.8.0
+docker tag quay.io/jetstack/cert-manager-controller:v1.8.0 ${DOCKER_REPO}/cert-manager-controller:v1.8.0
+docker tag quay.io/jetstack/cert-manager-webhook:v1.8.0 ${DOCKER_REPO}/cert-manager-webhook:v1.8.0
+
+# push them to your private docker repo
+docker push ${DOCKER_REPO}/cert-manager-cainjector:v1.8.0
+docker push ${DOCKER_REPO}/cert-manager-controller:v1.8.0
+docker push ${DOCKER_REPO}/cert-manager-webhook:v1.8.0
+```
+
+* Modify the manifests of cert-manager according to your docker registry
+
+
+```bash
+curl -L "https://github.com/cert-manager/cert-manager/releases/download/v1.8.0/cert-manager.yaml" -o "cert-manager.yaml"
+
+# edit cert-manager.yaml, change the following lines which including cert-manager images
+quay.io/jetstack/cert-manager-cainjector:v1.8.0 -> ${DOCKER_REPO}/cert-manager-cainjector:v1.8.0
+quay.io/jetstack/cert-manager-cainjector:v1.8.0 -> ${DOCKER_REPO}/cert-manager-controller:v1.8.0
+quay.io/jetstack/cert-manager-cainjector:v1.8.0 -> ${DOCKER_REPO}/cert-manager-webhook:v1.8.0
+
+# Install the cert-manager with your private docker
+kubectl apply -f cert-manager.yaml
+```
+
+**Install Operator with kubect-tg**
+
+In scenarios where your Docker registry necessitates authentication, you can specify a custom secret name using the `--image-pull-secret` option. The default secret name is `tigergraph-image-pull-secret`.
+
+Furthermore, it's imperative to create the image pull secret within the designated namespace before initiating the deployment of your TG cluster.
+```bash
+# please make sure the HELM_REPO and DOCKER_REPO is correct
+export HELM_REPO=http://127.0.0.1:8383
+export DOCKER_REPO=docker.io/internal
+kubectl tg init --namespace tigergraph --helm-repo ${HELM_REPO} --image-pull-secret yoursecret --docker-registry ${DOCKER_REPO}
+```
+
+**Install Operator using the helm command to install it locally**
+
+Please follow these steps to install a Helm chart:
+
+1. Download the Helm chart you want to install.
+
+2. Extract the chart to a directory on your local machine.
+
+3. Open a terminal window and navigate to the directory where you extracted the chart.
+
+4. Modify the default configuration of Operator by editing `values.yaml`.
+
+5. Run the following command to install the chart:
+
+
+Customize the operator configuration via values.yaml, we should change the image filed to your internal docker repo.
+
+```bash
+# Default values for tg-operator.
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+
+# Default values for deployment replicas of operator
+replicas: 3
+# image is the docker image of operator
+image: tigergraph/tigergraph-k8s-operator:0.0.7
+pullPolicy: IfNotPresent
+# imagePullSecret is the docker image pull secret of operator
+imagePullSecret: tigergraph-image-pull-secret
+# watchNameSpaces are the namespaces which operator watch, multiple namespaces separated by comma, empty indicates watch all namespaces
+watchNameSpaces: ""
+# clusterScope is whether the operator has ClusterRole
+clusterScope: true
+# resources are resources reqeusts configuration of operator
+resources:
+ requests:
+ cpu: 1000m
+ memory: 1024Mi
+ limits:
+ cpu: 2000m
+ memory: 4096Mi
+```
+
+Install Operator using helm
+
+```bash
+curl https://dl.tigergraph.com/charts/tg-operator-0.0.7.tgz -o tg-operator-0.0.7.tgz
+tar xvf tg-operator-0.0.7.tgz
+
+$ cd tg-operator
+$ tree
+.
+âââ Chart.yaml
+âââ crds
+â âââ tg-operator-crd.yaml
+âââ templates
+â âââ NOTES.txt
+â âââ _helpers.tpl
+â âââ tg-operator.yaml
+âââ values.yaml
+
+# before you install the Operator, you may need to modify the configuration in values.yaml
+# such as docker images, replicas, watchnamesapce, resources limits and so on
+$ helm install tg-operator ./tg-operator -n tigergraph
+NAME: tg-operator
+LAST DEPLOYED: Tue Mar 14 06:58:41 2023
+NAMESPACE: tigergraph
+STATUS: deployed
+REVISION: 1
+TEST SUITE: None
+NOTES:
+
+$ helm list -n tigergraph
+NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION
+tg-operator tigergraph 1 2023-03-14 06:58:41.849883727 +0000 UTC deployed tg-operator-0.0.7
+
+# make sure the deployment of Operator is running now
+kubectl get pods -n tigergraph
+NAME READY STATUS RESTARTS AGE
+tigergraph-operator-controller-manager-7cfc4476c7-692r4 2/2 Running 0 5m8s
+tigergraph-operator-controller-manager-7cfc4476c7-76msk 2/2 Running 0 5m8s
+tigergraph-operator-controller-manager-7cfc4476c7-k8425 2/2 Running 0 5m8s
+```
+
+**Deploy TG cluster**
+
+If your Docker registry necessitates authentication, you need to create an image pull secret. Please make the necessary adjustments to the namespace based on your environment.
+
+Here's the secret definition you can use as a reference, please make the necessary adjustments to the namespace based on your environment.:
+
+```bash
+apiVersion: v1
+data:
+ .dockerconfigjson: ******************************************
+kind: Secret
+metadata:
+ name: tigergraph-image-pull-secret
+ namespace: tigergraph
+type: kubernetes.io/dockerconfigjson
+```
+
+Create a private ssh key secret(Required operator 0.0.7 and later)
+
+```bash
+echo -e 'y\\n' | ssh-keygen -b 4096 -t rsa -f $HOME/.ssh/tigergraph_rsa -q -N ''
+
+kubectl create secret generic ssh-key-secret --from-file=private-ssh-key=$HOME/.ssh/tigergraph_rsa --from-file=public-ssh-key=$HOME/.ssh/tigergraph_rsa.pub --namespace tigergraph
+```
+
+Deploy TG cluster with specific docker registry
+
+```bash
+ # please make sure the DOCKER_REGISTRY is correct, the following is just an expamle.
+ export DOCKER_REGISTRY=docker.io/internal
+ kubectl tg create --cluster-name test001 --namespace tigergraph --private-key-secret ssh-key-secret --docker-registry ${DOCKER_REGISTRY} \
+ -s 6 --ha 2 --version TG_CLUSTER_VERSION \
+ --storage-class YOUR_STORAGE_CLASS_NAME --storage-size 100G
+```
+
+We can also modify the TigerGraph manifest to deploy the TG cluster directly
+
+```yaml
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraph
+metadata:
+ name: test-cluster
+spec:
+ image: docker.io/tginternal/tigergraph-k8s:3.9.2
+ imagePullPolicy: Always
+ imagePullSecrets:
+ - name: tigergraph-image-pull-secret
+ initJob:
+ image: docker.io/tginternal/tigergraph-k8s-init:0.0.7
+ imagePullPolicy: Always
+ imagePullSecrets:
+ - name: tigergraph-image-pull-secret
+ initTGConfig:
+ ha: 2
+ license: xxxxxxxxxxxx
+ version: 3.9.2
+ listener:
+ type: LoadBalancer
+ privateKeyName: ssh-key-secret
+ replicas: 3
+ resources:
+ requests:
+ cpu: "2"
+ memory: 8Gi
+ storage:
+ type: persistent-claim
+ volumeClaimTemplate:
+ accessModes:
+ - ReadWriteOnce
+ resources:
+ requests:
+ storage: 100G
+ storageClassName: standard
+ volumeMode: Filesystem
+```
\ No newline at end of file
diff --git a/k8s/docs/03-deploy/tigergraph-on-eks.md b/k8s/docs/03-deploy/tigergraph-on-eks.md
new file mode 100644
index 00000000..3a15b5c7
--- /dev/null
+++ b/k8s/docs/03-deploy/tigergraph-on-eks.md
@@ -0,0 +1,473 @@
+ Deploy TigerGraph on AWS EKS
+
+This user manual provides detailed instructions on deploying a TigerGraph cluster on AWS EKS (Elastic Kubernetes Service).
+
+- [Prerequisites](#prerequisites)
+- [Deploy TigerGraph Operator](#deploy-tigergraph-operator)
+ - [Install cert-manager for EKS](#install-cert-manager-for-eks)
+ - [Install the kubectl-tg Plugin](#install-the-kubectl-tg-plugin)
+ - [Optional: Install CRDs Independently](#optional-install-crds-independently)
+ - [Install TigerGraph Operator](#install-tigergraph-operator)
+- [Deploy a TigerGraph Cluster](#deploy-a-tigergraph-cluster)
+ - [Providing a Private SSH Key Pair for Enhanced Security](#providing-a-private-ssh-key-pair-for-enhanced-security)
+ - [Specify the StorageClass Name](#specify-the-storageclass-name)
+ - [Create a TigerGraph Cluster with Specific Options](#create-a-tigergraph-cluster-with-specific-options)
+- [Connect to a TigerGraph Cluster](#connect-to-a-tigergraph-cluster)
+ - [Connect to a TigerGraph Cluster Pod](#connect-to-a-tigergraph-cluster-pod)
+ - [Access TigerGraph Suite](#access-tigergraph-suite)
+ - [Access RESTPP API Service](#access-restpp-api-service)
+- [Upgrade a TigerGraph Cluster](#upgrade-a-tigergraph-cluster)
+- [Scale a TigerGraph Cluster](#scale-a-tigergraph-cluster)
+- [Update Resources (CPU and Memory) of the TigerGraph Cluster](#update-resources-cpu-and-memory-of-the-tigergraph-cluster)
+- [Destroy the TigerGraph Cluster and the Kubernetes Operator](#destroy-the-tigergraph-cluster-and-the-kubernetes-operator)
+ - [Destroy the TigerGraph Cluster](#destroy-the-tigergraph-cluster)
+ - [Uninstall TigerGraph Operator](#uninstall-tigergraph-operator)
+ - [Uninstall the Custom Resource Definitions (CRDs)](#uninstall-the-custom-resource-definitions-crds)
+- [See also](#see-also)
+
+## Prerequisites
+
+Before proceeding with the deployment, make sure you have the following prerequisites in place:
+
+- [Helm](https://helm.sh/docs/intro/install/) installed, with a version equal to or greater than 3.7.0. The TigerGraph Kubernetes Operator is packaged as a Helm chart and requires Helm for installation.
+
+- [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) installed, with a version equal to or greater than 1.23. The `kubectl-tg` plugin depends on `kubectl` for managing Kubernetes clusters.
+
+- [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) installed with the latest version. This will be used to install the EBS CSI driver `aws-ebs-csi-driver` if necessary.
+
+- An existing [EKS cluster](https://docs.aws.amazon.com/eks/latest/userguide/create-cluster.html) with admin role permissions.
+
+## Deploy TigerGraph Operator
+
+To deploy the TigerGraph Operator, follow these steps:
+
+### Install cert-manager for EKS
+
+The TigerGraph Operator uses the Admission Webhooks feature and relies on [cert-manager](https://github.com/jetstack/cert-manager) for provisioning certificates for the webhook server.
+
+Admission webhooks are HTTP callbacks that receive admission requests and do something with them. It is registered with Kubernetes and will be called by Kubernetes to validate or mutate a resource before being stored.
+
+Follow these commands to install cert-manager:
+
+> [!WARNING]
+> Please check whether cert-manager has been installed before execute the following command.
+
+```bash
+kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.8.0/cert-manager.yaml
+# Verify installation of cert-manager
+kubectl wait deployment -n cert-manager cert-manager --for condition=Available=True --timeout=90s
+kubectl wait deployment -n cert-manager cert-manager-cainjector --for condition=Available=True --timeout=90s
+kubectl wait deployment -n cert-manager cert-manager-webhook --for condition=Available=True --timeout=90s
+```
+
+### Install the kubectl-tg Plugin
+
+The `kubectl-tg` plugin allows you to deploy and manage the Operator and TigerGraph clusters imperatively. Before installing the plugin, ensure the following requirements are met:
+
+- [helm](https://helm.sh/docs/helm/helm_install/) with a version equal to or greater than 3.7.0.
+- [jq](https://jqlang.github.io/jq/download/) with a version equal to or greater than 1.6.
+- [yq](https://github.com/mikefarah/yq) with a version equal to or greater than 4.18.1.
+
+Here's an example of installing the latest kubectl-tg, you can change the latest to your desired version, such as 0.0.9:
+
+```bash
+wget https://dl.tigergraph.com/k8s/latest/kubectl-tg -O kubectl-tg
+sudo install kubectl-tg /usr/local/bin/
+```
+
+To check the `kubectl-tg` version, use:
+
+```bash
+kubectl tg version
+```
+
+For help information, use:
+
+```bash
+kubectl tg help
+```
+
+### Optional: Install CRDs Independently
+
+This step is optional and can be skipped if you have privileged permissions in your Kubernetes environment. The necessary CustomResourceDefinitions (CRDs) are automatically installed during the Operator installation. If you prefer to install CRDs independently, use the following command:
+
+```bash
+kubectl apply -f https://dl.tigergraph.com/k8s/latest/tg-operator-crd.yaml
+```
+
+### Install TigerGraph Operator
+
+To simplify the Operator installation and TigerGraph cluster deployment, define environment variables:
+
+```bash
+export YOUR_NAMESPACE="tigergraph"
+export YOUR_CLUSTER_NAME="test-tg-cluster"
+export YOUR_SSH_KEY_SECRET_NAME="ssh-key-secret"
+```
+
+Now, you can install the TigerGraph Operator using the following commands:
+
+A namespace-scoped operator watches and manages resources in a single Namespace, whereas a cluster-scoped operator watches and manages resources cluster-wide.
+
+- For a namespace-scoped Operator:
+
+ ```bash
+ kubectl tg init --cluster-scope false --namespace ${YOUR_NAMESPACE}
+ ```
+
+- For a cluster-scoped Operator (default behavior):
+
+ ```bash
+ # This is a defulat behavior if you don't specific the --cluster-scope option.
+ kubectl tg init --cluster-scope true --namespace ${YOUR_NAMESPACE}
+ ```
+
+- For custom installation options:
+
+ You can customize the installation by specifying options like the Operator version, deployment size, CPU, memory, and the namespace to watch, among others. Here's an example:
+
+ ```bash
+ kubectl tg init --cluster-scope false --version ${OPERATOR_VERSION} --operator-size 3 --operator-watch-namespace ${YOUR_NAMESPACE} --operator-cpu 1000m --operator-memory 1024Mi --namespace ${YOUR_NAMESPACE}
+ ```
+
+ For a comprehensive list of options, refer to the output of the `kubectl tg init` --help command.
+
+To verify the successful deployment of the Operator, use the following command:
+
+ ```bash
+ kubectl wait deployment tigergraph-operator-controller-manager --for condition=Available=True --timeout=120s -n ${YOUR_NAMESPACE}
+ ```
+
+## Deploy a TigerGraph Cluster
+
+This section explains how to deploy a TigerGraph cluster on EKS using the `kubectl-tg` plugin and a CR (Custom Resource) YAML manifest.
+
+### Providing a Private SSH Key Pair for Enhanced Security
+
+Starting from Operator version 0.0.4, users are required to provide their private SSH key pair for enhanced security before creating a cluster. Follow these steps:
+
+- Step 1: create a private SSH key pair file:
+
+ ```bash
+ echo -e 'y\n' | ssh-keygen -b 4096 -t rsa -f $HOME/.ssh/tigergraph_rsa -q -N ''
+ ```
+
+- Step 2: create a Secret Object
+
+ > [!IMPORTANT]
+ > The namespace of the Secret object must be the same as that of the TigerGraph cluster.
+
+ Create a secret object based on the private SSH key file generated in step 1. Ensure that the key name of the secret for the private SSH key is `private-ssh-key`, and the key name for the public SSH key is `public-ssh-key`. Do not alter these key names:
+
+ ```bash
+ kubectl create secret generic ${YOUR_SSH_KEY_SECRET_NAME} --from-file=private-ssh-key=$HOME/.ssh/tigergraph_rsa --from-file=public-ssh-key=$HOME/.ssh/tigergraph_rsa.pub --namespace ${YOUR_NAMESPACE}
+ ```
+
+ For Operator versions 0.0.4 and above, when creating a cluster using the `kubectl tg create command`, you must set the `--private-key-secret` option to `${YOUR_SSH_KEY_SECRET_NAME}`.
+
+These steps enhance the security of your cluster by utilizing your private SSH key pair.
+
+### Specify the StorageClass Name
+
+Before creating the TigerGraph cluster with the Operator, specify the StorageClass, which defines the various storage options. You can identify the name of the StorageClass with the following command:
+
+> [!NOTE]
+> Here the dynamic persistent volume storage is provided by EKS by default, if you want to use static persistent volume or use them from scratch, please refer to [How to use static & dynamic persistent volume storage](../07-reference/static-and-dynamic-persistent-volume-storage.md).
+
+```bash
+kubectl get storageclass
+
+NAME PROVISIONER RECLAIMPOLICY VOLUMEBINDINGMODE ALLOWVOLUMEEXPANSION AGE
+gp2 (default) kubernetes.io/aws-ebs Delete WaitForFirstConsumer false 62m
+```
+
+Choose the appropriate StorageClass (e.g., `gp2`) when creating the TigerGraph cluster, ensuring optimized storage provisioning and management.
+
+> [!WARNING]
+> For specific EKS version, you may encounter the following problems, TigerGraph pods will be in pending state because of the PVC pending state.
+
+- TigerGraph Pod status
+
+ ```bash
+ kubectl get pod -l tigergraph.com/cluster-name=${YOUR_CLUSTER_NAME} -n ${YOUR_NAMESPACE}
+
+ NAME READY STATUS RESTARTS AGE
+ test-tg-cluster-0 0/1 Pending 0 5m27s
+ test-tg-cluster-1 0/1 Pending 0 5m27s
+ test-tg-cluster-2 0/1 Pending 0 5m27s
+ ```
+
+- TigerGraph PVC status
+
+ ```bash
+ kubectl get pvc -l tigergraph.com/cluster-name=${YOUR_CLUSTER_NAME} -n ${YOUR_NAMESPACE}
+
+ NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE
+ tg-data-test-tg-cluster-0 Pending gp2 110s
+ tg-data-test-tg-cluster-1 Pending gp2 110s
+ tg-data-test-tg-cluster-2 Pending gp2 110s
+ ```
+
+- Checking the PVC Events of one Pod
+
+ ```bash
+ kubectl describe pvc tg-data-test-tg-cluster-0 -n ${YOUR_NAMESPACE}
+
+ Name: tg-data-test-tg-cluster-0
+ Namespace: tigergraph
+ StorageClass: gp2
+ Status: Pending
+ Volume:
+ Labels: tigergraph.com/cluster-name=test-tg-cluster
+ tigergraph.com/cluster-pod=test-tg-cluster
+ Annotations: volume.beta.kubernetes.io/storage-provisioner: ebs.csi.aws.com
+ volume.kubernetes.io/selected-node: ip-172-31-22-5.us-west-1.compute.internal
+ volume.kubernetes.io/storage-provisioner: ebs.csi.aws.com
+ Finalizers: [kubernetes.io/pvc-protection]
+ Capacity:
+ Access Modes:
+ VolumeMode: Filesystem
+ Used By: test-tg-cluster-0
+ Events:
+ Type Reason Age From Message
+ ---- ------ ---- ---- -------
+ Normal WaitForFirstConsumer 7m54s persistentvolume-controller waiting for first consumer to be created before binding
+ Normal ExternalProvisioning 115s (x25 over 7m54s) persistentvolume-controller waiting for a volume to be created, either by external provisioner "ebs.csi.aws.com" or manually created by system administrator
+ ```
+
+ If you encounter the above issues, please resolve it using the following steps:
+
+ 1. Make sure that the EKS cluster has been installed EBS CSI driver
+
+ ```bash
+ kubectl get deployment ebs-csi-controller -n kube-system
+ ```
+
+ 2. If not, install EBS CSI driver through the following commands
+
+ > [!WARNING]
+ > Please ensure that the IAM role for the Amazon EBS CSI driver has been created. You can refer to the official AWS documentation [Creating the Amazon EBS CSI driver IAM role](https://docs.aws.amazon.com/eks/latest/userguide/csi-iam-role.html) for detailed instructions.
+
+ ```bash
+ aws eks create-addon --cluster-name $YOUR_EKS_CLUSTER_NAME --addon-name aws-ebs-csi-driver
+ ```
+
+### Create a TigerGraph Cluster with Specific Options
+
+You can create a new TigerGraph cluster with specific options, such as size, high availability, version, license, and resource specifications. Here's an example:
+
+- Get and export free license
+
+ ```bash
+ export LICENSE=$(curl -L "ftp://ftp.graphtiger.com/lic/license3.txt" -o "/tmp/license3.txt" 2>/dev/null && cat /tmp/license3.txt)
+ ```
+
+- Create TigerGraph cluster with kubectl-tg plugin
+
+ ```bash
+ kubectl tg create --cluster-name ${YOUR_CLUSTER_NAME} --private-key-secret ${YOUR_SSH_KEY_SECRET_NAME} --size 3 --ha 2 --version 3.9.3 --license ${LICENSE} \
+ --storage-class gp2 --storage-size 100G --cpu 6000m --memory 16Gi --namespace ${YOUR_NAMESPACE}
+ ```
+
+- Create TigerGraph cluster with CR(Custom Resource) YAML manifest
+ > [!NOTE]
+ > Please replace the TigerGraph version (e.g., 3.9.3) and the Operator version (e.g., 0.0.9) with your desired versions.
+
+ ```bash
+ cat < [!WARNING]
+> TigerGraph's exceptional performance comes with certain considerations regarding high availability during upgrading operations. Currently, TigerGraph does not provide dedicated high-availability upgrade support, and some downtime is involved.
+
+Upgrading a TigerGraph cluster is supported from a lower version to a higher version.
+
+> [!WARNING]
+> For TigerGraph 3.9.3 and later versions, the use of passwords to log in to Pods is disabled, which enhances security. If you plan to upgrade your TigerGraph cluster to version 3.9.3, it is essential to first upgrade the Operator to version 0.0.9.
+
+> [!WARNING]
+> Operator 0.0.9 has disabled TG downgrades from a higher version (e.g., 3.9.3) to any lower version (e.g., 3.9.2). Therefore, the upgrade job will fail if you attempt to downgrade.
+
+You can upgrade a TigerGraph cluster from a lower version to a higher version. Assuming the current version is 3.9.2 and you want to upgrade to 3.9.3, use the following command:
+
+```bash
+kubectl tg update --cluster-name ${YOUR_CLUSTER_NAME} --version 3.9.3 --namespace ${YOUR_NAMESPACE}
+```
+
+If you prefer to upgrade the cluster using a CR (Custom Resource) YAML manifest, simply update the `spec.initTGConfig.version` and `spec.image` field, and then apply it.
+
+Ensure the successful upgrade with these commands:
+
+```bash
+kubectl rollout status --watch --timeout=900s statefulset/${YOUR_CLUSTER_NAME} --namespace ${YOUR_NAMESPACE}
+
+kubectl wait --for=condition=complete --timeout=15m job/${YOUR_CLUSTER_NAME}-upgrade-job --namespace ${YOUR_NAMESPACE}
+```
+
+## Scale a TigerGraph Cluster
+
+> [!WARNING]
+> TigerGraph's exceptional performance comes with certain considerations regarding high availability during scaling operations. Currently, TigerGraph does not provide dedicated high-availability scale support, and some downtime is involved.
+
+Before scaling out the cluster, ensure that the corresponding node pool is scaled out to provide sufficient resources for the new instances. Use the following command to scale the TigerGraph cluster:
+
+```bash
+kubectl tg update --cluster-name ${YOUR_CLUSTER_NAME} --size 6 --ha 2 --namespace ${YOUR_NAMESPACE}
+```
+
+The above command scales the cluster to a size of 6 with a high availability factor of 2. If you prefer to use a CR (Custom Resource) YAML manifest for scaling, update the `spec.replicas` and `spec.initTGConfig.ha` fields accordingly.
+
+## Update Resources (CPU and Memory) of the TigerGraph Cluster
+
+To update the CPU and memory resources of the TigerGraph cluster, use the following command:
+
+```bash
+kubectl tg update --cluster-name ${YOUR_CLUSTER_NAME} --cpu 8 --memory 16Gi --cpu-limit 8 --memory-limit 16Gi --namespace ${YOUR_NAMESPACE}
+```
+
+Alternatively, if you want to update the cluster using a CR (Custom Resource) YAML manifest, update the spec.resources.requests and spec.resources.limits fields accordingly.
+
+## Destroy the TigerGraph Cluster and the Kubernetes Operator
+
+### Destroy the TigerGraph Cluster
+
+To delete a TigerGraph cluster, use the following command. Note that this command does not remove Persistent Volume Claims (PVCs) and Persistent Volumes (PVs) associated with the cluster. To delete these components, manually delete the PVCs.
+
+- Delete the TigerGraph cluster and remain the Persistent Volumes (PVs)
+
+ ```bash
+ kubectl tg delete --cluster-name ${YOUR_CLUSTER_NAME} -n ${YOUR_NAMESPACE}
+ ```
+
+- To delete the PVCs related to the specified cluster, use the following commands:
+
+ ```bash
+ # to figure out the pvcs you want to delete by specific labels of pvc.
+ kubectl get pvc -l tigergraph.com/cluster-name=${YOUR_CLUSTER_NAME} -n ${YOUR_NAMESPACE}
+
+ # delete the pvcs related to the specified cluster
+ kubectl delete pvc -l tigergraph.com/cluster-name=${YOUR_CLUSTER_NAME} -n ${YOUR_NAMESPACE}
+ ```
+
+### Uninstall TigerGraph Operator
+
+To uninstall the TigerGraph Kubernetes Operator within a specified namespace, use the following command:
+
+```bash
+kubectl tg uninstall -n ${YOUR_NAMESPACE}
+```
+
+### Uninstall the Custom Resource Definitions (CRDs)
+
+> [!NOTE]
+> Replace the variable `${OPERATOR_VERSION}` to the Operator version you installed.
+
+```bash
+kubectl delete -f https://dl.tigergraph.com/k8s/${OPERATOR_VERSION}/tg-operator-crd.yaml
+```
+
+## See also
+
+If you are interested in the details of deploying a TigerGraph cluster using the CR (Custom Resource) YAML manifest, refer to the following document:
+
+- [Configuring TigerGraph Clusters on K8s using TigerGraph CR](../07-reference/configure-tigergraph-cluster-cr-with-yaml-manifests.md)
diff --git a/k8s/docs/03-deploy/tigergraph-on-gke.md b/k8s/docs/03-deploy/tigergraph-on-gke.md
new file mode 100644
index 00000000..a732ced8
--- /dev/null
+++ b/k8s/docs/03-deploy/tigergraph-on-gke.md
@@ -0,0 +1,405 @@
+# Deploy TigerGraph on Google Cloud GKE
+
+This comprehensive document provides step-by-step instructions on deploying a TigerGraph cluster on Google Kubernetes Engine (GKE) using Kubernetes.
+
+- [Deploy TigerGraph on Google Cloud GKE](#deploy-tigergraph-on-google-cloud-gke)
+ - [Prerequisites](#prerequisites)
+ - [Deploy TigerGraph Operator](#deploy-tigergraph-operator)
+ - [Install cert-manager for GKE](#install-cert-manager-for-gke)
+ - [Install kubectl-tg plugin](#install-kubectl-tg-plugin)
+ - [Install CustomResourceDefinitions (Optional)](#install-customresourcedefinitions-optional)
+ - [Install TigerGraph Operator](#install-tigergraph-operator)
+ - [Deploy a TigerGraph cluster](#deploy-a-tigergraph-cluster)
+ - [Providing Private SSH Key Pair for Enhanced Security](#providing-private-ssh-key-pair-for-enhanced-security)
+ - [Specify the StorageClass name](#specify-the-storageclass-name)
+ - [Create TG cluster with specific options](#create-tg-cluster-with-specific-options)
+ - [Connect to a TigerGraph cluster](#connect-to-a-tigergraph-cluster)
+ - [Connect to a TigerGraph cluster Pod](#connect-to-a-tigergraph-cluster-pod)
+ - [Access TigerGraph Suite](#access-tigergraph-suite)
+ - [Access RESTPP API Service](#access-restpp-api-service)
+ - [Upgrade a TigerGraph cluster](#upgrade-a-tigergraph-cluster)
+ - [Scale a TigerGraph cluster](#scale-a-tigergraph-cluster)
+ - [Update the resources(CPU and Memory) of the TigerGraph cluster](#update-the-resourcescpu-and-memory-of-the-tigergraph-cluster)
+ - [Destroy the TigerGraph cluster and the Kubernetes Operator](#destroy-the-tigergraph-cluster-and-the-kubernetes-operator)
+ - [Destroy the TigerGraph cluster](#destroy-the-tigergraph-cluster)
+ - [Uninstall TigerGraph Operator](#uninstall-tigergraph-operator)
+ - [Uninstall CRD](#uninstall-crd)
+ - [See also](#see-also)
+
+## Prerequisites
+
+Before proceeding, ensure you have the following prerequisites in place:
+
+- [Helm](https://helm.sh/docs/intro/install/): Helm version >= 3.7.0. TigerGraph Kubernetes Operator is packaged as a Helm chart, so Helm must be installed.
+
+- [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/): kubectl version >= 1.23. The kubectl-tg plugin relies on kubectl for managing Kubernetes clusters.
+
+- Create [GKE cluster](https://cloud.google.com/kubernetes-engine/docs/how-to/creating-a-zonal-cluster) with admin role permission.
+
+## Deploy TigerGraph Operator
+
+To deploy the TigerGraph Operator, follow these steps:
+
+### Install cert-manager for GKE
+
+The TigerGraph Operator uses [cert-manager](https://github.com/jetstack/cert-manager) for provisioning certificates for the webhook server. Cert-manager enables the Admission Webhooks feature.
+
+Admission webhooks are HTTP callbacks that receive admission requests and do something with them. It is registered with Kubernetes and will be called by Kubernetes to validate or mutate a resource before being stored.
+
+Follow these steps to install cert-manager:
+
+> [!WARNING]
+> Please check whether cert-manager has been installed before execute the following command.
+
+```bash
+kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.8.0/cert-manager.yaml
+# Verify installation of cert-manager
+kubectl wait deployment -n cert-manager cert-manager --for condition=Available=True --timeout=90s
+kubectl wait deployment -n cert-manager cert-manager-cainjector --for condition=Available=True --timeout=90s
+kubectl wait deployment -n cert-manager cert-manager-webhook --for condition=Available=True --timeout=90s
+```
+
+### Install kubectl-tg plugin
+
+The `kubectl-tg` plugin simplifies deploying and managing the Operator and TigerGraph clusters. Before installing `kubectl-tg`, ensure you meet the following requirements:
+
+- [helm](https://helm.sh/docs/helm/helm_install/): Helm version >= 3.7.0
+- [jq](https://jqlang.github.io/jq/download/): jq version >= 1.6
+- [yq](https://github.com/mikefarah/yq): yq version >= 4.18.1
+
+Here's an example of installing the latest kubectl-tg, you can change the latest to your desired version, such as 0.0.9:
+
+```bash
+wget https://dl.tigergraph.com/k8s/latest/kubectl-tg -O kubectl-tg
+sudo install kubectl-tg /usr/local/bin/
+```
+
+You can check the `kubectl-tg` version and access help information using the following commands:
+
+```bash
+kubectl tg version
+kubectl tg help
+```
+
+### Install CustomResourceDefinitions (Optional)
+
+This step is optional and can be skipped if you have privileged permissions in your Kubernetes environment. The required components will be automatically installed during the Operator installation process. However, if you prefer to install CustomResourceDefinitions (CRDs) independently, you can use the following command:
+
+```bash
+kubectl apply -f https://dl.tigergraph.com/k8s/latest/tg-operator-crd.yaml
+```
+
+### Install TigerGraph Operator
+
+To simplify the Operator installation and TigerGraph cluster deployment, define environment variables:
+
+```bash
+export YOUR_NAMESPACE="tigergraph"
+export YOUR_CLUSTER_NAME="test-tg-cluster"
+export YOUR_SSH_KEY_SECRET_NAME="ssh-key-secret"
+```
+
+Now, you can install the TigerGraph Operator based on your requirements:
+
+A namespace-scoped operator watches and manages resources in a single Namespace, whereas a cluster-scoped operator watches and manages resources cluster-wide.
+
+- Install a namespace-scoped Operator:
+
+ ```bash
+ kubectl tg init --cluster-scope false --namespace ${YOUR_NAMESPACE}
+ ```
+
+- Install a cluster-scoped Operator (default behavior if not specified):
+
+ ```bash
+ kubectl tg init --cluster-scope true --namespace ${YOUR_NAMESPACE}
+ ```
+
+- Install the Operator with specific options (e.g., version, deployment size, CPU, memory, and more):
+
+ ```bash
+ kubectl tg init --cluster-scope false --version ${OPERATOR_VERSION} --operator-size 3 --operator-watch-namespace ${YOUR_NAMESPACE} --operator-cpu 1000m --operator-memory 1024Mi --namespace ${YOUR_NAMESPACE}
+ ```
+
+ For a comprehensive list of options, refer to the output of the `kubectl tg init` --help command.
+
+ ```bash
+ kubectl tg init --help
+ ```
+
+- Ensure that the operator has been successfully deployed:
+
+ ```bash
+ kubectl wait deployment tigergraph-operator-controller-manager --for condition=Available=True --timeout=120s -n ${YOUR_NAMESPACE}
+ ```
+
+## Deploy a TigerGraph cluster
+
+This section explains how to deploy a TigerGraph cluster on GKE using the kubectl-tg plugin and a Custom Resource (CR) YAML manifest.
+
+### Providing Private SSH Key Pair for Enhanced Security
+
+Starting from Operator version 0.0.4, users are required to provide their private SSH key pair for enhanced security before creating a cluster. Follow these steps:
+
+- Step 1: create a Private SSH Key Pair File
+
+ To enhance cluster security, create a private SSH key pair file:
+
+ ```bash
+ echo -e 'y\n' | ssh-keygen -b 4096 -t rsa -f $HOME/.ssh/tigergraph_rsa -q -N ''
+ ```
+
+- Step 2: create a Secret Object
+
+ > [!IMPORTANT]
+ > The namespace of the Secret object must be the same as that of the TigerGraph cluster.
+
+ Create a secret object based on the private SSH key file generated in step 1. Ensure that the key name of the secret for the private SSH key is `private-ssh-key`, and the key name for the public SSH key is `public-ssh-key`. Do not alter these key names:
+
+ ```bash
+ kubectl create secret generic ${YOUR_SSH_KEY_SECRET_NAME} --from-file=private-ssh-key=$HOME/.ssh/tigergraph_rsa --from-file=public-ssh-key=$HOME/.ssh/tigergraph_rsa.pub --namespace ${YOUR_NAMESPACE}
+ ```
+
+ For Operator versions 0.0.4 and above, when creating a cluster using the `kubectl tg create command`, you must set the `--private-key-secret` option to `${YOUR_SSH_KEY_SECRET_NAME}`.
+
+These steps enhance the security of your cluster by utilizing your private SSH key pair.
+
+### Specify the StorageClass name
+
+> [!NOTE]
+> Here the dynamic persistent volume storage is provided by GKE by default, if you want to use static persistent volume or use them from scratch, please refer to [How to use static & dynamic persistent volume storage](../07-reference/static-and-dynamic-persistent-volume-storage.md).
+
+Before creating the TigerGraph cluster with the Operator, specify the StorageClass, which defines available storage classes. Identify the name of the StorageClass:
+
+```bash
+kubectl get storageclass
+
+NAME PROVISIONER RECLAIMPOLICY VOLUMEBINDINGMODE ALLOWVOLUMEEXPANSION AGE
+premium-rwo pd.csi.storage.gke.io Delete WaitForFirstConsumer true 173m
+standard kubernetes.io/gce-pd Delete Immediate true 173m
+standard-rwo (default) pd.csi.storage.gke.io Delete WaitForFirstConsumer true 173m
+```
+
+Choose the appropriate StorageClass (e.g., `standard`) when creating the TigerGraph cluster, ensuring optimized storage provisioning and management.
+
+### Create TG cluster with specific options
+
+To create a new TigerGraph cluster with specific options, use either the `kubectl-tg` plugin or a CR YAML manifest. Below are examples using the `kubectl-tg` plugin:
+
+You can get all of the TigerGraph docker image version from [tigergraph-k8s](https://hub.docker.com/r/tigergraph/tigergraph-k8s/tags)
+
+The following command will create a new TigerGraph cluster with a free license:
+
+- Get and export free license
+
+ ```bash
+ export LICENSE=$(curl -L "ftp://ftp.graphtiger.com/lic/license3.txt" -o "/tmp/license3.txt" 2>/dev/null && cat /tmp/license3.txt)
+ ```
+
+- Create TigerGraph cluster with kubectl-tg plugin
+
+ ```bash
+ kubectl tg create --cluster-name ${YOUR_CLUSTER_NAME} --private-key-secret ${YOUR_SSH_KEY_SECRET_NAME} --size 3 --ha 2 --version 3.9.3 --license ${LICENSE} \
+ --storage-class standard --storage-size 100G --cpu 6000m --memory 16Gi --namespace ${YOUR_NAMESPACE}
+ ```
+
+- Alternatively, create a TigerGraph cluster with a CR YAML manifest:
+ > [!NOTE]
+ > Please replace the TigerGraph version (e.g., 3.9.3) and the Operator version (e.g., 0.0.9) with your desired versions.
+
+ ```bash
+ cat < [!WARNING]
+> TigerGraph's exceptional performance comes with certain considerations regarding high availability during upgrading operations. Currently, TigerGraph does not provide dedicated high-availability upgrade support, and some downtime is involved.
+
+Upgrading a TigerGraph cluster is supported from a lower version to a higher version.
+
+> [!WARNING]
+> For TigerGraph 3.9.3 and later versions, the use of passwords to log in to Pods is disabled, which enhances security. If you plan to upgrade your TigerGraph cluster to version 3.9.3, it is essential to first upgrade the Operator to version 0.0.9.
+
+> [!WARNING]
+> Operator 0.0.9 has disabled TG downgrades from a higher version (e.g., 3.9.3) to any lower version (e.g., 3.9.2). Therefore, the upgrade job will fail if you attempt to downgrade.
+
+Assuming the current version of the cluster is 3.9.2, you can upgrade it to version 3.9.3 with the following command:
+
+```bash
+kubectl tg update --cluster-name ${YOUR_CLUSTER_NAME} --version 3.9.3 --namespace ${YOUR_NAMESPACE}
+```
+
+If you prefer using a CR YAML manifest, update the `spec.initTGConfig.version` and `spec.image` field, and then apply it.
+
+Ensure the successful upgrade with these commands:
+
+```bash
+kubectl rollout status --watch --timeout=900s statefulset/${YOUR_CLUSTER_NAME} --namespace ${YOUR_NAMESPACE}
+
+kubectl wait --for=condition=complete --timeout=15m job/${YOUR_CLUSTER_NAME}-upgrade-job --namespace ${YOUR_NAMESPACE}
+```
+
+## Scale a TigerGraph cluster
+
+> [!WARNING]
+> TigerGraph's exceptional performance comes with certain considerations regarding high availability during scaling operations. Currently, TigerGraph does not provide dedicated high-availability scale support, and some downtime is involved.
+
+Before scaling the cluster, scale the corresponding node pool to provide sufficient resources for new instances. Use the following command to scale the TigerGraph cluster:
+
+```bash
+kubectl tg update --cluster-name ${YOUR_CLUSTER_NAME} --size 6 --ha 2 --namespace ${YOUR_NAMESPACE}
+```
+
+The above command scales the cluster to a size of 6 with a high availability factor of 2. If you prefer to use a CR (Custom Resource) YAML manifest for scaling, update the `spec.replicas` and `spec.initTGConfig.ha` fields accordingly.
+
+## Update the resources(CPU and Memory) of the TigerGraph cluster
+
+Modify the CPU and memory resources of your TigerGraph cluster using the following command:
+
+```bash
+kubectl tg update --cluster-name ${YOUR_CLUSTER_NAME} --cpu 8 --memory 16Gi --cpu-limit 8 --memory-limit 16Gi --namespace ${YOUR_NAMESPACE}
+```
+
+For CR YAML manifests, update the `spec.resources.requests` and `spec.resources.limits` fields and apply the changes.
+
+## Destroy the TigerGraph cluster and the Kubernetes Operator
+
+### Destroy the TigerGraph cluster
+
+To delete a TigerGraph cluster, use the following command. Note that this command does not remove Persistent Volume Claims (PVCs) and Persistent Volumes (PVs) associated with the cluster. To delete these components, manually delete the PVCs.
+
+- Delete the TigerGraph cluster and retain the PVs:
+
+ ```bash
+ kubectl tg delete --cluster-name ${YOUR_CLUSTER_NAME} -n ${YOUR_NAMESPACE}
+ ```
+
+- Delete the PVCs related to the specified cluster:
+
+ ```bash
+ # Identify the PVCS to delete by specific labels of PVC.
+ kubectl get pvc -l tigergraph.com/cluster-name=${YOUR_CLUSTER_NAME} -n ${YOUR_NAMESPACE}
+
+ # Delete the PVCS related to the specified cluster.
+ kubectl delete pvc -l tigergraph.com/cluster-name=${YOUR_CLUSTER_NAME} -n ${YOUR_NAMESPACE}
+ ```
+
+### Uninstall TigerGraph Operator
+
+Uninstall the TigerGraph Kubernetes Operator within a specified namespace:
+
+```bash
+kubectl tg uninstall -n ${YOUR_NAMESPACE}
+```
+
+### Uninstall CRD
+
+Uninstall CRDs if needed:
+
+> [!NOTE]
+> Replace the variable `${OPERATOR_VERSION}` to the Operator version you installed.
+
+```bash
+kubectl delete -f https://dl.tigergraph.com/k8s/${OPERATOR_VERSION}/tg-operator-crd.yaml
+```
+
+## See also
+
+If you are interested in the details of deploying a TigerGraph cluster using the CR (Custom Resource) YAML manifest, refer to the following document:
+
+- [Configuring TigerGraph Clusters on K8s using TigerGraph CR](../07-reference/configure-tigergraph-cluster-cr-with-yaml-manifests.md)
diff --git a/k8s/docs/03-deploy/tigergraph-on-openshift.md b/k8s/docs/03-deploy/tigergraph-on-openshift.md
new file mode 100644
index 00000000..4da7b0f1
--- /dev/null
+++ b/k8s/docs/03-deploy/tigergraph-on-openshift.md
@@ -0,0 +1,609 @@
+# Deploying TigerGraph on Red Hat OpenShift
+
+This document provides detailed instructions for deploying a TigerGraph cluster on the Red Hat OpenShift platform.
+
+- [Deploying TigerGraph on Red Hat OpenShift](#deploying-tigergraph-on-red-hat-openshift)
+ - [Prerequisites](#prerequisites)
+ - [Deploying TigerGraph Operator](#deploying-tigergraph-operator)
+ - [Install cert-manager for OpenShift](#install-cert-manager-for-openshift)
+ - [Install kubectl-tg plugin](#install-kubectl-tg-plugin)
+ - [Install CRDs independently (Optional)](#install-crds-independently-optional)
+ - [Install TigerGraph Operator](#install-tigergraph-operator)
+ - [Deploy a TigerGraph Cluster](#deploy-a-tigergraph-cluster)
+ - [Change the podPidsLimit value of OpenShift](#change-the-podpidslimit-value-of-openshift)
+ - [Acquire special permission](#acquire-special-permission)
+ - [Providing a Private SSH Key Pair for Enhanced Security](#providing-a-private-ssh-key-pair-for-enhanced-security)
+ - [Specify the StorageClass name](#specify-the-storageclass-name)
+ - [Create TG cluster with specific options](#create-tg-cluster-with-specific-options)
+ - [Connect to a TigerGraph cluster](#connect-to-a-tigergraph-cluster)
+ - [Connecting to a TigerGraph cluster Pod](#connecting-to-a-tigergraph-cluster-pod)
+ - [Access TigerGraph Suite](#access-tigergraph-suite)
+ - [Access RESTPP API Service](#access-restpp-api-service)
+ - [Upgrade a TigerGraph cluster](#upgrade-a-tigergraph-cluster)
+ - [Scale a TigerGraph cluster](#scale-a-tigergraph-cluster)
+ - [Update the resources(CPU and Memory) of the TigerGraph cluster](#update-the-resourcescpu-and-memory-of-the-tigergraph-cluster)
+ - [Destroy the TigerGraph cluster and the Kubernetes Operator](#destroy-the-tigergraph-cluster-and-the-kubernetes-operator)
+ - [Destroy the TigerGraph cluster](#destroy-the-tigergraph-cluster)
+ - [Uninstall TigerGraph Operator](#uninstall-tigergraph-operator)
+ - [Uninstall CRD](#uninstall-crd)
+ - [See also](#see-also)
+
+## Prerequisites
+
+Before you begin, ensure you have the following prerequisites:
+
+- [Helm](https://helm.sh/docs/intro/install/): version >= 3.7.0 TigerGraph Kubernetes Operator is packaged as a Helm chart, so you need Helm installed.
+
+- [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/): version >= 1.23 `kubectl-tg` plugin requires kubectl for running commands against Kubernetes clusters.
+
+- [OpenShift CLI](https://docs.openshift.com/container-platform/4.8/cli_reference/openshift_cli/getting-started-cli.html): Install the OpenShift CLI to acquire permissions within OpenShift.
+
+- Create an [OpenShift Kubernetes cluster](https://docs.openshift.com/container-platform/4.10/installing/index.html) with admin role permission. OpenShift Container Platform version requirements are 4 and above.
+
+## Deploying TigerGraph Operator
+
+To deploy the TigerGraph Operator, follow these steps:
+
+### Install cert-manager for OpenShift
+
+The TigerGraph Operator uses the Admission Webhooks feature and relies on [cert-manager](https://github.com/jetstack/cert-manager) for provisioning certificates for the webhook server.
+
+Admission webhooks are HTTP callbacks that receive admission requests and do something with them. It is registered with Kubernetes and will be called by Kubernetes to validate or mutate a resource before being stored.
+
+Follow these commands to install cert-manager:
+
+> [!WARNING]
+> Please check whether cert-manager has been installed before execute the following command.
+
+```bash
+kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.8.0/cert-manager.yaml
+# Verify installation of cert-manager
+kubectl wait deployment -n cert-manager cert-manager --for condition=Available=True --timeout=90s
+kubectl wait deployment -n cert-manager cert-manager-cainjector --for condition=Available=True --timeout=90s
+kubectl wait deployment -n cert-manager cert-manager-webhook --for condition=Available=True --timeout=90s
+```
+
+### Install kubectl-tg plugin
+
+kubectl-tg is a plugin for deploying and managing the Operator and TigerGraph clusters imperatively. Ensure you meet the following requirements before installing the kubectl-tg plugin:
+
+- [helm](https://helm.sh/docs/helm/helm_install/): version >= 3.7.0
+- [jq](https://jqlang.github.io/jq/download/): version >= 1.6
+- [yq](https://github.com/mikefarah/yq): version >= 4.18.1
+
+Here's an example of installing the latest kubectl-tg, you can change the latest to your desired version, such as 0.0.9:
+
+```bash
+wget https://dl.tigergraph.com/k8s/latest/kubectl-tg -O kubectl-tg
+sudo install kubectl-tg /usr/local/bin/
+```
+
+Display kubectl-tg version information:
+
+```bash
+kubectl tg version
+```
+
+Show help Information
+
+```bash
+kubectl tg help
+```
+
+### Install CRDs independently (Optional)
+
+This step is optional. You can skip it if you have privileged permissions in your Kubernetes environment. The required component will be automatically installed during the Operator installation process.
+
+CustomResourceDefinitions (CRDs) are non-namespaced entities accessible across all namespaces. Installing CRDs requires privileged permissions from the Kubernetes cluster. You may prefer to install CRDs independently from the Operator installation:
+
+```bash
+kubectl apply -f https://dl.tigergraph.com/k8s/latest/tg-operator-crd.yaml
+```
+
+### Install TigerGraph Operator
+
+To simplify the Operator installation and TigerGraph cluster deployment, define environment variables:
+
+```bash
+export YOUR_NAMESPACE="tigergraph"
+export YOUR_CLUSTER_NAME="test-tg-cluster"
+export YOUR_SSH_KEY_SECRET_NAME="ssh-key-secret"
+export SERVICE_ACCOUNT_NAME="tg-service-account"
+```
+
+Install TigerGraph Operator using the following command:
+
+A namespace-scoped operator watches and manages resources in a single Namespace, whereas a cluster-scoped operator watches and manages resources cluster-wide.
+
+- Install a namespace-scoped Operator
+
+ ```bash
+ kubectl tg init --cluster-scope false --namespace ${YOUR_NAMESPACE}
+ ```
+
+- Install a cluster-scoped Operator (default behavior if not specified):
+
+ ```bash
+ kubectl tg init --cluster-scope true --namespace ${YOUR_NAMESPACE}
+ ```
+
+- Install the operator with specific options:
+
+ Customize options such as operator version, deployment size, CPU, memory, and the namespace to watch:
+
+ ```bash
+ kubectl tg init --cluster-scope false --version ${OPERATOR_VERSION} --operator-size 3 --operator-watch-namespace ${YOUR_NAMESPACE} --operator-cpu 1000m --operator-memory 1024Mi --namespace ${YOUR_NAMESPACE}
+ ```
+
+ For the full help manual, see the output of `kubectl tg init` --help.
+
+- Verify the operator's successful deployment:
+
+ ```bash
+ kubectl wait deployment tigergraph-operator-controller-manager --for condition=Available=True --timeout=120s -n ${YOUR_NAMESPACE}
+ ```
+
+## Deploy a TigerGraph Cluster
+
+This section explains how to deploy a TigerGraph cluster on OpenShift using the kubectl-tg plugin and CR (Custom Resource) YAML manifest.
+
+### Change the podPidsLimit value of OpenShift
+
+In a production environment, TigerGraph clusters require setting the podPidsLimit to 1 million. If your OpenShift cluster hasn't set this, use the following commands:
+
+```bash
+kubectl label machineconfigpool worker custom-crio=high-pid-limit
+kubectl label machineconfigpool worker custom-kubelet=small-pods
+
+eval "cat << EOF
+apiVersion: machineconfiguration.openshift.io/v1
+kind: KubeletConfig
+metadata:
+ name: worker-kubeconfig-fix
+spec:
+ machineConfigPoolSelector:
+ matchLabels:
+ custom-kubelet: small-pods
+ kubeletConfig:
+ podPidsLimit: 1024000
+EOF" | kubectl apply -f -
+
+eval "cat << EOF
+apiVersion: machineconfiguration.openshift.io/v1
+kind: ContainerRuntimeConfig
+metadata:
+ name: set-pids-limit
+spec:
+ machineConfigPoolSelector:
+ matchLabels:
+ custom-crio: high-pid-limit
+ containerRuntimeConfig:
+ pidsLimit: 1024000
+EOF" | kubectl apply -f -
+```
+
+- [Verify the value of podPidsLimit](https://access.redhat.com/solutions/5366631)
+
+1. Monitor /sys/fs/cgroup/pids/pids.current when the application is running to verify java.lang.OutOfMemoryError: unable to create new native thread or similar errors happen when it hits 1024 (or 4096 in OCP 4.11+).
+2. For OCP 4.10 and previous releases, check if the CRI-O pids_limit is being set on the node where the application container is running:
+
+ ```bash
+ $ crio config | grep pids_limit
+ INFO[2022-01-31 12:14:27.407346183Z] Starting CRI-O, version: 1.21.4-4.rhaos4.8.git84fa55d.el8, git: ()
+ INFO Using default capabilities: CAP_CHOWN, CAP_DAC_OVERRIDE, CAP_FSETID, CAP_FOWNER, CAP_SETGID, CAP_SETUID, CAP_SETPCAP, CAP_NET_BIND_SERVICE, CAP_KILL
+ pids_limit = 4096
+ ```
+
+3. Verify the kubelet podPidsLimit is being set in /etc/kubernetes/kubelet.conf and SupportPodPidsLimit (only in 4.10 and older) is set running the following command:
+
+ ```bash
+ $ oc debug node/[node_name] -- cat /host/etc/kubernetes/kubelet.conf | jq '.podPidsLimit, .featureGates'
+ Starting pod/[node_name]-debug ...
+ To use host binaries, run `chroot /host`
+ Removing debug pod ...
+
+ 2048
+ {
+ "LegacyNodeRoleBehavior": false,
+ "NodeDisruptionExclusion": true,
+ "RotateKubeletServerCertificate": true,
+ "SCTPSupport": true,
+ "ServiceNodeExclusion": true,
+ "SupportPodPidsLimit": true
+ }
+ ```
+
+ In newer releases, it's not a json file, so use the following command instead:
+
+ ```bash
+ $ oc debug node/[node_name] -- cat /host/etc/kubernetes/kubelet.conf | grep podPidsLimit
+ podPidsLimit: 4096
+ ```
+
+ If not configured, the default (1024) applies (or 4096 in OCP 4.11+).
+
+4. Verify the labels for the ContainerRuntimeConfig (only in OCP 4.10 and previous release) and KubeletConfig were created and applied:
+
+ ```bash
+ $ oc get kubeletconfig,containerruntimeconfig
+ NAME AGE
+ kubeletconfig/worker-kubeconfig-fix 9d
+
+ NAME AGE
+ containerruntimeconfig/set-pids-limit 15d
+
+ $ oc get mcp/worker -o json | jq '.metadata.labels'
+ {
+ "custom-crio": "high-pid-limit",
+ "custom-kubelet": "small-pods",
+ "machineconfiguration.openshift.io/mco-built-in": "",
+ "pools.operator.machineconfiguration.openshift.io/worker": ""
+ }
+
+ $ oc get kubeletconfig/worker-kubeconfig-fix -o json | jq '.status.conditions[]'
+ {
+ "lastTransitionTime": "2022-02-10T04:46:17Z",
+ "message": "Success",
+ "status": "True",
+ "type": "Success"
+ }
+ ```
+
+### Acquire special permission
+
+Starting from TigerGraph version 3.9.0 and Operator version 0.0.4, significant security enhancements have been introduced in the TigerGraph Kubernetes (k8s) Docker image. These enhancements are designed to reinforce the overall security posture of the TigerGraph deployment. Specifically, two notable changes have been made:
+
+1. **SUDO Permission Removal**: The SUDO permission has been removed from the TigerGraph image. This change aligns with best practices in security by reducing unnecessary privileges within the containerized environment.
+
+2. **Static Private SSH Key Removal**: Static private SSH key files have been eliminated from the TigerGraph image. This removal further enhances the security of your TigerGraph deployment by reducing potential vulnerabilities associated with static keys.
+
+To ensure a seamless deployment of the TigerGraph cluster with these enhanced security measures, it is essential to perform an additional operation within the OpenShift environment. Failure to complete this step may lead to issues where the Operator successfully creates the StatefulSet, but the Pod for TigerGraph fails to generate due to insufficient permissions. Consequently, querying the status of the StatefulSet will yield the following warning events:
+
+```bash
+Normal SuccessfulCreate 119s statefulset-controller create Claim tg-data-test-cluster-0 Pod test-cluster-0 in StatefulSet test-cluster success
+Warning FailedCreate 37s (x15 over 119s) statefulset-controller create Pod test-cluster-0 in StatefulSet test-cluster failed error:
+pods "test-cluster-0" is forbidden: unable to validate against any security context constraint: [provider "anyuid": Forbidden: not usable by user or serviceaccount,
+provider restricted: .spec.securityContext.fsGroup: Invalid value: []int64{1000}: 1000 is not an allowed group,
+spec.containers[0].securityContext.runAsUser: Invalid value: 1000: must be in the ranges: [1000680000, 1000689999],
+provider "nonroot": Forbidden: not usable by user or serviceaccount,
+provider "hostmount-anyuid": Forbidden: not usable by user or serviceaccount,
+provider "machine-api-termination-handler": Forbidden: not usable by user or serviceaccount,
+provider "hostnetwork": Forbidden: not usable by user or serviceaccount,
+provider "hostaccess": Forbidden: not usable by user or serviceaccount,
+provider "node-exporter": Forbidden: not usable by user or serviceaccount,
+provider "privileged": Forbidden: not usable by user or serviceaccount]
+```
+
+- Create a SecurityContextConstraints
+
+ Execute the following command:
+
+ ```bash
+ cat < [!NOTE]
+> If you are using the kubectl-tg plugin to create the TigerGraph cluster, you can specify the service account name using the `--service-account-name` option.
+> [!WARNING]
+> If you choose to install a cluster-scoped Operator, it is essential to create the aforementioned service account for each namespace in which you intend to deploy the TigerGraph cluster.
+
+### Providing a Private SSH Key Pair for Enhanced Security
+
+Starting from Operator version 0.0.4, users are required to provide their private SSH key pair for enhanced security before creating a cluster. Follow these steps:
+
+- Step 1: create a private SSH key pair file:
+
+ ```bash
+ echo -e 'y\n' | ssh-keygen -b 4096 -t rsa -f $HOME/.ssh/tigergraph_rsa -q -N ''
+ ```
+
+- Step 2: Create a Secret Object
+
+ > [!IMPORTANT]
+ > The namespace of the Secret object must be the same as that of the TigerGraph cluster.
+
+ Create a secret object based on the private SSH key file generated in step 1. Ensure that the key name of the secret for the private SSH key is `private-ssh-key`, and the key name for the public SSH key is `public-ssh-key`. Do not alter these key names:
+
+ ```bash
+ kubectl create secret generic ${YOUR_SSH_KEY_SECRET_NAME} --from-file=private-ssh-key=$HOME/.ssh/tigergraph_rsa --from-file=public-ssh-key=$HOME/.ssh/tigergraph_rsa.pub --namespace ${YOUR_NAMESPACE}
+ ```
+
+ For Operator versions 0.0.4 and above, when creating a cluster using the `kubectl tg create command`, you must set the `--private-key-secret` option to `${YOUR_SSH_KEY_SECRET_NAME}`.
+
+These steps enhance the security of your cluster by utilizing your private SSH key pair.
+
+### Specify the StorageClass name
+
+Before creating the TigerGraph cluster with the Operator, specify the StorageClass, which defines the various "classes" of storage available. Use the following command to identify the name of the StorageClass:
+
+You can determine the name of the StorageClass using the following command:
+
+```bash
+kubectl get storageclass
+
+NAME PROVISIONER RECLAIMPOLICY VOLUMEBINDINGMODE ALLOWVOLUMEEXPANSION AGE
+standard (default) kubernetes.io/gce-pd Delete WaitForFirstConsumer true 37m
+standard-csi pd.csi.storage.gke.io Delete WaitForFirstConsumer true 37m
+```
+
+With the StorageClass identified, you can proceed to create clusters using the create command. When specifying the --storage-class option, choose standard as its value.
+
+This process ensures that the appropriate StorageClass is assigned to your TigerGraph cluster creation, optimizing storage provisioning and management.
+
+### Create TG cluster with specific options
+
+To create a new TigerGraph cluster with specific options, use either the kubectl-tg plugin or a CR YAML manifest. Below are examples using the kubectl-tg plugin:
+
+You can get all of the TigerGraph docker image version from [tigergraph-k8s](https://hub.docker.com/r/tigergraph/tigergraph-k8s/tags)
+
+The following command will create a new TigerGraph cluster with a free license:
+
+- Get and export free license
+
+ ```bash
+ export LICENSE=$(curl -L "ftp://ftp.graphtiger.com/lic/license3.txt" -o "/tmp/license3.txt" 2>/dev/null && cat /tmp/license3.txt)
+ ```
+
+- Create TigerGraph cluster with kubectl-tg plugin
+
+ ```bash
+ kubectl tg create --cluster-name ${YOUR_CLUSTER_NAME} --private-key-secret ${YOUR_SSH_KEY_SECRET_NAME} --size 3 --ha 2 --version 3.9.3 --license ${LICENSE} \
+ --storage-class standard --storage-size 100G --cpu 6000m --memory 16Gi --namespace ${YOUR_NAMESPACE}
+ ```
+
+- Alternatively, create a TigerGraph cluster with a CR YAML manifest:
+
+ ```bash
+ cat < [!WARNING]
+> TigerGraph's exceptional performance comes with certain considerations regarding high availability during upgrading operations. Currently, TigerGraph does not provide dedicated high-availability upgrade support, and some downtime is involved.
+
+Upgrading a TigerGraph cluster is supported from a lower version to a higher version.
+
+> [!WARNING]
+> For TigerGraph 3.9.3 and later versions, the use of passwords to log in to Pods is disabled, which enhances security. If you plan to upgrade your TigerGraph cluster to version 3.9.3, it is essential to first upgrade the Operator to version 0.0.9.
+
+> [!WARNING]
+> Operator 0.0.9 has disabled TG downgrades from a higher version (e.g., 3.9.3) to any lower version (e.g., 3.9.2). Therefore, the upgrade job will fail if you attempt to downgrade.
+
+Assuming the current version of the cluster is 3.9.2, you can upgrade it to version 3.9.3 with the following command:
+
+```bash
+kubectl tg update --cluster-name ${YOUR_CLUSTER_NAME} --version 3.9.3 --namespace ${YOUR_NAMESPACE}
+```
+
+If you prefer using a CR YAML manifest, update the `spec.initTGConfig.version` and `spec.image` field, and then apply it.
+
+Ensure the successful upgrade with these commands:
+
+```bash
+kubectl rollout status --watch --timeout=900s statefulset/${YOUR_CLUSTER_NAME} --namespace ${YOUR_NAMESPACE}
+
+kubectl wait --for=condition=complete --timeout=15m job/${YOUR_CLUSTER_NAME}-upgrade-job --namespace ${YOUR_NAMESPACE}
+```
+
+## Scale a TigerGraph cluster
+
+> [!WARNING]
+> TigerGraph's exceptional performance comes with certain considerations regarding high availability during scaling operations. Currently, TigerGraph does not provide dedicated high-availability scale support, and some downtime is involved.
+
+Before scaling the cluster, scale the corresponding node pool to provide sufficient resources for new instances. Use the following command to scale the TigerGraph cluster:
+
+```bash
+kubectl tg update --cluster-name ${YOUR_CLUSTER_NAME} --size 6 --ha 2 --namespace ${YOUR_NAMESPACE}
+```
+
+The above command scales the cluster to a size of 6 with a high availability factor of 2. If you prefer to use a CR (Custom Resource) YAML manifest for scaling, update the `spec.replicas` and `spec.initTGConfig.ha` fields accordingly.
+
+## Update the resources(CPU and Memory) of the TigerGraph cluster
+
+Modify the CPU and memory resources of your TigerGraph cluster using the following command:
+
+```bash
+kubectl tg update --cluster-name ${YOUR_CLUSTER_NAME} --cpu 8 --memory 16Gi --cpu-limit 8 --memory-limit 16Gi --namespace ${YOUR_NAMESPACE}
+```
+
+For CR YAML manifests, update the `spec.resources.requests` and `spec.resources.limits` fields and apply the changes.
+
+## Destroy the TigerGraph cluster and the Kubernetes Operator
+
+### Destroy the TigerGraph cluster
+
+To delete a TigerGraph cluster, use the following command. Note that this command does not remove Persistent Volume Claims (PVCs) and Persistent Volumes (PVs) associated with the cluster. To delete these components, manually delete the PVCs.
+
+- Delete the TigerGraph cluster and retain the PVs:
+
+ ```bash
+ kubectl tg delete --cluster-name ${YOUR_CLUSTER_NAME} -n ${YOUR_NAMESPACE}
+ ```
+
+- Delete the PVCs related to the specified cluster:
+
+ ```bash
+ # Identify the PVCS to delete by specific labels of PVC.
+ kubectl get pvc -l tigergraph.com/cluster-name=${YOUR_CLUSTER_NAME} -n ${YOUR_NAMESPACE}
+
+ # Delete the PVCS related to the specified cluster.
+ kubectl delete pvc -l tigergraph.com/cluster-name=${YOUR_CLUSTER_NAME} -n ${YOUR_NAMESPACE}
+ ```
+
+### Uninstall TigerGraph Operator
+
+Uninstall the TigerGraph Kubernetes Operator within a specified namespace:
+
+```bash
+kubectl tg uninstall -n ${YOUR_NAMESPACE}
+```
+
+### Uninstall CRD
+
+Uninstall CRDs if needed:
+
+> [!NOTE]
+> Replace the variable `${OPERATOR_VERSION}` to the Operator version you installed.
+
+```bash
+kubectl delete -f https://dl.tigergraph.com/k8s/${OPERATOR_VERSION}/tg-operator-crd.yaml
+```
+
+## See also
+
+If you are interested in the details of deploying a TigerGraph cluster using the CR (Custom Resource) YAML manifest, refer to the following document:
+
+- [Configuring TigerGraph Clusters on K8s using TigerGraph CR](../07-reference/configure-tigergraph-cluster-cr-with-yaml-manifests.md)
diff --git a/k8s/docs/03-deploy/use-custom-containers-by-kubectl-tg.md b/k8s/docs/03-deploy/use-custom-containers-by-kubectl-tg.md
new file mode 100644
index 00000000..8fafb4dd
--- /dev/null
+++ b/k8s/docs/03-deploy/use-custom-containers-by-kubectl-tg.md
@@ -0,0 +1,134 @@
+Use InitContainers,SidecarContainers and CustomVolumes in kubectl-tg
+
+- [Basic knowledge](#basic-knowledge)
+- [Usage](#usage)
+ - [Creating initContainers,sidecarContainers and customVolumes](#creating-initcontainerssidecarcontainers-and-customvolumes)
+ - [Removing initContainers/sidecarContainers/customVolumes](#removing-initcontainerssidecarcontainerscustomvolumes)
+ - [Managing a TG Cluster with Custom Containers](#managing-a-tg-cluster-with-custom-containers)
+ - [See also](#see-also)
+
+Basic knowledge
+===============
+A K8S Pod has the capability to house multiple containers, including both init containers and app containers. Upon pod creation, the init containers execute sequentially in a designated order. Should any of the init containers encounter a failure, the overall pod execution is halted (for more insights, consult [Init Containers](https://kubernetes.io/docs/concepts/workloads/pods/init-containers/)). Following the successful completion of all init containers, the app containers proceed to run concurrently.
+
+By default, in the configuration of the TigerGraph CR, each TigerGraph Pod features a singular app container named "tigergraph". This container runs all TigerGraph services within the Pod. The functionality "InitContainers,SidecarContainers and CustomVolumes" empowers users to seamlessly integrate personalized initContainers and sidecarContainers into TigerGraph Pods. Furthermore, users can create customVolumes, enabling the mounting of these volumes within their initContainers or sidecarContainers.
+
+To grasp the concepts of InitContainers, Sidecar Containers, and Custom Volumes, please refer to the guide on [InitContainers, Sidecar Containers, and Custom Volumes](./custom-containers.md).
+
+Usage
+=====
+Creating initContainers,sidecarContainers and customVolumes
+-----
+To make use of this feature, follow these steps:
+
+1. Prepare a YAML file that includes the definitions for your `initContainers`, `sidecarContainers`, and `customVolumes`. This YAML file will be passed to the `--custom-containers` option.
+
+ - For `initContainers` and `sidecarContainers`, you can refer to the Kubernetes documentation at [Container](https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/pod-v1/#Container) for available fields and configurations.
+
+ - For `customVolumes`, the relevant fields and configurations can be found in the Kubernetes documentation at [Volumes](https://kubernetes.io/docs/concepts/storage/volumes/).
+
+Below is an illustrative example of a custom container YAML file:
+
+```yaml
+initContainers:
+ - image: alpine:3.17.2
+ name: init-hello
+ args:
+ - /bin/sh
+ - -c
+ - echo hello
+
+sidecarContainers:
+ - image: alpine:3.17.2
+ name: main-container
+ args:
+ - /bin/sh
+ - -c
+ - >
+ while true; do
+ echo "$(date) INFO hello from main-container" >> /var/log/myapp.log ;
+ sleep 1;
+ done
+ volumeMounts:
+ - name: tg-log
+ mountPath: /var/tglog
+ - name: log
+ mountPath: /var/log
+ readinessProbe:
+ exec:
+ command:
+ - sh
+ - -c
+ - if [[ -f /var/log/myapp.log ]];then exit 0; else exit 1;fi
+ initialDelaySeconds: 10
+ periodSeconds: 5
+
+ - name: sidecar-container
+ image: alpine:3.17.2
+ args:
+ - /bin/sh
+ - -c
+ - tail -fn+1 /var/log/myapp.log
+ volumeMounts:
+ - name: log
+ mountPath: /var/log
+
+customVolumes:
+ - name: log
+ emptyDir: {}
+```
+
+This comprehensive example showcases the configuration of `initContainers`, `sidecarContainers`, and a `customVolume` named "log". Adjust the contents according to your specific use case.
+
+Name the above YAML file as `tg-custom-container.yaml`. To create a new cluster using the `tg-custom-container.yaml` YAML file:
+
+```bash
+kubectl tg create --cluster-name test-cluster --namespace ${NAMESPACE} \
+ --size 3 --ha 2 -k ssh-key-secret --version ${TG_CLUSTER_VERSION} \
+ --storage-class standard --storage-size 10G -l ${LICENSE} \
+ --custom-containers tg-custom-container.yaml
+```
+
+If you already have a cluster, and you want to add/update initContainers/sidecarContainers for it, you can run
+
+```bash
+kubectl tg update --cluster-name test-cluster --namespace ${NAMESPACE} \
+ --custom-containers tg-custom-container.yaml
+```
+
+Removing initContainers/sidecarContainers/customVolumes
+-----------------------------------------------------
+
+To remove all of them, you can pass an empty file as an argument to the `--custom-containers` option:
+
+```bash
+touch empty.yaml
+
+kubectl tg update --cluster-name test-cluster --namespace ${NAMESPACE} \
+ --custom-containers empty.yaml
+```
+
+If you wish to remove specific containers or volumes, simply edit your configuration file and then use the `kubectl tg update` command to apply the changes.
+
+Managing a TG Cluster with Custom Containers
+-----------------------------------------
+
+Operating a TG cluster with custom containers is similar to managing a cluster without custom containers. You can utilize the `kubectl tg update` command to perform actions such as updating, upgrading, expanding, or shrinking the cluster.
+
+If you need to modify your `initContainers`, `sidecarContainers`, or `customVolumes`, follow these steps:
+
+1. Make the necessary adjustments to your custom container YAML file.
+
+2. Execute the following command to update the cluster with the new custom containers:
+
+```bash
+kubectl tg update --cluster-name test-cluster --namespace ${NAMESPACE} --custom-containers tg-custom-container.yaml
+```
+
+This command triggers a rolling update, ensuring that your custom containers are seamlessly updated within the cluster.
+
+## See also
+
+If you are interested in learning how to configure custom containers with YAML resources, please refer to the following documentation:
+
+- [Use custom containers with YAML resources](../03-deploy/custom-containers.md)
diff --git a/k8s/docs/04-manage/backup-and-restore/README.md b/k8s/docs/04-manage/backup-and-restore/README.md
new file mode 100644
index 00000000..de87f5c1
--- /dev/null
+++ b/k8s/docs/04-manage/backup-and-restore/README.md
@@ -0,0 +1,60 @@
+Backup and Restore Overview
+
+This document describes how to perform backup and restore on TigerGraph cluster on Kubernetes.
+To backup and restore your data, you can use the `kubectl-tg` plugin or the YAML file that corresponds to the TigerGraphBackup/TigerGraphRestore Custom Resource(CR).
+
+Difference Between Managing Backup/Restore by YAML File and Using kubectl-tg Plugin
+---
+
+**Using YAML Files:**
+
+1. **Manual Configuration:** With YAML files, you manually craft the configuration settings for backup or restore operations.
+
+2. **Customization:** You can store multiple backup/restore configurations in separate YAML files, enabling customized setups for different scenarios.
+
+**Using kubectl-tg Plugin:**
+
+1. **Simplified Commands:** The `kubectl tg` plugin streamlines the process by providing pre-defined command options that directly create CRs with specified configurations.
+
+2. **Efficiency:** You avoid the need to create YAML files and write configurations manually, accelerating the setup of backup and restore operations.
+
+3. **CR Management:** The `kubectl tg` plugin operates directly on CRs, enabling you to manage and modify them conveniently through commands.
+
+Ultimately, both approaches achieve the same outcome, but the `kubectl tg` plugin simplifies the process by eliminating manual configuration steps and providing a more streamlined and efficient method for managing backup and restore operations.
+
+* See [Backup & Restore cluster by kubectl-tg plugin](./backup-restore-by-kubectl-tg.md) to know how to use `kubectl tg` for backup & restore.
+* See [Backup & Restore cluster by CR](./backup-restore-by-cr.md) to get the example YAML files for backup & restore.
+
+
+Usage scenarios
+---
+### Back up data
+You can create backups of your TigerGraph clusters and store the backup files to Local storage or S3 bucket. Refer to:
+* [Backup to Local Storage](./backup-restore-by-kubectl-tg.md#backup-to-local-storage)
+* [Backup to S3 Bucket](./backup-restore-by-kubectl-tg.md#backup-to-an-s3-bucket)
+
+You can create a backup schedule to backup cluster periodically. Refer to:
+* [Creating and Managing Backup Schedules](./backup-restore-by-kubectl-tg.md#creating-and-managing-backup-schedules)
+* [TigerGraphBackupSchedule CR](./backup-restore-by-cr.md#tigergraphbackupschedule)
+
+About managing backup files and backup CR, refer to:
+- [Listing Backup Custom Resources](./backup-restore-by-kubectl-tg.md#listing-backup-custom-resources)
+- [Displaying Backup Process Status](./backup-restore-by-kubectl-tg.md#displaying-backup-process-status)
+- [Delete Backup Custom Resource (CR)](./backup-restore-by-kubectl-tg.md#delete-backup-custom-resource-cr)
+- [Listing Backups](./backup-restore-by-kubectl-tg.md#listing-backups)
+- [Removing Backups](./backup-restore-by-kubectl-tg.md#removing-backups)
+
+### Restore data
+If you have created backups of your cluster to Local storage or S3 Bucket, you can restore the cluster using a specific backup. Refer to:
+* [Restoring within the Same Cluster](./backup-restore-by-kubectl-tg.md#restoring-within-the-same-cluster)
+
+If you have created backups of your cluster to S3 Bucket, you can restore in another cluster, which we call cross-cluster restore. Refer to:
+* [Cross-Cluster Restore from Backup](./backup-restore-by-kubectl-tg.md#cross-cluster-restore-from-backup)
+
+If you want to clone your cluster, you can use cross-cluster to achieve this goal. Refer to:
+* [Clone Cluster from Backup](./backup-restore-by-kubectl-tg.md#clone-cluster-from-backup)
+
+
+Troubleshoot
+---
+If you encounter any Error with backup & restore process, please refer to [How to debug Backup & Restore](./troubleshoot.md) for troubleshooting guidance.
\ No newline at end of file
diff --git a/k8s/docs/04-manage/backup-and-restore/backup-restore-by-cr.md b/k8s/docs/04-manage/backup-and-restore/backup-restore-by-cr.md
new file mode 100644
index 00000000..6adaaec5
--- /dev/null
+++ b/k8s/docs/04-manage/backup-and-restore/backup-restore-by-cr.md
@@ -0,0 +1,531 @@
+Backup & Restore cluster by CR
+
+- [Creating an S3 Secret for Backup and Restore](#creating-an-s3-secret-for-backup-and-restore)
+- [TigerGraphBackup](#tigergraphbackup)
+ - [Backup to local storage](#backup-to-local-storage)
+ - [Backup to S3 bucket](#backup-to-s3-bucket)
+- [TigerGraphBackupSchedule](#tigergraphbackupschedule)
+ - [Schedule backup to local storage](#schedule-backup-to-local-storage)
+ - [Schedule backup to S3 bucket](#schedule-backup-to-s3-bucket)
+- [TigerGraphRestore](#tigergraphrestore)
+ - [Restore from local backup](#restore-from-local-backup)
+ - [Restore from backup in S3 bucket](#restore-from-backup-in-s3-bucket)
+ - [Cross-cluster restore in existing cluster](#cross-cluster-restore-in-existing-cluster)
+ - [Cluster version \>=3.9.2](#cluster-version-392)
+ - [Clone a cluster(Create a new cluster and do cross-cluster restore)](#clone-a-clustercreate-a-new-cluster-and-do-cross-cluster-restore)
+ - [Cluster version \>=3.9.2](#cluster-version-392-1)
+
+
+
+
+
+
+
+Creating an S3 Secret for Backup and Restore
+====
+When working with backup and restore operations involving S3 buckets, you need to create a Kubernetes Secret to securely store your AWS access credentials. Here's how you can create an S3 Secret:
+
+1. **Encode AWS Access Key ID and Secret Access Key**:
+
+ Before creating the Kubernetes Secret, you need to encode your AWS access key ID and secret access key in base64 format. You can use the following commands to do that:
+
+ ```bash
+ # Replace YOUR_ACCESS_KEY_ID with your actual AWS access key ID
+ echo -n "YOUR_ACCESS_KEY_ID" | base64
+
+ # Replace YOUR_SECRET_ACCESS_KEY with your actual AWS secret access key
+ echo -n "YOUR_SECRET_ACCESS_KEY" | base64
+ ```
+
+ Note down the base64 encoded strings generated for the access key ID and secret access key.
+
+2. **Create S3 Secret YAML**:
+
+ Create a YAML file (e.g., `s3-secret.yaml`) with the following content. Replace `YOUR_BASE64_ENCODED_ACCESS_KEY_ID` and `YOUR_BASE64_ENCODED_SECRET_ACCESS_KEY` with the actual base64 encoded values from step 1:
+
+ ```yaml
+ apiVersion: v1
+ kind: Secret
+ metadata:
+ name: s3-secret
+ type: Opaque
+ data:
+ accessKeyID: YOUR_BASE64_ENCODED_ACCESS_KEY_ID
+ secretAccessKey: YOUR_BASE64_ENCODED_SECRET_ACCESS_KEY
+ ```
+
+3. **Apply the Secret**:
+
+ Use the `kubectl apply` command within the same namespace as the cluster you intend to backup. This ensures that the secret is accessible to the backup and restore processes within that specific namespace.
+
+ ```bash
+ kubectl apply -f s3-secret.yaml -n YOUR_NAMESPACE
+ ```
+
+ This will create the Kubernetes Secret named `s3-secret` containing your AWS access credentials.
+
+By creating an S3 Secret in this manner, you ensure that your AWS access credentials are securely stored and can be easily referenced when needed for backup and restore tasks involving S3 buckets.
+
+TigerGraphBackup
+================
+> [!NOTE]
+> There are many examples on different conditions of backup and restore. Some fields in the YAML format CR is optional, a mark `# optional` is put above them. All fields without the optional mark is required.
+
+For optimal organization, we recommend using the naming convention `${CLUSTER-NAME}-backup-${TAG}` for your backup CR.
+
+Backup to local storage
+-----------------------
+Certainly, here's the modified YAML file for performing a backup to local storage. You can save this content to a file (e.g., backup-local.yaml), and then run `kubectl apply -f backup-local.yaml -n YOUR_NAMESPACE` to create the backup.
+
+```yaml
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraphBackup
+metadata:
+ name: test-cluster-backup-local
+spec:
+ # Specify which cluster to backup in the SAME NAMESPACE as the backup job
+ clusterName: test-cluster
+ # Specify where to store the backup data
+ destination:
+ storage: local
+ # Use this field if type is local
+ local:
+ path: /home/tigergraph/tigergraph/data/backup
+
+ # Configure the name of backup files and the path storing temporary files
+ backupConfig:
+ tag: local
+ # Optional: Set the path for temporary staging files
+ stagingPath: /home/tigergraph/tigergraph/data
+ # Optional: If 'incremental' is set to true, incremental backup will be performed
+ incremental: false
+ # Optional: Set the timeout value for the backup process (default is 18000 seconds)
+ timeout: 18000
+ # Optional: Specify the number of processes to use for compression (0 uses the number of CPU cores)
+ compressProcessNumber: 0
+ # Optional: (Requires operator version >= 0.0.9 and TigerGraph version >= 3.9.3)
+ # Choose the compression level for the backup: DefaultCompression/BestSpeed/BestCompression
+ compressLevel: DefaultCompression # Choose from DefaultCompression/BestSpeed/BestCompression
+```
+
+Backup to S3 bucket
+-------------------
+
+Certainly, here's the YAML file for performing a backup to an S3 bucket using a previously created Secret named `s3-secret`. You can save this content to a file (e.g., `backup-s3.yaml`), and then run `kubectl apply -f backup-s3.yaml -n YOUR_NAMESPACE` to create the backup.
+
+```yaml
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraphBackup
+metadata:
+ name: test-cluster-backup-s3
+spec:
+ clusterName: test-cluster
+ destination:
+ storage: s3Bucket
+ s3Bucket:
+ # Specify the name of the S3 bucket you want to use
+ bucketName: operator-backup
+ # Specify the Secret containing the S3 access key and secret access key
+ secretKey:
+ name: s3-secret
+
+ # Configure the name of backup files and the path storing temporary files
+ backupConfig:
+ tag: s3
+ # Optional: Set the path for temporary staging files
+ stagingPath: /home/tigergraph/tigergraph/data
+ # Optional: If 'incremental' is set to true, incremental backup will be performed
+ incremental: false
+ # Optional: Set the timeout value for the backup process (default is 18000 seconds)
+ timeout: 18000
+ # Optional: Specify the number of processes to use for compression (0 uses the number of CPU cores)
+ compressProcessNumber: 0
+ # Optional: (Requires operator version >= 0.0.9 and TigerGraph version >= 3.9.3)
+ # Choose the compression level for the backup: DefaultCompression/BestSpeed/BestCompression
+ compressLevel: DefaultCompression # Choose from DefaultCompression/BestSpeed/BestCompression
+```
+
+TigerGraphBackupSchedule
+========================
+
+The field `.spec.schedule` uses the cron schedule expression. You can refer to [https://crontab.guru/](https://crontab.guru/).
+
+The field `.spec.backupTemplate` is the same as the `.spec` of TigerGraphBackup
+
+
+
+Schedule backup to local storage
+--------------------------------
+
+```yaml
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraphBackupSchedule
+metadata:
+ name: test-cluster-schedule-daily
+spec:
+ # Cronjob shedule
+ schedule: "0 0 * * *"
+ # Strategies for managing backups
+ # We will delete oldest backups according to the strategies automatically
+ strategy:
+ # We will only retain 20 backups
+ maxBackupFiles: 20
+ # A backup can only exist for 3 days
+ maxReservedDays: 3
+ maxRetry: 10
+ # optional : is pause is true, the cronjob will be suspended
+ pause: false
+ backupTemplate:
+ # Specify which cluster to backup in the SAME NAMESPACE as the backup job
+ clusterName: test-cluster
+ # Specify where to store the backup data
+ destination:
+ storage: local
+ # Use this field if type is local
+ local:
+ path: /home/tigergraph/tigergraph/data/backup
+
+ # Configure the name of backup files and the path storing temporary files
+ backupConfig:
+ tag: daily
+ # optional
+ stagingPath: /home/tigergraph/tigergraph/data
+ # optional :if incremental is true, incremental backup will be performed
+ incremental: false
+ # optional
+ timeout: 18000
+ # optional :specify the number of process to do compress
+ compressProcessNumber: 0
+ # optional: (operator>=0.0.9 and tg>=3.9.3) specify the compress level for backup
+ compressLevel: DefaultCompression #choose from DefaultCompression/BestSpeed/BestCompression
+```
+
+Schedule backup to S3 bucket
+----------------------------
+
+```yaml
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraphBackupSchedule
+metadata:
+ name: test-cluster-schedule-daily
+spec:
+ # Cronjob shedule
+ schedule: "0 0 * * *"
+ # Strategies for managing backups
+ # We will delete oldest backups according to the strategies automatically
+ strategy:
+ # We will only retain 20 backups
+ maxBackupFiles: 20
+ # A backup can only exist for 3 days
+ maxReservedDays: 3
+ maxRetry: 10
+ # optional : is pause is true, the cronjob will be suspended
+ pause: false
+ backupTemplate:
+ clusterName: test-cluster
+ destination:
+ storage: s3Bucket
+ s3Bucket:
+ # specify the bucket you want to use
+ bucketName: operator-backup
+ secretKey:
+ name: s3-secret
+ # Configure the name of backup files and the path storing temporary files
+ backupConfig:
+ tag: s3-daily
+ # optional
+ stagingPath: /home/tigergraph/tigergraph/data/backup-staging
+ # optional :if incremental is true, incremental backup will be performed
+ incremental: false
+ # optional
+ timeout: 18000
+ # optional :specify the number of process to do compress
+ compressProcessNumber: 0
+ # optional: (operator>=0.0.9 and tg>=3.9.3) specify the compress level for backup
+ compressLevel: DefaultCompression #choose from DefaultCompression/BestSpeed/BestCompression
+```
+
+TigerGraphRestore
+=================
+
+Restore from local backup
+-------------------------
+
+```yaml
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraphRestore
+metadata:
+ name: restore-from-local
+spec:
+ restoreConfig:
+ # We can use tag to restore from backup in the same cluster
+ tag: daily-2021-11-04T120000
+ # optional
+ stagingPath: /home/tigergraph/tigergraph/data/restore-staging
+ # optional: (operator>=0.0.9 and tg>=3.9.3) should be >=0
+ decompressProcessNumber: 2
+ source:
+ storage: local
+ local:
+ path: /home/tigergraph/tigergraph/data/backup
+ # Specify the name of cluster
+ clusterName: test-cluster
+```
+
+Restore from backup in S3 bucket
+--------------------------------
+
+```yaml
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraphRestore
+metadata:
+ name: restore-from-s3
+spec:
+ restoreConfig:
+ tag: daily-2021-11-04T120000
+ # optional
+ stagingPath: /home/tigergraph/tigergraph/data/restore-staging
+ # optional: (operator>=0.0.9 and tg>=3.9.3) should be >=0
+ decompressProcessNumber: 2
+ source:
+ storage: s3Bucket
+ s3Bucket:
+ # specify the bucket you want to use
+ bucketName: operator-backup
+ secretKey:
+ name: s3-secret
+ # Specify the name of cluster
+ clusterName: test-cluster
+```
+
+Cross-cluster restore in existing cluster
+-----------------------------------------
+
+We recommend using `kubectl tg restore` command to do this(See [Cross-Cluster Restore from Backup](./backup-restore-by-kubectl-tg.md#cross-cluster-restore-from-backup)). Since it is complicated to get metadata of backup and put it in CR.
+
+You should use `kubectl tg backup list --meta` to get metadata, and put it in field`.spec.restoreConfig.meta`
+
+```yaml
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraphRestore
+metadata:
+ name: tigergraphrestore-sample
+spec:
+ clusterName: test-cluster-new
+ source:
+ storage: s3Bucket
+ s3Bucket:
+ bucketName: operator-backup
+ secretKey:
+ name: s3-secret
+ restoreConfig:
+ meta: |
+ {
+ "Tag": "daily-2022-10-13T022218",
+ "Files": [
+ {
+ "Instance": {
+ "ServiceName": "GSE",
+ "Replica": 0,
+ "Partition": 1
+ },
+ "Name": "GSE_1_1.tar.lz4",
+ "Checksum": "ecbddb2312346506",
+ "RawSize": 946248,
+ "Size": 5287,
+ "TargetPaths": [
+ ""
+ ]
+ },
+ {
+ "Instance": {
+ "ServiceName": "GPE",
+ "Replica": 0,
+ "Partition": 1
+ },
+ "Name": "GPE_1_1.tar.lz4",
+ "Checksum": "282f94df17d3ea35",
+ "RawSize": 13286,
+ "Size": 751,
+ "TargetPaths": [
+ ""
+ ]
+ },
+ {
+ "Instance": {
+ "ServiceName": "GSQL",
+ "Replica": 0,
+ "Partition": 0
+ },
+ "Name": "GSQL.tar.lz4",
+ "Checksum": "97dbfd62825bfd3f",
+ "RawSize": 4522912,
+ "Size": 1687264,
+ "TargetPaths": null
+ },
+ "Time": "2022-10-13 02:22:19"
+ }
+ stagingPath: /home/tigergraph/data
+```
+### Cluster version >=3.9.2
+If you are using a TigerGraph cluster whose version >=3.9.2, the CR could be simplified. You don't need to put the metadata into it, you only need to specify the tag
+```yaml
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraphRestore
+metadata:
+ name: restore-from-s3
+spec:
+ restoreConfig:
+ tag: daily-2022-10-13T022218
+ # optional
+ stagingPath: /home/tigergraph/tigergraph/data/restore-staging
+ # optional: (operator>=0.0.9 and tg>=3.9.3) should be >=0
+ decompressProcessNumber: 2
+ source:
+ storage: s3Bucket
+ s3Bucket:
+ # specify the bucket you want to use
+ bucketName: operator-backup
+ secretKey:
+ name: s3-secret
+ # Specify the name of cluster
+ clusterName: test-cluster-new
+```
+
+Clone a cluster(Create a new cluster and do cross-cluster restore)
+-------------------------------------------------
+
+We recommend using `kubectl tg restore` command to do this(See [Clone Cluster from Backup](./backup-restore-by-kubectl-tg.md#clone-cluster-from-backup)). Since it is complicated to get metadata of backup , clusterTemplate of the original cluster and put them in CR.
+
+You should use `kubectl tg backup list --cluster-name source-cluster -n tigergraph --meta` to get metadata, and put it in field`.spec.restoreConfig.meta`. Then use `kubectl tg export --cluster-name source-cluster -n tigergraph` to get cluster template of the original cluster, and put it in field `.spec.clusterTemplate`
+
+```yaml
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraphRestore
+metadata:
+ name: tigergraphrestore-sample
+spec:
+ clusterName: test-cluster-new
+ source:
+ storage: s3Bucket
+ s3Bucket:
+ bucketName: operator-backup
+ secretKey:
+ name: s3-secret
+ restoreConfig:
+ meta: |
+ {
+ "Tag": "daily-2022-10-13T022218",
+ "Files": [
+ {
+ "Instance": {
+ "ServiceName": "GSE",
+ "Replica": 0,
+ "Partition": 1
+ },
+ "Name": "GSE_1_1.tar.lz4",
+ "Checksum": "ecbddb2312346506",
+ "RawSize": 946248,
+ "Size": 5287,
+ "TargetPaths": [
+ ""
+ ]
+ },
+ {
+ "Instance": {
+ "ServiceName": "GPE",
+ "Replica": 0,
+ "Partition": 1
+ },
+ "Name": "GPE_1_1.tar.lz4",
+ "Checksum": "282f94df17d3ea35",
+ "RawSize": 13286,
+ "Size": 751,
+ "TargetPaths": [
+ ""
+ ]
+ },
+ {
+ "Instance": {
+ "ServiceName": "GSQL",
+ "Replica": 0,
+ "Partition": 0
+ },
+ "Name": "GSQL.tar.lz4",
+ "Checksum": "97dbfd62825bfd3f",
+ "RawSize": 4522912,
+ "Size": 1687264,
+ "TargetPaths": null
+ },
+ "Time": "2022-10-13 02:22:19"
+ }
+ stagingPath: /home/tigergraph/data
+ clusterTemplate:
+ replicas: 3
+ image: docker.io/tigergraph/tigergraph-k8s:3.9.3
+ imagePullPolicy: IfNotPresent
+ listener:
+ type: LoadBalancer
+ resources:
+ requests:
+ cpu: 2
+ memory: 8Gi
+ storage:
+ type: persistent-claim
+ volumeClaimTemplate:
+ storageClassName: standard
+ resources:
+ requests:
+ storage: 10G
+ initTGConfig:
+ ha: 1
+ license: "YOUR_LICENSE"
+ version: 3.9.3
+ initJob:
+ image: docker.io/tigergraph/tigergraph-k8s-init:0.0.9
+ imagePullPolicy: IfNotPresent
+```
+
+### Cluster version >=3.9.2
+```yaml
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraphRestore
+metadata:
+ name: tigergraphrestore-sample
+spec:
+ clusterName: test-cluster-new
+ source:
+ storage: s3Bucket
+ s3Bucket:
+ bucketName: operator-backup
+ secretKey:
+ name: s3-secret
+ restoreConfig:
+ tag: daily-2022-10-13T022218
+ stagingPath: /home/tigergraph/data
+ clusterTemplate:
+ replicas: 3
+ image: docker.io/tigergraph/tigergraph-k8s:3.9.3
+ imagePullPolicy: IfNotPresent
+ listener:
+ type: LoadBalancer
+ resources:
+ requests:
+ cpu: 2
+ memory: 8Gi
+ storage:
+ type: persistent-claim
+ volumeClaimTemplate:
+ storageClassName: standard
+ resources:
+ requests:
+ storage: 10G
+ initTGConfig:
+ ha: 1
+ license: "YOUR_LICENSE"
+ version: 3.9.3
+ initJob:
+ image: docker.io/tigergraph/tigergraph-k8s-init:0.0.9
+ imagePullPolicy: IfNotPresent
+```
+
diff --git a/k8s/docs/04-manage/backup-and-restore/backup-restore-by-kubectl-tg.md b/k8s/docs/04-manage/backup-and-restore/backup-restore-by-kubectl-tg.md
new file mode 100644
index 00000000..8a8ab1cd
--- /dev/null
+++ b/k8s/docs/04-manage/backup-and-restore/backup-restore-by-kubectl-tg.md
@@ -0,0 +1,916 @@
+Backup & Restore clustey kubectl-tg plugin
+
+If you have experience with Custom Resources in Kubernetes (K8S), you can leverage CRs to initiate backup or restore processes. We provide a dedicated document detailing the steps for performing backup and restore using Custom Resources (CRs). [Backup & restore by CR](backup-restore-by-cr.md)
+
+- [Prerequisite](#prerequisite)
+- [Utilizing `kubectl tg` Command for Backup](#utilizing-kubectl-tg-command-for-backup)
+ - [Creating and Updating Backups](#creating-and-updating-backups)
+ - [Backup to Local Storage](#backup-to-local-storage)
+ - [Backup to an S3 Bucket](#backup-to-an-s3-bucket)
+ - [\[Preview\] Performing Incremental Backup](#preview-performing-incremental-backup)
+ - [Updating Backup Custom Resources](#updating-backup-custom-resources)
+ - [Changing Backup Types](#changing-backup-types)
+ - [Creating Another Backup](#creating-another-backup)
+ - [Listing Backup Custom Resources](#listing-backup-custom-resources)
+ - [Displaying Backup Process Status](#displaying-backup-process-status)
+ - [Delete Backup Custom Resource (CR)](#delete-backup-custom-resource-cr)
+ - [Listing Backups](#listing-backups)
+ - [Removing Backups](#removing-backups)
+- [Creating and Managing Backup Schedules](#creating-and-managing-backup-schedules)
+ - [Specifying Backup Schedule](#specifying-backup-schedule)
+ - [Creating Backup Schedules](#creating-backup-schedules)
+ - [Creating a Local Backup Schedule](#creating-a-local-backup-schedule)
+ - [Creating an S3 Backup Schedule](#creating-an-s3-backup-schedule)
+ - [Updating a Backup Schedule](#updating-a-backup-schedule)
+ - [Listing All Backup Schedules](#listing-all-backup-schedules)
+ - [Deleting a Backup Schedule](#deleting-a-backup-schedule)
+ - [Showing Backup Schedule Status](#showing-backup-schedule-status)
+ - [Pausing and Resuming a Backup Schedule](#pausing-and-resuming-a-backup-schedule)
+ - [Backup Strategy Overview](#backup-strategy-overview)
+- [Utilizing `kubectl tg` for Restore](#utilizing-kubectl-tg-for-restore)
+ - [Restore within the Same Cluster](#restore-within-the-same-cluster)
+ - [Cross-Cluster Restore from Backup](#cross-cluster-restore-from-backup)
+ - [Clone Cluster from Backup](#clone-cluster-from-backup)
+ - [Cross-Cluster Restore and Cluster Clone (Cluster Version \< 3.9.2)](#cross-cluster-restore-and-cluster-clone-cluster-version--392)
+ - [Restore an Existing Cluster from Backup Created by Another Cluster (Cluster version \< 3.9.2)](#restore-an-existing-cluster-from-backup-created-by-another-cluster-cluster-version--392)
+ - [Clone a Cluster (Cluster version \< 3.9.2)](#clone-a-cluster-cluster-version--392)
+ - [Show Status of Restore](#show-status-of-restore)
+ - [Delete Restore Job](#delete-restore-job)
+
+
+Prerequisite
+============
+
+The successful execution of the `kubectl tg backup|restore|backup-schedule` command relies on the presence of several dependencies: `kubectl`, `helm`, `jq`, and `yq`. It is imperative to ensure that all these components are properly installed on your system.
+
+Furthermore, prior to using the backup command, it is essential to have the TigerGraph Kubectl Plugin installed(please refer to [Install kubectl-tg plugin](../../02-get-started/get_started.md#install-kubectl-tg-plugin)). Additionally, you must create your cluster as a prerequisite step.
+
+Utilizing `kubectl tg` Command for Backup
+==========================================
+
+To maintain coherence between the `kubectl-tg` command and custom resources presented in YAML format, the `--name` option is employed to specify the name of the custom resources to be created or managed.
+
+Creating and Updating Backups
+------------------------------
+
+```
+Usage:
+ kubectl tg backup [create|update] [OPTIONS]
+
+Options:
+ -h|--help : Display this message.
+ -n|--namespace : Define the namespace for TG cluster deployment. If not set, the
+ default namespace from the context will be used.
+ --name : (required) Specify the name of the backup.
+ -c|--cluster-name : Define the cluster name for TG cluster deployment. No default value.
+ --tag : Specify the tag for backup files. For example, if you specify
+ --tag daily, the backup file will be named daily-20xx-xx-xxTxxxxxx.
+ --staging-path : Specify the location to store temporary files.
+ --timeout : Set the backup timeout in seconds. Default: 18000.
+ --compress-process-number : Determine the number of concurrent processes used for compression
+ during backup. A value of 0 indicates that the number of compression processes will match the number of CPU cores on the nodes. The default value is 0.
+ --compress-level : Choose from options: BestSpeed, DefaultCompression, and
+ BestCompression. Only supported for TG clusters >=3.9.3.
+ --incremental : Perform incremental backup.
+ --full : Perform a full backup (full backup is the default behavior).
+ --destination : Specify the destination for storing backup files. Currently
+ supports local and S3 storage.
+ -y : Provide a positive response to all questions.
+
+ Configuration details for different destinations:
+ If the destination is local, you should provide:
+ --local-path : Specify the local path where backup files will be stored.
+ If the destination is S3:
+ --s3-bucket : Specify the name of the S3 Bucket.
+ --aws-secret : Provide the name of the AWS secret.
+ The secret should contain accessKeyID and secretAccessKey.
+```
+
+### Backup to Local Storage
+
+use the following command to backup cluster whose name is test-cluster and store backup files in local storage
+
+```
+ kubectl tg backup create --name backup-to-local \
+ --cluster-name test-cluster --tag testlocal -n tigergraph \
+ --destination local --local-path /home/tigergraph/tigergraph/data/mybackup
+```
+
+you can also customize timeout, staging path, the compress level and the compress process number
+
+```
+ kubectl tg backup create --name backup-to-local --cluster-name test-cluster \
+ --tag testlocal -n tigergraph --destination local \
+ --local-path /home/tigergraph/tigergraph/data/mybackup --staging-path /home/tigergraph/temp \
+ --timeout 18000 --compress-process-number 0 --compress-level BestSpeed
+```
+> [!NOTE]
+> 1. Please use subpath of `/home/tigergraph/tigergraph/data/` as local path for backup since this path is mounted with PV. For example, you can use `/home/tigergraph/tigergraph/data/mybackup` .If you do not use that, you will lose your backup data if the pod restarts. And be careful that donât use the same path for local path as the staging path. If you donât configure staging path, the default staging path is `/home/tigergraph/tigergraph/data/backup`, if you set local path as `/home/tigergraph/tigergraph/data/backup`, the backup will fail.
+> 2. Please remember which path you use and use the same path if you want to restore the backup file you create.
+
+
+### Backup to an S3 Bucket
+
+Follow the steps below to back up a cluster named "test-cluster" and store the backup files in an S3 bucket. Make sure you provide the S3 bucket name, access key ID, and secret key for S3.
+
+1. First, create a Kubernetes secret containing the access key ID and secret key:
+
+ ```bash
+ kubectl create secret generic aws-secret \
+ --from-literal=accessKeyID=AWSACCESSKEY \
+ --from-literal=secretAccessKey='AWSSECRETKEY'
+ ```
+
+2. Next, create a backup to the S3 bucket:
+
+ ```bash
+ kubectl tg backup create --name backup-to-s3 -n tigergraph \
+ --cluster-name test-cluster --destination s3Bucket --tag testS3 \
+ --s3-bucket tgbackup \
+ --aws-secret aws-secret
+ ```
+
+You can also customize the following parameters: timeout, staging path, and the number of compression processes:
+
+```bash
+kubectl tg backup create --name backup-to-s3 -n tigergraph \
+ --cluster-name test-cluster --tag testS3 --destination s3Bucket \
+ --s3-bucket tgbackup \
+ --aws-secret aws-secret \
+ --staging-path /home/tigergraph/temp \
+ --timeout 18000 --compress-process-number 0 --compress-level BestSpeed
+```
+
+> [!NOTE]
+> Ensure that you have created the necessary Kubernetes secret containing the access key ID and secret key before initiating the backup process to the S3 bucket.
+
+### [Preview] Performing Incremental Backup
+> [!NOTE]
+> For TigerGraph version 3.9, performing an incremental backup requires the existence of at least one previous backup for the cluster. Without a prior full backup, attempting an incremental backup will result in failure. To verify the presence of a full backup, you can utilize the command `kubectl tg backup list`.
+
+To initiate an incremental backup, incorporate the `--incremental` option into the following command:
+
+```bash
+kubectl tg backup create --cluster-name test-cluster -n tigergraph --name incremental-backup \
+ --incremental --tag testlocal \
+ --destination local \
+ --local-path /home/tigergraph/tigergraph/data/mybackup
+```
+
+### Updating Backup Custom Resources
+
+If you have previously created a backup using the `kubectl tg backup create` command, you can modify the backup configuration by employing the `kubectl tg backup update` command. Once the `update` command is executed, the backup process will be triggered immediately with the updated settings.
+
+Suppose you've already generated a backup using the following command:
+
+```bash
+kubectl tg backup create --name backup-to-local \
+ --cluster-name test-cluster --tag testlocal -n tigergraph \
+ --destination local \
+ --local-path /home/tigergraph/backup --staging-path /home/tigergraph/temp \
+ --timeout 18000 --compress-process-number 0
+```
+
+To adjust the backup timeout, you can execute:
+
+```bash
+kubectl tg backup update --name backup-to-local -n tigergraph \
+--timeout 20000
+```
+
+Subsequently, the timeout value will be updated to 20000, and a backup process with the revised timeout setting will be immediately initiated.
+
+#### Changing Backup Types
+
+You have the flexibility to switch between full and incremental backups using the following commands:
+
+- To convert a full backup configuration to an incremental backup, use:
+
+ ```bash
+ kubectl tg backup update --name backup-to-local --incremental
+ ```
+
+- To transform an incremental backup configuration to a full backup, use:
+
+ ```bash
+ kubectl tg backup update --name incremental-backup --full
+ ```
+
+These commands allow you to seamlessly modify the backup type based on your evolving requirements.
+
+
+
+### Creating Another Backup
+
+If you have previously initiated a backup using the `kubectl tg backup create` command:
+
+```bash
+kubectl tg backup create --name backup-to-local \
+ --cluster-name test-cluster --tag testlocal -n tigergraph \
+ --destination local \
+ --local-path /home/tigergraph/backup
+```
+
+And you wish to create a new backup with the same configuration, you can execute:
+
+```bash
+kubectl tg backup update --name backup-to-local -n tigergraph
+```
+
+The system will prompt you to confirm whether you want to initiate the backup again. You should type "y" to proceed.
+
+Alternatively, you can employ the `-y` option, indicating "yes to all questions," to immediately start the backup:
+
+```bash
+kubectl tg backup update --name backup-to-local -n tigergraph -y
+```
+
+Listing Backup Custom Resources
+----
+To retrieve a list of all backup Custom Resources (CRs) within a specific namespace, utilize the following command:
+
+```bash
+kubectl get tgbackup --namespace tigergraph
+```
+
+This command will provide you with an overview of the backup CRs present in the designated namespace.
+
+
+Displaying Backup Process Status
+----
+Upon executing `kubectl tg backup create/update`, a backup job will be generated within the Kubernetes (k8s) environment. To facilitate monitoring, we offer the `kubectl tg backup status` command, allowing you to assess the status of the backup process. Should you encounter errors or warnings, refer to the [How to Debug Backup & Restore](#how-to-debug-backup--restore) section for troubleshooting guidance.
+
+To display the status of all backup processes within the `tigergraph` namespace, use the following command:
+
+```bash
+kubectl tg backup status --namespace tigergraph
+```
+
+The output will resemble the following:
+
+```
+NAME CLUSTER TAG STORAGE INCREMENTAL STARTTIME COMPLETIONTIME
+test-cluster-backup-daily test-cluster daily local 3d12h
+test-cluster-backup-local test-cluster local local 16s 5s
+```
+
+If the `COMPLETIONTIME` field is not empty, it indicates a successful backup process.
+
+For detailed information about a specific backup process, execute:
+
+```bash
+kubectl tg backup status --name test-cluster-backup-daily \
+ --namespace tigergraph
+```
+
+The output provides a comprehensive overview of the backup process, including configurations and status details. You'll find events that indicate the progress and outcome of the backup job.
+
+The output is like this:
+
+```
+kubectl tg backup status --cluster-name test-cluster --tag daily
+Name: test-cluster-backup-daily
+Namespace: default
+Labels:
+Annotations:
+API Version: graphdb.tigergraph.com/v1alpha1
+Kind: TigerGraphBackup
+Metadata:
+ Creation Timestamp: 2022-12-13T09:52:38Z
+ Generation: 1
+ ...
+ Resource Version: 905382
+ UID: 6c97ae4a-e7fb-49e1-8c45-e8e09286865b
+Spec:
+ Backup Config:
+ Compress Process Number: 0
+ Tag: daily
+ Timeout: 18000
+ Cluster Name: test-cluster
+ Destination:
+ Local:
+ Path: /home/tigergraph/backup
+ Storage: local
+Status:
+ Conditions:
+ Last Transition Time: 2022-12-16T13:44:24Z
+ Message: Failed to backup cluster
+ Reason: BackupFailed
+ Status: True
+ Type: Failed
+ Start Time: 2022-12-16T13:44:03Z
+ Target Ready: true
+Events:
+ Type Reason Age From Message
+ ---- ------ ---- ---- -------
+ Normal Target cluster ready 31m (x35 over 3d12h) TigerGraphBackup Target cluster is ready for backup
+ Warning Backup job failed 31m (x12 over 3d12h) TigerGraphBackup Failed to backup cluster test-cluster
+```
+
+You can identify the occurrence of events marked as "Backup job failed," which indicates that the respective backup task has encountered a failure.
+
+Delete Backup Custom Resource (CR)
+-------------------------------------
+
+To remove a backup Custom Resource (CR), execute the following command:
+
+```
+kubectl tg backup delete --name backup-to-local --namespace tigergraph
+```
+
+Listing Backups
+---------------
+
+To list available backups, utilize the command:
+
+```
+Usage:
+ kubectl tg backup list [OPTIONS]
+
+Options:
+ --cluster-name : (required) Set the name of the target cluster.
+ -n, --namespace : Set the namespace of the target cluster.
+ --tag : Specify the tag of the backup.
+ --json : Output in JSON format.
+ --meta : Retrieve the metadata of the backup.
+```
+
+
+To examine the existing backups for a particular cluster, you can employ the following commands to list all backups associated with the "test-cluster":
+
+```
+kubectl tg backup list --cluster-name test-cluster -n tigergraph
+```
+
+If you prefer to obtain the backup list in JSON format, use:
+
+```
+kubectl tg backup list --cluster-name test-cluster -n tigergraph --json
+```
+
+
+In the context of a cross-cluster restore, acquiring backup metadata is essential. To accomplish this, utilize the tag obtained from the `kubectl tg backup list` command. Run the following command:
+
+```
+kubectl tg backup list --cluster-name test-cluster -n tigergraph \
+ --tag tests3-2022-10-31T031005 --meta
+```
+
+This command will display the metadata in the standard output. If you wish to store this metadata in a file, execute:
+
+```
+kubectl tg backup list --cluster-name test-cluster -n tigergraph --tag tests3 --meta > metadata
+```
+
+
+
+Removing Backups
+------------------
+
+To eliminate backups that are no longer needed, follow these steps:
+
+Use the following command to remove specific backups associated with the "test-cluster" and located in the "tigergraph" namespace:
+
+```bash
+kubectl tg backup remove --cluster-name test-cluster --namespace tigergraph \
+ --tag daily-20xx-xx-xxTxxxxx
+```
+
+This command enables you to selectively remove backups based on their tags. Please ensure you accurately specify the relevant cluster name, namespace, and backup tag when executing this command.
+
+
+Creating and Managing Backup Schedules
+====
+The `kubectl tg backup-schedule` command enables you to create, update, monitor, list, delete, pause, and resume backup schedules for specific clusters. This comprehensive set of options empowers you to effortlessly manage your backup scheduling requirements.
+
+```
+Usage:
+ kubectl tg backup-schedule [create|update|status|list|delete|pause|resume] [OPTIONS]
+
+Commands:
+ create Create a backup schedule to schedule backup for specific cluster
+ update Update a backup schedule
+ status Show status of backup schedule
+ list List existing backup schedules
+ delete Delete a backup schedule (backups created by the schedule won't be deleted)
+ pause Pause the backup schedule
+ resume Resume the backup schedule
+
+Options:
+ -h|--help: show this message
+ -n|--namespace : set namespace to deploy TG cluster, if not set, use the default namespace in context
+ --name : (required)specify name of backup schedule
+ -c|--cluster-name : set cluster-name to deploy TG cluster, no default
+ --tag : specify the tag of backup files. e.g. if you specify --tag daily, the backup file will be daily-20xx-xx-xxTxxxxxx
+ --staging-path : specify where to store the temporary files
+ --timeout : the backup timeout in seconds,default: 18000
+ --compress-process-number : the number of concurrent process for compression during backup
+ value 0 means the number of processes used to compress equals
+ the number of the node's CPU cores. And the default value is 0
+ --compress-level : choose from BestSpeed,DefaultCompression and BestCompression. Only support TG cluster >=3.9.3
+ --schedule : specify the schedule of backup in cron format. e.g. '* * * * *' is backup every minute
+ --destination : set the destination to store backup files, support local and s3 now
+ --incremental : do incremental backup
+ --full : do full backup (full backup is performed by default)
+ --max-retry : set max times of retry for each backup
+ --max-backup-file : set the max number of files you want to retain
+ --max-reserved-day : set the max number of days you want to retain these backups
+ -y : yes to all questions
+
+ Followings are about the configuration of different destination:
+ If destination is local,you should provide:
+ --local-path : set the local path where to store backup files
+ If destination is s3:
+ --s3-bucket : S3 Bucket name
+ --aws-secret : name of secret for aws, the secret should contain accessKeyID and secretAccessKey
+```
+
+
+### Specifying Backup Schedule
+
+To define a backup schedule, utilize a cron expression to set the timing. You can conveniently generate cron expressions using tools like [https://crontab.guru/](https://crontab.guru/), which provides an intuitive interface for creating intricate schedules.
+
+For instance, if you desire to execute a backup once daily at 00:00, you would specify the following cron expression:
+
+```bash
+--schedule '0 0 * * *'
+```
+
+Please ensure to enclose the cron expression in single quotation marks (`'`) to prevent unintended filename expansion.
+
+
+### Creating Backup Schedules
+
+#### Creating a Local Backup Schedule
+
+ To create a schedule that performs daily backups for the "test-cluster" at 00:00, storing backup files locally, execute the following command:
+
+ ```bash
+ kubectl tg backup-schedule create --name backupsch-local \
+ --cluster-name test-cluster -n tigergraph \
+ --tag localdaily --schedule '0 0 * * *' \
+ --destination local --local-path /home/tigergraph/backup
+ ```
+#### Creating an S3 Backup Schedule
+
+ For a schedule that conducts hourly backups for the "test-cluster" at minute 0, storing backup files in an S3 bucket, proceed as follows:
+
+ First, create a secret in Kubernetes containing access key id and secret key:
+
+ ```bash
+ kubectl create secret generic aws-secret \
+ --from-literal=accessKeyID=AWSACCESSKEY \
+ --from-literal=secretAccessKey='AWSSECRETKEY'
+ ```
+
+ Next, establish the backup schedule:
+
+ ```bash
+ kubectl tg backup-schedule create --name backupsch-s3 \
+ --cluster-name test-cluster -n tigergraph \
+ --tag s3daily --schedule '0 * * * *' --destination s3Bucket\
+ --s3-bucket tgbackup \
+ --aws-secret aws-secret
+ ```
+
+By executing these commands, you'll set up automatic backup schedules tailored to your requirements.
+
+
+
+
+### Updating a Backup Schedule
+When updating a backup schedule, ensure you provide the correct name.
+
+For instance, to adjust the schedule for daily backups at 12:00, execute the following:
+
+```bash
+kubectl tg backup-schedule update --name backupsch-local \
+ --tag localdaily --schedule '0 12 * * *'
+```
+
+Please note that ongoing backup jobs remain unaffected by configuration changes. The new configuration will take effect during the subsequent schedule.
+
+
+### Listing All Backup Schedules
+
+To view a comprehensive list of all existing backup schedules within a specific namespace, employ the following command:
+
+```bash
+kubectl tg backup-schedule list --namespace tigergraph
+```
+
+### Deleting a Backup Schedule
+
+To remove a backup schedule, execute the following command:
+
+```bash
+kubectl tg backup-schedule delete --name backupsch-local \
+ --namespace tigergraph
+```
+
+### Showing Backup Schedule Status
+
+To retrieve the status of a backup schedule, use the following command:
+
+```bash
+kubectl tg backup-schedule status --name test-cluster-schedule-daily \
+ --namespace tigergraph
+```
+
+The output will provide insights into the status of the specified backup schedule, allowing you to monitor its progress and execution.
+
+
+```
+Name: test-cluster-schedule-daily
+Namespace: default
+Labels:
+Annotations:
+API Version: graphdb.tigergraph.com/v1alpha1
+Kind: TigerGraphBackupSchedule
+Metadata:
+ Creation Timestamp: 2022-12-20T02:40:10Z
+ Generation: 1
+ Resource Version: 1696649
+ UID: f8c95418-bcb3-495b-b5e4-5083789ce11a
+Spec:
+ Backup Template:
+ Backup Config:
+ Compress Process Number: 0
+ Tag: daily
+ Timeout: 18000
+ Cluster Name: test-cluster
+ Destination:
+ Local:
+ Path: /home/tigergraph/backup
+ Storage: local
+ Schedule: * * * * *
+Status:
+ Conditions:
+ Last Transition Time: 2022-12-20T02:42:01Z
+ Message: Backup job is active
+ Reason: BackupActive
+ Status: True
+ Type: Active
+ Job Counter:
+ Successful Jobs: 1
+ Last Schedule Time: 2022-12-20T02:42:00Z
+ Last Successful Time: 2022-12-20T02:41:11Z
+Events:
+ Type Reason Age From Message
+ ---- ------ ---- ---- -------
+ Normal Backup schedule created 2m1s TigerGraphBackupSchedule Create a new backup schedule success.
+ Normal Backup job succeed 60s TigerGraphBackupSchedule Last scheduled job succeed
+ Normal Backup job created 10s (x2 over 71s) TigerGraphBackupSchedule Schedule a new backup job
+```
+
+
+Indeed, the events associated with backup schedule executions provide valuable insights into the success or failure of the scheduled jobs. By examining these events, you can ascertain whether the backup schedules were executed as intended and if any issues arose during the process.
+
+
+### Pausing and Resuming a Backup Schedule
+
+You have the ability to temporarily halt a running backup schedule or resume a paused one using the following commands:
+
+To pause a currently active backup schedule:
+
+```bash
+kubectl tg backup-schedule pause --name backupsch-local -n tigergraph
+```
+
+This action will prevent the scheduling of the next backup job.
+
+To resume a paused backup schedule:
+
+```bash
+kubectl tg backup-schedule resume --name backupsch-local -n tigergraph
+```
+
+
+### Backup Strategy Overview
+
+It's important to note that the backup strategy feature is available for cluster versions equal to or greater than 3.9.0. This feature provides enhanced control over backup operations and file retention. Presently, you have three distinct options at your disposal to facilitate a comprehensive backup strategy:
+
+1. **`--max-retry`**: This parameter allows you to specify the maximum number of retry attempts for each backup job. It helps ensure that backup processes have a predefined limit of retries in the event of any unexpected issues.
+
+2. **`--max-backup-file`**: As time progresses, the accumulation of backup files can consume substantial disk space. You can utilize this parameter to determine the maximum number of backup files to retain. For instance, setting `--max-backup-file 10` will retain the latest 10 backup files according to the specified tag.
+
+3. **`--max-reserved-day`**: This parameter governs the maximum number of days that backups are retained. If a backup is created more than the defined number of days ago, it will be automatically deleted, thus optimizing storage management.
+
+For example, consider a backup schedule with the tag `daily`. If you set `--max-backup-file 10`, a cleanup process will run after each scheduled backup, ensuring that only the latest 10 backups with the `daily` tag are retained. Backups with different tags will remain unaffected.
+
+Furthermore, with `--max-reserved-day 7`, backups created more than 7 days ago (and possessing the `daily` tag) will be deleted, aligning with your defined retention strategy.
+
+By leveraging these options, you can meticulously manage your backup jobs and safeguard against excessive disk usage. This proactive approach to backup strategy aids in optimizing storage utilization while preserving the necessary backups for operational needs.
+
+Utilizing `kubectl tg` for Restore
+====
+When you possess backups generated through the backup process or backup schedule, you have the capability to restore your cluster to a previous state. You can initiate restore from a backup that was crafted by the same cluster, and this feature extends to both local storage and S3 buckets.
+
+It's important to highlight that we also offer cross-cluster restore, enabling you to restore Cluster B utilizing backups from Cluster A. As of now, this functionality exclusively supports S3 buckets.
+
+A crucial consideration is that the restore process is currently restricted to clusters featuring the same partition configuration as the cluster that originated the backup.
+
+| Scenarios | Is Partition changed? | Is HA changed? | Support or not | Example(x\*y means x partitions and y ha) |
+| ---- | ---- | ---- | ---- | ---- |
+| Clone an identical cluster | N | N | Y | Source cluster: 3\*2, Target cluster: 3\*2 |
+| Restore in a cluster with different partition | Y | N or Y | N | Source cluster: 3*x, Target cluster: 2\*3 or 2\*2 |
+| Restore in a cluster with different HA | N | Y | Y | Source cluster: 3\*3, Target cluster: 3\*1 |
+
+
+
+```
+USAGE:
+ kubectl tg restore [OPTIONS]
+
+Options:
+ -h|--help: show this message
+ -n|--namespace : set namespace to deploy TG cluster, default namespace is current namespace
+ -c|--cluster-name : set cluster-name to deploy TG cluster, no default
+ --name: specify name of restore
+ --tag : specify the tag of backup files. you can use kubectl tg backup list to get all existing backups
+ --metadata : specify the metadata file of backup. you should this if you want a cross-cluster restore
+ --cluster-template : configure the cluster you want to create from exported CR
+ --staging-path : specify where to store the temporary files
+ --source : set the source to get backup files, support local and s3 now
+ Followings are about the configuration of different destination:
+ If destination is local,you should provide:
+ --local-path : set the local path where to store backup files
+ If destination is s3:
+ --s3-bucket : S3 Bucket name
+ --aws-secret : name of secret for aws, the secret should contain accessKeyID and secretAccessKey
+```
+### Restore within the Same Cluster
+
+Suppose you have previously created a backup for `test-cluster` using the `kubectl tg backup create` command. To initiate restore within the same cluster, retrieve the tag of all Backups first:
+
+ Execute the following command to retrieve the tags associated with all available backups:
+
+ ```bash
+ kubectl tg backup list --cluster-name test-cluster -n tigergraph
+ ```
+
+ The output will provide a list of backups along with their respective tags, types, versions, sizes, and creation timestamps. Choose the backup you intend to restore from based on your requirements.
+
+ For instance:
+
+ ```
+ +------------------------------+------+---------+--------+---------------------+
+ | TAG | TYPE | VERSION | SIZE | CREATED AT |
+ +------------------------------+------+---------+--------+---------------------+
+ | daily-2022-11-02T103601 | FULL | 3.9.0 | 1.7 MB | 2022-11-02 10:36:02 |
+ | daily-2022-11-02T104925 | FULL | 3.9.0 | 1.7 MB | 2022-11-02 10:49:25 |
+ | daily-2022-11-09T081545 | FULL | 3.9.0 | 1.7 MB | 2022-11-09 08:15:46 |
+ | daily-2022-11-09T081546 | FULL | 3.9.0 | 1.7 MB | 2022-11-09 08:15:53 |
+ +------------------------------+------+---------+--------+---------------------+
+ ```
+
+
+Using backup in local storage:
+To restore your cluster utilizing a backup stored in local storage, execute the following command:
+```
+kubectl tg restore --name restore-from-local \
+ --cluster-name test-cluster -n tigergraph --tag daily-2022-11-02T103601\
+ --source local --local-path /home/tigergraph/backup
+```
+Replace `/home/tigergraph/backup` with the appropriate path to the backup stored in your local storage. This command will initiate the restore process and bring your cluster back to the state captured by the specified backup.
+
+Use backup in s3 bucket:
+
+First, create a secret in k8s containing access key id and secret key:
+
+```
+kubectl create secret generic aws-secret \
+ --from-literal=accessKeyID=AWSACCESSKEY \
+ --from-literal=secretAccessKey='AWSSECRETKEY'
+```
+
+Select a backup tag from the available backups and execute the following command to initiate restore from an S3 bucket:
+```
+kubectl tg restore --name restore-from-s3 \
+ --namespace tigergraph --cluster-name test-cluster \
+ --tag tests3-2022-10-31T031005 \
+ --source s3Bucket --s3-bucket tg-backup \
+ --aws-secret aws-secret
+```
+
+Make sure to replace tests3-2022-10-31T031005 with the desired backup tag and adjust tg-backup to your S3 bucket name. This command will trigger the restore process, bringing your cluster back to the chosen backup's state.
+
+
+### Cross-Cluster Restore from Backup
+> [!NOTE]
+> This section pertains to users utilizing TigerGraph cluster version 3.9.2 or higher. If you are operating on an earlier version, please consult the [Restore an Existing Cluster from Backup Created by Another Cluster (Cluster version < 3.9.2)](#restore-an-existing-cluster-from-backup-created-by-another-cluster-cluster-version--392) section for relevant instructions.
+
+Performing a cross-cluster restore, where you restore an existing cluster (target-cluster) using a backup created by another cluster (source-cluster), requires careful steps. Follow the instructions below for a successful cross-cluster restore:
+
+1. **Retrieve the Backup Tag from the Source Cluster:**
+
+ Obtain the backup tag from the source cluster (source-cluster) using the following command:
+
+ ```bash
+ kubectl tg backup list --cluster-name source-cluster --namespace tigergraph
+ ```
+
+2. **Use the Tag to Restore the Target Cluster:**
+
+ Create an AWS secret for authentication if you haven't done so already:
+
+ ```bash
+ kubectl create secret generic aws-secret \
+ --from-literal=accessKeyID=AWSACCESSKEY \
+ --from-literal=secretAccessKey='AWSSECRETKEY'
+ ```
+
+ Then, initiate the cross-cluster restore for the target cluster (target-cluster) using the obtained backup tag:
+
+ ```bash
+ kubectl tg restore --name cross-restore \
+ --namespace tigergraph --cluster-name target-cluster \
+ --tag tests3-2022-10-31T031005 \
+ --source s3Bucket --s3-bucket tg-backup \
+ --aws-secret aws-secret
+ ```
+
+Remember to adjust the cluster names, backup tag, S3 bucket name, and AWS credentials as needed for your specific setup. Cross-cluster restore is a powerful way to recover data and configurations across different clusters, ensuring data resilience and system stability.
+
+
+
+### Clone Cluster from Backup
+> [!NOTE]
+> This section pertains to users utilizing TigerGraph cluster version 3.9.2 or higher. If you are operating on an earlier version, please consult the [Clone a Cluster (Cluster version \< 3.9.2)](#clone-a-cluster-cluster-version--392)
+
+
+Creating a new cluster and restoring it from a backup created by another cluster, often referred as "cloning", involves several steps. Follow these instructions to successfully clone a cluster using the `kubectl tg restore` command:
+
+1. **Retrieve the Cluster Configuration of the Source Cluster:**
+
+ Export the custom resource (CR) configuration of the source cluster (source-cluster) and save it to a YAML file, for example:
+
+ ```bash
+ kubectl tg export --cluster-name source-cluster -n tigergraph
+ ```
+ Assume the output file is /home/test-cluster_backup_1668069319.yaml.
+
+2. **Retrieve the Backup Tag:**
+
+ Obtain the backup tag associated with the desired backup from the source cluster:
+
+ ```bash
+ kubectl tg backup list --cluster-name source-cluster --namespace tigergraph
+ ```
+
+3. **Use the Configuration and Backup Tag to Create a Cluster Clone:**
+
+ Create an AWS secret for authentication if you haven't done so already:
+
+ ```bash
+ kubectl create secret generic aws-secret \
+ --from-literal=accessKeyID=AWSACCESSKEY \
+ --from-literal=secretAccessKey='AWSSECRETKEY'
+ ```
+
+ Initiate the cluster cloning process using the cluster configuration template and the backup tag:
+
+ ```bash
+ kubectl tg restore --name cross-restore \
+ --namespace tigergraph --cluster-name new-cluster \
+ --tag tests3-2022-10-31T031005 --cluster-template /home/test-cluster_backup_1668069319.yaml \
+ --source s3Bucket --s3-bucket tg-backup \
+ --aws-secret aws-secret
+ ```
+
+By following these steps, you can easily perform cross-cluster restore or clone a cluster using backup files created by another cluster. Be sure to replace placeholders such as `source-cluster`, `target-cluster`, `AWSACCESSKEY`, `AWSSECRETKEY`, and file paths with actual values specific to your environment.
+
+Once the process is complete, the new cluster (`new-cluster`) will be initialized and ready for use. The restore ensures that the new cluster matches the state of the source cluster captured by the backup. Cloning a cluster from a backup is a powerful way to quickly replicate environments and configurations for testing, development, or disaster recovery purposes.
+
+
+
+### Cross-Cluster Restore and Cluster Clone (Cluster Version < 3.9.2)
+
+Starting from TigerGraph cluster version 3.9.2, the process for cross-cluster restore and cluster cloning has been simplified. You only need the backup tag to specify the backup file that you want to restore. If you are using cluster < 3.9.2, you need to follow the instructions below:
+
+#### Restore an Existing Cluster from Backup Created by Another Cluster (Cluster version < 3.9.2)
+1. **Retrieve Backup Metadata for Source Cluster:**
+
+ Obtain the metadata of the backup from the source cluster (source-cluster) and save it to a file named `backup-metadata`. Run the following command:
+
+ ```bash
+ kubectl tg backup list --cluster-name source-cluster --namespace tigergraph \
+ --tag tests3-2022-10-31T031005 --meta > backup-metadata
+ ```
+
+2. **Create AWS Secret for Authentication:**
+
+ If you haven't done so already, create a Kubernetes secret containing your AWS credentials for authentication:
+
+ ```bash
+ kubectl create secret generic aws-secret \
+ --from-literal=accessKeyID=AWSACCESSKEY \
+ --from-literal=secretAccessKey='AWSSECRETKEY'
+ ```
+
+ Replace `AWSACCESSKEY` and `AWSSECRETKEY` with your actual AWS access key ID and secret access key.
+
+3. **Initiate Cross-Cluster Restore:**
+
+ Execute the following command to initiate the cross-cluster restore process for the target cluster (target-cluster) using the backup metadata obtained from the source cluster:
+
+ ```bash
+ kubectl tg restore --name cross-restore \
+ --namespace tigergraph --cluster-name target-cluster \
+ --metadata backup-metadata \
+ --source s3Bucket --s3-bucket tg-backup \
+ --aws-secret aws-secret
+ ```
+
+ This command will initiate the cross-cluster restore, ensuring that the target-cluster is brought back to the state captured by the backup from the source-cluster.
+
+Remember to adjust the cluster names, backup tag, S3 bucket name, and AWS credentials as needed for your specific setup. Cross-cluster restores are a powerful way to recover data and configurations across different clusters, ensuring data resilience and system stability.
+
+
+
+#### Clone a Cluster (Cluster version < 3.9.2)
+
+Creating a new cluster and restoring it from a backup created by another cluster, often referred to as "cloning," involves several steps. Follow these instructions to successfully clone a cluster using the `kubectl tg restore` command:
+
+1. **Export Configuration of Source Cluster:**
+
+ Obtain the custom resource (CR) configuration of the source cluster (source-cluster) and save it to a YAML file. Run the following command:
+
+ ```bash
+ kubectl tg export --cluster-name source-cluster -n tigergraph
+ ```
+ Assume the output file is /home/test-cluster_backup_1668069319.yaml.
+ This file will serve as the template for creating the new cluster.
+
+2. **Retrieve Backup Metadata for Source Cluster:**
+
+ Obtain the metadata of the backup from the source cluster (source-cluster) and save it to a file named `backup-metadata`. Run the following command:
+
+ ```bash
+ kubectl tg backup list --cluster-name source-cluster --namespace tigergraph \
+ --tag tests3-2022-10-31T031005 --meta > backup-metadata
+ ```
+
+3. **Create AWS Secret for Authentication:**
+
+ If you haven't done so already, create a Kubernetes secret containing your AWS credentials for authentication:
+
+ ```bash
+ kubectl create secret generic aws-secret \
+ --from-literal=accessKeyID=AWSACCESSKEY \
+ --from-literal=secretAccessKey='AWSSECRETKEY'
+ ```
+
+ Replace `AWSACCESSKEY` and `AWSSECRETKEY` with your actual AWS access key ID and secret access key.
+
+4. **Initiate Cluster Clone and Restore:**
+
+ Execute the following command to create a new cluster (new-cluster) based on the configuration template and restore it from the backup created by the source cluster:
+
+ ```bash
+ kubectl tg restore --name cross-restore \
+ --namespace tigergraph --cluster-name new-cluster \
+ --metadata backup-metadata --cluster-template /home/test-cluster_backup_1668069319.yaml \
+ --source s3Bucket --s3-bucket tg-backup \
+ --aws-secret aws-secret
+ ```
+
+ This command will create a new cluster named `new-cluster` based on the provided cluster template and restore its state from the specified backup.
+
+Once the process is complete, the new cluster (`new-cluster`) will be initialized and ready for use. The restore ensures that the new cluster matches the state of the source cluster captured by the backup.
+
+Remember to adjust the cluster names, backup tag, S3 bucket name, paths, and AWS credentials as needed for your specific setup. Cloning a cluster from a backup is a powerful way to quickly replicate environments and configurations for testing, development, or disaster recovery purposes.
+
+### Show Status of Restore
+
+To check the status of a restore process, you can use the following command:
+
+```bash
+kubectl tg restore status --name restore-from-local --namespace $NAMESPACE
+```
+
+This command will provide you with details about the ongoing or completed restore process. You can review the information in the output, including any events or messages related to the restore job. The status will indicate whether the restore was successful or if there were any issues.
+
+### Delete Restore Job
+
+If you want to delete a restore job, you can use the following command:
+
+```bash
+kubectl tg restore delete --name restore-from-local --namespace $NAMESPACE
+```
+
+This command will delete the specified restore job. Make sure to replace `restore-from-local` with the actual name of the restore job you want to delete, and provide the appropriate namespace using the `$NAMESPACE` variable.
+
+
diff --git a/k8s/docs/04-manage/backup-and-restore/troubleshoot.md b/k8s/docs/04-manage/backup-and-restore/troubleshoot.md
new file mode 100644
index 00000000..c12e35f2
--- /dev/null
+++ b/k8s/docs/04-manage/backup-and-restore/troubleshoot.md
@@ -0,0 +1,108 @@
+How to Debug Backup & Restore
+====
+
+General Guidelines
+----
+
+* It is important to avoid initiating multiple backup and restore jobs simultaneously for the same cluster.
+
+ Attempting to do so may result in the following outcomes:
+
+ * If a backup job is already in progress and you attempt to create another `TigerGraphBackup` to back up the identical cluster, the controller will await the completion of the ongoing job before generating a backup job for the new `TigerGraphBackup`.
+
+ * If a restore job is currently active and you create another `TigerGraphRestore` for the same cluster, the controller will wait for the ongoing job to finish before creating a restore job for the new `TigerGraphRestore`.
+
+ * In case a backup job is running and you create a new `TigerGraphRestore`, or if a restore job is ongoing and you create an additional `TigerGraphBackup`, the subsequently created job will encounter failure.
+
+* If the targeted cluster for backup or restore is not in a ready state (e.g., the cluster is in an uninitialized state, undergoing shrinkage, or undergoing an upgrade),the backup/restore controller will patiently await the cluster's return to a normal state before proceeding to create the backup/restore job.
+
+* Up to three pods responsible for executing backup or restore operations will be maintained for each cluster. These pods can prove useful for debugging purposes.
+
+* Should the backup process extend beyond the configured backup schedule interval, resulting in a duration exceeding the scheduled backup window, the scheduled backup will be missed. For instance, if your backup schedule is set to `0 * * * *`, indicating an hourly backup at the 1st minute of each hour, and if a backup process takes 1.5 hours, a backup job initiated at 00:00 will conclude at 01:30, leading to the scheduled 01:00 backup job being skipped.
+
+Debug backup or restore job
+----
+When dealing with backup and restore jobs, it's important to be able to troubleshoot and diagnose any issues that may arise. Here's a guide on how to debug backup and restore operations:
+
+1. **List Pods**: To begin, you can list pods running backup in the specified namespace using the following command:
+
+ ```bash
+ kubectl get pods -n NAMESPACE -l tigergraph.com/backup-cluster=test-cluster
+ ```
+
+ This will give you an overview of pods running backup for test-cluster in the specified namespace. You can replace "test-cluster" with the name of your cluster.
+ ```bash
+ NAME READY STATUS RESTARTS AGE
+ test-cluster-backup-local-backup-job-7sbcs 0/1 Completed 0 2d
+ test-cluster-backup-local-backup-job-7xd58 0/1 Error 0 5d13h
+ ```
+
+1. **Identify Backup and Restore Pods**: Look for pods related to backup and restore operations. These pods are typically named `${BACKUP_NAME}-backup-job-{SUFFIX}` for backup jobs and `${RESTORE_NAME}-restore-job-{SUFFIX}` for restore jobs.
+
+2. **Check Pod Status**: Check the status of the pods. If a pod's status is "Error" or not in the "Running" state, it indicates an issue with the backup or restore process.
+
+3. **View Logs**: To view the logs of a specific pod, you can use the following command:
+
+ ```bash
+ kubectl logs $POD_NAME -n NAMESPACE
+ ```
+
+ Replace `$POD_NAME` with the name of the pod you want to inspect, and specify the appropriate namespace using the `-n NAMESPACE` flag. The logs may provide valuable information about any errors or issues that occurred during the backup or restore job.
+
+ ```bash
+ > kubectl logs test-cluster-backup-job-7xd58
+ Warning: Permanently added '[test-cluster-internal-service.default]:10022' (ED25519) to the list of known hosts.
+ Fri Dec 16 13:44:19 UTC 2022
+ Start configure backup
+ [ Info] Configuration has been changed. Please use 'gadmin config apply' to persist the changes.
+ [ Info] Configuration has been changed. Please use 'gadmin config apply' to persist the changes.
+ Use Local Storage
+ [ Info] Configuration has been changed. Please use 'gadmin config apply' to persist the changes.
+ [ Info] Configuration has been changed. Please use 'gadmin config apply' to persist the changes.
+ [ Info] Configuration has been changed. Please use 'gadmin config apply' to persist the changes.
+ Apply config
+ [Warning] No difference from staging config, config apply is skipped.
+ [ Info] Successfully applied configuration change. Please restart services to make it effective immediately.
+ Create backup
+ [ Error] NotReady (check backup dependency service online get error: NotReady (GPE is not available; NotReady (GSE is not available)))
+ ```
+5. **Troubleshoot Errors**: Examine the logs for any error messages or warnings. These messages can help you identify the root cause of the problem. Common issues could include connectivity problems, resource limitations, or configuration errors. For instance, in above logs we can know that the reason of this error is that GPE is not ready.
+
+6. **Verify Configuration**: Double-check the configuration options provided for the backup or restore job. Ensure that paths, destinations, tags, and other settings are correctly specified.
+
+7. **Permissions and Secrets**: Ensure that any necessary permissions, access keys, or secrets (such as AWS credentials) are correctly configured and accessible to the pods.
+
+8. **Retry or Rerun**: If the issue is transient, you might consider retrying the backup or restore operation. You can also delete failed pods and trigger the job again.
+
+9. **Documentation**: Refer to the official documentation for TigerGraph's backup and restore features for more detailed troubleshooting steps and specific error messages.
+
+By following these steps, you can effectively troubleshoot and resolve issues with backup and restore operations.
+
+
+
+Debug backup schedule job
+----
+
+When debugging backup schedules in TigerGraph, you may encounter issues with the scheduled backup jobs. Here's a step-by-step guide on how to troubleshoot and debug backup schedule problems:
+
+1. **List Pods**: Start by listing pods running backup in the specified namespace to identify the pods related to backup schedule operations:
+
+ ```bash
+ kubectl get pods -n NAMESPACE -l tigergraph.com/backup-cluster=test-cluster
+ ```
+
+2. **Identify Schedule Pods**: Look for pods with names resembling `${BACKUP_SCHEDULE_NAME}-backup-cronjob-{SUFFIX}`. These pods are associated with the scheduled backup jobs created by the `TigerGraphBackupSchedule`.
+
+3. **Check Pod Status**: Examine the status of the pods. If a pod's status is not "Completed" or if it is in a non-running state, it indicates an issue with the backup schedule.
+
+4. **View All Logs**: To view the logs of a specific pod, including all containers within the pod, use the following command:
+
+ ```bash
+ kubectl logs $POD_NAME -n $NAMESPACE --all-containers=true
+ ```
+
+ Replace `$POD_NAME` with the name of the pod you want to inspect, and specify the appropriate namespace using the `-n NAMESPACE` flag. The `--all-containers=true` option ensures that logs from all containers within the pod are displayed.
+
+5. **Analyze Logs**: Carefully analyze the logs to identify any error messages, warnings, or anomalies. Look for clues that may point to the cause of the issue, such as connectivity problems, configuration errors, or resource limitations.
+
+By following these steps, you can effectively troubleshoot and resolve issues with backup schedule operations. If you encounter specific error messages or need further assistance, you can refer to the documentation or seek help.
\ No newline at end of file
diff --git a/k8s/docs/04-manage/operator-upgrade.md b/k8s/docs/04-manage/operator-upgrade.md
new file mode 100644
index 00000000..00275df3
--- /dev/null
+++ b/k8s/docs/04-manage/operator-upgrade.md
@@ -0,0 +1,123 @@
+# How to upgrade TigerGraph Kubernetes Operator
+
+This document provides step-by-step instructions for upgrading the TigerGraph Kubernetes Operator using the kubectl-tg plugin.
+
+## Install Operator 0.0.7 and TigerGraph 3.9.2
+
+If you have previously installed an older version of the Operator and TigerGraph cluster, you can skip this section. This section is only for verifying operator upgrading.
+
+### Install Operator 0.0.7
+
+```bash
+curl https://dl.tigergraph.com/k8s/0.0.7/kubectl-tg -o kubectl-tg
+sudo install kubectl-tg /usr/local/bin/
+
+kubectl tg init --operator-size 1 --operator-cpu 1000m --operator-memory 1024Mi -n tigergraph
+```
+
+### Install TigerGraph 3.9.2
+
+- Create private ssh key name of TigerGraph pod
+
+```bash
+# create a new private keys
+echo -e 'y\\n' | ssh-keygen -b 4096 -t rsa -f $HOME/.ssh/tigergraph_rsa -q -N ''
+
+# Create a Secret of K8s with above ssh key files
+kubectl create secret generic ssh-key-secret --from-file=private-ssh-key=$HOME/.ssh/tigergraph_rsa --from-file=public-ssh-key=$HOME/.ssh/tigergraph_rsa.pub --namespace YOUR_NAME_SPACE
+```
+
+- Create TG cluster with the above secret name
+
+```bash
+kubectl tg create --cluster-name test-cluster --private-key-secret ssh-key-secret --license xxxxxxxxxxxxxxxxxxxxxxxxx \
+--size 3 --ha 2 --version 3.9.2 \
+--storage-class standard --storage-size 100G --cpu 4000m --memory 8Gi -n tigergraph
+```
+
+You can also load graph data and verify it after upgrading the Operator.
+
+## Upgrade Operator and CRD
+
+Starting from Operator version 0.0.4, you can upgrade the Operator. The latest available Operator version is 0.0.7; please update it according to your requirements.
+
+### Install the latest kubectl-tg plugin
+
+```bash
+curl https://dl.tigergraph.com/k8s/latest/kubectl-tg -o kubectl-tg
+sudo install kubectl-tg /usr/local/bin/
+```
+
+### Upgrade CRD to the latest version
+
+Use the following command to upgrade the Custom Resource Definition (CRD) to version latest:
+
+```bash
+kubectl apply -f https://dl.tigergraph.com/k8s/latest/tg-operator-crd.yaml
+```
+
+### Upgrade Operator to the latest version
+
+The following command will upgrade the operator version to 0.0.9:
+
+```bash
+kubectl tg upgrade --namespace ${YOUR_NAMESPACE} --operator-version 0.0.9
+```
+
+If you only need to update the Operator's configuration without changing its version, use the following command:
+
+```bash
+kubectl tg upgrade --version ${OPERATOR_VERSION} --operator-size 3 --operator-watch-namespace ${YOUR_NAMESPACE} --operator-cpu 1000m --operator-memory 1024Mi --namespace ${YOUR_NAMESPACE}
+```
+
+## How to upgrade for mandatory change
+
+In case of mandatory changes, such as adding a new mandatory option for the secret name of the private SSH key in Operator 0.0.4,
+After upgrading the operator from 0.0.3 to 0.0.4, it will throw the following error if you do some update operation.
+
+```bash
+kubectl tg update --cluster-name test-cluster --memory 5Gi -n tigergraph
+The CR of cluster test-cluster is exported to /home/graphsql/e2e/test-cluster_backup_1679469286.yaml before update cluster, you can use this file for recovery
+Warning: resource tigergraphs/test-cluster is missing the kubectl.kubernetes.io/last-applied-configuration annotation which is required by kubectl apply. kubectl apply should only be used on resources created declaratively by either kubectl create --save-config or kubectl apply. The missing annotation will be patched automatically.
+Error from server (the secret name of private ssh key is required for TG version 3.8.0): error when applying patch:
+.......
+to:
+Resource: "graphdb.tigergraph.com/v1alpha1, Resource=tigergraphs", GroupVersionKind: "graphdb.tigergraph.com/v1alpha1, Kind=TigerGraph"
+Name: "test-cluster", Namespace: "tigergraph"
+for: "STDIN": admission webhook "vtigergraph.kb.io" denied the request: the secret name of private ssh key is required for TG version 3.8.0
+```
+
+Follow these steps after upgrading from Operator 0.0.3 to 0.0.4 for mandatory changes:
+
+- Delete cluster
+
+> [!WARNING]
+> Don't delete the pvc of the cluster manually.
+
+```bash
+kubectl tg delete --cluster-name test-cluster -n tigergraph
+```
+
+- Create a private ssh key secret
+
+```bash
+echo -e 'y\\n' | ssh-keygen -b 4096 -t rsa -f $HOME/.ssh/tigergraph_rsa -q -N ''
+
+kubectl create secret generic ssh-key-secret --from-file=private-ssh-key=$HOME/.ssh/tigergraph_rsa --from-file=public-ssh-key=$HOME/.ssh/tigergraph_rsa.pub --namespace tigergraph
+```
+
+- Recreate the cluster with the created private ssh key
+
+> [!WARNING]
+> Cluster size, ha and name must be the same.
+
+```bash
+kubectl tg create --cluster-name test-cluster --license xxxxxxxxxxxxxxxxxxxxxxxxx \
+--size 3 --ha 2 --version 3.8.0 \
+--storage-class standard --storage-size 10G --cpu 2000m --memory 6Gi -n tigergraph --private-key-secret ssh-key-secret
+```
+
+## How to upgrade for optional change
+
+If you don't require the new optional configuration of the CRD, no extra steps are needed. However,
+if you wish to use the new optional configuration, you can simply update the cluster as needed.
diff --git a/k8s/docs/05-troubleshoot/README.md b/k8s/docs/05-troubleshoot/README.md
new file mode 100644
index 00000000..fb8b72f6
--- /dev/null
+++ b/k8s/docs/05-troubleshoot/README.md
@@ -0,0 +1,10 @@
+# Troubleshoot TigerGraph in Kubernetes
+
+This document provides solutions to common issues that may arise during the deployment and management of a TigerGraph cluster in Kubernetes. For more detailed information on troubleshooting specific topics, please refer to the following documentation:
+
+- [TigerGraph Operator Troubleshoot](./operator-installation.md)
+- [TigerGraph Cluster Deployment Troubleshoot](./cluster-deployment.md)
+- [TigerGraph Cluster Management Troubleshoot](./cluster-management.md)
+- [TigerGraph Cluster Rolling Update Troubleshoot](./rolling-update.md)
+
+These resources will help you address and resolve any challenges you encounter while working with TigerGraph in a Kubernetes environment.
diff --git a/k8s/docs/05-troubleshoot/cluster-deployment.md b/k8s/docs/05-troubleshoot/cluster-deployment.md
new file mode 100644
index 00000000..9613564e
--- /dev/null
+++ b/k8s/docs/05-troubleshoot/cluster-deployment.md
@@ -0,0 +1,565 @@
+# TigerGraph Cluster Deployment Troubleshooting
+
+This document provides guidance on troubleshooting common issues encountered during the deployment of a TigerGraph cluster in Kubernetes.
+
+## Troubleshoot Steps
+
+In the following steps, it is assumed that the operator has already been successfully installed within the `tigergraph` namespace, and that the cluster has been named `test-cluster`. However, please ensure to make appropriate adjustments based on your specific circumstances and environment.
+
+### Check the pods of the TG Cluster
+
+Ensure that all the pods of the cluster are running, and the READY figure is 1/1. Starting from Operator version 0.0.7, sidecarContainers are supported. If you have X sideContainers, the READY figure should be (1+X)/(1+X).
+
+```bash
+kubectl get pod -l tigergraph.com/cluster-name=test-cluster,tigergraph.com/cluster-pod=test-cluster -n tigergraph
+
+NAME READY STATUS RESTARTS AGE
+test-cluster-0 1/1 Running 0 11h
+test-cluster-1 1/1 Running 0 11h
+test-cluster-2 1/1 Running 0 11h
+```
+
+If the status of a Pod is not `Running`, it might be in `Pending` or `PullImageError` state. In such cases, you can check detailed information about the specific pod using:
+
+```bash
+kubectl describe pod test-cluster-0 -n tigergraph
+```
+
+- Insufficient CPU or Memory
+
+ If a pod is in a `Pending` state, it might be due to insufficient CPU or memory resources. You can identify this issue by checking the pod's status:
+
+ ```bash
+ kubectl get pod -l app=test-cluster,tigergraph.com/cluster-pod=test-cluster -n tigergraph
+
+ NAME READY STATUS RESTARTS AGE
+ test-cluster-0 0/1 Pending 0 58s
+ test-cluster-1 0/1 Pending 0 58s
+ test-cluster-2 0/1 Pending 0 58s
+ ```
+
+ Inspect the details of the pending pod to find the root cause at the bottom of the output:
+
+ ```bash
+ kubectl describe pod test-cluster-0 -n tigergraph
+
+ Name: test-cluster-0
+ Namespace: tigergraph
+ Priority: 0
+ Node:
+ Labels: app=test-cluster
+ controller-revision-hash=test-cluster-6c8cc9c557
+ statefulset.kubernetes.io/pod-name=test-cluster-0
+ tigergraph.com/cluster-pod=test-cluster
+ Annotations: openshift.io/scc: privileged
+ Status: Pending
+ IP:
+ IPs:
+ Controlled By: StatefulSet/test-cluster
+ Containers:
+ tg:
+ Image: docker.io/tigergrah/tigergraph-k8s:3.8.0
+ Ports: 9000/TCP, 14240/TCP, 22/TCP
+ Host Ports: 0/TCP, 0/TCP, 0/TCP
+ Requests:
+ cpu: 16
+ memory: 32Gi
+ ......
+ Medium:
+ SizeLimit:
+ kube-api-access-mnsw5:
+ Type: Projected (a volume that contains injected data from multiple sources)
+ TokenExpirationSeconds: 3607
+ ConfigMapName: kube-root-ca.crt
+ ConfigMapOptional:
+ DownwardAPI: true
+ ConfigMapName: openshift-service-ca.crt
+ ConfigMapOptional:
+ QoS Class: Burstable
+ Node-Selectors:
+ Tolerations: node.kubernetes.io/memory-pressure:NoSchedule op=Exists
+ node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
+ node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
+ Events:
+ Type Reason Age From Message
+ ---- ------ ---- ---- -------
+ Warning FailedScheduling 65s (x3 over 2m23s) default-scheduler 0/4 nodes are available: 1 node(s) had taint {node-role.kubernetes.io/master: }, that the pod didn't tolerate, 3 Insufficient cpu, 3 Insufficient memory.
+ ```
+
+ You may encounter messages like `Insufficient CPU` or `Insufficient memory`. In this case, you should adjust the resource allocation for the cluster using the following command:
+
+ ```bash
+ kubectl tg update --cluster-name test-cluster --cpu 4 --memory 8Gi -n tigergraph
+ ```
+
+- Nodes donât match nodeSelector/affinity
+
+ Starting from Operator version 0.0.7, nodeSelector, affinity, and tolerations are supported. If you provide rules in the Custom Resource (CR), and your pod is in a `Pending` state, you can check for the following events:
+
+ ```bash
+ kubectl describe pod test-cluster-0 -n tigergraph
+ ```
+
+ Look for events like:
+
+ ```bash
+ Events:
+ Type Reason Age From Message
+ ---- ------ ---- ---- -------
+ Warning FailedScheduling 101s (x2 over 2m17s) default-scheduler 0/6 nodes are available: 1 Insufficient cpu, 5 node(s) didn't match Pod's node affinity/selector. preemption: 0/6 nodes are available: 1 No preemption victims found for incoming pod, 5 Preemption is not helpful for scheduling.
+ ```
+
+ This indicates that the nodes cannot meet your affinity or nodeSelector rules. You can update your rules using the following command:
+
+ ```bash
+ kubectl tg update --cluster-name test-cluster -n tigergraph --affinity affinity-config.yaml
+ ```
+
+- Incorrect docker image
+
+ If the TigerGraph Docker image version is incorrect, the pod status may be `ErrImagePull` or `ImagePullBackOff`. You can identify this issue by checking the pod status:
+
+ ```bash
+ kubectl get pod -l tigergraph.com/cluster-pod=test-cluster -n tigergraph
+
+ NAME READY STATUS RESTARTS AGE
+ test-cluster-0 0/1 ErrImagePull 0 63s
+ test-cluster-1 0/1 ImagePullBackOff 0 63s
+ test-cluster-2 0/1 ImagePullBackOff 0 63s
+ ```
+
+ Check the detailed error by examining the pod's events:
+
+ ```bash
+ kubectl describe pod test-cluster-0 -n tigergraph
+
+ Name: test-cluster-0
+ Namespace: tigergraph
+ Priority: 0
+ Node: tg-k8s-openshift-777-rdj74-worker-d-pvrm2/10.0.128.2
+ Start Time: Mon, 27 Feb 2023 03:15:39 +0000
+ Labels: app=test-cluster
+ controller-revision-hash=test-cluster-598bdbb6cb
+ statefulset.kubernetes.io/pod-name=test-cluster-0
+ tigergraph.com/cluster-pod=test-cluster
+ .......
+ Controlled By: StatefulSet/test-cluster
+ Containers:
+ tg:
+ Container ID:
+ Image: docker.io/tigergrah/tigergraph-k8s:3.8.5
+ Image ID:
+ Ports: 9000/TCP, 14240/TCP, 22/TCP
+ Host Ports: 0/TCP, 0/TCP, 0/TCP
+ State: Waiting
+ Reason: ImagePullBackOff
+ Ready: False
+ Restart Count: 0
+ Requests:
+ cpu: 2
+ memory: 8Gi
+ ......
+ Environment:
+ SERVICE_NAME: Optional: false
+ POD_PREFIX: Optional: false
+ NAMESPACE: Optional: false
+ CLUSTER_SIZE: Optional: false
+ Mounts:
+ /home/tigergraph/tigergraph/data from tg-data (rw)
+ /tmp/init_tg_cfg from config-volume (rw,path="init_tg_cfg")
+ /var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-vskns (ro)
+ Conditions:
+ Type Status
+ Initialized True
+ Ready False
+ ContainersReady False
+ PodScheduled True
+ Volumes:
+ tg-data:
+ Type: PersistentVolumeClaim (a reference to a PersistentVolumeClaim in the same namespace)
+ ClaimName: tg-data-test-cluster-0
+ ReadOnly: false
+ config-volume:
+ Type: ConfigMap (a volume populated by a ConfigMap)
+ Name: test-cluster-init-config
+ Optional: false
+ probe-data:
+ Type: EmptyDir (a temporary directory that shares a pod's lifetime)
+ Medium:
+ SizeLimit:
+ kube-api-access-vskns:
+ Type: Projected (a volume that contains injected data from multiple sources)
+ TokenExpirationSeconds: 3607
+ ConfigMapName: kube-root-ca.crt
+ ConfigMapOptional:
+ DownwardAPI: true
+ ConfigMapName: openshift-service-ca.crt
+ ConfigMapOptional:
+ QoS Class: Burstable
+ Node-Selectors:
+ Tolerations: node.kubernetes.io/memory-pressure:NoSchedule op=Exists
+ node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
+ node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
+ Events:
+ Type Reason Age From Message
+ ---- ------ ---- ---- -------
+ Normal Scheduled 2m38s default-scheduler Successfully assigned tigergraph/test-cluster-0 to tg-k8s-openshift-777-rdj74-worker-d-pvrm2
+ Normal SuccessfulAttachVolume 2m34s attachdetach-controller AttachVolume.Attach succeeded for volume "pvc-96c90faf-3019-416a-ace9-200502f67b65"
+ Normal AddedInterface 2m30s multus Add eth0 [10.130.0.33/23] from openshift-sdn
+ Normal Pulling 71s (x4 over 2m29s) kubelet Pulling image "docker.io/tigergrah/tigergraph-k8s:3.8.5"
+ Warning Failed 71s (x4 over 2m29s) kubelet Failed to pull image "docker.io/tigergrah/tigergraph-k8s:3.8.5": rpc error: code = Unknown desc = reading manifest 3.8.5 in docker.io/tigergrah/tigergraph-k8s: manifest unknown: manifest unknown
+ Warning Failed 71s (x4 over 2m29s) kubelet Error: ErrImagePull
+ Warning Failed 59s (x6 over 2m29s) kubelet Error: ImagePullBackOff
+ Normal BackOff 44s (x7 over 2m29s) kubelet Back-off pulling image "docker.io/tigergrah/tigergraph-k8s:3.8.5"
+ ```
+
+ Look for messages indicating issues with the image, such as `Error: ErrImagePull` You should correct the image version using the following command:
+
+ ```bash
+ kubectl tg update --cluster-name test-cluster --version 3.9.0 -n tigergraph
+ ```
+
+- Incorrect PVC with non-existent StorageClass
+
+ If you specified a non-existent or unusable StorageClass when creating a cluster, the cluster's pods may be stuck in a `Pending` state. To diagnose the issue, first check the pod statuses:
+
+ ```bash
+ kubectl get pod -l app=test-cluster,tigergraph.com/cluster-pod=test-cluster -n tigergraph
+
+ NAME READY STATUS RESTARTS AGE
+ test-cluster-0 0/1 Pending 0 2m16s
+ test-cluster-1 0/1 Pending 0 2m16s
+ test-cluster-2 0/1 Pending 0 2m16s
+ ```
+
+ If the pods are in a `Pending` state, inspect the details of one of the pods to find the root cause:
+
+ ```bash
+ kubectl describe pod test-cluster-0 -n tigergraph
+
+ Name: test-cluster-0
+ Namespace: tigergraph
+ Priority: 0
+ Node:
+ Labels: app=test-cluster
+ controller-revision-hash=test-cluster-598bdbb6cb
+ statefulset.kubernetes.io/pod-name=test-cluster-0
+ tigergraph.com/cluster-pod=test-cluster
+ Annotations: openshift.io/scc: privileged
+ Status: Pending
+ IP:
+ IPs:
+ Controlled By: StatefulSet/test-cluster
+ Containers:
+ tg:
+ Image: docker.io/tigergrah/tigergraph-k8s:3.8.5
+ Ports: 9000/TCP, 14240/TCP, 22/TCP
+ Host Ports: 0/TCP, 0/TCP, 0/TCP
+ Requests:
+ cpu: 2
+ memory: 8Gi
+ ......
+ Environment:
+ SERVICE_NAME: Optional: false
+ POD_PREFIX: Optional: false
+ NAMESPACE: Optional: false
+ CLUSTER_SIZE: Optional: false
+ Mounts:
+ /home/tigergraph/tigergraph/data from tg-data (rw)
+ /tmp/init_tg_cfg from config-volume (rw,path="init_tg_cfg")
+ /var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-8zb5z (ro)
+ Conditions:
+ Type Status
+ PodScheduled False
+ Volumes:
+ tg-data:
+ Type: PersistentVolumeClaim (a reference to a PersistentVolumeClaim in the same namespace)
+ ClaimName: tg-data-test-cluster-0
+ ReadOnly: false
+ config-volume:
+ Type: ConfigMap (a volume populated by a ConfigMap)
+ Name: test-cluster-init-config
+ Optional: false
+ probe-data:
+ Type: EmptyDir (a temporary directory that shares a pod's lifetime)
+ Medium:
+ SizeLimit:
+ kube-api-access-8zb5z:
+ Type: Projected (a volume that contains injected data from multiple sources)
+ TokenExpirationSeconds: 3607
+ ConfigMapName: kube-root-ca.crt
+ ConfigMapOptional:
+ DownwardAPI: true
+ ConfigMapName: openshift-service-ca.crt
+ ConfigMapOptional:
+ QoS Class: Burstable
+ Node-Selectors:
+ Tolerations: node.kubernetes.io/memory-pressure:NoSchedule op=Exists
+ node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
+ node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
+ Events:
+ Type Reason Age From Message
+ ---- ------ ---- ---- -------
+ Warning FailedScheduling 58s (x4 over 3m8s) default-scheduler 0/4 nodes are available: 4 pod has unbound immediate PersistentVolumeClaims.
+ ```
+
+ From the events of the pod, you can find out that the root cause is 0/4 nodes are available: 4 pod has unbound immediate PersistentVolumeClaims , this error indicates that the StorageClass is not existed or the capacity of PV is insufficient.
+
+ Check the storage configuration of the cluster.
+
+ ```bash
+ kubectl get tg test-cluster -n tigergraph -o json|jq .spec.storage
+
+ {
+ "type": "persistent-claim",
+ "volumeClaimTemplate": {
+ "accessModes": [
+ "ReadWriteOnce"
+ ],
+ "resources": {
+ "requests": {
+ "storage": "10G"
+ }
+ },
+ "storageClassName": "test-storage-class",
+ "volumeMode": "Filesystem"
+ }
+ }
+ ```
+
+ Check the PVC status of the cluster
+
+ ```bash
+ kubectl get pvc -l tigergraph.com/cluster-name=test-cluster -n tigergraph
+ ```
+
+ Ensure the STORAGECLASS exists and the capacity of PV is insufficient
+
+ ```bash
+ kubectl get STORAGECLASS
+
+ NAME PROVISIONER RECLAIMPOLICY VOLUMEBINDINGMODE ALLOWVOLUMEEXPANSION AGE
+ standard (default) kubernetes.io/gce-pd Delete WaitForFirstConsumer true 34m
+ standard-csi pd.csi.storage.gke.io Delete WaitForFirstConsumer true
+ ```
+
+ For this test case, the STORAGECLASS test-storage-class does not exist, you should recreate the cluster with a certain STORAGECLASS name in the above list.
+
+ If there are not any pods in the above step, you can check the status of StatefulSet:
+
+ ```bash
+ kubectl get statefulset test-cluster -n tigergraph
+
+ NAME READY AGE
+ test-cluster 3/3 11h
+ ```
+
+- ebs csi driver not installed (only on EKS)
+
+ Since some EKS versions do not install aws-ebs-csi-driver plugin by default, if you encounter the following issue when creating TigerGraph cluster with the dynamic persistent volume, you need to check it first.
+
+ After deploying the TigerGraph cluster, all of the TigerGraph pods are in Pending status, and all of the PVC attached to the StatefulSet of TigerGraph are also in Pending status.
+
+ ```bash
+ # please replace the cluster name and namespace with yours.
+ kubectl get pods -l tigergraph.com/cluster-name=test-cluster --namespace tigergraph
+
+ NAME READY STATUS RESTARTS AGE
+ test-cluster-0 0/1 Pending 0 32s
+ test-cluster-1 0/1 Pending 0 32s
+ test-cluster-2 0/1 Pending 0 32s
+
+ kubectl get pvc -l tigergraph.com/cluster-name=test-cluster --namespace tigergraph
+
+ NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS AGE
+ tg-data-test-cluster-0 Pending gp2 37s
+ tg-data-test-cluster-1 Pending gp2 37s
+ tg-data-test-cluster-2 Pending gp2 37s
+
+ kubectl describe pvc -l tigergraph.com/cluster-name=test-cluster --namespace tigergraph
+
+ Name: tg-data-test-cluster-0
+ Namespace: tigergraph
+ StorageClass: gp2
+ Status: Pending
+ Volume:
+ Labels: tigergraph.com/cluster-name=test-cluster
+ tigergraph.com/cluster-pod=test-cluster
+ Annotations: volume.beta.kubernetes.io/storage-provisioner: ebs.csi.aws.com
+ volume.kubernetes.io/selected-node: ip-172-31-20-181.us-west-1.compute.internal
+ volume.kubernetes.io/storage-provisioner: ebs.csi.aws.com
+ Finalizers: [kubernetes.io/pvc-protection]
+ Capacity:
+ Access Modes:
+ VolumeMode: Filesystem
+ Used By: test-cluster-0
+ Events:
+ Type Reason Age From Message
+ ---- ------ ---- ---- -------
+ Normal WaitForFirstConsumer 8m9s persistentvolume-controller waiting for first consumer to be created before binding
+ Normal ExternalProvisioning 2m35s (x25 over 8m9s) persistentvolume-controller waiting for a volume to be created, either by external provisioner "ebs.csi.aws.com" or manually created by system administrator
+ ```
+
+ check and install aws-ebs-csi-driver with following commands:
+
+ > [!WARNING]
+ > Please ensure that the IAM role for the Amazon EBS CSI driver has been created. You can refer to the official AWS documentation [Creating the Amazon EBS CSI driver IAM role](https://docs.aws.amazon.com/eks/latest/userguide/csi-iam-role.html) for detailed instructions.
+
+ ```bash
+ kubectl get deployment ebs-csi-controller -n kube-system
+
+ aws eks create-addon --cluster-name ${YOUR_K8S_CLUSTER_NAME} --addon-name aws-ebs-csi-driver
+ ```
+
+### Check the initialize job of TG Cluster
+
+If you've successfully created the StatefulSet and cluster pods for your TigerGraph cluster but encounter anomalies in the cluster's status, such as liveness and readiness staying unready for an extended period, you can follow these steps to troubleshoot the issue.
+
+- Ensure initialize job has been created
+
+ ```bash
+ kubectl get job -l tigergraph.com/cluster-job=test-cluster-init-job -n tigergraph
+ ```
+
+- If initialize job is exist and the COMPLETIONS of the job are not 1/1, you need to check the pod status of the job
+
+ ```bash
+ kubectl get pod -l job-name=test-cluster-init-job -n tigergraph
+
+ NAME READY STATUS RESTARTS AGE
+ test-cluster-init-job-p9lqr 0/1 Completed 0 12h
+ ```
+
+ If the pod status is incomplete, investigate further by checking the logs of the error pod (if it exists) to identify the root cause of the initialization job failure:
+
+ ```bash
+ # It equals kubectl logs test-cluster-init-job -n tigergraph
+ kubectl logs -l job-name=test-cluster-init-job -n tigergraph
+
+ Defaulted container "cluster-installer" out of: cluster-installer, init-tigergraph (init)
+ [ Info] Generating config files to all machines
+ [ Info] Successfully applied configuration change. Please restart services to make it effective immediately.
+ [ Info] Initializing KAFKA
+ [ Info] Starting EXE
+ [ Info] Starting CTRL
+ [ Info] Starting ZK ETCD DICT KAFKA ADMIN GSE NGINX GPE RESTPP KAFKASTRM-LL KAFKACONN TS3SERV GSQL TS3 IFM GUI
+ [ Info] Applying config
+ [Warning] No difference from staging config, config apply is skipped.
+ [ Info] Successfully applied configuration change. Please restart services to make it effective immediately.
+ [ Info] Cluster is initialized successfully
+ ```
+
+ Examine the logs for any error messages that might provide insights into the failure. It's essential to address these issues to ensure a successful initialization.
+
+- Check the cluster status by logging into a pod
+
+ If all the previous steps have been completed successfully, you can log into one of the cluster pods to check the detailed errors of the cluster using the `gadmin status -v` command. This can help identify any ongoing issues with the cluster:
+
+ ```bash
+ kubectl tg connect --cluster-name test-cluster -n tigergraph
+
+ tigergraph@test-cluster-0:~$ gadmin status
+ +--------------------+-------------------------+-------------------------+
+ | Service Name | Service Status | Process State |
+ +--------------------+-------------------------+-------------------------+
+ | ADMIN | Online | Running |
+ | CTRL | Online | Running |
+ | DICT | Online | Running |
+ | ETCD | Online | Running |
+ | EXE | Online | Running |
+ | GPE | Warmup | Running |
+ | GSE | Warmup | Running |
+ | GSQL | Online | Running |
+ | GUI | Online | Running |
+ | IFM | Online | Running |
+ | KAFKA | Online | Running |
+ | KAFKACONN | Online | Running |
+ | KAFKASTRM-LL | Online | Running |
+ | NGINX | Online | Running |
+ | RESTPP | Online | Running |
+ | TS3 | Online | Running |
+ | TS3SERV | Online | Running |
+ | ZK | Online | Running |
+ +--------------------+-------------------------+-------------------------+
+ ```
+
+ The gadmin status command provides detailed information about the status of various TigerGraph services and processes. Review the output to check for any services or processes that are not running correctly.
+
+ If the liveness check of the pod continues to fail, you can use a single command to get the cluster status:
+
+ ```bash
+ kubectl exec -it test-cluster-0 -n tigergraph -- /home/tigergraph/tigergraph/app/cmd/gadmin status -v
+
+ +--------------------+-------------------------+-------------------------+-------------------------+
+ | Service Name | Service Status | Process State | Process ID |
+ +--------------------+-------------------------+-------------------------+-------------------------+
+ | ADMIN#1 | Online | Running | 44484 |
+ | ADMIN#2 | Online | Running | 9536 |
+ | ADMIN#3 | Online | Running | 3099 |
+ | CTRL#1 | Online | Running | 79 |
+ | CTRL#2 | Online | Running | 637 |
+ | CTRL#3 | Online | Running | 74 |
+ | DICT#1 | Online | Running | 43741 |
+ | DICT#2 | Online | Running | 8504 |
+ | DICT#3 | Online | Running | 2347 |
+ | ETCD#1 | Online | Running | 43731 |
+ | ETCD#2 | Online | Running | 8494 |
+ | ETCD#3 | Online | Running | 2337 |
+ | EXE_1 | Online | Running | 59 |
+ | EXE_2 | Online | Running | 512 |
+ | EXE_3 | Online | Running | 56 |
+ | GPE_1#1 | Warmup | Running | 44534 |
+ | GPE_1#2 | Warmup | Running | 9586 |
+ | GSE_1#1 | Warmup | Running | 44495 |
+ | GSE_1#2 | Warmup | Running | 9547 |
+ | GSQL#1 | Online | Running | 44802 |
+ | GSQL#2 | Online | Running | 9756 |
+ | GSQL#3 | Online | Running | 3385 |
+ | GUI#1 | Online | Running | 45096 |
+ | GUI#2 | Online | Running | 9919 |
+ | GUI#3 | Online | Running | 3698 |
+ | IFM#1 | Online | Running | 44997 |
+ | IFM#2 | Online | Running | 9874 |
+ | IFM#3 | Online | Running | 3573 |
+ | KAFKA#1 | Online | Running | 240 |
+ | KAFKA#2 | Online | Running | 1097 |
+ | KAFKA#3 | Online | Running | 238 |
+ | KAFKACONN#1 | Online | Running | 44615 |
+ | KAFKACONN#2 | Online | Running | 9663 |
+ | KAFKACONN#3 | Online | Running | 3196 |
+ | KAFKASTRM-LL_1 | Online | Running | 44562 |
+ | KAFKASTRM-LL_2 | Online | Running | 9611 |
+ | KAFKASTRM-LL_3 | Online | Running | 3142 |
+ | NGINX#1 | Online | Running | 44499 |
+ | NGINX#2 | Online | Running | 9553 |
+ | NGINX#3 | Online | Running | 3110 |
+ | RESTPP#1 | Online | Running | 44540 |
+ | RESTPP#2 | Online | Running | 9596 |
+ | RESTPP#3 | Online | Running | 3127 |
+ | TS3SERV#1 | Online | Running | 44721 |
+ | TS3_1 | Online | Running | 44875 |
+ | TS3_2 | Online | Running | 9793 |
+ | TS3_3 | Online | Running | 3466 |
+ | ZK#1 | Online | Running | 108 |
+ | ZK#2 | Online | Running | 729 |
+ | ZK#3 | Online | Running | 103 |
+ +--------------------+-------------------------+-------------------------+-------------------------+
+ ```
+
+### Conflict port for NodePort Listener type
+
+If you encounter conflicts with port allocation when creating or updating a cluster with `LISTENER_TYPE=NodePort` and specified `rest-node-port` or `gui-node-port` values that conflict with in-use ports, you will receive an error message. To resolve this issue, specify available ports for these services:
+
+```bash
+# Create a cluster with --listener-type NodePort, and there is a tg cluster using the default port 30090, 30240
+kubectl tg create --cluster-name tg-cluster-2 --listener-type NodePort --rest-node-port 30092 --gui-node-port 30242
+
+# Check the CR, it indicates the provided port is already allocated.
+kubectl describe tigergraph.graphdb.tigergraph.com/tg-cluster-2
+Events:
+ Type Reason Age From Message
+ ---- ------ ---- ---- -------
+ Normal Create init ConfigMap 20s TigerGraph Create a new init ConfigMap success
+ Normal Create env ConfigMap 20s TigerGraph Create a new env ConfigMap success
+ Warning Failed to create external rest service 10s (x11 over 20s) TigerGraph Failed to create external service: Service "tg-cluster-2-rest-external-service" is invalid: spec.ports[0].nodePort: Invalid value: 30090: provided port is already allocated
+```
diff --git a/k8s/docs/05-troubleshoot/cluster-management.md b/k8s/docs/05-troubleshoot/cluster-management.md
new file mode 100644
index 00000000..362172d5
--- /dev/null
+++ b/k8s/docs/05-troubleshoot/cluster-management.md
@@ -0,0 +1,463 @@
+# TigerGraph Cluster management Troubleshooting
+
+This document provides solutions for common issues that may arise during the management of a TigerGraph cluster in Kubernetes.
+
+## Troubleshooting Steps for updating cluster
+
+- Verify the CPU and memory resources of the cluster Custom Resource (CR) have been updated:
+
+ ```bash
+ kubectl get tg test-cluster -o json -n tigergraph|jq .spec.resources
+
+ {
+ "requests": {
+ "cpu": "2",
+ "memory": "8Gi"
+ }
+ }
+ ```
+
+- Ensure the CPU and memory resources of the cluster's StatefulSet have been updated:
+
+ ```bash
+ kubectl get statefulset test-cluster -o json -n tigergraph|jq .spec.template.spec.containers[0].resources
+ {
+ "requests": {
+ "cpu": "2",
+ "memory": "8Gi"
+ }
+ }
+ ```
+
+ If the resources haven't been updated, the cluster might be in another process, such as upgrading or scaling. In this case, check the cluster's status to determine the ongoing process. If the resource update is not initiated, you may need to wait for the last operation to complete:
+
+ ```bash
+ kubectl tg status --cluster-name test-cluster -n tigergraph
+
+ Name: test-cluster
+ Namespace: tigergraph
+ Labels:
+ Annotations:
+ API Version: graphdb.tigergraph.com/v1alpha1
+ Kind: TigerGraph
+ Metadata:
+ Creation Timestamp: 2023-02-27T07:47:28Z
+ Generation: 2
+ ......
+ Spec:
+ Image: docker.io/tigergrah/tigergraph-k8s:3.8.0
+ Image Pull Policy: Always
+ Image Pull Secrets:
+ Name: tigergraph-image-pull-secret
+ Init Job:
+ Image: docker.io/tigergrah/tigergraph-k8s-init:0.0.3
+ Image Pull Policy: Always
+ Image Pull Secrets:
+ Name: tigergraph-image-pull-secret
+ Init TG Config:
+ App Root: /home/tigergraph/tigergraph/app
+ Data Root: /home/tigergraph/tigergraph/data
+ Ha: 2
+ License:
+ Log Root: /home/tigergraph/tigergraph/log
+ Password: tigergraph
+ Privatekey: /home/tigergraph/.ssh/tigergraph_rsa
+ Temp Root: /home/tigergraph/tigergraph/tmp
+ Username: tigergraph
+ Version: 3.8.0
+ Listener:
+ Type: LoadBalancer
+ Replicas: 3
+ Resources:
+ Requests:
+ Cpu: 16
+ Memory: 32Gi
+ Storage:
+ Type: persistent-claim
+ Volume Claim Template:
+ Access Modes:
+ ReadWriteOnce
+ Resources:
+ Requests:
+ Storage: 10G
+ Storage Class Name: standard
+ Volume Mode: Filesystem
+ Status:
+ Cluster Topology:
+ test-cluster-0:
+ gui
+ restpp
+ test-cluster-1:
+ gui
+ restpp
+ test-cluster-2:
+ gui
+ restpp
+ Conditions:
+ Last Probe Time: 2023-02-27T08:29:48Z
+ Status: Unknown
+ Type: UpdateRoll
+ Last Probe Time: 2023-02-27T08:29:48Z
+ Message: Hello GSQL
+ Status: True
+ Type: test-cluster-0-rest-Available
+ Last Probe Time: 2023-02-27T08:29:48Z
+ Message: Hello GSQL
+ Status: True
+ Type: test-cluster-1-rest-Available
+ Last Probe Time: 2023-02-27T08:29:48Z
+ Message: Get "": dial tcp 10.131.0.15:9000: connect: connection refused
+ Status: Unknown
+ Type: test-cluster-2-rest-Available
+ Image: docker.io/tigergrah/tigergraph-k8s:3.8.0
+ Listener:
+ Type: LoadBalancer
+ Replicas: 3
+ Events:
+ Type Reason Age From Message
+ ---- ------ ---- ---- -------
+ Normal InitConfigMap created 42m TigerGraph Creating a new init configmap success
+ Normal EnvConfigMap created 42m TigerGraph Creating a new env configmap success
+ Normal external service created 42m TigerGraph Create a new external rest service success.
+ Normal external service created 42m TigerGraph Create a new external gui service success.
+ Normal internal service created 42m TigerGraph Create a new internal service success.
+ Normal StatefulSet created 42m TigerGraph Create a new StatefulSet success.
+ Normal Init job created 42m TigerGraph Create a new init job success.
+ Normal Update StatefulSet 4s TigerGraph Update a StatefulSet success.
+ ```
+
+### Potential failure of update
+
+- If the updated resources (CPU or memory) exceed the available resources of the Kubernetes (K8s) cluster, the pods of the cluster will remain in a pending state. In such cases, you need to adjust the cluster configuration to allocate suitable resources:
+
+ ```bash
+ kubectl get pod -l app=test-cluster,tigergraph.com/cluster-pod=test-cluster -n tigergraph
+
+ NAME READY STATUS RESTARTS AGE
+ test-cluster-0 0/1 Pending 0 58s
+ test-cluster-1 0/1 Pending 0 58s
+ test-cluster-2 0/1 Pending 0 58s
+ ```
+
+- License updates via the K8s Operator are not supported yet. You can perform a license update using the following methods:
+ - Use the `kubectl tg update` --license ${LICENSE} command (supported in version 0.0.7).
+ - Use the License function in the GUI to access the Admin Portal for updates.
+ - Use `gadmin config entry license` to update the license in the pod. Refer to the TigerGraph Docs for more information.
+
+- Resizing Persistent Volumes (PV) via the K8s Operator is not yet supported. Please refer to the [manual documentation](../07-reference/expand-persistent-volume.md) for instructions.
+
+## Troubleshooting Steps for upgrading cluster
+
+- Verify that the Docker image of the cluster CR has been updated:
+
+ ```bash
+ kubectl get tg test-cluster -o json -n tigergraph|jq .spec.image
+
+ "docker.io/tigergrah/tigergraph-k8s:3.8.0"
+ ```
+
+- Ensure the Docker image of the cluster's StatefulSet has been updated:
+
+ ```bash
+ kubectl get statefulset test-cluster -o json -n tigergraph|jq .spec.template.spec.containers[0].image
+
+ "docker.io/tigergrah/tigergraph-k8s:3.9.0"
+ ```
+
+ If the images haven't been updated, the cluster might be in another process, such as resource updating or scaling. Check the cluster's status to determine the ongoing process. If the cluster upgrade is not initiated, you may need to wait for the last operation to complete:
+
+ ```bash
+ kubectl tg status --cluster-name test-cluster -n tigergraph
+ ```
+
+- Ensure the upgrade process completes successfully. Verify the following:
+
+ - The cluster status should be "Normal," and the actual image version should match the expected version:
+
+ ```bash
+ kubectl tg status --cluster-name test-cluster -n tigergraph
+ ```
+
+ - Check the status and version of the TigerGraph cluster by executing the following commands:
+
+ ```bash
+ kubectl exec -it test-cluster-0 -n tigergraph -- /home/tigergraph/tigergraph/app/cmd/gadmin version
+ kubectl exec -it test-cluster-0 -n tigergraph -- /home/tigergraph/tigergraph/app/cmd/gadmin status -v
+ ```
+
+### Potential failure of upgrading
+
+- If you set a misconfiguration upgrade version, the pods of the cluster will be in ErrImagePull or ImagePullBackOff state, then you need to update the cluster configuration to correct the version.
+
+ ```bash
+ kubectl get pod -l tigergraph.com/cluster-pod=test-cluster -n tigergraph
+
+ NAME READY STATUS RESTARTS AGE
+ test-cluster-0 0/1 ErrImagePull 0 63s
+ test-cluster-1 0/1 ImagePullBackOff 0 63s
+ test-cluster-2 0/1 ImagePullBackOff 0 63s
+
+ kubectl tg update --cluster-name test-cluster --version ${CORRECT_VERSION} -n tigergraph
+ ```
+
+- Rollback version of TigerGraph is not supported
+
+ Operator Version 0.0.9 (Please note that this issue persists across all versions of the operator and may continue to be encountered in future releases)
+
+ A unique scenario arises during the upgrading process when attempting to downgrade the TigerGraph cluster from a higher version to a lower version. For instance, moving from version 3.9.2 to 3.9.1. While the rolling update process will initially proceed successfully, an issue arises during the execution of the UpgradePost job due to the downgrade being disabled. Consequently, this prevents the ability to revert to a previous version for recovery purposes.
+
+ Should the need to revert to the previous version arise, a two-step process is necessary. First, the cluster must be deleted, and subsequently recreated with the identical cluster name. This process enables the rollback to the desired version and facilitates the restoration of normal operations.
+
+## Troubleshooting Steps for scaling cluster
+
+### Expansion
+
+- Ensure that the pods of the cluster have been scaled up to the expected size:
+
+ ```bash
+ # test-cluster is the name of the cluster
+ # The example below tries to scale the cluster size from 3 to 5
+ kubectl get pod -l tigergraph.com/cluster-pod=test-cluster -n tigergraph
+
+ NAME READY STATUS RESTARTS AGE
+ test-cluster-0 1/1 Running 0 17m
+ test-cluster-1 1/1 Running 0 17m
+ test-cluster-2 1/1 Running 0 17m
+ test-cluster-3 0/1 ContainerCreating 0 8s
+ test-cluster-4 0/1 ContainerCreating 0 7s
+ ```
+
+- Ensure the expansion job is running or has completed successfully:
+
+ ```bash
+ # replace test-cluster with you tigergraph cluster name
+ kubectl get job -l job-name=test-cluster-expand-job -n tigergraph
+
+ NAME COMPLETIONS DURATION AGE
+ test-cluster-expand-job 0/1 4m13s 4m13s
+ ```
+
+ If the expansion job fails, we can check out the logs of the job.
+
+ ```bash
+ # replace test-cluster with you tigergraph cluster name
+ kubectl get pod -l job-name=test-cluster-expand-job -n tigergraph
+
+ NAME READY STATUS RESTARTS AGE
+ test-cluster-expand-job-6jk42 1/1 Running 0 5m38s
+
+ kubectl logs test-cluster-expand-job-6jk42 -n tigergraph
+ Could not create directory '/.ssh' (Permission denied).
+ Failed to add the host to the list of known hosts (/.ssh/known_hosts).
+ [ Info] Starting EXE
+ [ Info] Starting CTRL
+ [ Info] Generating config files to all machines
+ [ Info] Successfully applied configuration change. Please restart services to make it effective immediately.
+ [ Info] Initializing KAFKA
+ [ Info] Starting EXE
+ [ Info] Starting CTRL
+ [ Info] Starting ZK ETCD DICT KAFKA ADMIN GSE NGINX GPE RESTPP KAFKASTRM-LL KAFKACONN TS3SERV GSQL TS3 IFM GUI
+ [ Info] Applying config
+ [Warning] No difference from staging config, config apply is skipped.
+ [ Info] Successfully applied configuration change. Please restart services to make it effective immediately.
+ [ Info] Cluster is initialized successfully
+ [ Info] Configuration has been changed. Please use 'gadmin config apply' to persist the changes.
+ [Warning] No difference from staging config, config apply is skipped.
+ [ Info] Successfully applied configuration change. Please restart services to make it effective immediately.
+ [ Info] Stopping ZK ETCD DICT KAFKA ADMIN GSE NGINX GPE RESTPP KAFKASTRM-LL KAFKACONN TS3SERV GSQL TS3 IFM GUI
+ [ Info] Stopping CTRL
+ [ Info] Stopping EXE
+ [ Info] Starting EXE
+ [ Info] Starting CTRL
+ [ Info] Starting ZK ETCD DICT KAFKA ADMIN GSE NGINX GPE RESTPP KAFKASTRM-LL KAFKACONN TS3SERV GSQL TS3 IFM GUI
+ Could not create directory '/.ssh' (Permission denied).
+ Failed to add the host to the list of known hosts (/.ssh/known_hosts).
+ [ Info] Starting EXE
+ [ Info] Starting CTRL
+ [ Info] Generating config files to all machines
+ [ Info] Successfully applied configuration change. Please restart services to make it effective immediately.
+ [ Info] Initializing KAFKA
+ [ Info] Starting EXE
+ [ Info] Starting CTRL
+ [ Info] Starting ZK ETCD DICT KAFKA ADMIN GSE NGINX GPE RESTPP KAFKASTRM-LL KAFKACONN TS3SERV GSQL TS3 IFM GUI
+ [ Info] Applying config
+ [Warning] No difference from staging config, config apply is skipped.
+ [ Info] Successfully applied configuration change. Please restart services to make it effective immediately.
+ [ Info] Cluster is initialized successfully
+ [ Info] Configuration has been changed. Please use 'gadmin config apply' to persist the changes.
+ [Warning] No difference from staging config, config apply is skipped.
+ [ Info] Successfully applied configuration change. Please restart services to make it effective immediately.
+ [ Info] Stopping ZK ETCD DICT KAFKA ADMIN GSE NGINX GPE RESTPP KAFKASTRM-LL KAFKACONN TS3SERV GSQL TS3 IFM GUI
+ [ Info] Stopping CTRL
+ [ Info] Stopping EXE
+ [ Info] Starting EXE
+ [ Info] Starting CTRL
+ [ Info] Starting ZK ETCD DICT KAFKA ADMIN GSE NGINX GPE RESTPP KAFKASTRM-LL KAFKACONN TS3SERV GSQL TS3 IFM GUI
+ Could not create directory '/.ssh' (Permission denied).
+ Failed to add the host to the list of known hosts (/.ssh/known_hosts).
+ hostlist is: m4:test-cluster-3.test-cluster-internal-service,m5:test-cluster-4.test-cluster-internal-service
+ You have entered:
+
+ m4 : test-cluster-3.test-cluster-internal-service
+ m5 : test-cluster-4.test-cluster-internal-service
+
+ Replication number will be changed to 2. The previous value is 2
+ [ Info] [Mon Feb 27 09:34:54 UTC 2023] Validate cluster change requests
+ [ Info] [Mon Feb 27 09:34:54 UTC 2023] Export gsql/gui data
+ [ Info] [Mon Feb 27 09:34:54 UTC 2023] Export graph data, time cost will be proportional to data size
+ ```
+
+- Ensure that the expansion operation has been performed successfully:
+
+ - Check the status of the cluster CR:
+
+ ```bash
+ kubectl tg status --cluster-name test-cluster -n tigergraph
+ ```
+
+ - Check the cluster status by executing gadmin:
+
+ ```bash
+ kubectl exec -it test-cluster-0 -n tigergraph -- /home/tigergraph/tigergraph/app/cmd/gadmin status -v
+ ```
+
+#### Potential failure of expansion
+
+- If the K8s cluster's resources (CPU or Memory) are insufficient to expand the TigerGraph cluster, you have two options based on the Operator version:
+
+ - For Operator versions 0.0.3 and earlier, recreate the cluster with the same cluster name, which will load the remaining cluster data for recovery.
+
+ - For Operator versions 0.0.4 and higher, update the size to match the K8s cluster's available resources or reset the cluster to the previous configuration.
+
+- The expanding job fails again after retrying three times
+
+ - If you back up the cluster before expansion, you can restore it with the backup package directly.
+
+ - If there is no backup package, it has to do a complex manual recovery, so weâd better backup the cluster before expansion.
+
+- Repeated pod restart after shrink/expand operations since GPE is not ready in-time causes healthy check failed. After Expansion/Shrink ends, services such as GPE may take a while to switch from warmup to running. This process may exceed 40 seconds, which is the upper limit of the health check. If the services of one or more TG nodes, but not all TG nodes, are still in the warmup state, their pods will be removed and rebuilt. This may cause repeated pod restarts. There will be about 5-12 times, depending on the amount of data, until the TG Service status of all pods is normal. At present, at the Operator level, it is impossible to limit such waiting time, or perceive the maximum warmup time, and it is also impossible to configure such waiting time arbitrarily.
+solution
+
+ - Wait for all services to return to normal status before taking any action.
+
+ ```bash
+ kubectl describe pods tg-cluster-1
+
+ Events:
+ Type Reason Age From Message
+ ---- ------ ---- ---- -------
+ Warning Unhealthy 31m kubelet Readiness probe failed: service status of GPE_2#1 is Down, exit error.
+ [Warning] Status might not be up-to-date due to sync data error; failed to get latest offset, err is "kafka server: Request was for a topic or partition that does not exist on this broker."
+ Warning Unhealthy 30m kubelet Readiness probe failed: service status of GPE_2#1 is Down, exit error.
+ Warning Unhealthy 29m (x9 over 74m) kubelet Readiness probe failed: service status of GPE_2#1 should not be Warmup, exit error
+ kubectl describe pods tg-cluster-0
+ Events:
+ Type Reason Age From Message
+ ---- ------ ---- ---- -------
+ Warning Unhealthy 30m (x5 over 31m) kubelet Readiness probe failed: service status of GPE_1#1 is Down, exit error.
+ Warning Unhealthy 30m (x7 over 72m) kubelet Readiness probe failed: service status of GPE_1#1 should not be Warmup, exit error
+ kubectl describe pods tg-cluster-2
+ Events:
+ Type Reason Age From Message
+ ---- ------ ---- ---- -------
+ Warning Unhealthy 30m (x5 over 31m) kubelet Readiness probe failed: service status of GPE_3#1 is Down, exit error.
+ Warning Unhealthy 30m (x7 over 78m) kubelet Readiness probe failed: service status of GPE_3#1 should not be Warmup, exit error
+ kubectl describe pods tg-cluster-3 | tail -10
+ Events:
+ Type Reason Age From Message
+ ---- ------ ---- ---- -------
+ Warning Unhealthy 31m kubelet Readiness probe failed: service status of GPE_4#1 is Down, exit error.
+ [Warning] Status might not be up-to-date due to sync data error; failed to get latest offset, err is "kafka server: Request was for a topic or partition that does not exist on this broker."
+ Warning Unhealthy 31m kubelet Readiness probe failed: service status of GPE_4#1 is Down, exit error.
+ Warning Unhealthy 30m (x8 over 80m) kubelet Readiness probe failed: service status of GPE_4#1 should not be Warmup, exit error
+ ```
+
+### Shrinking
+
+- Ensure that the shrinking job is running or has completed successfully:
+
+ ```bash
+ # replace test-cluster with you tigergraph cluster name
+ kubectl get job test-cluster-shrink-pre-job -n tigergraph
+
+ NAME COMPLETIONS DURATION AGE
+ test-cluster-shrink-pre-job 0/1 21s 21s
+ ```
+
+ If the shrinking job fails, you can check the job logs:
+
+ ```bash
+ kubectl get pod -l job-name=test-cluster-shrink-pre-job -n tigergraph
+
+ NAME READY STATUS RESTARTS AGE
+ test-cluster-shrink-pre-job-jzlhm 1/1 Running 0 2m11s
+
+ kubectl logs test-cluster-shrink-pre-job-jzlhm -n tigergraph
+
+ Warning: Permanently added 'test-cluster-0.test-cluster-internal-service.tigergraph' (ED25519) to the list of known hosts.
+ hostlist is: m4:test-cluster-3.test-cluster-internal-service,m5:test-cluster-4.test-cluster-internal-service
+ You have entered:
+
+ m5 : test-cluster-4.test-cluster-internal-service
+ m4 : test-cluster-3.test-cluster-internal-service
+
+ Replication number will be changed to 2. The previous value is 2
+ [ Info] [Mon Feb 27 10:06:21 UTC 2023] Validate cluster change requests
+ [ Info] [Mon Feb 27 10:06:21 UTC 2023] Export gsql/gui data
+ [ Info] [Mon Feb 27 10:06:21 UTC 2023] Export graph data, time cost will be proportional to data size
+ ```
+
+- Ensure that the shrinking operation has been performed successfully:
+
+ - Check the status of the cluster CR:
+
+ ```bash
+ kubectl tg status --cluster-name test-cluster -n tigergraph
+ ```
+
+ - Check the cluster status by executing gadmin:
+
+ ```bash
+ kubectl exec -it test-cluster-0 -n tigergraph -- /home/tigergraph/tigergraph/app/cmd/gadmin status -v
+ ```
+
+#### Potential Causes
+
+- The shrinking job fails again after retrying three times
+
+ - If you have a backup of the cluster before shrinking, you can restore it directly using the backup package.
+
+ - If there is no backup package, manual recovery is a complex process, so it's recommended to backup the cluster before shrinking.
+
+## Troubleshooting Steps for External Service Accessibility
+
+### TigerGraph GUI
+
+> [!IMPORTANT]
+> TigerGraph 3.6.3 doesn't support session affinity in HA env
+
+In TigerGraph 3.6.3, Session Affinity is not supported. Direct external access through the Service (SVC) may lead to the following issues:
+
+1. The TigerGraph GUI application may repeatedly prompt an authentication error after login because authentication occurs on one node, and requests sent to the GUI may be directed to other nodes.
+2. Sending HTTP/HTTPS requests to the GUI service may yield similar results.
+
+ ```bash
+ curl -H "Cookie: TigerGraphApp=5eaf48c3-cb0b-4c78-9f27-7251c53dc0bc" http://34.29.233.2:14240/api/loading-jobs/ldbc_snb/meta
+ {"error":true,"message":"You are not authorized to use this API.","results":{}}%
+ ```
+
+- Solution
+
+1. Use the latest version of TigerGraph that supports Session Affinity.
+
+2. To enable Session Affinity, edit the service (SVC) configuration:
+
+ ```bash
+ kubectl edit svc tg-cluster-1-gui-external-service
+ # edit the sessionAffinity as below
+ sessionAffinity: ClientIP
+ sessionAffinityConfig:
+ clientIP:
+ timeoutSeconds: 1800
+
+ ```
diff --git a/k8s/docs/05-troubleshoot/operator-installation.md b/k8s/docs/05-troubleshoot/operator-installation.md
new file mode 100644
index 00000000..0bf831f0
--- /dev/null
+++ b/k8s/docs/05-troubleshoot/operator-installation.md
@@ -0,0 +1,140 @@
+# TigerGraph Operator Installation Troubleshooting
+
+This document outlines common issues and provides solutions for troubleshooting TigerGraph Operator installation in a Kubernetes environment.
+
+## Troubleshooting Steps
+
+In the following steps, we assume that the TigerGraph Operator has been installed in the `tigergraph` namespace. Please adapt the commands according to your specific setup.
+
+- Verify Operator Installation
+
+ Ensure that the TigerGraph Operator has been successfully installed. If not, install the operator first.
+
+ ```bash
+ kubectl get deployment tigergraph-operator-controller-manager -o wide -n tigergraph
+
+ NAME READY UP-TO-DATE AVAILABLE AGE CONTAINERS IMAGES SELECTOR
+ tigergraph-operator-controller-manager 1/1 1 1 22m manager,kube-rbac-proxy docker.io/tigergrah/tigergraph-k8s-operator:0.0.3,gcr.io/kubebuilder/kube-rbac-proxy:v0.8.0 control-plane=controller-manager
+ ```
+
+ From the output of the above command, you can figure out that the operator version is 0.0.3, docker.io/tigergrah/tigergraph-k8s-operator:0.0.3, you can also use the following helm command to get the current version of Operator:
+
+ ```bash
+ helm list -n tigergraph
+
+ NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION
+ tg-operator tigergraph 1 2023-02-26 13:16:15.701059001 +0000 UTC deployed tg-operator-0.0.3
+
+ ```
+
+- If the Operator has been installed, Check if the TigerGraph Operator is running normally:
+
+ ```bash
+ kubectl get pods -l control-plane=controller-manager -n tigergraph
+
+ NAME READY STATUS RESTARTS AGE
+ tigergraph-operator-controller-manager-c8d65bcd9-t2mkz 2/2 Running 0 12m
+ ```
+
+ If the status is `Running` and the `READY` field shows "2/2," it indicates that the Operator deployment is running normally.
+
+- If the Operator is not running normally, follow these steps:
+ - Get the Pod Name of the Operator
+
+ ```bash
+ kubectl get pods -l control-plane=controller-manager -n tigergraph
+
+ NAME READY STATUS RESTARTS AGE
+ tigergraph-operator-controller-manager-c8d65bcd9-t2mkz 2/2 Running 0 12m
+ ```
+
+ - Make sure to replace the pod name with the one from the previous command's output:
+
+ ```bash
+ kubectl describe pod tigergraph-operator-controller-manager-c8d65bcd9-t2mkz -n tigergraph
+ ```
+
+ - To identify the root cause of the Operator pod restarting, check its logs:
+
+ ```bash
+ kubectl logs tigergraph-operator-controller-manager-c8d65bcd9-t2mkz -f -n tigergraph
+ ```
+
+- If you have multiple Operator pods for high availability, you should check logs of these pods to find the leader of operator, and check the logs of leader.
+
+ - Get pod name of Operator
+
+ ```bash
+ kubectl get pods -l control-plane=controller-manager -n tigergraph
+ NAME READY STATUS RESTARTS AGE
+ tigergraph-operator-controller-manager-869b885466-5qwkp 2/2 Running 0 28h
+ tigergraph-operator-controller-manager-869b885466-6cq8w 2/2 Running 0 28h
+ tigergraph-operator-controller-manager-869b885466-6jpnq 2/2 Running 0 28h
+ ```
+
+ - Check logs of these pods
+
+ ```bash
+ kubectl logs tigergraph-operator-controller-manager-869b885466-5qwkp -n tigergraph
+ kubectl logs tigergraph-operator-controller-manager-869b885466-6cq8w -n tigergraph
+ kubectl logs tigergraph-operator-controller-manager-869b885466-6jpnq -n tigergraph
+ ```
+
+ - The logs of a leader should include the following output
+
+ ```bash
+ I0509 07:57:53.476671 1 leaderelection.go:248] attempting to acquire leader lease tigergraph/9d6fe668.tigergraph.com...
+ 2023-05-09T07:57:53.476Z INFO starting metrics server {"path": "/metrics"}
+ 2023-05-09T07:57:53.476Z INFO controller-runtime.webhook.webhooks starting webhook server
+ 2023-05-09T07:57:53.476Z INFO controller-runtime.certwatcher Updated current TLS certificate
+ 2023-05-09T07:57:53.477Z INFO controller-runtime.webhook serving webhook server {"host": "", "port": 9443}
+ 2023-05-09T07:57:53.477Z INFO controller-runtime.certwatcher Starting certificate watcher
+ I0509 07:57:53.498264 1 leaderelection.go:258] successfully acquired lease tigergraph/9d6fe668.tigergraph.com
+ ```
+
+## Potential failures
+
+- Before installing the operator, ensure that cert-manager has been installed. Failure to do so may result in the following error during operator installation:
+
+ ```bash
+ Error: INSTALLATION FAILED: unable to build kubernetes objects from release manifest: [resource mapping not found for name: "tigergraph-operator-serving-cert" namespace: "tigergraph" from "": no matches for kind "Certificate" in version "cert-manager.io/v1"
+ ```
+
+- To verify that cert-manager is installed and running normally, follow these steps:
+
+ - Check cert-manager Deployments:
+
+ ```bash
+ kubectl get deployment -n cert-manager cert-manager
+
+ kNAME READY UP-TO-DATE AVAILABLE AGE
+ cert-manager 1/1 1 1 5m27s
+
+ kubectl get deployment -n cert-manager cert-manager-cainjector
+
+ NAME READY UP-TO-DATE AVAILABLE AGE
+ cert-manager-cainjector 1/1 1 1 5m27s
+
+ kubectl get deployment -n cert-manager cert-manager-webhook
+
+ NAME READY UP-TO-DATE AVAILABLE AGE
+ cert-manager-webhook 1/1 1 1 5m28s
+
+ kubectl get deployment -n cert-manager cert-manager
+ kubectl get deployment -n cert-manager cert-manager-cainjector
+ kubectl get deployment -n cert-manager cert-manager-webhook
+ ```
+
+ - If cert-manager is not installed, install it:
+
+ ```bash
+ kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.8.0/cert-manager.yaml
+ ```
+
+ - Ensure cert-manager is running normally:
+
+ ```bash
+ kubectl wait deployment -n cert-manager cert-manager --for condition=Available=True --timeout=90s
+ kubectl wait deployment -n cert-manager cert-manager-cainjector --for condition=Available=True --timeout=90s
+ kubectl wait deployment -n cert-manager cert-manager-webhook --for condition=Available=True --timeout=90s
+ ```
diff --git a/k8s/docs/05-troubleshoot/rolling-update.md b/k8s/docs/05-troubleshoot/rolling-update.md
new file mode 100644
index 00000000..4e471993
--- /dev/null
+++ b/k8s/docs/05-troubleshoot/rolling-update.md
@@ -0,0 +1,434 @@
+# TigerGraph Cluster Rolling Update Troubleshoot
+
+This document provides solutions for common issues that may arise during a rolling update of a TigerGraph cluster.
+
+## Prerequisite knowledge
+
+### Cluster Operations Triggering Rolling Updates
+
+A rolling update of a TigerGraph cluster on Kubernetes can be triggered by the following cluster operations:
+
+- Updating the resources (CPU and Memory) of TigerGraph Pods
+- Adding or updating the init or sidecar container for TigerGraph Pods
+- Adding or updating Node/Pod Affinity
+- Updating the TigerGraph Docker image version to trigger the TigerGraph cluster upgrade
+- Changing default configurations of TigerGraph Pod and container when upgrading the operator version. For example, in Operator version 0.0.7, the TerminationGracePeriodSeconds of TG Pod was updated to 6 minutes from 1 minute. This change triggers a rolling update after upgrading the Operator successfully.
+
+### Rolling update strategy of StatefulSet
+
+The Kubernetes Operator uses a StatefulSet to manage TigerGraph Pods during rolling updates. Key points of the rolling update strategy include:
+
+- The StatefulSet controller deletes and recreates each Pod in the StatefulSet.
+- Pods are updated in order of termination, from the largest ordinal to the smallest.
+- Each Pod is updated one at a time.
+- The Kubernetes control plane waits until an updated Pod is in the Running and Ready state before updating its predecessor. This can occasionally result in the rolling update process getting stuck.
+
+### Factors Affecting TigerGraph Pod's Running and Ready State
+
+- The requested resources (CPU, Memory, and Persistent Volume) of the TigerGraph Pod must be met for it to be in the Running state.
+- The PostStart Handler of the TigerGraph container must run without errors for the Pod to be considered Ready.
+- The readiness check of the TigerGraph container must pass for the Pod to be Ready.
+- The liveness check of the TigerGraph container must pass for the Pod to be Ready.
+- All Init containers of the TigerGraph pod must complete successfully; otherwise, the TigerGraph container will not start running.
+
+## How to Recover from Rolling Update Failure
+
+### Pod Stuck in Pending State Due to Unmet Resource Needs
+
+The Pod may be stuck in pending status due to unmet resource needs, the typical example is the following:
+
+```bash
+kubectl get pods
+NAME READY STATUS RESTARTS AGE
+test-cluster-0 1/1 Running 1 (11m ago) 42h
+test-cluster-1 0/1 Pending 0 41h
+test-cluster-2 1/1 Running 1 (11m ago) 42h
+tigergraph-operator-controller-manager-9868c59f6-w58mh 2/2 Running 3 (10m ago) 41h
+```
+
+You can check the Pod event to find the specific resources needs that are unmet:
+
+```bash
+$ kubectl describe pod test-cluster-1
+Name: test-cluster-1
+Namespace: tigergraph
+Priority: 0
+Node:
+Labels: controller-revision-hash=test-cluster-7f7b4c5599
+ statefulset.kubernetes.io/pod-name=test-cluster-1
+ tigergraph.com/cluster-name=test-cluster
+ tigergraph.com/cluster-pod=test-cluster
+Annotations:
+Status: Pending
+IP:
+IPs:
+Controlled By: StatefulSet/test-cluster
+Init Containers:
+ init-tigergraph:
+ Image: alpine:3.17.2
+ Port:
+ Host Port:
+ Command:
+ sh
+ -c
+ chown -R 1000:1000 /home/tigergraph/tigergraph/data
+ Environment:
+ Mounts:
+ /home/tigergraph/tigergraph/data from tg-data (rw)
+ /var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-n9jg7 (ro)
+Containers:
+ tigergraph:
+ Image: docker.io/tigergraph/tigergraph-k8s:3.9.2
+ Ports: 9000/TCP, 14240/TCP, 22/TCP
+ Host Ports: 0/TCP, 0/TCP, 0/TCP
+ Requests:
+ cpu: 2
+ memory: 6Gi
+ ...
+Conditions:
+ Type Status
+ PodScheduled False
+Volumes:
+ tg-data:
+ Type: PersistentVolumeClaim (a reference to a PersistentVolumeClaim in the same namespace)
+ ClaimName: tg-data-test-cluster-1
+ ReadOnly: false
+ config-volume:
+ Type: ConfigMap (a volume populated by a ConfigMap)
+ Name: test-cluster-init-config
+ Optional: false
+ probe-data:
+ Type: EmptyDir (a temporary directory that shares a pod's lifetime)
+ Medium:
+ SizeLimit:
+ private-key-volume:
+ Type: Secret (a volume populated by a Secret)
+ SecretName: ssh-key-secret
+ Optional: false
+ tg-log:
+ Type: EmptyDir (a temporary directory that shares a pod's lifetime)
+ Medium:
+ SizeLimit:
+ kube-api-access-n9jg7:
+ Type: Projected (a volume that contains injected data from multiple sources)
+ TokenExpirationSeconds: 3607
+ ConfigMapName: kube-root-ca.crt
+ ConfigMapOptional:
+ DownwardAPI: true
+QoS Class: Burstable
+Node-Selectors:
+Tolerations: node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
+ node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
+Events:
+ Type Reason Age From Message
+ ---- ------ ---- ---- -------
+ Warning FailedScheduling 17h (x298 over 41h) default-scheduler 0/1 nodes are available: 1 Insufficient cpu. preemption: 0/1 nodes are available: 1 No preemption victims found for incoming pod.
+ Warning FailedScheduling 11m default-scheduler 0/1 nodes are available: 1 Insufficient cpu. preemption: 0/1 nodes are available: 1 No preemption victims found for incoming pod.
+```
+
+From the events mentioned above, you can identify that there is an insufficient CPU resource. In such cases, you need to add new nodes first, and the rolling update process will resume once the resource requirements are fulfilled. Additionally, you have the option to reduce the desired CPU or memory resources.
+
+### TG container is in PostStartHookError or CrashLoopBackOff
+
+The PostStart Handle script must execute successfully when restarting the TigerGraph container. If it does not, the TigerGraph pod will enter either the PostStartHookError or CrashLoopBackOff (after multiple failures) state. A typical example of failure is as follows:
+
+```bash
+kubectl get pods
+NAME READY STATUS RESTARTS AGE
+test-cluster-0 1/1 Running 0 9m
+test-cluster-1 0/1 PostStartHookError 0 (4s ago) 23s
+test-cluster-2 1/1 Running 0 88s
+test-cluster-init-job-4sjjm 0/1 Completed 0 6m12s
+tigergraph-operator-controller-manager-6745f8c5bc-jfw9r 2/2 Running
+$ kubectl get pods
+NAME READY STATUS RESTARTS AGE
+test-cluster-0 1/1 Running 0 9m27s
+test-cluster-1 0/1 CrashLoopBackOff 1 (14s ago) 50s
+test-cluster-2 1/1 Running 0 115s
+test-cluster-init-job-4sjjm 0/1 Completed 0 6m39s
+tigergraph-operator-controller-manager-6745f8c5bc-jfw9r 2/2 Running 0 0
+```
+
+You can check the logs of PostStart Handle script first by following the command:(**Required TG docker image version 3.9.3, and operator version 0.0.9**)
+
+> [!NOTE]
+> The failure examples below are intentionally simulated, In practice you probably won't have this problem.
+
+```bash
+$ kubectl logs test-cluster-1
+Defaulted container "tigergraph" out of: tigergraph, init-tigergraph (init)
+[Sun Jul 23 03:59:38 UTC 2023] tigergraph container is running now
+[Sun Jul 23 03:59:52 UTC 2023] the config file /home/tigergraph/.tg.cfg is not exist
+
+$ kct logs test-cluster-1
+Defaulted container "tigergraph" out of: tigergraph, init-tigergraph (init)
+Name: test-cluster-1.test-cluster-internal-service.tigergraph.svc.cluster.local
+Address: 10.244.0.196
+
+Server: 10.96.0.10
+Address: 10.96.0.10#53
+
+Name: test-cluster-2.test-cluster-internal-service.tigergraph.svc.cluster.local
+Address: 10.244.0.198
+
+[Wed Jul 12 04:36:02 UTC 2023] tigergraph container is running now
+[Wed Jul 12 04:36:17 UTC 2023] try to start all services on the current node
+[ Info] Starting EXE
+[Wed Jul 12 04:36:18 UTC 2023] start service EXE_3 of current node successfully
+[ Info] Starting CTRL
+[Wed Jul 12 04:36:18 UTC 2023] failed to start service CTRL of all nodes for 1 times
+[ Info] Starting CTRL
+[Wed Jul 12 04:37:18 UTC 2023] failed to start service CTRL of all nodes for 2 times
+[ Info] Starting CTRL
+[Wed Jul 12 04:38:18 UTC 2023] failed to start service CTRL of all nodes for 3 times
+[Wed Jul 12 04:38:18 UTC 2023] failed to start all services of current node
+```
+
+The PostStart Handle script attempts to start all services on the current Pod. If it fails, the TigerGraph container will be restarted. In such cases, we need to identify the root cause by examining the logs of the current container and addressing any specific issues first.
+
+You can use the following command to log in to the TigerGraph pod and review the logs of the failed services.
+
+- Log into the TigerGraph cluster pod
+
+ ```bash
+ # the following command equals `kubectl exec it test-cluster-0 -- bash`
+ kubectl tg connect --cluster-name test-cluster --namespace tigergraph
+ ```
+
+- Check the TigerGraph services status
+
+ ``` bash
+ gadmin status
+ # for example, if GSE is down, you can check the log path of it.
+ gadmin log GSE
+ ```
+
+- Start the down service
+
+ ```bash
+ gadmin start GSE
+ ```
+
+### Readiness or Liveness Check of TigerGraph Container Fails
+
+The liveness check will monitor the listener port of the executor. If it remains down after four retry attempts, the container will be terminated and then restarted.
+
+Therefore, if the executor cannot be started as expected, you should log in to the Pod to review the error logs of the executor and address those issues first.
+
+Regarding the readiness check for the TigerGraph container, it assesses the service status of the current TG container. If any unexpected issues arise, causing the liveness check to fail, the rolling update will remain stalled until the liveness check succeeds.
+
+During a rolling update, you can intentionally stop all services to trigger a readiness check failure, resulting in the rolling update becoming stuck at the Pod named `test-cluster-1`.
+
+```bash
+kubectl get pods
+NAME READY STATUS RESTARTS AGE
+test-cluster-0 0/1 Running 0 39m
+test-cluster-1 0/1 Running 7 (18m ago) 32m
+test-cluster-2 0/1 Running 0 33m
+test-cluster-init-job-62qwz 0/1 Completed 0 37m
+tigergraph-operator-controller-manager-6745f8c5bc-kfssx 2/2 Running 0 39m
+```
+
+Check the events of Pod test-cluster-1:
+
+```bash
+$ kubectl describe pod test-cluster-1
+Name: test-cluster-1
+Namespace: tigergraph
+Priority: 0
+Node: tg-control-plane/172.18.0.2
+Start Time: Sun, 23 Jul 2023 05:00:10 +0000
+Labels: controller-revision-hash=test-cluster-568859648
+ statefulset.kubernetes.io/pod-name=test-cluster-1
+ tigergraph.com/cluster-name=test-cluster
+ tigergraph.com/cluster-pod=test-cluster
+ tigergraph.com/gui-service=true
+ tigergraph.com/nginx-service=true
+ tigergraph.com/restpp-service=true
+Annotations:
+Status: Running
+IP: 10.244.0.39
+IPs:
+ IP: 10.244.0.39
+Controlled By: StatefulSet/test-cluster
+Init Containers:
+ init-tigergraph:
+ Container ID: containerd://0f1b4141969080be757a8cbf1d4e62122dc2846ea0968416a790b95cadcacc3f
+ Image: alpine:3.17.2
+ Image ID: docker.io/library/alpine@sha256:ff6bdca1701f3a8a67e328815ff2346b0e4067d32ec36b7992c1fdc001dc8517
+ Port:
+ Host Port:
+ Command:
+ sh
+ -c
+ chown -R 1000:1000 /home/tigergraph/tigergraph/data
+ State: Terminated
+ Reason: Completed
+ Exit Code: 0
+ Started: Sun, 23 Jul 2023 05:00:11 +0000
+ Finished: Sun, 23 Jul 2023 05:00:11 +0000
+ Ready: True
+ Restart Count: 0
+ Environment:
+ Mounts:
+ /home/tigergraph/tigergraph/data from tg-data (rw)
+ /var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-cftws (ro)
+Containers:
+ tigergraph:
+ Container ID: containerd://f47028197d9376f558a088b5b88cb34e0f00f6639a433d115b29b493b54c2e87
+ Image: docker.io/tginternal/tigergraph-k8s:3.9.2-post-start
+ Image ID: docker.io/tginternal/tigergraph-k8s@sha256:dd3dd058fbef7eae77cf51e622c467d290ceeaf9644b8392b5b0eec4920b84de
+ Ports: 9000/TCP, 14240/TCP, 22/TCP
+ Host Ports: 0/TCP, 0/TCP, 0/TCP
+ State: Running
+ Started: Sun, 23 Jul 2023 05:19:38 +0000
+ Last State: Terminated
+ Reason: Error
+ Exit Code: 143
+ Started: Sun, 23 Jul 2023 05:13:13 +0000
+ Finished: Sun, 23 Jul 2023 05:14:33 +0000
+ Ready: False
+ Restart Count: 7
+ Requests:
+ cpu: 2
+ memory: 7Gi
+...
+Events:
+ Type Reason Age From Message
+ ---- ------ ---- ---- -------
+ Normal Scheduled 34m default-scheduler Successfully assigned tigergraph/test-cluster-1 to tg-control-plane
+ Normal Pulled 34m kubelet Container image "alpine:3.17.2" already present on machine
+ Normal Started 34m kubelet Started container init-tigergraph
+ Normal Created 34m kubelet Created container init-tigergraph
+ Normal Pulled 34m kubelet Successfully pulled image "docker.io/tginternal/tigergraph-k8s:3.9.2-post-start" in 2.034685698s
+ Normal Pulled 32m kubelet Successfully pulled image "docker.io/tginternal/tigergraph-k8s:3.9.2-post-start" in 338.940713ms
+ Warning FailedPreStopHook 31m (x2 over 32m) kubelet Exec lifecycle hook ([/bin/bash -c
+ if [ "$(ls -A /home/tigergraph/tigergraph/data/|grep -v lost|tail -1)" ]; then
+ export PATH=/home/tigergraph/tigergraph/app/cmd:$PATH
+ PROCESS_ALL=$(ghostname|awk '{$1=""}1'| \
+ awk '{for(x=1;x<=NF;x++)if(x % 2)printf "%s", $x (x == NF || x == (NF-1)?"\n":" ")}')
+ if [ $? != 0 ]; then exit 0; fi
+ gadmin stop $PROCESS_ALL -y
+ fi]) for Container "tigergraph" in Pod "test-cluster-1_tigergraph(7f81bbfb-c425-493f-881a-2e1cf502d44d)" failed - error: command '/bin/bash -c
+ if [ "$(ls -A /home/tigergraph/tigergraph/data/|grep -v lost|tail -1)" ]; then
+ export PATH=/home/tigergraph/tigergraph/app/cmd:$PATH
+ PROCESS_ALL=$(ghostname|awk '{$1=""}1'| \
+ awk '{for(x=1;x<=NF;x++)if(x % 2)printf "%s", $x (x == NF || x == (NF-1)?"\n":" ")}')
+ if [ $? != 0 ]; then exit 0; fi
+ gadmin stop $PROCESS_ALL -y
+ fi' exited with 1: , message: "ExternalError (Failed to get the APP root from config; The file ~/.tg.cfg either does not exist or is a broken link. Please create a new symlink at this location and point it to the tg.cfg file located in the 'configs' directory of System.DataRoot. This can be done using the following command: ln -s /path/to/System.DataRoot/configs/tg.cfg ~/.tg.cfg; open /home/tigergraph/.tg.cfg: no such file or directory)\n"
+ Normal Killing 31m (x2 over 32m) kubelet FailedPostStartHook
+ Normal Pulling 31m (x3 over 34m) kubelet Pulling image "docker.io/tginternal/tigergraph-k8s:3.9.2-post-start"
+ Normal Pulled 31m kubelet Successfully pulled image "docker.io/tginternal/tigergraph-k8s:3.9.2-post-start" in 315.864405ms
+ Normal Created 31m (x3 over 34m) kubelet Created container tigergraph
+ Normal Started 31m (x3 over 34m) kubelet Started container tigergraph
+ Warning BackOff 18m (x41 over 31m) kubelet Back-off restarting failed container
+ Warning Unhealthy 13m kubelet Readiness probe failed: command "/bin/bash -c \n\t\t\t\t\t\t\t\t\t\t\texport PATH=/home/tigergraph/tigergraph/app/cmd:$PATH\n\t\t\t\t\t\t\t\t\t\t\tcommand gadmin > /dev/null 2>&1\n\t\t\t\t\t\t\t\t\t\t\tif [ $? != 0 ]; then exit 0; fi\n\t\t\t\t\t\t\t\t\t\t\tDATA_ROOT=$(gadmin config get system.dataroot --file ~/.tg.cfg)\n\t\t\t\t\t\t\t\t\t\t\tCFG_FILE=${DATA_ROOT}/configs/tg.cfg\n\t\t\t\t\t\t\t\t\t\t\tif [ ! -f $CFG_FILE ]; then\n\t\t\t\t\t\t\t\t\t\t\t exit 0\n\t\t\t\t\t\t\t\t\t\t\tfi\n\t\t\t\t\t\t\t\t\t\t\tPROCESS_ALL=$(ghostname|awk '{$1=\"\"}1'| awk '{for(x=1;x<=NF;x++)if(x % 2)printf \"%s\", \\\n\t\t\t\t\t\t\t\t\t\t\t$x (x == NF || x == (NF-1)?\"\\n\":\" \")}')\n\t\t\t\t\t\t\t\t\t\t\tif [ $? != 0 ]; then exit 0; fi\n\t\t\t\t\t\t\t\t\t\t\tgadmin status -v $PROCESS_ALL|grep -v Online|head -n -1|tail -n +4 | \\\n\t\t\t\t\t\t\t\t\t\t\tawk '{print $2,$4,$6}'| while read -r service_info;\n\t\t\t\t\t\t\t\t\t\t\tdo\n\t\t\t\t\t\t\t\t\t\t\t service_name=$(echo $service_info|awk '{print $1}')\n\t\t\t\t\t\t\t\t\t\t\t service_status=$(echo $service_info|awk '{print $2}')\n\t\t\t\t\t\t\t\t\t\t\t process_status=$(echo $service_info|awk '{print $3}')\n\t\t\t\t\t\t\t\t\t\t\t if [ \"$service_status\" != \"Warmup\" ]; then\n\t\t\t\t\t\t\t\t\t\t\t\techo \"service status of $service_name is $service_status, exit error.\"\n\t\t\t\t\t\t\t\t\t\t\t\texit 1\n\t\t\t\t\t\t\t\t\t\t\t else\n\t\t\t\t\t\t\t\t\t\t\t\tif [[ $service_name =~ ^GPE.* ]] || [[ $service_name =~ ^GSE.* ]]; then\n\t\t\t\t\t\t\t\t\t\t\t\t if ! test -f ~/tigergraph/data/gstore/0/part/config.yaml; then\n\t\t\t\t\t\t\t\t\t\t\t\t\tcontinue\n\t\t\t\t\t\t\t\t\t\t\t\t else\n\t\t\t\t\t\t\t\t\t\t\t\t\techo \"service status of $service_name should not be Warmup, exit error\"\n\t\t\t\t\t\t\t\t\t\t\t\t\texit 1\n\t\t\t\t\t\t\t\t\t\t\t\t fi\n\t\t\t\t\t\t\t\t\t\t\t\telse\n\t\t\t\t\t\t\t\t\t\t\t\t echo \"service status of $service_name is $service_status, not Online, exit error\"\n\t\t\t\t\t\t\t\t\t\t\t\t exit 1\n\t\t\t\t\t\t\t\t\t\t\t\tfi\n\t\t\t\t\t\t\t\t\t\t\t fi\n\t\t\t\t\t\t\t\t\t\t\tdone\n\t\t\t\t\t\t\t\t\t\t\t" timed out
+ Warning Unhealthy 3m54s (x35 over 12m) kubelet Readiness probe failed: service status of ADMIN#2 is Down, exit error.
+[Warning] Status might not be up-to-date due to sync data error; failed to get latest offset, err is "kafka: client has run out of available brokers to talk to (Is your cluster reachable?)"
+```
+
+You can execute the following command to start all down services, then the rolling update will continue:
+
+```bash
+kubectl exec -it test-cluster-0 -- /home/tigergraph/tigergraph/app/cmd/gadmin start all --auto-restart
+Defaulted container "tigergraph" out of: tigergraph, init-tigergraph (init)
+[ Info] Starting EXE
+[ Info] Starting CTRL
+[ Info] Starting ZK ETCD DICT KAFKA ADMIN GSE NGINX GPE RESTPP KAFKASTRM-LL KAFKACONN TS3SERV GSQL TS3 IFM GUI
+```
+
+If the above command executes failed due to some other issues, you can rerun it till all service are online.
+
+```bash
+kubectl exec -it test-cluster-0 -- /home/tigergraph/tigergraph/app/cmd/gadmin status -v
+Defaulted container "tigergraph" out of: tigergraph, init-tigergraph (init)
++--------------------+-------------------------+-------------------------+-------------------------+
+| Service Name | Service Status | Process State | Process ID |
++--------------------+-------------------------+-------------------------+-------------------------+
+| ADMIN#1 | Online | Running | 2118 |
+| ADMIN#2 | Online | Running | 380177 |
+| ADMIN#3 | Online | Running | 73481 |
+| CTRL#1 | Online | Running | 849 |
+| CTRL#2 | Online | Running | 375698 |
+| CTRL#3 | Online | Running | 72151 |
+| DICT#1 | Online | Running | 988 |
+| DICT#2 | Online | Running | 376859 |
+| DICT#3 | Online | Running | 72319 |
+| ETCD#1 | Online | Running | 955 |
+| ETCD#2 | Online | Running | 376824 |
+| ETCD#3 | Online | Running | 72307 |
+| EXE_1 | Online | Running | 820 |
+| EXE_2 | Online | Running | 28447 |
+| EXE_3 | Online | Running | 41917 |
+| GPE_1#1 | Warmup | Running | 2167 |
+| GPE_1#2 | Warmup | Running | 380278 |
+| GSE_1#1 | Warmup | Running | 2128 |
+| GSE_1#2 | Warmup | Running | 380186 |
+| GSQL#1 | Online | Running | 2399 |
+| GSQL#2 | Online | Running | 8187 |
+| GSQL#3 | Online | Running | 73919 |
+| GUI#1 | Online | Running | 2555 |
+| GUI#2 | Online | Running | 8338 |
+| GUI#3 | Online | Running | 74265 |
+| IFM#1 | Online | Running | 2499 |
+| IFM#2 | Online | Running | 8287 |
+| IFM#3 | Online | Running | 74125 |
+| KAFKA#1 | Online | Running | 1056 |
+| KAFKA#2 | Online | Running | 377205 |
+| KAFKA#3 | Online | Running | 72386 |
+| KAFKACONN#1 | Warmup | Running | 2224 |
+| KAFKACONN#2 | Online | Running | 362949 |
+| KAFKACONN#3 | Online | Running | 73630 |
+| KAFKASTRM-LL_1 | Online | Running | 2186 |
+| KAFKASTRM-LL_2 | Online | Running | 8073 |
+| KAFKASTRM-LL_3 | Online | Running | 73526 |
+| NGINX#1 | Online | Running | 2142 |
+| NGINX#2 | Online | Running | 380188 |
+| NGINX#3 | Online | Running | 73501 |
+| RESTPP#1 | Online | Running | 2169 |
+| RESTPP#2 | Online | Running | 380319 |
+| RESTPP#3 | Online | Running | 73512 |
+| TS3SERV#1 | Online | Running | 2311 |
+| TS3_1 | Online | Running | 2441 |
+| TS3_2 | Online | Running | 8233 |
+| TS3_3 | Online | Running | 73988 |
+| ZK#1 | Online | Running | 864 |
+| ZK#2 | Online | Running | 376115 |
+| ZK#3 | Online | Running | 72166 |
++--------------------+-------------------------+-------------------------+-------------------------+
+```
+
+The rolling update process will continue and eventually succeed.
+
+```bash
+kubectl get pods -w
+NAME READY STATUS RESTARTS AGE
+test-cluster-0 0/1 Running 0 48m
+test-cluster-1 0/1 Running 7 (27m ago) 42m
+test-cluster-2 0/1 Running 0 43m
+test-cluster-init-job-62qwz 0/1 Completed 0 47m
+tigergraph-operator-controller-manager-6745f8c5bc-kfssx 2/2 Running 0 48m
+test-cluster-1 1/1 Running 7 (27m ago) 42m
+test-cluster-0 1/1 Running 0 49m
+test-cluster-2 1/1 Running 0 44m
+test-cluster-0 1/1 Terminating 0 49m
+test-cluster-0 0/1 Terminating 0 49m
+test-cluster-0 0/1 Terminating 0 49m
+test-cluster-0 0/1 Terminating 0 49m
+test-cluster-0 0/1 Pending 0 0s
+test-cluster-0 0/1 Pending 0 0s
+test-cluster-0 0/1 Init:0/1 0 0s
+test-cluster-0 0/1 PodInitializing 0 1s
+test-cluster-0 0/1 PodInitializing 0 13s
+test-cluster-0 0/1 Running 0 2m6s
+test-cluster-0 1/1 Running 0 2m7s
+```
+
+> [!IMPORTANT]
+> There are some best practices you can use for rolling update failover.
+
+1. Identify the root cause by examining Pod events and TigerGraph container logs if the rolling update becomes stuck in the Pending or PostStartHookError state
+2. Avoid executing gadmin stop all -y during a rolling update, as its behavior is undefined.
+3. If the rolling update becomes stuck due to a service shutdown, execute gadmin start all --auto-restart.
+4. If executing gadmin start all --auto-restart fails to start all services, log in to the Pods to address the issues with specific services that are down first.
diff --git a/k8s/docs/06-FAQs/README.md b/k8s/docs/06-FAQs/README.md
new file mode 100644
index 00000000..e5c770b9
--- /dev/null
+++ b/k8s/docs/06-FAQs/README.md
@@ -0,0 +1,85 @@
+# TigerGraph FAQs on Kubernetes
+
+## Are hardware(on-premise) licenses valid on Kubernetes and how do I renew the license?
+
+If you have deployed TigerGraph cluster on-premise with a hardware license, you can't reuse the license on Kubernetes.
+
+The hardware license is invalid on Kubernetes, and you need to apply a special license for TigerGraph on K8s.
+
+The easiest way to update the license is to use `kubectl tg update --cluster-name ${YOUR_CLUSTER_NAME} --license`, you can also log in to one TigerGraph pod and execute `gadmin config set System.License && gadmin config apply -y`.
+
+## Whether to support TigerGraph downgrade with Operatorīŧ
+
+The TigerGraph downgrade is not recommended and the behavior is undefined, you can find the solution in the [troubleshoot section](../05-troubleshoot/cluster-management.md) if you accidentally performed a downgrade.
+
+## Does TigerGraph Operator support resizing the persistent volume for an existing TigerGraph cluster on K8s?
+
+At present, Kubernetes offers automatic volume resizing for persistent volumes, but not for volumes linked to StatefulSets. Since some of the CSI does not support ALLOWVOLUMEEXPANSION, the Operator doesn't support to resize it automatically, you can refer to [persistent volume resizing](../07-reference/expand-persistent-volume.md) to do it manually.
+
+## Does TigerGraph cluster support high availability when performing cluster management such as resource update, upgrade, scale and backup?
+
+TigerGraph's exceptional performance comes with certain considerations regarding high availability during upgrading and scaling operations. Presently, TigerGraph does not offer high availability support specifically for these processes. However, it's important to note that high availability is maintained for other operations.
+
+For your optimal experience, it is strongly recommended to start a backup operation before starting any upgrade or scaling activities. This precautionary measure ensures the safety of your data and system integrity.
+
+## How to know the status of cluster management? Do I need to confirm stat before modifying TigerGraph cluster CR configuration?
+
+In essence, TigerGraph does not inherently maintain a record of the cluster status throughout its lifecycle. However, understanding the status of the TigerGraph cluster is pivotal for the TigerGraph Operator. This insight empowers the Operator to determine which operations can be executed at any given moment.
+
+To facilitate this, the TigerGraph Operator takes on the responsibility of managing the cluster status based on the user's operational configurations. This orchestration ensures a cohesive interaction between the TigerGraph cluster and the Operator, leading to effective and accurate execution of desired operations.
+
+By keeping a dynamic grasp of the cluster status, the TigerGraph Operator optimizes the functionality of your TigerGraph environment, streamlining operations and contributing to the overall efficiency of your workflow.
+
+Currently, TigerGraph Operator will divide the cluster status into six types, and it will probably add new types according to the requirements.
+
+TigerGraph cluster status in Operator are following as:
+
+| State of TigerGraph on Kubernetes | Description |
+|----------|----------|
+| Normal | TigerGraph cluster is in ready state, it's allowed to do any cluster operations |
+| Initialization | Prepare for TigerGraph pods and init TigerGraph cluster |
+| Update | TigerGraph cluster is in rolling update that indicate you update the CPU, Memory, and other pod configurations. |
+| Upgrade | TigerGraph cluster is in upgrading process, pulling new version image and performing upgrade job|
+| Expand | TigerGraph cluster is in scale up process, preparing for new pods and performing expansion job|
+| Shrink | TigerGraph cluster is in scale down process, performing shrinking job and scale down pods|
+
+You can execute the following command to check the status of TigerGraph cluster on Kubernetes:
+
+```bash
+kubectl get tg ${TIGERGRAPH_CLUSTER_NAME} -o yaml -n ${NAMESPACE}|yq .status
+clusterSize: 3
+clusterTopology:
+ test-cluster-0:
+ - gui
+ - nginx
+ - restpp
+ test-cluster-1:
+ - gui
+ - nginx
+ - restpp
+ test-cluster-2:
+ - gui
+ - nginx
+ - restpp
+conditions:
+ - lastProbeTime: "2023-08-23T08:37:00Z"
+ status: "True"
+ type: Normal
+ - lastProbeTime: "2023-08-24T05:46:24Z"
+ message: Hello GSQL
+ status: "True"
+ type: test-cluster-0-rest-Available
+ - lastProbeTime: "2023-08-24T05:46:24Z"
+ message: Hello GSQL
+ status: "True"
+ type: test-cluster-1-rest-Available
+ - lastProbeTime: "2023-08-24T05:46:24Z"
+ message: Hello GSQL
+ status: "True"
+ type: test-cluster-2-rest-Available
+ha: 2
+image: docker.io/tginternal/tigergraph-k8s:3.9.2
+listener:
+ type: LoadBalancer
+replicas: 3
+```
diff --git a/k8s/docs/07-reference/configure-tigergraph-cluster-cr-with-yaml-manifests.md b/k8s/docs/07-reference/configure-tigergraph-cluster-cr-with-yaml-manifests.md
new file mode 100644
index 00000000..096316c0
--- /dev/null
+++ b/k8s/docs/07-reference/configure-tigergraph-cluster-cr-with-yaml-manifests.md
@@ -0,0 +1,366 @@
+# How to configure TG Cluster on K8s using TigerGraph CR
+
+This document introduces how to configure the TG cluster using TigerGraph CR. It covers the following content:
+
+- Configure resources
+
+- Configure TigerGraph deployment
+
+## Configure resources
+
+Before deploying a TG cluster, it is necessary to be familiar with the hardware and software requirements depending on your needs. For details, refer to [Hardware and Software Requirements](https://docs.tigergraph.com/tigergraph-server/current/installation/hw-and-sw-requirements).
+
+To ensure the proper scheduling and stable operation of the components of the TG cluster on Kubernetes, it is recommended to set Guaranteed-level quality of service (QoS) by making `limits` equal to `requests` when configuring resources. For details, refer to [Configure Quality of Service for Pods](https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod/).
+
+## Configure TG deployment
+
+To configure a TG deployment, you need to configure the TigerGraph CR. Refer to the following example.
+
+```yaml
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraph
+metadata:
+ name: test-cluster
+spec:
+ image: docker.io/tginternal/tigergraph-k8s:3.9.3
+ imagePullPolicy: IfNotPresent
+ initJob:
+ image: docker.io/tginternal/tigergraph-k8s-init:0.0.9
+ imagePullPolicy: IfNotPresent
+ initTGConfig:
+ ha: 2
+ license: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+ listener:
+ type: LoadBalancer
+ privateKeyName: ssh-key-secret
+ replicas: 6
+ resources:
+ requests:
+ cpu: "4"
+ memory: 8Gi
+ storage:
+ type: persistent-claim
+ volumeClaimTemplate:
+ resources:
+ requests:
+ storage: 100G
+ storageClassName: standard
+```
+
+### Cluster name
+
+The cluster name can be configured by changing `metadata.name` in the `TigerGraph` CR, cluster names should be unique within a certain namespace.
+
+### TigerGraph cluster version
+
+The TigerGraph cluster version can be configured by changing `spec.image` in the `TigerGraph` CR, you can also specify the `imagePullPolicy` and `imagePullSecrets` according to your needs.
+
+Besides, you also need to specify the TG version by changing `spec.initTGConfig.version` which is required when initializing and upgrading cluster.(Before Operator version 0.0.8)
+
+### TigerGraph cluster size and HA factor
+
+The TigerGraph cluster version can be configured by changing `spec.replicas` in the `TigerGraph` CR,
+
+and the HA factor can be configured by changing `spec.initTGConfig.ha`, the default value of HA factor is 1.
+
+### TigerGraph Cluster license
+
+The TigerGraph cluster license is required for TigerGraph deployment, and it can be configured by changing `spec.initTGConfig.`license in the `TigerGraph` CR.
+
+A free license is available through this link [ftp://ftp.graphtiger.com/lic/license3.txt](ftp://ftp.graphtiger.com/lic/license3.txt), which has 14 days expiration date.
+
+### Service account name of TigerGraph pod(Optional)
+
+A service account name of TigerGraph pod is required to acquire permission for some special K8s distribution, such as OpenShift.
+
+You can create a service account name and grant permission to it first, and it can be configured by changing `spec.serviceAccountName` in the `TigerGraph` CR.
+
+Itâs an optional configuration, you can omit it if there are no permission issues.
+
+### Private ssh key name of TigerGraph Cluster
+
+The field `privateKeyName` is a mandatory configuration for Operator 0.0.4 and later.
+
+The private ssh key pair is required for security when running TigerGraph on K8s, you can create a private ssh key pair, and then create a Secret with these ssh key files.
+
+```bash
+# create a new private keys
+echo -e 'y\\n' | ssh-keygen -b 4096 -t rsa -f $HOME/.ssh/tigergraph_rsa -q -N ''
+
+# Create a Secret of K8s with above ssh key files
+kubectl create secret generic ssh-key-secret --from-file=private-ssh-key=$HOME/.ssh/tigergraph_rsa --from-file=public-ssh-key=$HOME/.ssh/tigergraph_rsa.pub --namespace YOUR_NAME_SPACE
+```
+
+Then you can specify the value of `spec.privateKeyName` to the secret name you created above.
+
+### Storage volumes of TigerGraph Cluster
+
+Storage volumes configurations can be configured by changing `spec.storage` , there are two types of storage, `persistent-claim` and `ephemeral`. For production, you should use the `persistent-claim` type to store the data on persistent volumes.
+
+- persistent-claim
+
+```yaml
+spec:
+ storage:
+ type: persistent-claim
+ volumeClaimTemplate:
+ resources:
+ requests:
+ storage: 10G
+ storageClassName: standard
+```
+
+- ephemeral
+
+```yaml
+spec:
+ storage:
+ type: ephemeral
+```
+
+### Resource requests and limits of TigerGraph pod
+
+The Resource requests and limits of TG Cluster pod can be configured by changing `spec.resources.requests` and `spec.resources.limits` in the `TigerGraph` CR.
+
+```yaml
+spec:
+ resources:
+ limits:
+ cpu: 8
+ memory: 100Gi
+ requests:
+ cpu: 8
+ memory: 100Gi
+```
+
+### External access service
+
+TigerGraph Operator provides three types of external access services, LoadBalancer, NodePort, and Ingress. It can be configured by changing `spec.listener.type` in the `TigerGraph` CR.
+
+- LoadBalancer
+
+```yaml
+spec:
+ listener:
+ type: LoadBalancer
+```
+
+- NodePort
+
+```yaml
+spec:
+ listener:
+ type: NodePort
+ restNodePort: 30090
+ studioNodePort: 30240
+```
+
+- Ingress
+
+```yaml
+spec:
+ listener:
+ type: Ingress
+ restHost: tigergraph-api.k8s.company.com
+ studioHost: tigergraph-studio.k8s.company.com
+ secretName: k8s.company.com
+```
+
+### Customized labels and annotations for external service
+
+If you want to add customized labels and annotations for external service, you can configure it by adding `spec.listener.labels` and `spec.listener.annotations` in `TigerGraph` CR.
+
+```yaml
+spec:
+ listener:
+ type: LoadBalancer
+ labels:
+ label-key: label-value
+ annotations:
+ annotation-key: annotation-value
+```
+
+### Initialize Job configuration of TigerGraph cluster
+
+Itâs required to run a special job to initialize the TigerGraph cluster when deploying TigerGraph on K8s, you need to specify the image version of the Init Job, usually, the version is the same as the Operator version you installed.
+
+It can be configured by changing `spec.initjob` in the `TigerGraph` CR. imagePullPolicy and imagePullSecrets are optional configurations, you can omit them if you donât need them.
+
+```yaml
+spec:
+ initJob:
+ image: docker.io/tginternal/tigergraph-k8s-init:${OPERATOR_VERSION}
+ imagePullPolicy: IfNotPresent
+ imagePullSecrets:
+ - name: tigergraph-image-pull-secret
+```
+
+### Container Customization of TigerGraph pods
+
+TigerGraph CR support customizing the containers of TG pods, including the Init container, Sidecar container, and container volumes. To know more about this feature, you can refer to [InitContainers,SidecarContainers and CustomVolumes](../03-deploy/custom-containers.md)
+
+The init container can be configured by changing `spec.initContainers`, you can add multiple init containers through this configuration field. About the fields of Container, you can refer to K8S Container API [https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/pod-v1/#Container](https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/pod-v1/#Container)
+
+```yaml
+spec:
+ initContainers:
+ - args:
+ - /bin/sh
+ - -c
+ - echo "this is init-container test"
+ image: alpine:3.17.3
+ name: init-container-test
+ securityContext:
+ capabilities:
+ add:
+ - NET_ADMIN
+ privileged: true
+```
+
+Sidecar containers can be configured by changing `spec.sidecarContainers`, you can add multiple sidecar containers through this configuration field.
+
+```yaml
+spec:
+ sidecarContainers:
+ - args: # sidecar will execute this
+ - /bin/sh
+ - -c
+ - |
+ while true; do
+ echo "$(date) INFO hello from main-container" >> /var/log/myapp.log ;
+ sleep 1;
+ done
+ image: alpine:3.17.2
+ name: main-container # name of sidecar
+ readinessProbe: # check if the sidecar is ready
+ exec:
+ command:
+ - sh
+ - -c
+ - if [[ -f /var/log/myapp.log ]];then exit 0; else exit 1;fi
+ initialDelaySeconds: 10
+ periodSeconds: 5
+ resources:
+ requests: # request resouces for sidecar
+ cpu: 2
+ memory: 1Gi
+ limits: # limit resources
+ cpu: 4
+ memory: 4Gi
+ env: # inject the environment you need
+ - name: CLUSTER_NAME
+ value: test-cluster
+ volumeMounts:
+ - mountPath: /var/log
+ name: tg-log # this volume is used by TG, you can access log of tg here
+```
+
+Additional volumes can be configured by changing `spec.customVolumes` . If you need to mount extra volumes into the init container or sidecar container, you can update this configuration.
+
+The Operator has created two volumes by default, one is tg-data which is used to persistent data of TG cluster, another volume name is tg-log which is used to save logs of TG, and the mount path is `/home/tigergraph/tigergraph/log` , you can use volume name `tg-log` and mount path `/home/tigergraph/tigergraph/log` in the sidecar to access the logs of TG.
+
+For detailed configurations of different volumes, refer to [https://kubernetes.io/docs/concepts/storage/volumes](https://kubernetes.io/docs/concepts/storage/volumes) .
+
+```yaml
+spec:
+ customVolumes:
+ - name: auth-sidecar-config
+ configMap:
+ name: auth-sidecar-configmap
+ - name: credentials
+ emptyDir:
+ medium: Memory
+ - name: fallback-config
+ configMap:
+ name: fallback
+ optional: true
+ - name: heap-dump
+ hostPath:
+ path: /var/tmp/heapdump
+ type: DirectoryOrCreate
+```
+
+### NodeSelector, Affinity, and Toleration configuration
+
+NodeSelector, Affinity, and Toleration can be configured by changing `spec.affinityConfiguration`, the special cases for both of these configurations, you can refer to page [NodeSelector, Affinity and Toleration using cases](../03-deploy/affinity-use-cases.md).
+
+- NodeSelector
+
+```yaml
+spec:
+ affinityConfiguration:
+ nodeSelector:
+ disktype: ssd
+```
+
+- Toleration
+
+```yaml
+spec:
+ affinityConfiguration:
+ tolerations:
+ - key: "userGroup"
+ operator: "Equal"
+ value: "enterprise"
+ effect: "NoExecute"
+```
+
+- Affinity
+
+```yaml
+spec:
+ affinityConfiguration:
+ affinity:
+ nodeAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ nodeSelectorTerms:
+ - matchExpressions:
+ - key: disktype
+ operator: In
+ values:
+ - ssd
+ podAntiAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ - labelSelector:
+ matchExpressions:
+ - key: tigergraph.com/cluster-pod
+ operator: In
+ values:
+ - test-cluster
+ topologyKey: topology.kubernetes.io/zone
+```
+
+## API reference of TigerGraphSpec
+
+TigerGraphSpec contains the details of TigerGraph members
+
+| Field | Description |
+|----------|----------|
+| replicas | The desired TG cluster size |
+| image | The desired TG docker image |
+| imagePullPolicy | (*Optional*)The image pull policy of TG docker image, default is IfNotPresent |
+| imagePullSecrets | (*Optional*)The own keys can access the private registry |
+| initJob.image | The desired TG Init docker image |
+| initJob.imagePullPolicy | (*Optional*)The image pull policy of TG docker image, default is IfNotPresent |
+| initJob.imagePullSecrets | (*Optional*)The own keys can access the private registry |
+| serviceAccountName | (*Optional*)The service account name of pod which is used to acquire special permission |
+| privateKeyName | The secret name of private ssh key files |
+| initTGConfig.ha | The replication factor of TG cluster |
+| initTGConfig.license | The license of TG cluster |
+| initTGConfig.version | The TG cluster version to initialize or upgrade |
+| listener.type | The type of external access service, which can be set to LoadBalancer, NodePort, and Ingress |
+| listener.restNodePort | The rest service port which is required when setting listener.type to NodePort |
+| listener.studioNodePort | The gui service port which is required when setting listener.type to NodePort |
+| listener.restHost | The domain name of rest service which is required when setting listener.type to Ingress |
+| listener.studioHost| The domain name of gui service which is required when setting listener.type to Ingress |
+| listener.secretName | (*Optional*)The secretName is the name of the secret used to terminate TLS traffic on port 443 when setting listener.type to Ingress |
+| listener.labels | (*Optional*)The customized labels will be added to external service |
+| listener.annotations | (*Optional*)The customized annotations will be added to external service |
+| resources | [The compute resource requirements](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.22/#resourcerequirements-v1-core) |
+| initContainers | The [init containers](https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/pod-v1/#Container) run in TigerGraph pods. |
+| sidecarContainers | (*Optional*)The [sidecar containers](https://kubernetes.io/docs/reference/kubernetes-api/workload-resources/pod-v1/#Container) run in TG pods |
+| customVolumes | (*Optional*)The custom [volumes](https://kubernetes.io/docs/concepts/storage/volumes/) used in init container and sidecar container |
+| affinityConfiguration | (*Optional*)The configurations for NodeSelector, Affinity, and Tolerations |
+| affinityConfiguration.nodeSelector | (*Optional*)The configuration of assigning pods to special nodes using [NodeSelector](https://kubernetes.io/docs/tasks/configure-pod-container/assign-pods-nodes/) |
+| affinityConfiguration.tolerations | (*Optional*)The [tolerations](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/) configuration of TigerGraph pod |
+| affinityConfiguration.affinity | (*Optional*)The [affinity](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#inter-pod-affinity-and-anti-affinity) configuration of TigerGraph pod |
diff --git a/k8s/docs/07-reference/expand-persistent-volume.md b/k8s/docs/07-reference/expand-persistent-volume.md
new file mode 100644
index 00000000..bae4474d
--- /dev/null
+++ b/k8s/docs/07-reference/expand-persistent-volume.md
@@ -0,0 +1,167 @@
+# How to resize persistent volumes of TigerGraph cluster on Kubernetes
+
+This document provides instructions on resizing persistent volumes for a TigerGraph cluster on Kubernetes.
+
+Currently, Kubernetes offers automatic volume resizing for persistent volumes, but not for volumes associated with StatefulSets. The TigerGraph Kubernetes Operator relies on StatefulSets for orchestrating TigerGraph pods. Therefore, when dealing with persistent volumes associated with the TigerGraph Operator, a unique set of manual procedures is required to facilitate volume resizing.
+
+> [!WARNING]
+> Resizing PVCs using this method only works your StorageClass supports AllowVolumeExpansion=True.
+
+Follow these steps to resize persistent volumes attached to the StatefulSet of a TigerGraph cluster on Kubernetes:
+
+1. Update the storage class to allow volume expansion.
+2. Delete the TigerGraph cluster, but keep the PVC (Persistent Volume Claim).
+3. Patch the PVC to the new size.
+4. Recreate the TigerGraph cluster with the new volume size.
+
+## Update storageclass to allow volume expansion
+
+### GKE
+
+On GKE, all the preinstalled storage classes have `ALLOWVOLUMEEXPANSION` enabled, so there is no need to change it. You can check it using the following command:
+
+```bash
+kubectl get storageclass
+NAME PROVISIONER RECLAIMPOLICY VOLUMEBINDINGMODE ALLOWVOLUMEEXPANSION AGE
+premium-rwo pd.csi.storage.gke.io Delete WaitForFirstConsumer true 5h27m
+standard kubernetes.io/gce-pd Delete Immediate true 5h27m
+standard-rwo (default) pd.csi.storage.gke.io Delete WaitForFirstConsumer true 5h27m
+```
+
+### EKS
+
+- Install the EBS CSI (Elastic Block Store Container Storage Interface) driver (optional):
+
+Since some EKS versions do not install aws-ebs-csi-driver plugin by default, if you encounter the following issue when creating TG cluster with the dynamic persistent volume, you need to check it first.
+
+```bash
+# please replace the cluster name and namespace with yours.
+$ kubectl describe pvc -l tigergraph.com/cluster-name=test-cluster --namespace tigergraph
+Name: tg-data-test-cluster-0
+Namespace: tigergraph
+StorageClass: gp2
+Status: Pending
+Volume:
+Labels: tigergraph.com/cluster-name=test-cluster
+ tigergraph.com/cluster-pod=test-cluster
+Annotations: volume.beta.kubernetes.io/storage-provisioner: ebs.csi.aws.com
+ volume.kubernetes.io/selected-node: ip-172-31-20-181.us-west-1.compute.internal
+ volume.kubernetes.io/storage-provisioner: ebs.csi.aws.com
+Finalizers: [kubernetes.io/pvc-protection]
+Capacity:
+Access Modes:
+VolumeMode: Filesystem
+Used By: test-cluster-0
+Events:
+ Type Reason Age From Message
+ ---- ------ ---- ---- -------
+ Normal WaitForFirstConsumer 8m9s persistentvolume-controller waiting for first consumer to be created before binding
+ Normal ExternalProvisioning 2m35s (x25 over 8m9s) persistentvolume-controller waiting for a volume to be created, either by external provisioner "ebs.csi.aws.com" or manually created by system administrator
+```
+
+- Check and install `aws-ebs-csi-driver` with following commands:
+
+```bash
+kubectl get deployment ebs-csi-controller -n kube-system
+
+aws eks create-addon --cluster-name ${YOUR_K8S_CLUSTER_NAME} --addon-name aws-ebs-csi-driver
+```
+
+- Update storageclass to allow volume expansion
+
+```bash
+$ kubectl get sc gp2
+NAME PROVISIONER RECLAIMPOLICY VOLUMEBINDINGMODE ALLOWVOLUMEEXPANSION AGE
+gp2 (default) kubernetes.io/aws-ebs Delete WaitForFirstConsumer false 87m
+
+$ kubectl patch sc gp2 -p '{"allowVolumeExpansion": true}'
+storageclass.storage.k8s.io/gp2 patched
+
+$ kubectl get sc gp2
+NAME PROVISIONER RECLAIMPOLICY VOLUMEBINDINGMODE ALLOWVOLUMEEXPANSION AGE
+gp2 (default) kubernetes.io/aws-ebs Delete WaitForFirstConsumer true 88m
+```
+
+## Delete the TG cluster and remain the PVC
+
+The following examples assume that the name of the TG cluster is test-cluster and it is installed in namespace tigergraph, please replace them according to your requirements.
+
+You should also specify the correct storageclas(`--storage-class)` according to your K8s enviroment.
+
+- Create TG cluster(Optional)
+
+You can skip this step if there is an existing TigerGraph cluster; it's only used for testing. The following examples assume that the name of the TigerGraph cluster is test-cluster, and it is installed in the tigergraph namespace.
+
+```bash
+kubectl tg create --cluster-name test-cluster --license xxxxxxxxx -k ssh-key-secret --size 6 --ha 2 --version 3.9.1 --storage-class gp2 --storage-size 10G --cpu 3000m --memory 8Gi -n tigergraph
+```
+
+- Check the volume size (Optional, for verification)
+
+```bash
+$ kubectl tg connect --cluster-name test-cluster -n tigergraph
+tigergraph@test-cluster-0:~$ df -h
+Filesystem Size Used Avail Use% Mounted on
+overlay 100G 12G 89G 12% /
+tmpfs 64M 0 64M 0% /dev
+tmpfs 15G 0 15G 0% /sys/fs/cgroup
+/dev/xvda1 100G 12G 89G 12% /etc/hosts
+tmpfs 27G 8.0K 27G 1% /etc/private-key-volume
+shm 64M 0 64M 0% /dev/shm
+/dev/xvdaa 9.7G 916M 8.8G 10% /home/tigergraph/tigergraph/data
+tmpfs 27G 12K 27G 1% /run/secrets/kubernetes.io/serviceaccount
+tmpfs 15G 0 15G 0% /proc/acpi
+tmpfs 15G 0 15G 0% /proc/scsi
+tmpfs 15G 0 15G 0% /sys/firmware
+tigergraph@test-cluster-0:~$
+```
+
+- Delete the TigerGraph cluster and keep the PVC:
+
+> [!WARNING]
+> Remember to keep the PVC to recreate the cluster; otherwise, all the data in the cluster will be lost.
+
+```bash
+kubectl tg delete --cluster-name test-cluster -n tigergraph
+```
+
+## Patch the PVC to the new size
+
+In this example, we will demonstrate how to resize the PV from 10Gi to 20Gi using the kubectl patch command to adjust the storage size of the PVC.
+
+We will perform this operation on all PVCs within the cluster by specifying the label `tigergraph.com/cluster-name=test-cluster`. Please ensure you replace `test-cluster` with the name of your cluster.
+
+```bash
+kubectl get pvc -l tigergraph.com/cluster-name=test-cluster --namespace tigergraph|sed '1d'|awk '{print $1}'|xargs -I {} kubectl patch pvc {} --namespace tigergraph --type merge --patch '{"spec":{"resources":{"requests":{"storage":"20Gi"}}}}'
+```
+
+## Recreate the TG cluster with the new volume size
+
+Because the PVC was retained after deleting the TG cluster, you can quickly recreate it using the same cluster name for a swift recovery. Simultaneously, ensure that you update the volume size of the CR to match the new desired value.
+
+```bash
+# change --storage-size 10Gi to --storage-size 20Gi
+kubectl tg create --cluster-name test-cluster --license xxxxxxxxx -k ssh-key-secret --size 6 --ha 2 --version 3.9.1 --storage-class gp2 --storage-size 20G --cpu 3000m --memory 8Gi -n tigergraph
+```
+
+- Check the volume size again (Optional, for verification)
+
+After patching the PVC to the new size and recreating the cluster with the new storage size, you can find that the storage size has been updated to 20Gi.
+
+```bash
+$ kubectl tg connect --cluster-name test-cluster -n tigergraph
+Defaulted container "tigergraph" out of: tigergraph, init-tigergraph (init)
+tigergraph@test-cluster-0:~$ df -h
+Filesystem Size Used Avail Use% Mounted on
+overlay 100G 12G 89G 12% /
+tmpfs 64M 0 64M 0% /dev
+tmpfs 15G 0 15G 0% /sys/fs/cgroup
+/dev/xvda1 100G 12G 89G 12% /etc/hosts
+tmpfs 27G 8.0K 27G 1% /etc/private-key-volume
+shm 64M 0 64M 0% /dev/shm
+/dev/xvdaa 20G 185M 20G 1% /home/tigergraph/tigergraph/data
+tmpfs 27G 12K 27G 1% /run/secrets/kubernetes.io/serviceaccount
+tmpfs 15G 0 15G 0% /proc/acpi
+tmpfs 15G 0 15G 0% /proc/scsi
+tmpfs 15G 0 15G 0% /sys/firmware
+```
diff --git a/k8s/docs/07-reference/integrate-envoy-sidecar.md b/k8s/docs/07-reference/integrate-envoy-sidecar.md
new file mode 100644
index 00000000..df6148f4
--- /dev/null
+++ b/k8s/docs/07-reference/integrate-envoy-sidecar.md
@@ -0,0 +1,161 @@
+How to integrate the envoy sidecar with TG Pod
+
+Starting from Operator version 0.0.6, we support adding sidecar containers to the TG Pod. This guide is dedicated to the integration process of the envoy sidecar with the TG Pod. To proceed, please ensure that you have Operator version 0.0.6 or a newer version installed. Additionally, please note that this document does not delve into the intricacies of envoy, such as TLS configuration. Instead, its primary focus is to describe the configuration of envoy sidecar containers for accessing TG services.
+
+
+
+Configuration of Envoy sidecar container
+========================================
+
+The initial step involves the creation of a ConfigMap resource and its subsequent mounting onto the pod as the Envoy's configuration.
+
+Below is an illustrative example of the ConfigMap:
+
+```yaml
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: sidecar-test-configmap
+ labels:
+ app: sidecar-test
+data:
+ envoy.yaml: |
+ static_resources:
+ listeners:
+ - name: listener_0
+ address:
+ socket_address:
+ address: 0.0.0.0
+ port_value: 12000
+ filter_chains:
+ - filters:
+ - name: envoy.filters.network.http_connection_manager
+ typed_config:
+ "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
+ stat_prefix: ingress_http
+ access_log:
+ - name: envoy.access_loggers.stdout
+ typed_config:
+ "@type": type.googleapis.com/envoy.extensions.access_loggers.stream.v3.StdoutAccessLog
+ http_filters:
+ - name: envoy.filters.http.router
+ typed_config:
+ "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
+ route_config:
+ name: local_route
+ virtual_hosts:
+ - name: nginx_service
+ domains:
+ - "*"
+ routes:
+ - match:
+ prefix: "/"
+ route:
+ cluster: nginx_service
+ clusters:
+ - name: nginx_service
+ type: LOGICAL_DNS
+ # Comment out the following line to test on v6 networks
+ dns_lookup_family: V4_ONLY
+ load_assignment:
+ cluster_name: nginx_service
+ endpoints:
+ - lb_endpoints:
+ - endpoint:
+ address:
+ socket_address:
+ address: 127.0.0.1
+ port_value: 14240
+```
+
+* Add listener to forward the requests to the API gateway of TG
+
+ * `listener_1` is listening on port 12000 which is used for routing to the Nginx service, in `rout_config` part, we use cluster nginx\_service as the route.
+
+* Add cluster to configure the endpoint for the above listener
+
+ * cluster `nginx_service` specifies the `endpoint` to address 127.0.0.1 and port 14240 where the NGINX service will listen.
+
+
+Add `sidecarContainers` and `customVolumes` to the TigerGraph CR
+================================================================
+
+```yaml
+ sidecarContainers:
+ - image: envoyproxy/envoy:v1.26.0
+ name: envoy-sidecar-container
+ resources:
+ requests:
+ memory: "512Mi"
+ cpu: "500m"
+ limits:
+ memory: "512Mi"
+ cpu: "500m"
+ ports:
+ - name: tg-nginx
+ containerPort: 12000
+ protocol: TCP
+ volumeMounts:
+ - name: sidecar-config
+ mountPath: "/etc/envoy"
+ readOnly: true
+ customVolumes:
+ - name: sidecar-config
+ configMap:
+ name: sidecar-test-configmap
+```
+
+Validation
+==========
+
+Finally, to ensure the proper functionality of the Envoy sidecar service and its access to the RESTPP and Metric services, we will establish a Kubernetes (K8s) Service. This service will facilitate the verification process.
+
+Assume that the TG cluster name is `test-cluster`, the `selector` of the service should include label `tigergraph.com/cluster-pod: test-cluster`
+
+Furthermore, for those aiming to access the web console, an additional label `tigergraph.com/gui-service: "true"` must be included within the selector of the Service.
+
+```yaml
+apiVersion: v1
+kind: Service
+metadata:
+ name: envoy-sidecar-nginx-service
+ labels:
+ app: envoy-sidecar-test
+spec:
+ type: LoadBalancer
+ selector:
+ tigergraph.com/cluster-pod: test-cluster
+ # optional,only for web cosole accessing.
+ # tigergraph.com/gui-service: "true"
+ ports:
+ - name: nginx
+ port: 12000
+ targetPort: 12000
+```
+
+* RESTPP
+
+
+```bash
+curl http://${LBS_EXTERNAL_IP}:12000/restpp/echo
+# it will return {"error":false, "message":"Hello GSQL"} if accessing successfully.
+```
+
+* Metric
+
+
+```bash
+# it will return the latest metrics of cpu and mem.
+curl http://${LBS_EXTERNAL_IP}:12000/informant/metrics/get/cpu-memory -d '{"ServiceDescriptor":{"ServiceName":"gse","Partition": 1,"Replica":1}}'
+```
+
+* WEB API and console
+
+
+```bash
+# it will return {"error":false,"message":"pong","results":null} if accessing successfully
+curl http://${LBS_EXTERNAL_IP}:12000/api/ping
+
+# Web console
+# open the the url http://${LBS_EXTERNAL_IP}:12000 in Chrome or other browser.
+```
\ No newline at end of file
diff --git a/k8s/docs/07-reference/labels-used-by-tg.md b/k8s/docs/07-reference/labels-used-by-tg.md
new file mode 100644
index 00000000..75f58e03
--- /dev/null
+++ b/k8s/docs/07-reference/labels-used-by-tg.md
@@ -0,0 +1,27 @@
+# Labels used by TigerGraph Operator
+TigerGraph utilizes specific labels for different purposes in Kubernetes:
+
+### TigerGraph Cluster Pods
+
+| Label | Usage |
+|----------------------------------------|---------------------------------------------------------------------|
+| `tigergraph.com/cluster-name=CLUSTER_NAME` | Indicates which cluster the pod belongs to. |
+| `tigergraph.com/cluster-pod=CLUSTER_NAME` | Indicates that the pod belongs to a cluster and not a Job. |
+| `tigergraph.com/gui-service=true` | Labeled on pods running the GUI service. |
+| `tigergraph.com/restpp-service=true` | Labeled on pods running the RESTPP service. |
+
+### TigerGraph Job Pods
+
+| Label | Usage |
+|-------------------------------------------------|------------------------------------------------------------------------------|
+| `tigergraph.com/cluster-name=CLUSTER_NAME` | Indicates which cluster the job is for. |
+| `tigergraph.com/cluster-job={CLUSTER_NAME}-{JOB_TYPE}-job` | Specifies the type of job and the cluster it's associated with (JOB_TYPE: init, upgrade, expand, shrink-pre, shrink-post). |
+
+### TigerGraph Backup/Restore Job Pods
+
+| Label | Usage |
+|--------------------------------------------------|------------------------------------------------------------------------------|
+| `tigergraph.com/backup-cluster=CLUSTER_NAME` | Labeled on pods running backup jobs for the specified cluster. |
+| `tigergraph.com/restore-cluster=CLUSTER_NAME` | Labeled on pods running restore jobs for the specified cluster. |
+
+These labels help identify the purpose and affiliation of various pods within the Kubernetes environment, making it easier to manage and monitor different components of TigerGraph clusters, jobs, backups, and restores.
\ No newline at end of file
diff --git a/k8s/docs/07-reference/static-and-dynamic-persistent-volume-storage.md b/k8s/docs/07-reference/static-and-dynamic-persistent-volume-storage.md
new file mode 100644
index 00000000..347cd830
--- /dev/null
+++ b/k8s/docs/07-reference/static-and-dynamic-persistent-volume-storage.md
@@ -0,0 +1,368 @@
+# How to use static & dynamic persistent volume storage
+
+This document describes how to deploy a TigerGraph on K8s with static or dynamic persistent volume storage.
+
+## GKE
+
+### Static persistent volume storage on GKE
+
+You can follow these steps to set up and use static persistent volume storage for GKE:
+
+1. Provision a Persistent volume using a special storage class name.
+2. Deploy TigerGraph with persistent volume.
+
+### Creating Persistent Volumes From Existing Google Compute Disks
+
+- Create disk
+
+Consider a scenario where you are creating a TigerGraph cluster comprising three nodes. To achieve this, you can create three compute disks named tg-pv-1, tg-pv-2, and tg-pv-3, each with a size of 10GB.
+
+```bash
+gcloud compute disks create tg-pv-1 --zone=us-central1-a --size=10GB
+gcloud compute disks create tg-pv-2 --zone=us-central1-a --size=10GB
+gcloud compute disks create tg-pv-3 --zone=us-central1-a --size=10GB
+
+# delete gcd
+gcloud compute disks delete tg-pv-1 --zone=us-central1-a
+gcloud compute disks delete tg-pv-2 --zone=us-central1-a
+gcloud compute disks delete tg-pv-3 --zone=us-central1-a
+```
+
+Now you have three disks available to be used as PV (Persistent Volume) in GKE.
+
+- Create static persistent pv
+
+```java
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+ name: tg-pv-storage1
+spec:
+ storageClassName: "tg-pv"
+ capacity:
+ storage: 10Gi
+ accessModes:
+ - ReadWriteOnce
+ gcePersistentDisk:
+ pdName: tg-pv-1
+ fsType: ext4
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+ name: tg-pv-storage2
+spec:
+ storageClassName: "tg-pv"
+ capacity:
+ storage: 10Gi
+ accessModes:
+ - ReadWriteOnce
+ gcePersistentDisk:
+ pdName: tg-pv-2
+ fsType: ext4
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+ name: tg-pv-storage3
+spec:
+ storageClassName: "tg-pv"
+ capacity:
+ storage: 10Gi
+ accessModes:
+ - ReadWriteOnce
+ gcePersistentDisk:
+ pdName: tg-pv-3
+ fsType: ext4
+```
+
+- Create TG cluster with storage class name tg-pv
+
+```bash
+kubectl tg create --namespace tigergraph --cluster-name test-pv-tg-cluster -k ssh-key-secret --license xxxxxx --size 3 --ha 2 --version 3.9.1 --storage-class tg-pv --cpu 2000m --memory 8G --storage-size 10G
+```
+
+### Dynamically persistent volume storage
+
+To enable and utilize dynamic persistent volume storage for Google Kubernetes Engine (GKE), follow these steps:
+
+1. **Create a Storage Class:**
+ Start by creating a storage class, which serves as a straightforward way to categorize and organize storage options.
+
+2. **Deploy TigerGraph with the Created Storage Class:**
+ Once the storage class is in place, proceed to deploy TigerGraph, ensuring you specify the name of the storage class created in the previous step.
+
+A storage class essentially outlines the type of storage to be provisioned. In simpler terms, it defines how the storage behaves and what it's best suited for.
+
+For instance, you can categorize your storage classes as `gold` and `silver`, using names that make sense for your use case. The `gold` storage class might leverage the `pd-ssd` persistent disk type, ideal for high IOPS applications like databases. Meanwhile, the `silver` storage class could utilize the `pd-standard` volume type, suitable for regular disk operations and backups.
+
+These storage class categorizations are entirely tailored to your project's specific requirements, ensuring that the storage resources are optimally utilized based on your application's needs.
+
+> [!NOTE]
+> GKE comes with default storage classes that utilize `pd-standard` disks. If you omit specifying a storage class while provisioning a Persistent Volume (PV), the default storage class is automatically considered.
+
+By following these steps, you can efficiently configure and leverage dynamic persistent volume storage within your GKE environment.
+
+- Create a storage class
+
+Save the following manifest as `storage-class.yaml`
+
+```java
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+ name: gold
+provisioner: kubernetes.io/gce-pd
+volumeBindingMode: Immediate
+allowVolumeExpansion: true
+reclaimPolicy: Delete
+parameters:
+ type: pd-ssd
+ fstype: ext4
+ replication-type: none
+```
+
+- Create the storage class.
+
+```bash
+kubectl apply -f storage-class.yaml
+```
+
+A little explanation about the parameters.
+
+1. **Type:**  supports `pd-standard` & `pd-ssd`. If you donât specify anything, it defaults `pd-standard`
+2. **fstype:** supports `ext4` and `xfs`. Defaults to `ext4`.
+3. **replication-type:** This decides whether the disk is zonal or regional. If you donât specify `regional-pd`, it defaults to a zonal disk.
+4. **allowVolumeExpansion:**Â With this parameter, you can expand the persistent volume if required.
+5. **volumeBindingMode:** There are two modes. `Immediate` and `WaitForFirstConsumer`. In cases where the storage is not accessible from all the nodes, use `WaitForFirstConsumer` so that volume binding will happen after the pod gets created.
+6. [**reclaimPolicy**](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#reclaiming)
+ 1. **reclaim** policy allows for manual reclamation of the resource. When the PersistentVolumeClaim is deleted, the PersistentVolume still exists and the volume is considered "released". But it is not yet available for another claim because the previous claimant's data remains on the volume.
+ 2. the `Delete`reclaim policy, deletion removes both the PersistentVolume object from Kubernetes, as well as the associated storage asset in the external infrastructure, such as an AWS EBS, GCE PD, Azure Disk, or Cinder volume.
+
+- Deploy TG with the specific Storage class name
+
+```bash
+kubectl tg create --namespace tigergraph --cluster-name dynamic-pv-tg-cluster -k ssh-key-secret --license xxxxxx --size 1 --ha 1 --version 3.9.1 --storage-class gold --cpu 4000m --memory 8G --storage-size 10G
+```
+
+## EKS
+
+### Static persistent volume storage on eks
+
+You can follow these steps to set up and use static persistent volume storage for EKS:
+
+1. Provision a Persistent volume using a special storage class name.
+2. Deploy TG with persistent volume.
+
+### **Creating ESB Persistent Volumes**
+
+- Create ESB volumes
+
+Consider a scenario where you are creating a TigerGraph cluster comprising three nodes. To achieve this, you can create three compute disks named tg-pv-1, tg-pv-2, and tg-pv-3, each with a size of 10GB.
+
+```bash
+$ aws ec2 create-volume --volume-type gp2 --size 10 --availability-zone us-west-1b
+{
+ "AvailabilityZone": "us-west-1b",
+ "CreateTime": "2023-05-04T09:00:21+00:00",
+ "Encrypted": false,
+ "Size": 10,
+ "SnapshotId": "",
+ "State": "creating",
+ "VolumeId": "vol-01b4da831ee293eb7",
+ "Iops": 100,
+ "Tags": [],
+ "VolumeType": "gp2",
+ "MultiAttachEnabled": false
+}
+
+$ aws ec2 create-volume --volume-type gp2 --size 10 --availability-zone us-west-1b
+{
+ "AvailabilityZone": "us-west-1b",
+ "CreateTime": "2023-05-04T09:00:51+00:00",
+ "Encrypted": false,
+ "Size": 10,
+ "SnapshotId": "",
+ "State": "creating",
+ "VolumeId": "vol-0cf5cb04ce0b30eee",
+ "Iops": 100,
+ "Tags": [],
+ "VolumeType": "gp2",
+ "MultiAttachEnabled": false
+}
+
+$ aws ec2 create-volume --volume-type gp2 --size 10 --availability-zone us-west-1b
+{
+ "AvailabilityZone": "us-west-1b",
+ "CreateTime": "2023-05-04T09:01:18+00:00",
+ "Encrypted": false,
+ "Size": 10,
+ "SnapshotId": "",
+ "State": "creating",
+ "VolumeId": "vol-056ddf237f6bfe122",
+ "Iops": 100,
+ "Tags": [],
+ "VolumeType": "gp2",
+ "MultiAttachEnabled": false
+}
+
+# delete esb volume
+aws ec2 delete-volume --volume-id vol-01b4da831ee293eb7
+aws ec2 delete-volume --volume-id vol-0cf5cb04ce0b30eee
+aws ec2 delete-volume --volume-id vol-056ddf237f6bfe122
+```
+
+Now there are three ESB volumes available to be used as PV in GKE.
+
+- Create static persistent pv
+
+```yaml
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+ name: tg-pv-storage1
+spec:
+ capacity:
+ storage: 10Gi
+ accessModes:
+ - ReadWriteOnce
+ persistentVolumeReclaimPolicy: Retain
+ storageClassName: "tg-pv"
+ awsElasticBlockStore:
+ volumeID: vol-01b4da831ee293eb7
+ fsType: ext4
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+ name: tg-pv-storage2
+spec:
+ capacity:
+ storage: 10Gi
+ accessModes:
+ - ReadWriteOnce
+ persistentVolumeReclaimPolicy: Retain
+ storageClassName: "tg-pv"
+ awsElasticBlockStore:
+ volumeID: vol-0cf5cb04ce0b30eee
+ fsType: ext4
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+ name: tg-pv-storage3
+spec:
+ capacity:
+ storage: 10Gi
+ accessModes:
+ - ReadWriteOnce
+ persistentVolumeReclaimPolicy: Retain
+ storageClassName: "tg-pv"
+ awsElasticBlockStore:
+ volumeID: vol-056ddf237f6bfe122
+ fsType: ext4
+```
+
+- Create TigerGraph cluster with storage class name tg-pv
+
+The ESB volumes are located in zone us-west-1b, configuring the node affinity to ensure the TG pods are scheduled to the nodes of this zone.
+
+creating an affinity configuration file like this:
+
+tg-affinity.yaml
+
+```yaml
+affinity:
+ nodeAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ nodeSelectorTerms:
+ - matchExpressions:
+ - key: topology.kubernetes.io/zone
+ operator: In
+ values:
+ - us-west-1b
+```
+
+```bash
+kubectl tg create --cluster-name ${YOUR_CLUSTER_NAME} --private-key-secret ${YOUR_SSH_KEY_SECRET_NAME} --size 3 --ha 2 --version 3.9.1 --license ${LICENSE} --service-account-name ${SERVICE_ACCOUNT_NAME} \
+ --storage-class pv-local --storage-size 10G --cpu 4000m --memory 8Gi --namespace ${YOUR_NAMESPACE} --affinity tg-affinity.yaml
+```
+
+- Dynamically persistent volume storage
+
+You can follow these steps to set up and use dynamic persistent volume storage for EKS:
+
+1. Create Storage class
+2. Deploy TG with the Storage class name created in step1
+
+Storage class is a simple way of segregating storage options.
+
+To put it simply, a storage class defines what type of storage is to be provisioned.
+
+For example, you can classify our storage class as `gold` and `silver`. These names are arbitrary and use a name that is meaningful to you.
+
+Gold storage class uses the `gp3` persistent disk type for high IOPS applications (to be used with databases). While the silver storage class uses the `gp2` volume type to be used for backups and normal disk operations.
+
+These storage class segregations are completely based on the project requirements.
+
+> [!NOTE]
+> There are default storage classes available in EKS which are backed by gp2 (default). If you donât specify a storage class while provisioning a PV, the default storage class is considered.
+
+- Create a storage class
+
+Save the following manifest as `storage-class.yaml`
+
+```java
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+ name: gold
+parameters:
+ type: gp3
+ fsType: ext4
+provisioner: ebs.csi.aws.com
+reclaimPolicy: Delete
+volumeBindingMode: WaitForFirstConsumer
+```
+
+- Create the storage class.
+
+```bash
+kubectl apply -f storage-class.yaml
+```
+
+## Creating Persistent Volumes using the local file system of the local node
+
+> [!WARNING]
+> It doesnât suggest using the mode for the product system, since itâs only easy to create TigerGraph cluster with a single instance on K8s and also with a single worker node, if not, the data will be lost after rolling update cluster, because the pod may schedule on another worker of K8s.
+
+in addition, if the local filesystem of the node has been deleted after the node restarts, the data will be lost.
+
+### Create a persistent volume with a local filesystem
+
+TigerGraph container will mount data from the path "/home/tigergraph/tigergraph/data", you shouldnât change it.
+
+You can set the storageClassName to pv-local, if you modify the name, you should use the same when creating TigerGraph cluster
+
+```java
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+ name: task-pv-volume-1
+ labels:
+ type: local
+spec:
+ storageClassName: pv-local
+ capacity:
+ storage: 10Gi
+ accessModes:
+ - ReadWriteOnce
+ hostPath:
+ path: "/home/tigergraph/tigergraph/data"
+```
+
+### Create TG cluster with storage class name pv-local
+
+```bash
+kubectl tg create --namespace tigergraph --cluster-name local-pv-tg-cluster -k ssh-key-secret --size 1 --ha 1 --version 3.9.1 --image-pull-policy Always --storage-class pv-local --cpu 4000m --memory 8G --storage-size 10G --license ${LICENSE}
+```
diff --git a/k8s/docs/08-release-notes/README.md b/k8s/docs/08-release-notes/README.md
new file mode 100644
index 00000000..bd446564
--- /dev/null
+++ b/k8s/docs/08-release-notes/README.md
@@ -0,0 +1,13 @@
+# Release Notes
+
+Those document describes the new features, improvements, bugfixes for all of operator version release.
+
+Please see the detailed documentation of each operator version release notes as follows:
+
+- [Operator 0.0.9](./operator-0.0.9.md)
+- [Operator 0.0.7](./operator-0.0.7.md)
+- [Operator 0.0.6](./operator-0.0.6.md)
+- [Operator 0.0.5](./operator-0.0.5.md)
+- [Operator 0.0.4](./operator-0.0.4.md)
+- [Operator 0.0.3](./operator-0.0.3.md)
+- [Operator 0.0.2](./operator-0.0.2.md)
\ No newline at end of file
diff --git a/k8s/docs/08-release-notes/operator-0.0.2.md b/k8s/docs/08-release-notes/operator-0.0.2.md
new file mode 100644
index 00000000..50c854cd
--- /dev/null
+++ b/k8s/docs/08-release-notes/operator-0.0.2.md
@@ -0,0 +1,34 @@
+# Operator 0.0.2 Release notes
+
+## Overview
+
+Operator 0.0.2 has been released in conjunction with TigerGraph 3.7.0, bringing automated TigerGraph provisioning and configuration management to your fingertips.
+
+### kubectl plugin installation
+
+To seamlessly integrate Operator 0.0.2 with your environment, please follow these steps for kubectl plugin installation:
+
+```bash
+curl https://dl.tigergraph.com/k8s/0.0.2/kubectl-tg -o kubectl-tg
+sudo install kubectl-tg /usr/local/bin/
+```
+
+## New features
+
+This release introduces the following new features:
+
+- Cluster Provisioning: Streamline the process of provisioning clusters with ease.
+
+- Cluster Deletion: Simplify cluster management by enabling cluster deletion when necessary.
+
+- Listing Clusters: Easily view a list of clusters for enhanced visibility and control.
+
+- Checking Cluster Status: Stay informed about the status of your clusters with ease.
+
+## Improvements
+
+No specific improvements have been made in this release.
+
+## Bugfixes
+
+No known bugs have been addressed in this release.
diff --git a/k8s/docs/08-release-notes/operator-0.0.3.md b/k8s/docs/08-release-notes/operator-0.0.3.md
new file mode 100644
index 00000000..21481a03
--- /dev/null
+++ b/k8s/docs/08-release-notes/operator-0.0.3.md
@@ -0,0 +1,34 @@
+# Operator 0.0.3 Release notes
+
+## Overview
+
+**Operator 0.0.3** has been released in conjunction with **TigerGraph 3.8.0**.
+
+This release of Operator 0.0.3 primarily focuses on enabling special cluster operations for TigerGraph on Kubernetes (K8s). These operations include resource updates (CPU and Memory), TigerGraph cluster upgrading, and cluster scaling.
+
+### kubectl plugin installation
+
+To install the kubectl plugin for **Operator 0.0.3**, please execute the following command:
+
+```bash
+curl https://dl.tigergraph.com/k8s/0.0.3/kubectl-tg -o kubectl-tg
+sudo install kubectl-tg /usr/local/bin/
+```
+
+## New features
+
+- Cluster Resource Update: Operator 0.0.3 introduces the capability to update cluster resources, including CPU and Memory configurations.
+
+- Cluster Upgrading: You can now upgrade your TigerGraph cluster using this release.
+
+- Cluster Expansion: Expand your TigerGraph cluster effortlessly to meet growing demands.
+
+- Cluster Shrinking: When necessary, scale down your TigerGraph cluster efficiently.
+
+## Improvements
+
+- High Availability (HA) Enabled by Default: Operator 0.0.3 now enables High Availability by default, ensuring greater reliability and fault tolerance.
+
+## Bugfixes
+
+- Addressed an issue where the expand command would become stuck when no schema and graph data existed in the TigerGraph cluster. ([CORE-1743](https://graphsql.atlassian.net/browse/CORE-1743), TigerGraph 3.8.0)
diff --git a/k8s/docs/08-release-notes/operator-0.0.4.md b/k8s/docs/08-release-notes/operator-0.0.4.md
new file mode 100644
index 00000000..8d6e7bcf
--- /dev/null
+++ b/k8s/docs/08-release-notes/operator-0.0.4.md
@@ -0,0 +1,64 @@
+# Operator 0.0.4 Release notes
+
+## Overview
+
+**Operator 0.0.4** has been released in conjunction with **TigerGraph 3.9.0**.
+
+In this release, **Operator 0.0.4** brings various enhancements, including support for operator updating, operator upgrading, cluster backup, and cluster restore.
+
+### kubectl plugin installation
+
+To install the kubectl plugin for Operator 0.0.4, please use the following command:
+
+```bash
+curl https://dl.tigergraph.com/k8s/0.0.4/kubectl-tg -o kubectl-tg
+sudo install kubectl-tg /usr/local/bin/
+```
+
+### CRD upgrading
+
+Upgrade the Custom Resource Definition for Operator 0.0.4 using the following command:
+
+```bash
+kubectl apply -f https://dl.tigergraph.com/k8s/0.0.4/tg-operator-crd.yaml
+```
+
+### Operator upgrading
+
+Upgrade the Operator to version 0.0.4 with the following command:
+
+```bash
+kubectl tg upgrade --namespace ${NAMESPACE_OF_OPERATOR} --operator-version 0.0.4
+```
+
+## New features
+
+- Operator Updating: Operator 0.0.4 now supports updating the operator, ensuring you stay up to date with the latest enhancements.
+
+- Operator Upgrading: Seamlessly upgrade your Operator to version 0.0.4 and beyond.
+
+- Support for Namespaced-Scope and Cluster-Scope Operator: Enjoy the flexibility of namespaced-scope and cluster-scope operators.
+
+- Cluster Backup: Perform both one-time and scheduled backups of your TigerGraph cluster.
+
+- Cluster Restore: Easily restore your TigerGraph cluster as needed.
+
+## Improvements
+
+- Security Fixes for TG K8s Docker Image:
+
+ - Removed the use of sudo from the TG docker image for enhanced security.
+Customized private SSH key files for added security.
+ - Enhancements in Expansion and Shrinking: Improved support for expansion and shrinking, including a robust failure recovery process.
+
+- Security Vulnerabilities Fixes in K8s Operator: Addressed security vulnerabilities in the K8s operator.
+
+- kubectl tg Use AWS Secret Name: Improved security by using AWS secret names instead of directly passing strings in the options.
+
+## Bugfixes
+
+- Fixed issues causing GSE crashes when the number of pods exceeded 32.
+
+- Updated the nginx template during installation upgrades.
+
+- Configured LBS (Load Balancer Service) to forward RESTPP requests to the corresponding cluster when multiple clusters exist in the same namespace.
diff --git a/k8s/docs/08-release-notes/operator-0.0.5.md b/k8s/docs/08-release-notes/operator-0.0.5.md
new file mode 100644
index 00000000..d25807e9
--- /dev/null
+++ b/k8s/docs/08-release-notes/operator-0.0.5.md
@@ -0,0 +1,50 @@
+# Operator 0.0.5 Release notes
+
+## Overview
+
+**Operator version 0.0.5** has been released in conjunction with **TigerGraph 3.9.1**. This release introduces several enhancements, bugfixes, and the ability to configure resource limits for both Operator and the TigerGraph cluster.
+
+### kubectl plugin installation
+
+To install the kubectl plugin for Operator 0.0.5, please execute the following command:
+
+```bash
+curl https://dl.tigergraph.com/k8s/0.0.5/kubectl-tg -o kubectl-tg
+sudo install kubectl-tg /usr/local/bin/
+```
+
+### CRD upgrading
+
+To upgrade the Custom Resource Definitions (CRD) for Operator 0.0.5, use the following command:
+
+```bash
+kubectl apply -f https://dl.tigergraph.com/k8s/0.0.5/tg-operator-crd.yaml
+```
+
+### Operator upgrading
+
+Upgrade your Operator to version 0.0.5 with the following command:
+
+```bash
+kubectl tg upgrade --namespace ${NAMESPACE_OF_OPERATOR} --operator-version 0.0.5
+```
+
+## New features
+
+- Resource Limit Configuration: Operator 0.0.5 now allows you to configure resource limits for both the Operator itself and the TigerGraph cluster using the kubectl-tg plugin.
+
+## Improvements
+
+- Automated Backup Recovery: Operator now supports automatic backup recovery, simplifying data restoration processes.
+
+- Enhanced Cluster Initialization: Improvements have been made to the cluster initialization process, enhancing stability and usability.
+
+- Improved kubectl tg Commands: Various enhancements have been made to kubectl tg commands, making them more user-friendly.
+
+## Bugfixes
+
+- Upgrade Issue: Fixed an issue that caused problems when upgrading from version 3.7.0 to 3.9.0.
+
+- Job Name Length: Added a hint when the job name exceeds the RFC 1123 character limit.
+
+- Fixed an issue where expansion or shrink operations on an empty queue would skip pausing GPE. ([CORE-2440](https://graphsql.atlassian.net/browse/CORE-2440), TigerGraph 3.9.1)
diff --git a/k8s/docs/08-release-notes/operator-0.0.6.md b/k8s/docs/08-release-notes/operator-0.0.6.md
new file mode 100644
index 00000000..4fae0670
--- /dev/null
+++ b/k8s/docs/08-release-notes/operator-0.0.6.md
@@ -0,0 +1,48 @@
+# Operator 0.0.6 Release notes
+
+## Overview
+
+**Operator 0.0.6** has been released in conjunction with the **TigerGraph 3.9.1 update**.
+
+In this release, **Operator 0.0.6** introduces support for node and pod affinity configuration, along with the ability to customize init containers and sidecar containers. Notably, after upgrading to Operator version 0.0.6, creating a service account when deploying the TigerGraph cluster in a different namespace is no longer mandatory.
+
+### kubectl plugin installation
+
+To install the kubectl plugin for Operator 0.0.6, please execute the following command:
+
+```bash
+curl https://dl.tigergraph.com/k8s/0.0.6/kubectl-tg -o kubectl-tg
+sudo install kubectl-tg /usr/local/bin/
+```
+
+### CRD upgrading
+
+To upgrade the Custom Resource Definitions (CRD) for Operator 0.0.6, use the following command:
+
+```bash
+kubectl apply -f https://dl.tigergraph.com/k8s/0.0.6/tg-operator-crd.yaml
+```
+
+### Operator upgrading
+
+After upgrading the Operator, the TG cluster will undergo a rolling update due to tg-log volume mounting changes.
+
+To upgrade the Operator to version 0.0.6, please execute the following command:
+
+```bash
+kubectl tg upgrade --namespace ${NAMESPACE_OF_OPERATOR} --operator-version 0.0.6
+```
+
+## New features
+
+- Added support for Node selector, Pod Affinity, and Toleration.
+- Introduced the ability to include customized init containers and sidecar containers.
+- Optional configuration for the service account name of TG pod when managing the TG cluster using cluster-scoped Operator.
+
+## Improvements
+
+No specific improvements have been made in this release.
+
+## Bugfixes
+
+No known bugs have been addressed in this release.
diff --git a/k8s/docs/08-release-notes/operator-0.0.7.md b/k8s/docs/08-release-notes/operator-0.0.7.md
new file mode 100644
index 00000000..ceea2994
--- /dev/null
+++ b/k8s/docs/08-release-notes/operator-0.0.7.md
@@ -0,0 +1,66 @@
+# Operator 0.0.7 Release notes
+
+## Overview
+
+**Operator 0.0.7** has been released in conjunction with **TigerGraph 3.9.2**.
+
+This release of **Operator 0.0.7** brings several noteworthy features and improvements. It supports custom labels and annotations for external services, simplifies the backup and restore process for the TigerGraph cluster by eliminating the need to specify meta files, and allows for license updates through the **kubectl-tg** plugin.
+
+### kubectl plugin installation
+
+To install the kubectl plugin for **Operator 0.0.7**, please execute the following command:
+
+```bash
+curl https://dl.tigergraph.com/k8s/0.0.7/kubectl-tg -o kubectl-tg
+sudo install kubectl-tg /usr/local/bin/
+```
+
+### CRD upgrading
+
+To upgrade the Custom Resource Definition for Operator 0.0.7, please use the following command:
+
+```bash
+kubectl apply -f https://dl.tigergraph.com/k8s/0.0.7/tg-operator-crd.yaml
+```
+
+### Operator upgrading
+
+To upgrade the Operator to version 0.0.7, please use the following command:
+
+```bash
+kubectl tg upgrade --namespace ${NAMESPACE_OF_OPERATOR} --operator-version 0.0.7
+```
+
+## New features
+
+- Operator 0.0.7 now supports the addition of custom labels and annotations to external services.
+
+- Backup and restore processes for the TigerGraph cluster have been streamlined, eliminating the need to specify meta files.
+
+- You can now conveniently update your TigerGraph license using the kubectl tg plugin.
+
+## Improvements
+
+- Enhancements have been made to improve the handling of overlapping operations between expansion/shrinking and upgrading.
+
+- A retry interval for failed jobs has been added, improving job reliability.
+
+- The terminationGracePeriodSeconds of the TigerGraph container has been increased to 300 seconds from 60 seconds for smoother termination.
+
+## Bugfixes
+
+- Resolved issues related to deploying the cluster using static PV with the local filesystem.
+
+- Fixed the problem of external services updating twice, which led to errors.
+
+- Corrected unexpected config updates when executing an overlap operation between upgrade and expansion. ([TP-3646](https://graphsql.atlassian.net/browse/TP-3646))
+
+- Addressed an incorrect error exit issue in the upgrade script. ([TP-3869](https://graphsql.atlassian.net/browse/TP-3869))
+
+- Fixed the issue of cluster status checking during expansion and shrinking. It now checks the service of all nodes, not just the client node.
+
+- Graph query responses no longer encounter errors after successful execution of the expansion. ([GLE-5195](https://graphsql.atlassian.net/jira/software/c/projects/GLE/issues/GLE-5195), TigerGraph 3.9.2)
+
+- Resolved the issue of cluster size limits during cluster expansion. ([TP-3768](https://graphsql.atlassian.net/browse/TP-3768))
+
+- Fixed unnecessary rolling update problems that occurred when upgrading from 3.7.0 and below to 3.9.0 and above. ([TP-3765](https://graphsql.atlassian.net/browse/TP-3765) & [CORE-2585](https://graphsql.atlassian.net/browse/CORE-2585))
diff --git a/k8s/docs/08-release-notes/operator-0.0.9.md b/k8s/docs/08-release-notes/operator-0.0.9.md
new file mode 100644
index 00000000..fe8d0906
--- /dev/null
+++ b/k8s/docs/08-release-notes/operator-0.0.9.md
@@ -0,0 +1,71 @@
+# Operator 0.0.9 Release notes
+
+## Overview
+
+**Operator 0.0.9** has been released in conjunction with **TigerGraph version 3.9.3**.
+
+In this release, **Operator 0.0.9** introduces support for two essential features: CompressLevel and DecompressProcessNumber, newly introduced in **TigerGraph 3.9.3**. To leverage these capabilities, it is imperative to upgrade both the Custom Resource Definition (CRD) and the Operator itself.
+
+Operator 0.0.9 has disabled TG downgrades from a higher version (e.g., 3.9.3) to any lower version (e.g., 3.9.2). Therefore, the upgrade job will fail if you attempt to downgrade.
+
+A significant security enhancement has been implemented in the TigerGraph 3.9.3 Docker image. This enhancement disables access to TigerGraph pods through the use of a static password. Consequently, it is important to note that installations of TigerGraph versions 3.9.3 and higher are only supported with Operator version 0.0.9 and above.
+
+Additionally, Operator 0.0.9 introduces the Controller.ServiceManager.AutoRestart feature during cluster initialization. This enhancement ensures that services will automatically restart when using gadmin start/restart in the TigerGraph container.
+
+### kubectl plugin installation
+
+To install the kubectl plugin for Operator 0.0.9, please execute the following command:
+
+```bash
+curl https://dl.tigergraph.com/k8s/0.0.9/kubectl-tg -o kubectl-tg
+sudo install kubectl-tg /usr/local/bin/
+```
+
+### CRD upgrading
+
+To upgrade the Custom Resource Definition for Operator 0.0.9, please use the following command:
+
+```bash
+kubectl apply -f https://dl.tigergraph.com/k8s/0.0.9/tg-operator-crd.yaml
+```
+
+### Operator upgrading
+
+> [!WARNING]
+> For TigerGraph 3.9.3 and later versions, the use of passwords to log in to Pods is disabled, which enhances security. If you plan to upgrade your TigerGraph cluster to version 3.9.3, it is essential to first upgrade the Operator to version 0.0.9.
+
+To upgrade the Operator to version 0.0.9, please use the following command:
+
+```bash
+kubectl tg upgrade --namespace ${NAMESPACE_OF_OPERATOR} --operator-version 0.0.9
+```
+
+## New features
+
+- CompressLevel is now supported in `TigerGraphBackup` and `TigerGraphBackupSchedule`, with support for DecompressProcessNumber in TigerGraphRestore. These features require a cluster version of 3.9.3 or higher.([TP-4017](https://graphsql.atlassian.net/browse/TP-4017))
+
+## Improvements
+
+- The help message menu for the `kubectl-tg` plugin has been enhanced. ([TP-3915](https://graphsql.atlassian.net/browse/TP-3915))
+
+- The `.spec.initTGConfig.version` field in TigerGraph CR is now optional. You no longer need to specify this field when creating or updating the CR. ([TP-3910](https://graphsql.atlassian.net/browse/TP-3910))
+
+- Static passwords have been replaced with private keys for executing cluster operations jobs. ([TP-3792](https://graphsql.atlassian.net/browse/TP-3792))
+
+- The make command has been added to support the installation of tsar, and password usage has been disabled when building the TG docker image. ([TP-3786](https://graphsql.atlassian.net/browse/TP-3786))
+
+- Support for automatic restart of TigerGraph service under any circumstances has been introduced. ([TP-3848](https://graphsql.atlassian.net/browse/TP-3848) Database change)
+
+- Service auto-restart in the Operator can now be enabled by setting the TG configuration Controller.ServiceManager.AutoRestart. ([TP-4045](https://graphsql.atlassian.net/browse/TP-4045))
+
+## Bugfixes
+
+- A situation where the cluster was cloned again when a restore had already succeeded has been rectified. ([TP-3948](https://graphsql.atlassian.net/browse/TP-3948))
+
+- A problem with error handling in the TG container's PostStart Handler script has been resolved. ([TP-3914](https://graphsql.atlassian.net/browse/TP-3914))
+
+- A restpp status refresh issue has been addressed. ([CORE-1905](https://graphsql.atlassian.net/browse/CORE-1905))
+
+- GSQL jobs no longer get stuck when some related services are down. ([GLE-5365](https://graphsql.atlassian.net/browse/GLE-5365))
+
+- An issue where expansion was stuck at importing gsql/gui has been fixed. ([TOOLS-2306](https://graphsql.atlassian.net/browse/TOOLS-2306))
diff --git a/k8s/docs/09-samples/backup-restore/backup-schedule-local.yaml b/k8s/docs/09-samples/backup-restore/backup-schedule-local.yaml
new file mode 100644
index 00000000..f49b257e
--- /dev/null
+++ b/k8s/docs/09-samples/backup-restore/backup-schedule-local.yaml
@@ -0,0 +1,40 @@
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraphBackupSchedule
+metadata:
+ name: test-cluster-schedule-daily
+spec:
+ # Cronjob shedule
+ schedule: "0 0 * * *"
+ # Strategies for managing backups
+ # We will delete oldest backups according to the strategies automatically
+ strategy:
+ # We will only retain 20 backups
+ maxBackupFiles: 20
+ # A backup can only exist for 3 days
+ maxReservedDays: 3
+ maxRetry: 10
+ # optional : is pause is true, the cronjob will be suspended
+ pause: false
+ backupTemplate:
+ # Specify which cluster to backup in the SAME NAMESPACE as the backup job
+ clusterName: test-cluster
+ # Specify where to store the backup data
+ destination:
+ storage: local
+ # Use this field if type is local
+ local:
+ path: /home/tigergraph/tigergraph/data/backup
+
+ # Configure the name of backup files and the path storing temporary files
+ backupConfig:
+ tag: daily
+ # optional
+ stagingPath: /home/tigergraph/tigergraph/data
+ # optional :if incremental is true, incremental backup will be performed
+ incremental: false
+ # optional
+ timeout: 18000
+ # optional :specify the number of process to do compress
+ compressProcessNumber: 0
+ # optional: (operator>=0.0.9 and tg>=3.9.3) specify the compress level for backup
+ compressLevel: DefaultCompression #choose from DefaultCompression/BestSpeed/BestCompression
\ No newline at end of file
diff --git a/k8s/docs/09-samples/backup-restore/backup-schedule-s3.yaml b/k8s/docs/09-samples/backup-restore/backup-schedule-s3.yaml
new file mode 100644
index 00000000..c8b3e6da
--- /dev/null
+++ b/k8s/docs/09-samples/backup-restore/backup-schedule-s3.yaml
@@ -0,0 +1,39 @@
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraphBackupSchedule
+metadata:
+ name: test-cluster-schedule-daily
+spec:
+ # Cronjob shedule
+ schedule: "0 0 * * *"
+ # Strategies for managing backups
+ # We will delete oldest backups according to the strategies automatically
+ strategy:
+ # We will only retain 20 backups
+ maxBackupFiles: 20
+ # A backup can only exist for 3 days
+ maxReservedDays: 3
+ maxRetry: 10
+ # optional : is pause is true, the cronjob will be suspended
+ pause: false
+ backupTemplate:
+ clusterName: test-cluster
+ destination:
+ storage: s3Bucket
+ s3Bucket:
+ # specify the bucket you want to use
+ bucketName: operator-backup
+ secretKey:
+ name: s3-secret
+ # Configure the name of backup files and the path storing temporary files
+ backupConfig:
+ tag: s3
+ # optional
+ stagingPath: /home/tigergraph/tigergraph/data/backup-staging
+ # optional :if incremental is true, incremental backup will be performed
+ incremental: false
+ # optional
+ timeout: 18000
+ # optional :specify the number of process to do compress
+ compressProcessNumber: 0
+ # optional: (operator>=0.0.9 and tg>=3.9.3) specify the compress level for backup
+ compressLevel: DefaultCompression #choose from DefaultCompression/BestSpeed/BestCompression
\ No newline at end of file
diff --git a/k8s/docs/09-samples/backup-restore/backup-to-local.yaml b/k8s/docs/09-samples/backup-restore/backup-to-local.yaml
new file mode 100644
index 00000000..e2c1851e
--- /dev/null
+++ b/k8s/docs/09-samples/backup-restore/backup-to-local.yaml
@@ -0,0 +1,28 @@
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraphBackup
+metadata:
+ name: test-cluster-backup-local
+spec:
+ # Specify which cluster to backup in the SAME NAMESPACE as the backup job
+ clusterName: test-cluster
+ # Specify where to store the backup data
+ destination:
+ storage: local
+ # Use this field if type is local
+ local:
+ path: /home/tigergraph/tigergraph/data/backup
+
+ # Configure the name of backup files and the path storing temporary files
+ backupConfig:
+ tag: local
+ # Optional: Set the path for temporary staging files
+ stagingPath: /home/tigergraph/tigergraph/data
+ # Optional: If 'incremental' is set to true, incremental backup will be performed
+ incremental: false
+ # Optional: Set the timeout value for the backup process (default is 18000 seconds)
+ timeout: 18000
+ # Optional: Specify the number of processes to use for compression (0 uses the number of CPU cores)
+ compressProcessNumber: 0
+ # Optional: (Requires operator version >= 0.0.9 and TigerGraph version >= 3.9.3)
+ # Choose the compression level for the backup: DefaultCompression/BestSpeed/BestCompression
+ compressLevel: DefaultCompression # Choose from DefaultCompression/BestSpeed/BestCompression
\ No newline at end of file
diff --git a/k8s/docs/09-samples/backup-restore/backup-to-s3.yaml b/k8s/docs/09-samples/backup-restore/backup-to-s3.yaml
new file mode 100644
index 00000000..0fd9366f
--- /dev/null
+++ b/k8s/docs/09-samples/backup-restore/backup-to-s3.yaml
@@ -0,0 +1,29 @@
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraphBackup
+metadata:
+ name: test-cluster-backup-s3
+spec:
+ clusterName: test-cluster
+ destination:
+ storage: s3Bucket
+ s3Bucket:
+ # Specify the name of the S3 bucket you want to use
+ bucketName: operator-backup
+ # Specify the Secret containing the S3 access key and secret access key
+ secretKey:
+ name: aws-secret
+
+ # Configure the name of backup files and the path storing temporary files
+ backupConfig:
+ tag: s3
+ # Optional: Set the path for temporary staging files
+ stagingPath: /home/tigergraph/tigergraph/data
+ # Optional: If 'incremental' is set to true, incremental backup will be performed
+ incremental: false
+ # Optional: Set the timeout value for the backup process (default is 18000 seconds)
+ timeout: 18000
+ # Optional: Specify the number of processes to use for compression (0 uses the number of CPU cores)
+ compressProcessNumber: 0
+ # Optional: (Requires operator version >= 0.0.9 and TigerGraph version >= 3.9.3)
+ # Choose the compression level for the backup: DefaultCompression/BestSpeed/BestCompression
+ compressLevel: DefaultCompression # Choose from DefaultCompression/BestSpeed/BestCompression
\ No newline at end of file
diff --git a/k8s/docs/09-samples/backup-restore/restore-from-local.yaml b/k8s/docs/09-samples/backup-restore/restore-from-local.yaml
new file mode 100644
index 00000000..9b818488
--- /dev/null
+++ b/k8s/docs/09-samples/backup-restore/restore-from-local.yaml
@@ -0,0 +1,18 @@
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraphRestore
+metadata:
+ name: restore-from-local
+spec:
+ restoreConfig:
+ # We can use tag to restore from backup in the same cluster
+ tag: local-2023-08-23T061417
+ # optional
+ stagingPath: /home/tigergraph/tigergraph/data/restore-staging
+ # optional: (operator>=0.0.9 and tg>=3.9.3) should be >=0
+ decompressProcessNumber: 2
+ source:
+ storage: local
+ local:
+ path: /home/tigergraph/tigergraph/data/backup
+ # Specify the name of cluster
+ clusterName: test-cluster
\ No newline at end of file
diff --git a/k8s/docs/09-samples/backup-restore/restore-from-s3.yaml b/k8s/docs/09-samples/backup-restore/restore-from-s3.yaml
new file mode 100644
index 00000000..03355024
--- /dev/null
+++ b/k8s/docs/09-samples/backup-restore/restore-from-s3.yaml
@@ -0,0 +1,20 @@
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraphRestore
+metadata:
+ name: restore-from-s3
+spec:
+ restoreConfig:
+ tag: s3-2023-08-23T060851
+ # optional
+ stagingPath: /home/tigergraph/tigergraph/data/restore-staging
+ # optional: (operator>=0.0.9 and tg>=3.9.3) should be >=0
+ decompressProcessNumber: 2
+ source:
+ storage: s3Bucket
+ s3Bucket:
+ # specify the bucket you want to use
+ bucketName: operator-backup
+ secretKey:
+ name: aws-secret
+ # Specify the name of cluster
+ clusterName: test-cluster
\ No newline at end of file
diff --git a/k8s/docs/09-samples/deploy/tigergraph-cluster.yaml b/k8s/docs/09-samples/deploy/tigergraph-cluster.yaml
new file mode 100644
index 00000000..5fd41525
--- /dev/null
+++ b/k8s/docs/09-samples/deploy/tigergraph-cluster.yaml
@@ -0,0 +1,40 @@
+apiVersion: graphdb.tigergraph.com/v1alpha1
+kind: TigerGraph
+metadata:
+ name: test-cluster
+ namespace: tigergraph
+spec:
+ image: docker.io/tigergraph/tigergraph-k8s:3.9.2
+ imagePullPolicy: Always
+ imagePullSecrets:
+ - name: tigergraph-image-pull-secret
+ initJob:
+ image: docker.io/tigergraph/tigergraph-k8s-init:0.0.7
+ imagePullPolicy: Always
+ imagePullSecrets:
+ - name: tigergraph-image-pull-secret
+ initTGConfig:
+ ha: 2
+ license: xxxxxxxxxxxxxxxxx
+ version: 3.9.2
+ listener:
+ type: LoadBalancer
+ privateKeyName: ssh-key-secret
+ replicas: 3
+ resources:
+ limits:
+ cpu: "2"
+ memory: 8Gi
+ requests:
+ cpu: "2"
+ memory: 8Gi
+ storage:
+ type: persistent-claim
+ volumeClaimTemplate:
+ accessModes:
+ - ReadWriteOnce
+ resources:
+ requests:
+ storage: 10G
+ storageClassName: standard
+ volumeMode: Filesystem
\ No newline at end of file
diff --git a/k8s/eks/disable-gui-ha-patch/disable-gui-ha.yaml b/k8s/eks/disable-gui-ha-patch/disable-gui-ha.yaml
deleted file mode 100644
index cd76e89a..00000000
--- a/k8s/eks/disable-gui-ha-patch/disable-gui-ha.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
- name: disable-gui-ha
- labels:
- app: tigergraph
-spec:
- ttlSecondsAfterFinished: 300
- template:
- metadata:
- labels:
- app: tigergraph
- spec:
- serviceAccountName: tigergraph-installer
- initContainers:
- - name: init-disable-gui-ha
- image: tigergraph/tigergraph-k8s-installer:3.5.0
- env:
- - name: POD_PREFIX
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: pod.prefix
- command:
- - "/bin/sh"
- - "-c"
- - >
- set -e;
- kubectl rollout status --watch --timeout=2h statefulset ${POD_PREFIX};
- sleep 5;
- kubectl wait --for=condition=complete --timeout=6h job/installer || exit 0
- containers:
- - name: disable-gui-ha
- image: tigergraph/tigergraph-k8s-installer:3.5.0
- env:
- - name: SERVICE_NAME
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: service.headless.name
- - name: POD_PREFIX
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: pod.prefix
- - name: NAMESPACE
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: namespace
- command:
- - "/bin/sh"
- - "-c"
- - |
- set -e;
- export SSHPASS='tigergraph';
- sshpass -e ssh -o StrictHostKeyChecking=no tigergraph@${POD_PREFIX}-0.${SERVICE_NAME}.${NAMESPACE} "
- export PATH=$PATH:/home/tigergraph/tigergraph/app/cmd:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin;
- gadmin config set GUI.BasicConfig.Nodes '[{\"HostID\":\"m1\",\"Partition\":0,\"Replica\":1}]'
- gadmin config apply -y;
- gadmin restart all -y;
- ";
- restartPolicy: Never
- backoffLimit: 0
diff --git a/k8s/eks/disable-gui-ha-patch/kustomization.yaml b/k8s/eks/disable-gui-ha-patch/kustomization.yaml
deleted file mode 100644
index aba60bc5..00000000
--- a/k8s/eks/disable-gui-ha-patch/kustomization.yaml
+++ /dev/null
@@ -1,5 +0,0 @@
-apiVersion: kustomize.config.k8s.io/v1beta1
-kind: Kustomization
-
-resources:
- - disable-gui-ha.yaml
diff --git a/k8s/eks/kustomization.yaml b/k8s/eks/kustomization.yaml
deleted file mode 100644
index 0fb3deee..00000000
--- a/k8s/eks/kustomization.yaml
+++ /dev/null
@@ -1,37 +0,0 @@
-apiVersion: kustomize.config.k8s.io/v1beta1
-kind: Kustomization
-
-generatorOptions:
- disableNameSuffixHash: true
- labels:
- app: tigergraph
-
-# source base yaml
-bases:
-- ../base
-
-# revise blow to update global namespace
-namespace: default
-
-# uncomment and revise blow to update images
-#images:
-#- name: tigergraph/tigergraph-k8s
-# newName: aws_account_id.dkr.ecr.us-west-2.amazonaws.com/tigergraph
-# newTag: 3.2.0
-
-configMapGenerator:
-- name: env-config
- literals:
- - service.headless.name=tigergraph
- - pod.prefix=tigergraph
- - namespace=default
- - cluster_size=1
- - license=
- - ha=1
- - version=3.5.0
- - cluster_size.staging=0
- - version.staging=0
-
-patchesStrategicMerge:
-- patch-statfulset.yaml
-
diff --git a/k8s/eks/patch-statfulset.yaml b/k8s/eks/patch-statfulset.yaml
deleted file mode 100644
index 5a95e1c0..00000000
--- a/k8s/eks/patch-statfulset.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-apiVersion: apps/v1
-kind: StatefulSet
-metadata:
- name: tigergraph
-spec:
- volumeClaimTemplates:
- - metadata:
- name: tg-data
- labels:
- app: tigergraph
- spec:
- accessModes: [ "ReadWriteOnce" ]
- storageClassName: "gp2"
- resources:
- requests:
- storage: 50Gi
diff --git a/k8s/gke/kustomization.yaml b/k8s/gke/kustomization.yaml
deleted file mode 100644
index 76d0fc5e..00000000
--- a/k8s/gke/kustomization.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
-apiVersion: kustomize.config.k8s.io/v1beta1
-kind: Kustomization
-
-generatorOptions:
- disableNameSuffixHash: true
- labels:
- app: tigergraph
-
-# source base yaml
-bases:
-- ../base
-
-# revise blow to update global namespace
-namespace: default
-
-# uncomment and revise blow to update images
-# images:
-# - name: tigergraph/tigergraph-k8s
-# newName: gcr.io/yourregistry/tigergraph
-# newTag: 3.2.0
-
-configMapGenerator:
-- name: env-config
- literals:
- - service.headless.name=tigergraph
- - pod.prefix=tigergraph
- - namespace=default
- - cluster_size=1
- - license=
- - ha=1
- - version=3.5.0
- - cluster_size.staging=0
- - version.staging=0
-
-resources:
-- storageclass-pd.yaml
-
-patchesStrategicMerge:
-- patch-statfulset.yaml
-
diff --git a/k8s/gke/patch-statfulset.yaml b/k8s/gke/patch-statfulset.yaml
deleted file mode 100644
index 27b95672..00000000
--- a/k8s/gke/patch-statfulset.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-apiVersion: apps/v1
-kind: StatefulSet
-metadata:
- name: tigergraph
-spec:
- volumeClaimTemplates:
- - metadata:
- name: tg-data
- labels:
- app: tigergraph
- spec:
- accessModes: [ "ReadWriteOnce" ]
- storageClassName: "gcp-pd-ssd"
- resources:
- requests:
- storage: 50Gi
diff --git a/k8s/gke/storageclass-pd.yaml b/k8s/gke/storageclass-pd.yaml
deleted file mode 100644
index 32dd5cfb..00000000
--- a/k8s/gke/storageclass-pd.yaml
+++ /dev/null
@@ -1,11 +0,0 @@
-apiVersion: storage.k8s.io/v1
-kind: StorageClass
-metadata:
- name: gcp-pd-ssd
- namespace: default
- labels:
- app: tigergraph
-provisioner: kubernetes.io/gce-pd
-parameters:
- type: pd-ssd
-volumeBindingMode: WaitForFirstConsumer
diff --git a/k8s/jobs/cluster-expand.yaml b/k8s/jobs/cluster-expand.yaml
deleted file mode 100644
index 2d73e8c7..00000000
--- a/k8s/jobs/cluster-expand.yaml
+++ /dev/null
@@ -1,90 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
- name: cluster-expand
- labels:
- app: tigergraph
-spec:
- template:
- metadata:
- labels:
- app: tigergraph
- spec:
- serviceAccountName: tigergraph-installer
- initContainers:
- - name: init-cluster-expand
- image: tigergraph/tigergraph-k8s-installer:3.5.0
- env:
- - name: POD_PREFIX
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: pod.prefix
- command:
- - "/bin/sh"
- - "-c"
- - >
- set -e;
- kubectl rollout status --watch --timeout=2h statefulset ${POD_PREFIX};
- sleep 5;
- containers:
- - name: cluster-expand
- image: tigergraph/tigergraph-k8s-installer:3.5.0
- env:
- - name: CLUSTER_SIZE
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: cluster_size
- - name: CLUSTER_SIZE_STAGING
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: cluster_size.staging
- - name: HA
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: ha
- - name: SERVICE_NAME
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: service.headless.name
- - name: POD_PREFIX
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: pod.prefix
- - name: NAMESPACE
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: namespace
- command:
- - "/bin/sh"
- - "-c"
- - |
- export SSHPASS='tigergraph';
- hostlist=""
- for i in `seq $CLUSTER_SIZE_STAGING $(($CLUSTER_SIZE-1))`
- do
- tg_cfg=$(kubectl exec -n ${NAMESPACE} -i ${POD_PREFIX}-${i} -- /bin/bash -c "/usr/bin/find /home/tigergraph/tigergraph/app -name .tg.cfg|head -n 1");
- sshpass -e ssh -o StrictHostKeyChecking=no tigergraph@${POD_PREFIX}-${i}.${SERVICE_NAME}.${NAMESPACE} "
- export PATH=$PATH:/home/tigergraph/tigergraph/app/cmd:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin;
- ln -sf \"$tg_cfg\" /home/tigergraph/.tg.cfg;
- gadmin init cluster -y --skip-stop;
- gadmin config set System.TempRoot '/home/tigergraph/tigergraph/tmp';
- gadmin config apply -y;
- gadmin restart all -y;
- ";
- host="m$(($i+1)):${POD_PREFIX}-$i.tigergraph";
- if [[ -z "$hostlist" ]]; then
- hostlist="$host";
- else
- hostlist="${hostlist},${host}";
- fi
- done;
- kubectl exec -n ${NAMESPACE} -i ${POD_PREFIX}-0 -- /home/tigergraph/tigergraph/app/cmd/gadmin cluster expand ${hostlist} --ha=${HA} -y;
- restartPolicy: Never
- backoffLimit: 0
diff --git a/k8s/jobs/cluster-post-shrink.yaml b/k8s/jobs/cluster-post-shrink.yaml
deleted file mode 100644
index c576b5e2..00000000
--- a/k8s/jobs/cluster-post-shrink.yaml
+++ /dev/null
@@ -1,68 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
- name: cluster-post-shrink
- labels:
- app: tigergraph
-spec:
- template:
- metadata:
- labels:
- app: tigergraph
- spec:
- serviceAccountName: tigergraph-installer
- initContainers:
- - name: init-cluster-post-shrink
- image: tigergraph/tigergraph-k8s-installer:3.5.0
- env:
- - name: POD_PREFIX
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: pod.prefix
- command:
- - "/bin/sh"
- - "-c"
- - >
- set -e;
- kubectl rollout status --watch --timeout=2h statefulset ${POD_PREFIX};
- sleep 5;
- containers:
- - name: cluster-post-shrink
- image: tigergraph/tigergraph-k8s-installer:3.5.0
- env:
- - name: CLUSTER_SIZE
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: cluster_size
- - name: SERVICE_NAME
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: service.headless.name
- - name: POD_PREFIX
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: pod.prefix
- - name: NAMESPACE
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: namespace
- command:
- - "/bin/sh"
- - "-c"
- - |
- export SSHPASS='tigergraph';
- for i in `seq 0 $(($CLUSTER_SIZE-1))`
- do
- sshpass -e ssh -o StrictHostKeyChecking=no tigergraph@${POD_PREFIX}-${i}.${SERVICE_NAME}.${NAMESPACE} "
- export PATH=$PATH:/home/tigergraph/tigergraph/app/cmd:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin;
- find /home/tigergraph/tigergraph/data -regex \"/home/tigergraph/tigergraph/data/expansion-export-.*\" -delete;
- find /home/tigergraph/tigergraph/data -regex \"/home/tigergraph/tigergraph/data/gstore/.*-pre-expand-backup-.*\" -delete
- "
- done
- restartPolicy: Never
- backoffLimit: 0
diff --git a/k8s/jobs/cluster-shrink.yaml b/k8s/jobs/cluster-shrink.yaml
deleted file mode 100644
index 2c391e55..00000000
--- a/k8s/jobs/cluster-shrink.yaml
+++ /dev/null
@@ -1,80 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
- name: cluster-shrink
- labels:
- app: tigergraph
-spec:
- template:
- metadata:
- labels:
- app: tigergraph
- spec:
- serviceAccountName: tigergraph-installer
- initContainers:
- - name: init-cluster-shrink
- image: tigergraph/tigergraph-k8s-installer:3.5.0
- env:
- - name: POD_PREFIX
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: pod.prefix
- command:
- - "/bin/sh"
- - "-c"
- - >
- set -e;
- kubectl rollout status --watch --timeout=2h statefulset ${POD_PREFIX};
- sleep 5;
- containers:
- - name: cluster-shrink
- image: tigergraph/tigergraph-k8s-installer:3.5.0
- env:
- - name: CLUSTER_SIZE
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: cluster_size
- - name: CLUSTER_SIZE_STAGING
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: cluster_size.staging
- - name: HA_STAGING
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: ha.staging
- - name: SERVICE_NAME
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: service.headless.name
- - name: POD_PREFIX
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: pod.prefix
- - name: NAMESPACE
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: namespace
- command:
- - "/bin/sh"
- - "-c"
- - |
- hostlist="";
- for i in `seq $CLUSTER_SIZE_STAGING $(($CLUSTER_SIZE-1))`
- do
- host="m$(($i+1)):${POD_PREFIX}-$i.tigergraph";
- if [[ -z "$hostlist" ]]; then
- hostlist="$host";
- else
- hostlist="${hostlist},${host}";
- fi
- done;
- kubectl exec -n ${NAMESPACE} -i ${POD_PREFIX}-0 -- /home/tigergraph/tigergraph/app/cmd/gadmin cluster shrink ${hostlist} --ha=${HA_STAGING} -y;
- restartPolicy: Never
- backoffLimit: 0
diff --git a/k8s/jobs/cluster-upgrade.yaml b/k8s/jobs/cluster-upgrade.yaml
deleted file mode 100644
index 16a2112c..00000000
--- a/k8s/jobs/cluster-upgrade.yaml
+++ /dev/null
@@ -1,94 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
- name: cluster-upgrade
- labels:
- app: tigergraph
-spec:
- template:
- metadata:
- labels:
- app: tigergraph
- spec:
- serviceAccountName: tigergraph-installer
- initContainers:
- - name: init-cluster-upgrade
- image: tigergraph/tigergraph-k8s-installer:3.5.0
- env:
- - name: POD_PREFIX
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: pod.prefix
- command:
- - "/bin/sh"
- - "-c"
- - >
- set -e;
- kubectl rollout status --watch --timeout=2h statefulset ${POD_PREFIX};
- sleep 5;
- containers:
- - name: cluster-upgrade
- image: tigergraph/tigergraph-k8s-installer:3.5.0
- env:
- - name: CLUSTER_SIZE
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: cluster_size
- - name: SERVICE_NAME
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: service.headless.name
- - name: POD_PREFIX
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: pod.prefix
- - name: NAMESPACE
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: namespace
- - name: VERSION
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: version
- - name: VERSION_STAGING
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: version.staging
- command:
- - "/bin/sh"
- - "-c"
- - |
- set -x;
- for i in `seq 0 $(($CLUSTER_SIZE-1))`
- do
- kubectl exec -n ${NAMESPACE} ${POD_PREFIX}-${i} -- /bin/bash -c "
- sed -i -E \"s|/home/tigergraph/tigergraph/app/[0-9\.]+|/home/tigergraph/tigergraph/app/${VERSION}|g\" /home/tigergraph/tigergraph/data/configs/tg.cfg &&
- grep -rn /home/tigergraph/.tg.cfg -e \"/app/\"
- "
- done;
- kubectl exec -n ${NAMESPACE} ${POD_PREFIX}-0 -- /bin/bash -c "
- export PATH=$PATH:/home/tigergraph/tigergraph/app/cmd:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin;
- gadmin start infra &&
- gadmin config set System.AppRoot /home/tigergraph/tigergraph/app/${VERSION} &&
- gadmin config apply -y &&
- gadmin stop all -y &&
- gadmin start exe ctrl &&
- gadmin config apply --initial &&
- gadmin start infra &&
- gadmin init kafka -y &&
- gadmin config apply -y && gadmin restart all -y &&
- cp -r /home/tigergraph/tigergraph/data/upgrade-backup/QueryUdf/* /home/tigergraph/tigergraph/app/${VERSION}/dev/gdk/gsql/src/QueryUdf &&
- cp -r /home/tigergraph/tigergraph/data/upgrade-backup/TokenBank/* /home/tigergraph/tigergraph/app/${VERSION}/dev/gdk/gsql/src/TokenBank &&
- rm -rf /home/tigergraph/tigergraph/data/upgrade-backup &&
- gsql recompile loading job &&
- gsql install query -force all
- ";
- restartPolicy: Never
- backoffLimit: 0
diff --git a/k8s/jobs/disable-gui-ha.yaml b/k8s/jobs/disable-gui-ha.yaml
deleted file mode 100644
index cd58b3b0..00000000
--- a/k8s/jobs/disable-gui-ha.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
- name: disable-gui-ha
- namespace: default
- labels:
- app: tigergraph
-spec:
- template:
- metadata:
- labels:
- app: tigergraph
- spec:
- serviceAccountName: tigergraph-installer
- initContainers:
- - name: init-disable-gui-ha
- image: tigergraph/tigergraph-k8s-installer:3.5.0
- env:
- - name: POD_PREFIX
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: pod.prefix
- command:
- - "/bin/sh"
- - "-c"
- - >
- set -e;
- kubectl rollout status --watch --timeout=2h statefulset ${POD_PREFIX};
- kubectl wait --for=condition=complete --timeout=6h job/installer || exit 0
- sleep 5;
- containers:
- - name: disable-gui-ha
- image: tigergraph/tigergraph-k8s-installer:3.5.0
- env:
- - name: SERVICE_NAME
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: service.headless.name
- - name: POD_PREFIX
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: pod.prefix
- - name: NAMESPACE
- valueFrom:
- configMapKeyRef:
- name: env-config
- key: namespace
- command:
- - "/bin/sh"
- - "-c"
- - |
- set -e;
- export SSHPASS='tigergraph';
- sshpass -e ssh -o StrictHostKeyChecking=no tigergraph@${POD_PREFIX}-0.${SERVICE_NAME}.${NAMESPACE} "
- export PATH=$PATH:/home/tigergraph/tigergraph/app/cmd:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin;
- gadmin config set GUI.BasicConfig.Nodes '[{\"HostID\":\"m1\",\"Partition\":0,\"Replica\":1}]'
- gadmin config apply -y;
- gadmin restart all -y;
- ";
- restartPolicy: Never
- backoffLimit: 0
diff --git a/k8s/jobs/guiserver-labeler.yaml b/k8s/jobs/guiserver-labeler.yaml
deleted file mode 100644
index 0e96e4be..00000000
--- a/k8s/jobs/guiserver-labeler.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-apiVersion: batch/v1
-kind: CronJob
-metadata:
- name: guiserver-labeler
- namespace: default
-spec:
- schedule: "*/1 * * * *"
- concurrencyPolicy: Forbid
- successfulJobsHistoryLimit: 1
- jobTemplate:
- spec:
- template:
- spec:
- serviceAccountName: tigergraph-installer
- containers:
- - name: guiserver-labeler
- image: tigergraph/tigergraph-k8s-installer:3.5.0
- imagePullPolicy: Always
- command:
- - "/bin/bash"
- - "-c"
- - |
- # Get all hosts
- allhosts_str=$(kubectl get pods -l app=tigergraph -ojson | jq -c '.items | map(. | .metadata) | map(. | .name)');
- IFS=',' read -ra allhosts_raw <<< "$allhosts_str";
- allhosts=()
- for i in "${allhosts_raw[@]}"; do
- if [[ $i =~ (tigergraph-[0-9]+) ]]
- then
- hostname=${BASH_REMATCH[1]};
- allhosts+=($hostname)
- fi
- done;
- # Get hosts running GUI server
- guihosts_str=$(kubectl exec -it tigergraph-0 -- /bin/sh -c "/home/tigergraph/tigergraph/app/cmd/gadmin config get GUI.BasicConfig.Nodes --file ~/.tg.cfg | jq -c 'map(. | .HostID)'");
- IFS=',' read -ra guihosts_raw <<< "$guihosts_str";
- guihosts=()
- for i in "${guihosts_raw[@]}"; do
- if [[ $i =~ m([0-9]+) ]]
- then
- hostid=${BASH_REMATCH[1]};
- hostname="tigergraph-"$((hostid-1));
- kubectl label pods $hostname guiserver=running --overwrite;
- guihosts+=($hostname)
- fi
- done;
- # Calculate diff and remove flag on these nodes
- diff=(`echo ${allhosts[@]} ${guihosts[@]} | tr ' ' '\n' | sort | uniq -u `)
- for i in "${diff[@]}"; do
- kubectl label pods $i guiserver-;
- done;
- initContainers:
- - name: init-guiserver-labeler
- image: tigergraph/tigergraph-k8s-installer:3.5.0
- imagePullPolicy: IfNotPresent
- command:
- - "/bin/sh"
- - "-c"
- - |
- kubectl wait --for=condition=complete --timeout=6h job/installer || exit 0
- restartPolicy: OnFailure
diff --git a/k8s/jobs/service-loadbalancer-gui.yaml b/k8s/jobs/service-loadbalancer-gui.yaml
deleted file mode 100644
index 18d29049..00000000
--- a/k8s/jobs/service-loadbalancer-gui.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
- labels:
- app: tigergraph
- name: tg-gui-service
- namespace: default
-spec:
- type: LoadBalancer
- selector:
- app: tigergraph
- guiserver: running
- ports:
- - port: 14240
- name: graphstudio
- targetPort: 14240
- sessionAffinity: ClientIP
diff --git a/k8s/tg b/k8s/tg
deleted file mode 100755
index 8343449d..00000000
--- a/k8s/tg
+++ /dev/null
@@ -1,671 +0,0 @@
-#!/bin/bash
-set -e
-# this script is used to simplify the manifest customization and delpoyment of tigergraph in K8S
-
-cwd=$(cd $(dirname $0) && pwd)
-sedi=(-i)
-case "$(uname)" in
- # For mac os
- Darwin*) sedi=(-i "")
-esac
-
-# help documents
-function usage() {
- case $MODE in
- kustomize|create)
-cat << EOF
-Cluster setup usage:
- $0 K8S_PROVIDER [kustomize|create] [OPTIONS]
- -v|--version : (Required) set TG cluster version
- -n|--namespace : set namespace to deploy TG cluster, default as default
- -s|--size : set TG cluster size, default as 1
- -l|--license : set TG cluster license, default as free tier license from docker image
- --ha : set TG cluster ha setting, default as 1
- --pv : set Persistent volume size, default as 50
- --cpu: set Request vCPU number, default as 8
- --mem : set Request Memory size, default as 16
- --prefix : set Pod name prefix-${index}
- --guiserver-freq: set frequency for updating status of gui servers, in (escaped) cronjob format
-
-Examples when working in eks:
- Generate the manifest for deployment
- ./tg eks kustomize -v 3.7.0 -n tigergraph --size 3
- Create TG cluster:
- ./tg eks create -v 3.7.0 -n tigergraph -s 2
-EOF
- ;;
- delete)
-cat << EOF
-Cluster delete usage:
- $0 K8S_PROVIDER delete [OPTIONS]
- -n|--namespace : set namespace used to deploy TG cluster, default as default
- --prefix : set Pod name prefix-${index} used to create TGcluster
-
-Example:
- ./tg eks delete -n tigergraph --prefix tg
-EOF
- ;;
- list)
-cat << EOF
-Cluster list usage:
- $0 K8S_PROVIDER list [OPTIONS]
- -n|--namespace : set namespace used to deploy TG cluster, default as default
-EOF
- ;;
- expand)
-cat << EOF
-Cluster expansion usage:
- $0 K8S_PROVIDER expand [OPTIONS]
- -s|--size : (Required) cluster size to expand to
- --ha : update cluster ha setting, default as 1
- -n|--namespace : set namespace of TG cluster to modify, default as default
- --prefix : set Pod name prefix of TG cluster to modify
-
-Example:
- ./tg eks expand -s 6 --ha 2 -n tigergraph
-EOF
- ;;
- shrink|post-shrink)
-cat << EOF
-Cluster shrink:
- $0 K8S_PROVIDER shrink [OPTIONS]
- -s|--size : (Required) cluster size to shrink to
- --ha : update cluster ha setting, default as 1
- -n|--namespace : set namespace of TG cluster to modify, default as default
- --prefix : set Pod name prefix of TG cluster to modify
-
-Example:
- ./tg eks shrink -s 4 --ha 2 -n tigergraph --prefix tg
-
-After performing the cluster shrink operation, the script will output a command to clean up unused
-pods and resources. Please check that the shrink is successful and all necessary data is moved off
-of the unused pods before executing the command. Example of the command:
- ./tg eks post-shrink -n tigergraph --prefix tg
-EOF
- ;;
- upgrade)
-cat << EOF
-Cluster upgrade:
- $0 K8S_PROVIDER upgrade [OPTIONS]
- -v|--version : (Required) cluster version to upgrade to
- -n|--namespace : set namespace of TG cluster to modify, default as default
- --prefix : set Pod name prefix of TG cluster to modify
-
-Example:
- ./tg eks upgrade -v 3.7.0 -n tigergraph --prefix tg
-EOF
- ;;
- *)
-cat << EOF
-TG script usage:
- $0 K8S_PROVIDER [kustomize|create|delete|list|expand|shrink|post-shrink|upgrade] [OPTIONS]
-
-Commands:
- kustomize|create
- Generates a manifest for the tigergraph cluster. The create command additionally starts up the cluster
- using the generated manifest.
- delete
- Deletes a cluster that uses a manifest generated from this script.
- list
- Lists the pods in the given namespace.
- expand
- Expands an existing cluster.
- shrink|post-shrink
- Shrinks an existing cluster. The post-shrink command cleans up unused resources after the cluster shrink
- is complete; usage of the command will be given when shrink is executed.
- upgrade
- Upgrades an existing cluster to a higher version.
-
-Use
- $0 K8S_PROVIDER [kustomize|create|delete|list|expand|shrink|post-shrink|upgrade] -h
-for information on each command
-EOF
- esac
-exit 0
-}
-
-# parse all arguments
-function set_opts () {
- MODE=""
- while [ -n "$1" ];
- do
- case "$1" in
- kustomize|create|delete|list|expand|shrink|post-shrink|upgrade|help)
- MODE=$1
- ;;
- -n|--namespace)
- shift
- NAMESPACE=$1
- ;;
- -s|--size)
- shift
- CLUSTER_SIZE=$1
- ;;
- --ha)
- shift
- TG_HA=$1
- ;;
- --prefix)
- shift
- TG_PREFIX=$1
- ;;
- --pv)
- shift
- TG_PV=$1
- ;;
- --cpu)
- shift
- TG_CPU=$1
- ;;
- --mem)
- shift
- TG_MEM=$1
- ;;
- --guiserver-freq)
- shift
- GUISERVER_FREQ=$1
- ;;
- -v|--version)
- shift
- DOCKERFILE_VERSION=$1
- # Version number only, i.e. 3.6.0 instead of 3.6.0-image-tag
- if [[ $DOCKERFILE_VERSION =~ ([0-9\.]+) ]]; then
- TG_VERSION=${BASH_REMATCH[1]}
- else
- echo "Unrecognized version format"
- exit 1
- fi
- ;;
- -l|--license)
- shift
- TG_LICENSE=$1
- ;;
- -h|--help)
- HELP_FLAG="true"
- shift
- usage
- ;;
- *)
- echo "Unrecognized flag \"$1\""
- usage
- ;;
- esac
- shift
- done
-}
-# update namespace
-function update_namespace () {
- if [[ -z $NAMESPACE ]]; then
- sed "${sedi[@]}" -e "s/namespace: default/namespace: $cur_namespace/g" $MANIFEST
- elif [ $NAMESPACE != 'default' ]; then
- sed "${sedi[@]}" -e "s/namespace: default/namespace: $NAMESPACE/g" $MANIFEST
- fi
-}
-# change Tigergraph cluster size if not 1
-function update_cluster_size () {
- sed "${sedi[@]}" -e "s/cluster_size: \"[1-9][0-9]*\"/cluster_size: \"${CLUSTER_SIZE}\"/g" $MANIFEST
- sed "${sedi[@]}" -e "s/replicas: [1-9][0-9]*/replicas: ${CLUSTER_SIZE}/g" $MANIFEST
- HOSTS=$(printf '{"ID":"m1","Hostname":"%s-0.tigergraph","Region":""}' "$TG_PREFIX")
- if [ $CLUSTER_SIZE -gt 1 2>/dev/null ]; then
- for i in $(seq 2 ${CLUSTER_SIZE});
- do
- HOSTS+=',{"ID":"m'
- HOSTS+="${i}"
- HOSTS+=$(printf '","Hostname":"%s-' "$TG_PREFIX")
- HOSTS+="$((i-1))"
- HOSTS+='.tigergraph","Region":""}'
- done
- fi
- sed "${sedi[@]}" -e "s/System.HostList=\[.*\]/System.HostList=\[${HOSTS}\]/1" $MANIFEST
-}
-# update Tigergraph Version if not default
-function update_version () {
- # Suppress errors
- set +e
- exec 3>&2
- exec 2> /dev/null
-
- if [[ -z "${DOCKERFILE_VERSION}" || -z "${TG_VERSION}" ]]; then
- echo "Tigergraph version is required for generating manifests"
- exit 1
- elif [[ -z $(curl -I https://hub.docker.com/v2/repositories/tigergraph/tigergraph-k8s/tags/${DOCKERFILE_VERSION} | grep "200") ]]; then
- echo "Tigergraph docker image version ${DOCKERFILE_VERSION} does not exist"
- exit 1
- else
- sed "${sedi[@]}" -E "s/tigergraph-k8s:[0-9\.]+/tigergraph-k8s:${DOCKERFILE_VERSION}/g" $MANIFEST
- sed "${sedi[@]}" -e "s/tigergraph-k8s:VERSION/tigergraph-k8s:${DOCKERFILE_VERSION}/g" $MANIFEST
- sed "${sedi[@]}" -E "s/version: [0-9\.]+/version: ${TG_VERSION}/g" $MANIFEST
- fi
-
- exec 2>&3
-}
-# update Tigergraph license
-function update_license () {
- if [ ! -z "${TG_LICENSE}" ]; then
- sed "${sedi[@]}" -e "s/license: \"\"/license: \"${TG_LICENSE}\"/1" $MANIFEST
- fi
-}
-# update Tigergraph HA
-function update_ha () {
- # check ha <= cluster size
- if [[ ${CLUSTER_SIZE} -lt ${TG_HA} || (${CLUSTER_SIZE} -lt 3 && ${TG_HA} -eq 2) ]]; then
- echo "Entered cluster size ${CLUSTER_SIZE} is not large enough to support HA value of ${TG_HA}"
- exit 1
- fi
-
- if [ ! -z "${TG_HA}" ]; then
- sed "${sedi[@]}" -e "s/ha: \"[0-9]*\"/ha: \"${TG_HA}\"/1" $MANIFEST
- fi
-}
-
-# update Tigergraph PV size
-function update_pv () {
- if [ ! -z "${TG_PV}" ]; then
- sed "${sedi[@]}" -e "s/storage: 50Gi/storage: ${TG_PV}Gi/1" $MANIFEST
- fi
-}
-
-# update Tigergraph Request CPU number
-function update_cpu () {
- if [ ! -z "${TG_CPU}" ]; then
- sed "${sedi[@]}" -e "s/cpu: 8000m/cpu: $((TG_CPU*1000))m/1" $MANIFEST
- fi
-}
-
-# update Tigergraph Request MEM size
-function update_mem () {
- if [ ! -z "${TG_MEM}" ]; then
- sed "${sedi[@]}" -e "s/memory: 16Gi/memory: ${TG_MEM}Gi/1" $MANIFEST
- fi
-}
-
-# update Tigergraph prefix
-function update_prefix () {
- if [ "${TG_PREFIX}" != "tigergraph" ]; then
- sed "${sedi[@]}" -e "s/pod.prefix: tigergraph/pod.prefix: ${TG_PREFIX}/g" $MANIFEST
- sed "${sedi[@]}" -e "s/tigergraph-\[0-9\]/${TG_PREFIX}-\[0-9\]/g" $MANIFEST
- sed "${sedi[@]}" -e "s/tigergraph-0/${TG_PREFIX}-0/g" $MANIFEST
- sed "${sedi[@]}" -e "s/\"tigergraph-\"/\"${TG_PREFIX}-\"/g" $MANIFEST
- sed "${sedi[@]}" -e "/StatefulSet/{n;n;n;n;s/tigergraph/${TG_PREFIX}/;}" $MANIFEST
- fi
-}
-
-# update guiserver update frequency
-function update_guiserver_freq () {
- if [ ! -z "${GUISERVER_FREQ}" ]; then
- sed "${sedi[@]}" -e "s/\*\/1 \* \* \* \*/${GUISERVER_FREQ}/g" $MANIFEST
- fi
-}
-
-# extract previous cluster size from current manifest
-function extract_prev_cluster_size () {
- CLUSTER_SIZE_PREV=$(kubectl get configmap env-config -o yaml -n ${NAMESPACE} | sed -n -E "s/ *cluster_size: \"([1-9][0-9]*)\"/\1/p")
-}
-
-# extract previous version from current manifest
-function extract_prev_version () {
- VERSION_PREV=$(kubectl get configmap env-config -o yaml -n ${NAMESPACE} | sed -n -E "s/ *version: ([0-9\.]+)/\1/p")
- DOCKERFILE_PREV=$(kubectl get statefulsets.apps ${TG_PREFIX} -o json | jq -r ".spec.template.spec.containers[0].image" | sed -n -E "s/docker\.tigergraph\.com\/tigergraph-k8s:(.*)/\1/p")
- if [[ "$VERSION_PREV" = "$TG_VERSION" && "DOCKERFILE_PREV" = "$DOCKERFILE_VERSION" ]]; then
- echo "Previous version matches input version"
- sed "${sedi[@]}" -e "s/tigergraph-\[0-9\]/${TG_PREFIX}-\[0-9\]/g" $MANIFEST
- exit 1
- fi
-}
-
-# create the combined mainfest and customize the settings
-function init_manifest () {
- # initialize default arguments if not provided
- if [[ -z ${CLUSTER_SIZE} ]]; then
- CLUSTER_SIZE="1"
- fi
- if [[ -z ${TG_HA} ]]; then
- TG_HA="1"
- fi
- if [[ -z ${TG_PV} ]]; then
- TG_PV="50"
- fi
- if [[ -z ${TG_CPU} ]]; then
- TG_CPU="8"
- fi
- if [[ -z ${TG_MEM} ]]; then
- TG_MEM="16"
- fi
- if [[ -z ${GUISERVER_FREQ} ]]; then
- GUISERVER_FREQ="\*\/1 \* \* \* \*"
- fi
-
- kubectl -n ${NAMESPACE} kustomize ${cwd}/${K8S_PROVIDER} > $MANIFEST
-
- # loadbalancer patch for tg versions < 3.7.0
- loadbalancer_patch
-
- update_namespace
- update_cluster_size
- update_version
- update_license
- update_ha
- update_pv
- update_cpu
- update_mem
- update_prefix
- update_guiserver_freq
-}
-
-# start Tigergraph cluster
-function tg_start () {
- if [ -f "$MANIFEST" ]; then
- kubectl -n ${NAMESPACE} apply -f $MANIFEST
- else
- echo "Deployment manifest not found at $MANIFEST"
- exit 1
- fi
-}
-
-# clean up all resouces for Tigergraph cluster.
-# It will also clean up all data, be careful to use!!
-function tg_cleanup () {
- if [ -f "$MANIFEST" ]; then
- kubectl delete -f $MANIFEST || :
- kubectl -n ${NAMESPACE} delete pvc -l app=${TG_PREFIX} || :
- else
- echo "Deployment manifest not found at $MANIFEST"
- exit 1
- fi
-}
-
-# check for cluster existence with the current namespace and prefix
-function check_cluster_existence () {
- # suppress errors and redirect stderr to prevent kubectl errors
- set +e
- exec 3>&2
- exec 2> /dev/null
-
- if [[ ! $(kubectl get namespace ${NAMESPACE} | grep "NAME") ]]; then
- echo "No namespace ${NAMESPACE} has been found"
- exit 1
- fi
-
- if [[ ! $(kubectl get statefulsets.apps -n ${NAMESPACE} | grep ${TG_PREFIX}) ]]; then
- echo "No cluster with prefix ${TG_PREFIX} has been found in the namespace ${NAMESPACE}"
- exit 1
- fi
-
- set -e
- exec 2>&3
-}
-
-# start cluster expansion
-function expand_start () {
- if [ -f "$EXPAND_MANIFEST" ]; then
- # check job already exists
- if [[ ! -z $(kubectl get jobs -n ${NAMESPACE} | grep "cluster-expand") ]]; then
- echo "An expand job already exists in the given namespace, make sure it is completed and delete it before rerunning the command"
- exit 1
- fi
-
- if [[ -z "${CLUSTER_SIZE}" ]]; then
- echo "Cluster size is required for expand job"
- exit 1
- fi
-
- if [[ -z ${CLUSTER_SIZE_PREV} || ${CLUSTER_SIZE} -le ${CLUSTER_SIZE_PREV} ]]; then
- echo "Entered cluster size (${CLUSTER_SIZE}) is not greater than current cluster size (${CLUSTER_SIZE_PREV})"
- exit 1
- fi
-
- # if ha is not changed we need to reuse the current ha
- if [[ -z ${TG_HA} ]]; then
- TG_HA=$(kubectl get configmaps env-config -n ${NAMESPACE} -o json | jq -r ".data.\"ha\"")
- fi
-
- # update the previous and current cluster size in manifest
- if [ ! -z ${CLUSTER_SIZE_PREV} ]; then
- sed "${sedi[@]}" -e "s/cluster_size\.staging: \"[0-9]\"/cluster_size\.staging: \"${CLUSTER_SIZE_PREV}\"/g" $MANIFEST
- fi
-
- update_cluster_size
- update_ha
- tg_start
-
- kubectl -n ${NAMESPACE} apply -f $EXPAND_MANIFEST
- else
- echo "Expansion job manifest not found at $EXPAND_MANIFEST"
- exit 1
- fi
-}
-
-# start cluster shrink
-function shrink_start () {
- if [ -f "$SHRINK_MANIFEST" ]; then
- # check job already exists
- if [[ ! -z $(kubectl get jobs -n ${NAMESPACE} | grep "cluster-shrink") ]]; then
- echo "A shrink job already exists in the given namespace, make sure it is completed and delete it before rerunning the command"
- exit 1
- fi
-
- if [[ -z "${CLUSTER_SIZE}" ]]; then
- echo "Cluster size is required for shrink job"
- exit 1
- fi
-
- if [[ -z ${CLUSTER_SIZE_PREV} || ${CLUSTER_SIZE} -ge ${CLUSTER_SIZE_PREV} ]]; then
- echo "Entered cluster size (${CLUSTER_SIZE}) is not less than current cluster size (${CLUSTER_SIZE_PREV})"
- exit 1
- fi
-
- # if ha is not changed we need to reuse the current ha
- if [[ -z ${TG_HA} ]]; then
- TG_HA=$(kubectl get configmaps env-config -n ${NAMESPACE} -o json | jq -r ".data.\"ha\"")
- fi
-
- # check ha <= cluster size
- if [[ ${CLUSTER_SIZE} -lt ${TG_HA} || (${CLUSTER_SIZE} -lt 3 && ${TG_HA} -eq 2) ]]; then
- echo "Entered cluster size ${CLUSTER_SIZE} is not large enough to support HA value of ${TG_HA}"
- exit 1
- fi
-
- # need to patch cluster_size & ha staging in configmap, since we are not updating the deployment
- kubectl patch configmap/env-config -n ${NAMESPACE} --type merge -p {\"data\":{\"cluster_size.staging\":\"${CLUSTER_SIZE}\"}}
- kubectl patch configmap/env-config -n ${NAMESPACE} --type merge -p {\"data\":{\"ha.staging\":\"${TG_HA}\"}}
-
- kubectl -n ${NAMESPACE} apply -f $SHRINK_MANIFEST
-
- echo "The cluster shrink job has been started. After checking that the cluster and data is correct, use the following command to delete the unused pods:"
- echo " ./tg ${K8S_PROVIDER} post-shrink -n ${NAMESPACE} --prefix ${TG_PREFIX}"
- else
- echo "Shrink job manifest not found at $SHRINK_MANIFEST"
- exit 1
- fi
-}
-
-# start post cluster shrink pod cleanup
-function post_shrink_start () {
- # check job already exists
- if [[ ! -z $(kubectl get jobs -n ${NAMESPACE} | grep "cluster-post-shrink") ]]; then
- echo "A post-shrink job already exists in the given namespace, make sure it is completed and delete it before rerunning the command"
- exit 1
- fi
-
- # get shrink cluster size & ha from configmap
- CLUSTER_SIZE=$(kubectl get configmaps env-config -n ${NAMESPACE} -o json | jq -r ".data.\"cluster_size.staging\"")
- OLD_CLUSTER_SIZE=$(kubectl get configmaps env-config -n ${NAMESPACE} -o json | jq -r ".data.\"cluster_size\"")
- TG_HA=$(kubectl get configmaps env-config -n ${NAMESPACE} -o json | jq -r ".data.\"ha.staging\"")
-
- # check current hostlist size from gadmin
- cur_cluster_size=$(kubectl exec -n ${NAMESPACE} ${TG_PREFIX}-0 -- /home/tigergraph/tigergraph/app/cmd/gadmin config get System.HostList | grep -o "${TG_PREFIX}-" | wc -l | sed 's/ //g')
-
- if [ $CLUSTER_SIZE -ne $cur_cluster_size ]; then
- echo "Entered cluster size does not match the current cluster size of ${cur_cluster_size}"
- exit 1
- fi
-
- update_cluster_size
- update_ha
- tg_start
-
- kubectl -n ${NAMESPACE} apply -f $POST_SHRINK_MANIFEST
-
- # delete pvc
- for i in `seq $CLUSTER_SIZE $(($OLD_CLUSTER_SIZE-1))`
- do
- kubectl -n ${NAMESPACE} delete pvc tg-data-${TG_PREFIX}-${i}
- done
-}
-
-# start cluster upgrade
-function upgrade_start () {
- if [ -f "$UPGRADE_MANIFEST" ]; then
- # check job already exists
- if [[ ! -z $(kubectl get jobs -n ${NAMESPACE} | grep "cluster-upgrade") ]]; then
- echo "An upgrade job already exists in the given namespace, make sure it is completed and delete it before rerunning the command"
- exit 1
- fi
-
- if [[ ! ${TG_VERSION} > ${VERSION_PREV} ]]; then
- if [[ ${DOCKERFILE_PREV} = ${DOCKERFILE_VERSION} ]]; then
- echo "Entered version (${TG_VERSION}) is not greater than current version (${VERSION_PREV})"
- exit 1
- fi
- fi
-
- update_version
-
- # copy APPROOT/dev/gdk/gsql/src/QueryUdf and TokenBank to transfer between version
- # these folders only exist for m0 nodes
- approot_cur="/home/tigergraph/tigergraph/app/${VERSION_PREV}"
- kubectl exec -n ${NAMESPACE} ${TG_PREFIX}-0 -- /bin/bash -c "
- mkdir -p /home/tigergraph/tigergraph/data/upgrade-backup &&
- cp -r ${approot_cur}/dev/gdk/gsql/src/QueryUdf /home/tigergraph/tigergraph/data/upgrade-backup &&
- cp -r ${approot_cur}/dev/gdk/gsql/src/TokenBank /home/tigergraph/tigergraph/data/upgrade-backup &&
- /home/tigergraph/tigergraph/app/cmd/gadmin stop all -y
- "
-
- tg_start
-
- kubectl -n ${NAMESPACE} apply -f $UPGRADE_MANIFEST
- else
- echo "Upgrade job manifest not found at $UPGRADE_MANIFEST"
- exit 1
- fi
-}
-
-# loadbalancer patch for versions < 3.7.0
-function loadbalancer_patch () {
- if [[ ${TG_VERSION} < "3.7.0" ]]; then
- echo "---" >> $MANIFEST
- cat "${CWD}/jobs/service-loadbalancer-gui.yaml" >> $MANIFEST
- echo "---" >> $MANIFEST
- cat "${CWD}/jobs/guiserver-labeler.yaml" >> $MANIFEST
-
- if [ "$K8S_PROVIDER" = "eks" ]; then
- sed "${sedi[@]}" -e "/sessionAffinity: ClientIP/d" $MANIFEST
- echo "---" >> $MANIFEST
- cat "${CWD}/jobs/disable-gui-ha.yaml" >> $MANIFEST
- fi
- else
- sed "${sedi[@]}" -e "/sessionAffinity: ClientIP/d" $MANIFEST
- fi
-}
-
-# main entry
-function main () {
- CWD=$(cd $(dirname $0) && pwd)
- [ -d ${CWD}/deploy ] || mkdir -p ${CWD}/deploy
-
- shopt -s nocasematch
- if [[ $1 =~ gke|eks|aks ]]; then
- K8S_PROVIDER=$1
- shift
- else
- MISSING_PROVIDER="true"
- fi
- shopt -u nocasematch
-
- set_opts "$@"
-
- if [[ ! -z ${MISSING_PROVIDER} ]]; then
- if [[ ! -z ${HELP_FLAG} ]]; then
- usage
- else
- echo "Missing or unrecognized K8s provider, got \"$1\""
- exit 1
- fi
- fi
-
- # set argument defaults (arguments required for cluster initialization are set in init_manifest
- if [[ -z ${TG_PREFIX} ]]; then
- TG_PREFIX="tigergraph"
- fi
-
- # use namespace specified in context if unset
- if [[ -z ${NAMESPACE} ]]; then
- NAMESPACE=$(kubectl config view --minify -o jsonpath='{..namespace}')
- if [[ -z ${NAMESPACE} ]]; then
- echo "Could not automatically retrieve namespace from current context, please use the --namespace argument"
- exit 1
- fi
- fi
-
- MANIFEST="${CWD}/deploy/tigergraph-${K8S_PROVIDER}-${NAMESPACE}.yaml"
- case $MODE in
- kustomize)
- init_manifest
- ;;
- create)
- init_manifest
-
- # check cluster does not already exist
- set +e
- exec 3>&2
- exec 2> /dev/null
-
- if [[ $(kubectl get statefulsets.apps -n ${NAMESPACE} | grep ${TG_PREFIX}) ]]; then
- echo "A cluster with prefix ${TG_PREFIX} already exists in the namespace ${NAMESPACE}"
- exit 1
- fi
-
- exec 2>&3
-
- tg_start
- ;;
- delete)
- tg_cleanup
- ;;
- list)
- kubectl -n ${NAMESPACE} get pods -o wide
- ;;
- expand)
- check_cluster_existence
-
- EXPAND_MANIFEST="${CWD}/jobs/cluster-expand.yaml"
- extract_prev_cluster_size
- expand_start
- ;;
- shrink)
- check_cluster_existence
-
- SHRINK_MANIFEST="${CWD}/jobs/cluster-shrink.yaml"
- extract_prev_cluster_size
- shrink_start
- ;;
- post-shrink)
- check_cluster_existence
-
- POST_SHRINK_MANIFEST="${CWD}/jobs/cluster-post-shrink.yaml"
- post_shrink_start
- ;;
- upgrade)
- check_cluster_existence
-
- UPGRADE_MANIFEST="${CWD}/jobs/cluster-upgrade.yaml"
- extract_prev_cluster_size
- extract_prev_version
- upgrade_start
- ;;
- *)
- usage
- esac
-}
-
-main "$@"