From 4787a6d736cdca30af7aacad989e0e5b87cfe28c Mon Sep 17 00:00:00 2001
From: Ye Cao <caoye.cao@alibaba-inc.com>
Date: Tue, 14 Nov 2023 17:55:26 +0800
Subject: [PATCH 1/4] Change the API of kubeflow pipeline from
 vineyard.csi.read/writer to client.get/put

Signed-off-by: Ye Cao <caoye.cao@alibaba-inc.com>
---
 k8s/examples/vineyard-kubeflow/Dockerfile     |  10 ++
 k8s/examples/vineyard-kubeflow/Makefile       |  23 ++++
 .../pipeline-with-vineyard.py                 |  69 +++++++++++
 .../pipeline-with-vineyard.yaml               | 116 ++++++++++++++++++
 k8s/examples/vineyard-kubeflow/pipeline.py    |  45 +++++++
 k8s/examples/vineyard-kubeflow/pipeline.yaml  | 105 ++++++++++++++++
 .../vineyard-kubeflow/prepare-data.yaml       |  57 +++++++++
 .../prepare-data/prepare-data.py              |  76 ++++++++++++
 .../preprocess/preprocess.py                  |  89 ++++++++++++++
 k8s/examples/vineyard-kubeflow/rbac.yaml      |  31 +++++
 k8s/examples/vineyard-kubeflow/test/test.py   |  46 +++++++
 k8s/examples/vineyard-kubeflow/train/train.py |  44 +++++++
 12 files changed, 711 insertions(+)
 create mode 100644 k8s/examples/vineyard-kubeflow/Dockerfile
 create mode 100644 k8s/examples/vineyard-kubeflow/Makefile
 create mode 100644 k8s/examples/vineyard-kubeflow/pipeline-with-vineyard.py
 create mode 100644 k8s/examples/vineyard-kubeflow/pipeline-with-vineyard.yaml
 create mode 100644 k8s/examples/vineyard-kubeflow/pipeline.py
 create mode 100644 k8s/examples/vineyard-kubeflow/pipeline.yaml
 create mode 100644 k8s/examples/vineyard-kubeflow/prepare-data.yaml
 create mode 100644 k8s/examples/vineyard-kubeflow/prepare-data/prepare-data.py
 create mode 100644 k8s/examples/vineyard-kubeflow/preprocess/preprocess.py
 create mode 100644 k8s/examples/vineyard-kubeflow/rbac.yaml
 create mode 100644 k8s/examples/vineyard-kubeflow/test/test.py
 create mode 100644 k8s/examples/vineyard-kubeflow/train/train.py

diff --git a/k8s/examples/vineyard-kubeflow/Dockerfile b/k8s/examples/vineyard-kubeflow/Dockerfile
new file mode 100644
index 000000000..612e2d22e
--- /dev/null
+++ b/k8s/examples/vineyard-kubeflow/Dockerfile
@@ -0,0 +1,10 @@
+FROM python:3.10
+
+RUN pip3 install --no-cache-dir pandas requests scikit-learn numpy vineyard
+
+WORKDIR /
+
+ARG APP
+ENV APP ${APP}
+
+COPY ${APP} /${APP}
diff --git a/k8s/examples/vineyard-kubeflow/Makefile b/k8s/examples/vineyard-kubeflow/Makefile
new file mode 100644
index 000000000..3ce332c36
--- /dev/null
+++ b/k8s/examples/vineyard-kubeflow/Makefile
@@ -0,0 +1,23 @@
+REGISTRY := "ghcr.io/v6d-io/v6d/kubeflow-example"
+docker-build:
+	docker build prepare-data/ -f Dockerfile \
+		--build-arg APP=prepare-data.py \
+		-t $(REGISTRY)/prepare-data
+
+	docker build preprocess/ -f Dockerfile \
+		--build-arg APP=preprocess.py \
+		-t $(REGISTRY)/preprocess-data
+
+	docker build train/ -f Dockerfile \
+		--build-arg APP=train.py \
+		-t $(REGISTRY)/train-data
+	
+	docker build test/ -f Dockerfile \
+		--build-arg APP=test.py \
+		-t $(REGISTRY)/test-data
+
+push-images:
+	docker push $(REGISTRY)/prepare-data
+	docker push $(REGISTRY)/preprocess-data
+	docker push $(REGISTRY)/train-data
+	docker push $(REGISTRY)/test-data
diff --git a/k8s/examples/vineyard-kubeflow/pipeline-with-vineyard.py b/k8s/examples/vineyard-kubeflow/pipeline-with-vineyard.py
new file mode 100644
index 000000000..7fe7d71a8
--- /dev/null
+++ b/k8s/examples/vineyard-kubeflow/pipeline-with-vineyard.py
@@ -0,0 +1,69 @@
+from kfp import dsl
+from kubernetes.client.models import V1EnvVar
+import kubernetes as k8s
+
+def PreProcess(data_multiplier: int, registry: str):
+    #############################################################
+    vineyard_volume = dsl.PipelineVolume(volume=k8s.client.V1Volume(
+        name="vineyard-socket",
+        host_path=k8s.client.V1HostPathVolumeSource(path="/var/run/vineyard-kubernetes/vineyard-system/vineyardd-sample")))
+
+    return dsl.ContainerOp(
+        name='Preprocess Data',
+        image = f'{registry}/preprocess-data',
+        container_kwargs={
+            'image_pull_policy': "Always",
+            'env': [V1EnvVar('VINEYARD_IPC_SOCKET', '/var/run/vineyard.sock')]
+        },
+        pvolumes={
+            "/data": dsl.PipelineVolume(pvc="benchmark-data"),
+            "/var/run": vineyard_volume,
+        },
+        command = ['python3', 'preprocess.py'],
+        arguments=[f'--data_multiplier={data_multiplier}', '--with_vineyard=True'],
+    )
+
+def Train(comp1, registry: str):
+    return dsl.ContainerOp(
+        name='Train Data',
+        image=f'{registry}/train-data',
+        container_kwargs={
+            'image_pull_policy': "Always",
+            'env': [V1EnvVar('VINEYARD_IPC_SOCKET', '/var/run/vineyard.sock')]
+        },
+        pvolumes={
+            "/data": comp1.pvolumes['/data'],
+            "/var/run": comp1.pvolumes['/var/run'],
+        },
+        command = ['python3', 'train.py'],
+        arguments=['--with_vineyard=True'],
+    )
+
+def Test(comp1, comp2, registry: str):
+    return dsl.ContainerOp(
+        name='Test Data',
+        image=f'{registry}/test-data',
+        container_kwargs={
+            'image_pull_policy': "Always",
+            'env': [V1EnvVar('VINEYARD_IPC_SOCKET', '/var/run/vineyard.sock')]
+        },
+        pvolumes={
+            "/data": comp2.pvolumes['/data'],
+            "/var/run": comp1.pvolumes['/var/run']
+        },
+        command = ['python3', 'test.py'],
+        arguments=['--with_vineyard=True'],
+    )
+
+@dsl.pipeline(
+   name='Machine learning Pipeline',
+   description='An example pipeline that trains and logs a regression model.'
+)
+def pipeline(data_multiplier: int, registry: str):
+    comp1 = PreProcess(data_multiplier=data_multiplier, registry=registry)
+    comp2 = Train(comp1, registry=registry)
+    comp3 = Test(comp1, comp2, registry=registry)
+
+if __name__ == '__main__':
+    from kfp import compiler
+    compiler.Compiler().compile(pipeline, __file__[:-3]+ '.yaml')
diff --git a/k8s/examples/vineyard-kubeflow/pipeline-with-vineyard.yaml b/k8s/examples/vineyard-kubeflow/pipeline-with-vineyard.yaml
new file mode 100644
index 000000000..7ffb8e42b
--- /dev/null
+++ b/k8s/examples/vineyard-kubeflow/pipeline-with-vineyard.yaml
@@ -0,0 +1,116 @@
+apiVersion: argoproj.io/v1alpha1
+kind: Workflow
+metadata:
+  generateName: machine-learning-pipeline-
+  annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.21, pipelines.kubeflow.org/pipeline_compilation_time: '2023-11-14T17:50:23.371870',
+    pipelines.kubeflow.org/pipeline_spec: '{"description": "An example pipeline that
+      trains and logs a regression model.", "inputs": [{"name": "data_multiplier",
+      "type": "Integer"}, {"name": "registry", "type": "String"}], "name": "Machine
+      learning Pipeline"}'}
+  labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.21}
+spec:
+  entrypoint: machine-learning-pipeline
+  templates:
+  - name: machine-learning-pipeline
+    inputs:
+      parameters:
+      - {name: data_multiplier}
+      - {name: registry}
+    dag:
+      tasks:
+      - name: preprocess-data
+        template: preprocess-data
+        arguments:
+          parameters:
+          - {name: data_multiplier, value: '{{inputs.parameters.data_multiplier}}'}
+          - {name: registry, value: '{{inputs.parameters.registry}}'}
+      - name: test-data
+        template: test-data
+        dependencies: [preprocess-data, train-data]
+        arguments:
+          parameters:
+          - {name: registry, value: '{{inputs.parameters.registry}}'}
+      - name: train-data
+        template: train-data
+        dependencies: [preprocess-data]
+        arguments:
+          parameters:
+          - {name: registry, value: '{{inputs.parameters.registry}}'}
+  - name: preprocess-data
+    container:
+      args: ['--data_multiplier={{inputs.parameters.data_multiplier}}', --with_vineyard=True]
+      command: [python3, preprocess.py]
+      env:
+      - {name: VINEYARD_IPC_SOCKET, value: /var/run/vineyard.sock}
+      image: '{{inputs.parameters.registry}}/preprocess-data'
+      imagePullPolicy: Always
+      volumeMounts:
+      - {mountPath: /data, name: pvolume-d9c6725a1237b14c08a2567cb12c489bec539873deeddba7d87f5b4}
+      - {mountPath: /var/run, name: vineyard-socket}
+    inputs:
+      parameters:
+      - {name: data_multiplier}
+      - {name: registry}
+    metadata:
+      labels:
+        pipelines.kubeflow.org/kfp_sdk_version: 1.8.21
+        pipelines.kubeflow.org/pipeline-sdk-type: kfp
+        pipelines.kubeflow.org/enable_caching: "true"
+    volumes:
+    - name: pvolume-d9c6725a1237b14c08a2567cb12c489bec539873deeddba7d87f5b4
+      persistentVolumeClaim: {claimName: benchmark-data}
+    - hostPath: {path: /var/run/vineyard-kubernetes/vineyard-system/vineyardd-sample}
+      name: vineyard-socket
+  - name: test-data
+    container:
+      args: [--with_vineyard=True]
+      command: [python3, test.py]
+      env:
+      - {name: VINEYARD_IPC_SOCKET, value: /var/run/vineyard.sock}
+      image: '{{inputs.parameters.registry}}/test-data'
+      imagePullPolicy: Always
+      volumeMounts:
+      - {mountPath: /data, name: pvolume-d9c6725a1237b14c08a2567cb12c489bec539873deeddba7d87f5b4}
+      - {mountPath: /var/run, name: vineyard-socket}
+    inputs:
+      parameters:
+      - {name: registry}
+    metadata:
+      labels:
+        pipelines.kubeflow.org/kfp_sdk_version: 1.8.21
+        pipelines.kubeflow.org/pipeline-sdk-type: kfp
+        pipelines.kubeflow.org/enable_caching: "true"
+    volumes:
+    - name: pvolume-d9c6725a1237b14c08a2567cb12c489bec539873deeddba7d87f5b4
+      persistentVolumeClaim: {claimName: benchmark-data}
+    - hostPath: {path: /var/run/vineyard-kubernetes/vineyard-system/vineyardd-sample}
+      name: vineyard-socket
+  - name: train-data
+    container:
+      args: [--with_vineyard=True]
+      command: [python3, train.py]
+      env:
+      - {name: VINEYARD_IPC_SOCKET, value: /var/run/vineyard.sock}
+      image: '{{inputs.parameters.registry}}/train-data'
+      imagePullPolicy: Always
+      volumeMounts:
+      - {mountPath: /data, name: pvolume-d9c6725a1237b14c08a2567cb12c489bec539873deeddba7d87f5b4}
+      - {mountPath: /var/run, name: vineyard-socket}
+    inputs:
+      parameters:
+      - {name: registry}
+    metadata:
+      labels:
+        pipelines.kubeflow.org/kfp_sdk_version: 1.8.21
+        pipelines.kubeflow.org/pipeline-sdk-type: kfp
+        pipelines.kubeflow.org/enable_caching: "true"
+    volumes:
+    - name: pvolume-d9c6725a1237b14c08a2567cb12c489bec539873deeddba7d87f5b4
+      persistentVolumeClaim: {claimName: benchmark-data}
+    - hostPath: {path: /var/run/vineyard-kubernetes/vineyard-system/vineyardd-sample}
+      name: vineyard-socket
+  arguments:
+    parameters:
+    - {name: data_multiplier}
+    - {name: registry}
+  serviceAccountName: pipeline-runner
diff --git a/k8s/examples/vineyard-kubeflow/pipeline.py b/k8s/examples/vineyard-kubeflow/pipeline.py
new file mode 100644
index 000000000..c58f7ce61
--- /dev/null
+++ b/k8s/examples/vineyard-kubeflow/pipeline.py
@@ -0,0 +1,45 @@
+from kfp import dsl
+
+def PreProcess(data_multiplier: int, registry: str):
+    return dsl.ContainerOp(
+        name='Preprocess Data',
+        image = f'{registry}/preprocess-data',
+        container_kwargs={'image_pull_policy':"Always"},
+        command = ['python3', 'preprocess.py'],
+        arguments = [f'--data_multiplier={data_multiplier}'],
+        # add the existing volume to the pipeline
+        pvolumes={"/data": dsl.PipelineVolume(pvc="benchmark-data")},
+    )
+
+def Train(comp1, registry: str):
+    return dsl.ContainerOp(
+        name='Train Data',
+        image=f'{registry}/train-data',
+        container_kwargs={'image_pull_policy':"Always"},
+        command = ['python3', 'train.py'],
+        
+        pvolumes={"/data": comp1.pvolumes['/data']},
+    )
+
+def Test(comp2, registry: str):
+    return dsl.ContainerOp(
+        name='Test Data',
+        image=f'{registry}/test-data',
+        container_kwargs={'image_pull_policy':"Always"},
+        command = ['python3', 'test.py'],
+
+        pvolumes={"/data": comp2.pvolumes['/data']},
+    )
+
+@dsl.pipeline(
+   name='Machine Learning Pipeline',
+   description='An example pipeline that trains and logs a regression model.'
+)
+def pipeline(data_multiplier: int, registry: str):
+    comp1 = PreProcess(data_multiplier=data_multiplier, registry=registry)
+    comp2 = Train(comp1, registry=registry)
+    comp3 = Test(comp2, registry=registry)
+
+if __name__ == '__main__':
+    from kfp import compiler
+    compiler.Compiler().compile(pipeline, __file__[:-3]+ '.yaml')
diff --git a/k8s/examples/vineyard-kubeflow/pipeline.yaml b/k8s/examples/vineyard-kubeflow/pipeline.yaml
new file mode 100644
index 000000000..fe94f9485
--- /dev/null
+++ b/k8s/examples/vineyard-kubeflow/pipeline.yaml
@@ -0,0 +1,105 @@
+apiVersion: argoproj.io/v1alpha1
+kind: Workflow
+metadata:
+  generateName: machine-learning-pipeline-
+  annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.0, pipelines.kubeflow.org/pipeline_compilation_time: '2023-10-10T15:14:12.195049',
+    pipelines.kubeflow.org/pipeline_spec: '{"description": "An example pipeline that
+      trains and logs a regression model.", "inputs": [{"name": "data_multiplier",
+      "type": "Integer"}, {"name": "registry", "type": "String"}], "name": "Machine
+      Learning Pipeline"}'}
+  labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.0}
+spec:
+  entrypoint: machine-learning-pipeline
+  templates:
+  - name: machine-learning-pipeline
+    inputs:
+      parameters:
+      - {name: data_multiplier}
+      - {name: registry}
+    dag:
+      tasks:
+      - name: preprocess-data
+        template: preprocess-data
+        arguments:
+          parameters:
+          - {name: data_multiplier, value: '{{inputs.parameters.data_multiplier}}'}
+          - {name: registry, value: '{{inputs.parameters.registry}}'}
+      - name: test-data
+        template: test-data
+        dependencies: [train-data]
+        arguments:
+          parameters:
+          - {name: registry, value: '{{inputs.parameters.registry}}'}
+      - name: train-data
+        template: train-data
+        dependencies: [preprocess-data]
+        arguments:
+          parameters:
+          - {name: registry, value: '{{inputs.parameters.registry}}'}
+  - name: preprocess-data
+    container:
+      args: ['--data_multiplier={{inputs.parameters.data_multiplier}}']
+      command: [python3, preprocess.py]
+      image: '{{inputs.parameters.registry}}/preprocess-data'
+      imagePullPolicy: Always
+      securityContext:
+        privileged: true
+      volumeMounts:
+      - {mountPath: /data, name: pvolume-d9c6725a1237b14c08a2567cb12c489bec539873deeddba7d87f5b4}
+    inputs:
+      parameters:
+      - {name: data_multiplier}
+      - {name: registry}
+    metadata:
+      labels:
+        pipelines.kubeflow.org/kfp_sdk_version: 1.8.0
+        pipelines.kubeflow.org/pipeline-sdk-type: kfp
+        pipelines.kubeflow.org/enable_caching: "true"
+    volumes:
+    - name: pvolume-d9c6725a1237b14c08a2567cb12c489bec539873deeddba7d87f5b4
+      persistentVolumeClaim: {claimName: benchmark-data}
+  - name: test-data
+    container:
+      command: [python3, test.py]
+      image: '{{inputs.parameters.registry}}/test-data'
+      imagePullPolicy: Always
+      securityContext:
+        privileged: true
+      volumeMounts:
+      - {mountPath: /data, name: pvolume-d9c6725a1237b14c08a2567cb12c489bec539873deeddba7d87f5b4}
+    inputs:
+      parameters:
+      - {name: registry}
+    metadata:
+      labels:
+        pipelines.kubeflow.org/kfp_sdk_version: 1.8.0
+        pipelines.kubeflow.org/pipeline-sdk-type: kfp
+        pipelines.kubeflow.org/enable_caching: "true"
+    volumes:
+    - name: pvolume-d9c6725a1237b14c08a2567cb12c489bec539873deeddba7d87f5b4
+      persistentVolumeClaim: {claimName: benchmark-data}
+  - name: train-data
+    container:
+      command: [python3, train.py]
+      image: '{{inputs.parameters.registry}}/train-data'
+      imagePullPolicy: Always
+      securityContext:
+        privileged: true
+      volumeMounts:
+      - {mountPath: /data, name: pvolume-d9c6725a1237b14c08a2567cb12c489bec539873deeddba7d87f5b4}
+    inputs:
+      parameters:
+      - {name: registry}
+    metadata:
+      labels:
+        pipelines.kubeflow.org/kfp_sdk_version: 1.8.0
+        pipelines.kubeflow.org/pipeline-sdk-type: kfp
+        pipelines.kubeflow.org/enable_caching: "true"
+    volumes:
+    - name: pvolume-d9c6725a1237b14c08a2567cb12c489bec539873deeddba7d87f5b4
+      persistentVolumeClaim: {claimName: benchmark-data}
+  arguments:
+    parameters:
+    - {name: data_multiplier}
+    - {name: registry}
+  serviceAccountName: pipeline-runner
diff --git a/k8s/examples/vineyard-kubeflow/prepare-data.yaml b/k8s/examples/vineyard-kubeflow/prepare-data.yaml
new file mode 100644
index 000000000..9475ff9fd
--- /dev/null
+++ b/k8s/examples/vineyard-kubeflow/prepare-data.yaml
@@ -0,0 +1,57 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: prepare-data
+  namespace: kubeflow
+spec:
+  selector:
+    matchLabels:
+      app: prepare-data
+  replicas: 1
+  template:
+    metadata:
+      labels:
+        app: prepare-data
+    spec:
+      containers:
+      - name: prepare-data
+        image: ghcr.io/v6d-io/v6d/kubeflow-example/prepare-data
+        imagePullPolicy: Always
+        command: ["python3", "/prepare-data.py"]
+        volumeMounts:
+        - mountPath: /data
+          name: data
+      volumes:
+      - name: data
+        persistentVolumeClaim:
+          claimName: benchmark-data
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: benchmark-data
+  namespace: kubeflow
+spec:
+  storageClassName: manual
+  accessModes:
+  - ReadWriteMany
+  resources:
+    requests:
+      storage: 30Gi
+---
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: benchmark-data
+  namespace: kubeflow
+  labels:
+    type: local
+spec:
+  storageClassName: manual
+  capacity:
+    storage: 30Gi
+  accessModes:
+    - ReadWriteMany
+  hostPath:
+    # mount a nfs volume to the kind nodes
+    path: "/mnt/csi-benchmark"
\ No newline at end of file
diff --git a/k8s/examples/vineyard-kubeflow/prepare-data/prepare-data.py b/k8s/examples/vineyard-kubeflow/prepare-data/prepare-data.py
new file mode 100644
index 000000000..9c26d19d3
--- /dev/null
+++ b/k8s/examples/vineyard-kubeflow/prepare-data/prepare-data.py
@@ -0,0 +1,76 @@
+import time
+
+import numpy as np
+import pandas as pd
+
+
+def generate_random_dataframe(num_rows):
+    return pd.DataFrame({
+            'Id': np.random.randint(1, 100000, num_rows),
+            'MSSubClass': np.random.randint(20, 201, size=num_rows),
+            'LotFrontage': np.random.randint(50, 151, size=num_rows),
+            'LotArea': np.random.randint(5000, 20001, size=num_rows),
+            'OverallQual': np.random.randint(1, 11, size=num_rows),
+            'OverallCond': np.random.randint(1, 11, size=num_rows),
+            'YearBuilt': np.random.randint(1900, 2022, size=num_rows),
+            'YearRemodAdd': np.random.randint(1900, 2022, size=num_rows),
+            'MasVnrArea': np.random.randint(0, 1001, size=num_rows),
+            'BsmtFinSF1': np.random.randint(0, 2001, size=num_rows),
+            'BsmtFinSF2': np.random.randint(0, 1001, size=num_rows),
+            'BsmtUnfSF': np.random.randint(0, 2001, size=num_rows),
+            'TotalBsmtSF': np.random.randint(0, 3001, size=num_rows),
+            '1stFlrSF': np.random.randint(500, 4001, size=num_rows),
+            '2ndFlrSF': np.random.randint(0, 2001, size=num_rows),
+            'LowQualFinSF': np.random.randint(0, 201, size=num_rows),
+            'GrLivArea': np.random.randint(600, 5001, size=num_rows),
+            'BsmtFullBath': np.random.randint(0, 4, size=num_rows),
+            'BsmtHalfBath': np.random.randint(0, 3, size=num_rows),
+            'FullBath': np.random.randint(0, 5, size=num_rows),
+            'HalfBath': np.random.randint(0, 3, size=num_rows),
+            'BedroomAbvGr': np.random.randint(0, 11, size=num_rows),
+            'KitchenAbvGr': np.random.randint(0, 4, size=num_rows),
+            'TotRmsAbvGrd': np.random.randint(0, 16, size=num_rows),
+            'Fireplaces': np.random.randint(0, 4, size=num_rows),
+            'GarageYrBlt': np.random.randint(1900, 2022, size=num_rows),
+            'GarageCars': np.random.randint(0, 5, num_rows),
+            'GarageArea': np.random.randint(0, 1001, num_rows),
+            'WoodDeckSF': np.random.randint(0, 501, num_rows),
+            'OpenPorchSF': np.random.randint(0, 301, num_rows),
+            'EnclosedPorch': np.random.randint(0, 201, num_rows),
+            '3SsnPorch': np.random.randint(0, 101, num_rows),
+            'ScreenPorch': np.random.randint(0, 201, num_rows),
+            'PoolArea': np.random.randint(0, 301, num_rows),
+            'MiscVal': np.random.randint(0, 5001, num_rows),
+            'TotalRooms': np.random.randint(2, 11, num_rows),
+            "GarageAge": np.random.randint(1, 31, num_rows),
+            "RemodAge": np.random.randint(1, 31, num_rows),
+            "HouseAge": np.random.randint(1, 31, num_rows),
+            "TotalBath": np.random.randint(1, 5, num_rows),
+            "TotalPorchSF": np.random.randint(1, 1001, num_rows),
+            "TotalSF": np.random.randint(1000, 6001, num_rows),
+            "TotalArea": np.random.randint(1000, 6001, num_rows),
+            'MoSold': np.random.randint(1, 13, num_rows),
+            'YrSold': np.random.randint(2006, 2022, num_rows),
+            'SalePrice': np.random.randint(50000, 800001, num_rows),
+        })
+
+def prepare_data():
+    print('Start preparing data....', flush=True)
+    st = time.time()
+    for multiplier in 4000, 5000, 6000:
+        df = generate_random_dataframe(10000*(multiplier))
+        df.to_pickle('/data/df_{}.pkl'.format(multiplier))
+        del df
+    ed = time.time()
+    print('##################################', flush=True)
+    print('dataframe to_pickle time: ', ed - st, flush=True)
+
+
+if __name__ == '__main__':
+    st = time.time()
+    print('Preparing data....', flush=True)
+    prepare_data()
+    ed = time.time()
+    print('##################################')
+    print('preparing data time: ', ed - st, flush=True)
+    time.sleep(10000000)
diff --git a/k8s/examples/vineyard-kubeflow/preprocess/preprocess.py b/k8s/examples/vineyard-kubeflow/preprocess/preprocess.py
new file mode 100644
index 000000000..93ff2f4b4
--- /dev/null
+++ b/k8s/examples/vineyard-kubeflow/preprocess/preprocess.py
@@ -0,0 +1,89 @@
+import argparse
+import os
+import time
+
+#from sklearn.compose import ColumnTransformer
+from sklearn.model_selection import train_test_split
+#from sklearn.preprocessing import OneHotEncoder
+
+import pandas as pd
+import vineyard
+
+
+def preprocess_data(data_multiplier, with_vineyard):
+    os.system('sync; echo 3 > /proc/sys/vm/drop_caches')
+    st = time.time()
+    df = pd.read_pickle('/data/df_{0}.pkl'.format(data_multiplier))
+
+    ed = time.time()
+    print('##################################')
+    print('read dataframe pickle time: ', ed - st)
+
+    df = df.drop(df[(df['GrLivArea']>4800)].index)
+
+    """ The following part will need large memory usage, disable for benchmark
+    del df
+
+    # Define the categorical feature columns
+    categorical_features = df_preocessed.select_dtypes(include='object').columns
+
+    # Create the column transformer for one-hot encoding
+    preprocessor = ColumnTransformer(
+        transformers=[('encoder', OneHotEncoder(sparse=False, handle_unknown='ignore'), categorical_features)],
+        remainder='passthrough'
+    )
+
+    # Preprocess the features using the column transformer
+    one_hot_df = preprocessor.fit_transform(df_preocessed)
+
+    # Get the column names for the encoded features
+    encoded_feature_names = preprocessor.named_transformers_['encoder'].get_feature_names_out(categorical_features)
+
+    columns = list(encoded_feature_names) + list(df_preocessed.select_dtypes(exclude='object').columns)
+
+    del df_preocessed
+
+    # Concatenate the encoded features with the original numerical features
+    df = pd.DataFrame(one_hot_df, columns=columns)
+
+    del one_hot_df
+    """
+
+    X = df.drop('SalePrice', axis=1)  # Features
+    y = df['SalePrice']  # Target variable
+
+    del df
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
+
+    del X, y
+
+    st = time.time()
+    if with_vineyard:
+        client = vineyard.connect()
+        client.put(X_train, name="/data/x_train.pkl", persist=True)
+        client.put(X_test, name="/data/x_test.pkl", persist=True)
+        client.put(y_train, name="/data/y_train.pkl", persist=True)
+        client.put(y_test, name="/data/y_test.pkl", persist=True)
+    else:
+        X_train.to_pickle('/data/x_train.pkl')
+        X_test.to_pickle('/data/x_test.pkl')
+        y_train.to_pickle('/data/y_train.pkl')
+        y_test.to_pickle('/data/y_test.pkl')
+
+    ed = time.time()
+    print('##################################')
+    print('write training and testing data time: ', ed - st)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_multiplier', type=int, default=1, help='Multiplier for data')
+    parser.add_argument('--with_vineyard', type=bool, default=False, help='Whether to use vineyard')
+    args = parser.parse_args()
+    st = time.time()
+    print('Preprocessing data...')
+    preprocess_data(args.data_multiplier, args.with_vineyard)
+    ed = time.time()
+    print('##################################')
+    print('Preprocessing data time: ', ed - st)
diff --git a/k8s/examples/vineyard-kubeflow/rbac.yaml b/k8s/examples/vineyard-kubeflow/rbac.yaml
new file mode 100644
index 000000000..cbd71ef53
--- /dev/null
+++ b/k8s/examples/vineyard-kubeflow/rbac.yaml
@@ -0,0 +1,31 @@
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: pipeline-runner
+  namespace: kubeflow
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: pipeline-runner-role
+rules:
+  - apiGroups: [""]
+    resources: ["persistentvolumeclaims"]
+    verbs: ["get", "create", "update", "list", "delete"]
+  - apiGroups: [""]
+    resources: ["pods"]
+    verbs: ["get", "patch", "create", "update", "list", "delete"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: pipeline-runner-rolebinding
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: pipeline-runner-role
+subjects:
+  - kind: ServiceAccount
+    name: pipeline-runner
+    namespace: kubeflow
\ No newline at end of file
diff --git a/k8s/examples/vineyard-kubeflow/test/test.py b/k8s/examples/vineyard-kubeflow/test/test.py
new file mode 100644
index 000000000..eacef6152
--- /dev/null
+++ b/k8s/examples/vineyard-kubeflow/test/test.py
@@ -0,0 +1,46 @@
+import argparse
+import os
+import time
+
+from sklearn.metrics import mean_squared_error
+
+import joblib
+import pandas as pd
+import vineyard
+
+def test_model(with_vineyard):
+    os.system('sync; echo 3 > /proc/sys/vm/drop_caches')
+    st = time.time()
+    if with_vineyard:
+        client = vineyard.connect()
+        x_test_data = client.get(name="/data/x_test.pkl", fetch=True)
+        y_test_data = client.get(name="/data/y_test.pkl", fetch=True)
+    else:
+        x_test_data = pd.read_pickle("/data/x_test.pkl")
+        y_test_data = pd.read_pickle("/data/y_test.pkl")
+        #delete the x_test.pkl and y_test.pkl
+        os.remove("/data/x_test.pkl")
+        os.remove("/data/y_test.pkl")
+    ed = time.time()
+    print('##################################')
+    print('read x_test and y_test execution time: ', ed - st)
+
+    model = joblib.load("/data/model.pkl")
+    y_pred = model.predict(x_test_data)
+
+    err = mean_squared_error(y_test_data, y_pred)
+
+    with open('/data/output.txt', 'a') as f:
+        f.write(str(err))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--with_vineyard', type=bool, default=False, help='Whether to use vineyard')
+    args = parser.parse_args()
+    st = time.time()
+    print('Testing model...')
+    test_model(args.with_vineyard)
+    ed = time.time()
+    print('##################################')
+    print('Testing model data time: ', ed - st)
diff --git a/k8s/examples/vineyard-kubeflow/train/train.py b/k8s/examples/vineyard-kubeflow/train/train.py
new file mode 100644
index 000000000..5a534ab09
--- /dev/null
+++ b/k8s/examples/vineyard-kubeflow/train/train.py
@@ -0,0 +1,44 @@
+import argparse
+import os
+import time
+
+from sklearn.linear_model import LinearRegression
+
+import joblib
+import pandas as pd
+import vineyard
+
+
+def train_model(with_vineyard):
+    os.system('sync; echo 3 > /proc/sys/vm/drop_caches')
+    st = time.time()
+    if with_vineyard:
+        client = vineyard.connect()
+        x_train_data = client.get(name="/data/x_train.pkl", fetch=True)
+        y_train_data = client.get(name="/data/y_train.pkl", fetch=True)
+    else:
+        x_train_data = pd.read_pickle("/data/x_train.pkl")
+        y_train_data = pd.read_pickle("/data/y_train.pkl")
+        # delete the x_train.pkl and y_train.pkl
+        os.remove("/data/x_train.pkl")
+        os.remove("/data/y_train.pkl")
+    ed = time.time()
+    print('##################################')
+    print('read x_train and y_train data time: ', ed - st)
+
+    model = LinearRegression()
+    model.fit(x_train_data, y_train_data)
+
+    joblib.dump(model, '/data/model.pkl')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--with_vineyard', type=bool, default=False, help='Whether to use vineyard')
+    args = parser.parse_args()
+    st = time.time()
+    print('Training model...')
+    train_model(args.with_vineyard)
+    ed = time.time()
+    print('##################################')
+    print('Training model data time: ', ed - st)

From 90e1f083b58de31f614c8f0adea00302b9bf7c1f Mon Sep 17 00:00:00 2001
From: Ye Cao <caoye.cao@alibaba-inc.com>
Date: Fri, 17 Nov 2023 16:50:42 +0800
Subject: [PATCH 2/4] * Add a quick installation section for vineyard cluster
 in the deploy on kubernetes doc. * Add a readme for kubeflow examples.

Signed-off-by: Ye Cao <caoye.cao@alibaba-inc.com>
---
 docs/notes/cloud-native/deploy-kubernetes.rst | 27 +++++++++
 docs/notes/developers/build-from-source.rst   | 14 -----
 k8s/examples/vineyard-csidriver/Makefile      |  2 +-
 .../pipeline-kfp-v2-with-vineyard.py          |  6 +-
 .../pipeline-kfp-v2-with-vineyard.yaml        |  6 +-
 .../vineyard-csidriver/pipeline-kfp-v2.py     |  6 +-
 .../vineyard-csidriver/pipeline-kfp-v2.yaml   |  6 +-
 .../vineyard-csidriver/prepare-data.yaml      |  2 +-
 .../pipeline-with-vineyard.py                 | 41 +++++++++----
 .../pipeline-with-vineyard.yaml               | 16 ++++-
 k8s/examples/vineyard-kubeflow/pipeline.py    | 36 +++++++----
 .../vineyard-kubeflow/prepare-data.yaml       |  2 +-
 k8s/examples/vineyard-kubeflow/readme.md      | 60 +++++++++++++++++++
 13 files changed, 170 insertions(+), 54 deletions(-)
 create mode 100644 k8s/examples/vineyard-kubeflow/readme.md

diff --git a/docs/notes/cloud-native/deploy-kubernetes.rst b/docs/notes/cloud-native/deploy-kubernetes.rst
index 1464343b1..9858050bd 100644
--- a/docs/notes/cloud-native/deploy-kubernetes.rst
+++ b/docs/notes/cloud-native/deploy-kubernetes.rst
@@ -5,6 +5,32 @@ Deploy on Kubernetes
 
 Vineyard is managed by the :ref:`vineyard-operator` on Kubernetes.
 
+Quick start
+-----------
+
+If you want to install vineyard cluster quickly, you can 
+use the following command.
+
+Install `vineyardctl`_ as follows.
+
+.. code:: bash
+
+    export LATEST_TAG=$(curl -s "https://api.github.com/repos/v6d-io/v6d/tags" | jq -r '.[0].name')
+    export OS=$(uname -s | tr '[:upper:]' '[:lower:]')
+    export ARCH=${$(uname -m)/x86_64/amd64}
+    curl -Lo vineyardctl https://github.com/v6d-io/v6d/releases/download/$LATEST_TAG/vineyardctl-$LATEST_TAG-$OS-$ARCH
+    chmod +x vineyardctl
+    sudo mv vineyardctl /usr/local/bin/
+
+Use the vineyardctl to install vineyard cluster.
+
+.. code:: bash
+
+    vineyardctl install vineyard-cluster --create-namespace
+
+Also, you could follow the next guide to install vineyard cluster steps
+by steps.
+
 Install vineyard-operator
 -------------------------
 
@@ -196,5 +222,6 @@ automates much of the boilerplate configuration required when deploying workflow
    ^^^^^^^^^^^^
    :code:`vineyardctl` is the command-line tool for working with the Vineyard Operator.
 
+.. _vineyardctl: https://github.com/v6d-io/v6d/blob/main/k8s/cmd/README.md
 .. _kind: https://kind.sigs.k8s.io
 .. _CRD: https://kubernetes.io/docs/tasks/extend-kubernetes/custom-resources/custom-resource-definitions
diff --git a/docs/notes/developers/build-from-source.rst b/docs/notes/developers/build-from-source.rst
index c9fd0f3f3..030515f9b 100644
--- a/docs/notes/developers/build-from-source.rst
+++ b/docs/notes/developers/build-from-source.rst
@@ -139,20 +139,6 @@ After building the vineyard library successfully, you can package an install whe
 
     python3 setup.py bdist_wheel
 
-Install vineyardctl
--------------------
-
-Vineyardctl is available on the Github release page, you can download the binary as follows:
-
-.. code:: shell
-
-    export LATEST_TAG=$(curl -s "https://api.github.com/repos/v6d-io/v6d/tags" | jq -r '.[0].name')
-    export OS=$(uname -s | tr '[:upper:]' '[:lower:]')
-    export ARCH=${$(uname -m)/x86_64/amd64}
-    curl -Lo vineyardctl https://github.com/v6d-io/v6d/releases/download/$LATEST_TAG/vineyardctl-$LATEST_TAG-$OS-$ARCH
-    chmod +x vineyardctl
-    sudo mv vineyardctl /usr/local/bin/
-
 
 Building the documentation
 --------------------------
diff --git a/k8s/examples/vineyard-csidriver/Makefile b/k8s/examples/vineyard-csidriver/Makefile
index 3ce332c36..ff1d7d193 100644
--- a/k8s/examples/vineyard-csidriver/Makefile
+++ b/k8s/examples/vineyard-csidriver/Makefile
@@ -1,4 +1,4 @@
-REGISTRY := "ghcr.io/v6d-io/v6d/kubeflow-example"
+REGISTRY := "ghcr.io/v6d-io/v6d/csidriver-example"
 docker-build:
 	docker build prepare-data/ -f Dockerfile \
 		--build-arg APP=prepare-data.py \
diff --git a/k8s/examples/vineyard-csidriver/pipeline-kfp-v2-with-vineyard.py b/k8s/examples/vineyard-csidriver/pipeline-kfp-v2-with-vineyard.py
index da961cfca..e636252ac 100644
--- a/k8s/examples/vineyard-csidriver/pipeline-kfp-v2-with-vineyard.py
+++ b/k8s/examples/vineyard-csidriver/pipeline-kfp-v2-with-vineyard.py
@@ -4,7 +4,7 @@
 @dsl.container_component
 def PreProcess(data_multiplier: int):
     return dsl.ContainerSpec(
-        image = 'ghcr.io/v6d-io/v6d/kubeflow-example/preprocess-data',
+        image = 'ghcr.io/v6d-io/v6d/csidriver-example/preprocess-data',
         command = ['python3', 'preprocess.py'],
         args = [f'--data_multiplier={data_multiplier}', '--with_vineyard=True'],
     )
@@ -12,7 +12,7 @@ def PreProcess(data_multiplier: int):
 @dsl.container_component
 def Train():
     return dsl.ContainerSpec(
-        image = 'ghcr.io/v6d-io/v6d/kubeflow-example/train-data',
+        image = 'ghcr.io/v6d-io/v6d/csidriver-example/train-data',
         command = ['python3', 'train.py'],
         args = ['--with_vineyard=True'],
     )
@@ -20,7 +20,7 @@ def Train():
 @dsl.container_component
 def Test():
     return dsl.ContainerSpec(
-        image = 'ghcr.io/v6d-io/v6d/kubeflow-example/test-data',
+        image = 'ghcr.io/v6d-io/v6d/csidriver-example/test-data',
         command = ['python3', 'test.py'],
         args = ['--with_vineyard=True'],
     )
diff --git a/k8s/examples/vineyard-csidriver/pipeline-kfp-v2-with-vineyard.yaml b/k8s/examples/vineyard-csidriver/pipeline-kfp-v2-with-vineyard.yaml
index 8d0f979b2..c6aab3905 100644
--- a/k8s/examples/vineyard-csidriver/pipeline-kfp-v2-with-vineyard.yaml
+++ b/k8s/examples/vineyard-csidriver/pipeline-kfp-v2-with-vineyard.yaml
@@ -99,7 +99,7 @@ deploymentSpec:
         command:
         - python3
         - preprocess.py
-        image: ghcr.io/v6d-io/v6d/kubeflow-example/preprocess-data
+        image: ghcr.io/v6d-io/v6d/csidriver-example/preprocess-data
     exec-test:
       container:
         args:
@@ -107,7 +107,7 @@ deploymentSpec:
         command:
         - python3
         - test.py
-        image: ghcr.io/v6d-io/v6d/kubeflow-example/test-data
+        image: ghcr.io/v6d-io/v6d/csidriver-example/test-data
     exec-train:
       container:
         args:
@@ -115,7 +115,7 @@ deploymentSpec:
         command:
         - python3
         - train.py
-        image: ghcr.io/v6d-io/v6d/kubeflow-example/train-data
+        image: ghcr.io/v6d-io/v6d/csidriver-example/train-data
 pipelineInfo:
   description: An example pipeline that trains and logs a regression model.
   name: machine-learning-pipeline-with-vineyard
diff --git a/k8s/examples/vineyard-csidriver/pipeline-kfp-v2.py b/k8s/examples/vineyard-csidriver/pipeline-kfp-v2.py
index 993ef4be5..88e548395 100644
--- a/k8s/examples/vineyard-csidriver/pipeline-kfp-v2.py
+++ b/k8s/examples/vineyard-csidriver/pipeline-kfp-v2.py
@@ -4,7 +4,7 @@
 @dsl.container_component
 def PreProcess(data_multiplier: int):
     return dsl.ContainerSpec(
-        image = 'ghcr.io/v6d-io/v6d/kubeflow-example/preprocess-data',
+        image = 'ghcr.io/v6d-io/v6d/csidriver-example/preprocess-data',
         command = ['python3', 'preprocess.py'],
         args=[f'--data_multiplier={data_multiplier}'],
     )
@@ -12,14 +12,14 @@ def PreProcess(data_multiplier: int):
 @dsl.container_component
 def Train():
     return dsl.ContainerSpec(
-        image='ghcr.io/v6d-io/v6d/kubeflow-example/train-data',
+        image='ghcr.io/v6d-io/v6d/csidriver-example/train-data',
         command = ['python3', 'train.py'],
     )
 
 @dsl.container_component
 def Test():
     return dsl.ContainerSpec(
-        image='ghcr.io/v6d-io/v6d/kubeflow-example/test-data',
+        image='ghcr.io/v6d-io/v6d/csidriver-example/test-data',
         command = ['python3', 'test.py'],
     )
 
diff --git a/k8s/examples/vineyard-csidriver/pipeline-kfp-v2.yaml b/k8s/examples/vineyard-csidriver/pipeline-kfp-v2.yaml
index ed6944036..c9f13e230 100644
--- a/k8s/examples/vineyard-csidriver/pipeline-kfp-v2.yaml
+++ b/k8s/examples/vineyard-csidriver/pipeline-kfp-v2.yaml
@@ -23,19 +23,19 @@ deploymentSpec:
         command:
         - python3
         - preprocess.py
-        image: ghcr.io/v6d-io/v6d/kubeflow-example/preprocess-data
+        image: ghcr.io/v6d-io/v6d/csidriver-example/preprocess-data
     exec-test:
       container:
         command:
         - python3
         - test.py
-        image: ghcr.io/v6d-io/v6d/kubeflow-example/test-data
+        image: ghcr.io/v6d-io/v6d/csidriver-example/test-data
     exec-train:
       container:
         command:
         - python3
         - train.py
-        image: ghcr.io/v6d-io/v6d/kubeflow-example/train-data
+        image: ghcr.io/v6d-io/v6d/csidriver-example/train-data
 pipelineInfo:
   description: An example pipeline that trains and logs a regression model.
   name: machine-learning-pipeline
diff --git a/k8s/examples/vineyard-csidriver/prepare-data.yaml b/k8s/examples/vineyard-csidriver/prepare-data.yaml
index 9475ff9fd..c8b051d98 100644
--- a/k8s/examples/vineyard-csidriver/prepare-data.yaml
+++ b/k8s/examples/vineyard-csidriver/prepare-data.yaml
@@ -15,7 +15,7 @@ spec:
     spec:
       containers:
       - name: prepare-data
-        image: ghcr.io/v6d-io/v6d/kubeflow-example/prepare-data
+        image: ghcr.io/v6d-io/v6d/csidriver-example/prepare-data
         imagePullPolicy: Always
         command: ["python3", "/prepare-data.py"]
         volumeMounts:
diff --git a/k8s/examples/vineyard-kubeflow/pipeline-with-vineyard.py b/k8s/examples/vineyard-kubeflow/pipeline-with-vineyard.py
index 7fe7d71a8..610af0343 100644
--- a/k8s/examples/vineyard-kubeflow/pipeline-with-vineyard.py
+++ b/k8s/examples/vineyard-kubeflow/pipeline-with-vineyard.py
@@ -3,12 +3,16 @@
 import kubernetes as k8s
 
 def PreProcess(data_multiplier: int, registry: str):
-    #############################################################
-    vineyard_volume = dsl.PipelineVolume(volume=k8s.client.V1Volume(
-        name="vineyard-socket",
-        host_path=k8s.client.V1HostPathVolumeSource(path="/var/run/vineyard-kubernetes/vineyard-system/vineyardd-sample")))
+    vineyard_volume = dsl.PipelineVolume(
+        volume=k8s.client.V1Volume(
+            name="vineyard-socket",
+            host_path=k8s.client.V1HostPathVolumeSource(
+                path="/var/run/vineyard-kubernetes/vineyard-system/vineyardd-sample"
+            )
+        )
+    )
 
-    return dsl.ContainerOp(
+    op = dsl.ContainerOp(
         name='Preprocess Data',
         image = f'{registry}/preprocess-data',
         container_kwargs={
@@ -22,9 +26,14 @@ def PreProcess(data_multiplier: int, registry: str):
         command = ['python3', 'preprocess.py'],
         arguments=[f'--data_multiplier={data_multiplier}', '--with_vineyard=True'],
     )
+    op.add_pod_label('scheduling.k8s.v6d.io/vineyardd-namespace', 'vineyard-system')
+    op.add_pod_label('scheduling.k8s.v6d.io/vineyardd', 'vineyardd-sample')
+    op.add_pod_label('scheduling.k8s.v6d.io/job', 'preprocess-data')
+    op.add_pod_annotation('scheduling.k8s.v6d.io/required', '')
+    return op
 
 def Train(comp1, registry: str):
-    return dsl.ContainerOp(
+    op = dsl.ContainerOp(
         name='Train Data',
         image=f'{registry}/train-data',
         container_kwargs={
@@ -38,9 +47,14 @@ def Train(comp1, registry: str):
         command = ['python3', 'train.py'],
         arguments=['--with_vineyard=True'],
     )
+    op.add_pod_label('scheduling.k8s.v6d.io/vineyardd-namespace', 'vineyard-system')
+    op.add_pod_label('scheduling.k8s.v6d.io/vineyardd', 'vineyardd-sample')
+    op.add_pod_label('scheduling.k8s.v6d.io/job', 'train-data')
+    op.add_pod_annotation('scheduling.k8s.v6d.io/required', 'preprocess-data')
+    return op
 
-def Test(comp1, comp2, registry: str):
-    return dsl.ContainerOp(
+def Test(comp2, registry: str):
+    op = dsl.ContainerOp(
         name='Test Data',
         image=f'{registry}/test-data',
         container_kwargs={
@@ -49,20 +63,25 @@ def Test(comp1, comp2, registry: str):
         },
         pvolumes={
             "/data": comp2.pvolumes['/data'],
-            "/var/run": comp1.pvolumes['/var/run']
+            "/var/run": comp2.pvolumes['/var/run']
         },
         command = ['python3', 'test.py'],
         arguments=['--with_vineyard=True'],
     )
+    op.add_pod_label('scheduling.k8s.v6d.io/vineyardd-namespace', 'vineyard-system')
+    op.add_pod_label('scheduling.k8s.v6d.io/vineyardd', 'vineyardd-sample')
+    op.add_pod_label('scheduling.k8s.v6d.io/job', 'test-data')
+    op.add_pod_annotation('scheduling.k8s.v6d.io/required', 'train-data')
+    return op
 
 @dsl.pipeline(
-   name='Machine learning Pipeline',
+   name='Machine Learning Pipeline',
    description='An example pipeline that trains and logs a regression model.'
 )
 def pipeline(data_multiplier: int, registry: str):
     comp1 = PreProcess(data_multiplier=data_multiplier, registry=registry)
     comp2 = Train(comp1, registry=registry)
-    comp3 = Test(comp1, comp2, registry=registry)
+    comp3 = Test(comp2, registry=registry)
 
 if __name__ == '__main__':
     from kfp import compiler
diff --git a/k8s/examples/vineyard-kubeflow/pipeline-with-vineyard.yaml b/k8s/examples/vineyard-kubeflow/pipeline-with-vineyard.yaml
index 7ffb8e42b..acff2db51 100644
--- a/k8s/examples/vineyard-kubeflow/pipeline-with-vineyard.yaml
+++ b/k8s/examples/vineyard-kubeflow/pipeline-with-vineyard.yaml
@@ -2,7 +2,7 @@ apiVersion: argoproj.io/v1alpha1
 kind: Workflow
 metadata:
   generateName: machine-learning-pipeline-
-  annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.21, pipelines.kubeflow.org/pipeline_compilation_time: '2023-11-14T17:50:23.371870',
+  annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.21, pipelines.kubeflow.org/pipeline_compilation_time: '2023-11-17T16:10:29.221000',
     pipelines.kubeflow.org/pipeline_spec: '{"description": "An example pipeline that
       trains and logs a regression model.", "inputs": [{"name": "data_multiplier",
       "type": "Integer"}, {"name": "registry", "type": "String"}], "name": "Machine
@@ -26,7 +26,7 @@ spec:
           - {name: registry, value: '{{inputs.parameters.registry}}'}
       - name: test-data
         template: test-data
-        dependencies: [preprocess-data, train-data]
+        dependencies: [train-data]
         arguments:
           parameters:
           - {name: registry, value: '{{inputs.parameters.registry}}'}
@@ -52,7 +52,11 @@ spec:
       - {name: data_multiplier}
       - {name: registry}
     metadata:
+      annotations: {scheduling.k8s.v6d.io/required: ''}
       labels:
+        scheduling.k8s.v6d.io/vineyardd-namespace: vineyard-system
+        scheduling.k8s.v6d.io/vineyardd: vineyardd-sample
+        scheduling.k8s.v6d.io/job: preprocess-data
         pipelines.kubeflow.org/kfp_sdk_version: 1.8.21
         pipelines.kubeflow.org/pipeline-sdk-type: kfp
         pipelines.kubeflow.org/enable_caching: "true"
@@ -76,7 +80,11 @@ spec:
       parameters:
       - {name: registry}
     metadata:
+      annotations: {scheduling.k8s.v6d.io/required: train-data}
       labels:
+        scheduling.k8s.v6d.io/vineyardd-namespace: vineyard-system
+        scheduling.k8s.v6d.io/vineyardd: vineyardd-sample
+        scheduling.k8s.v6d.io/job: test-data
         pipelines.kubeflow.org/kfp_sdk_version: 1.8.21
         pipelines.kubeflow.org/pipeline-sdk-type: kfp
         pipelines.kubeflow.org/enable_caching: "true"
@@ -100,7 +108,11 @@ spec:
       parameters:
       - {name: registry}
     metadata:
+      annotations: {scheduling.k8s.v6d.io/required: preprocess-data}
       labels:
+        scheduling.k8s.v6d.io/vineyardd-namespace: vineyard-system
+        scheduling.k8s.v6d.io/vineyardd: vineyardd-sample
+        scheduling.k8s.v6d.io/job: train-data
         pipelines.kubeflow.org/kfp_sdk_version: 1.8.21
         pipelines.kubeflow.org/pipeline-sdk-type: kfp
         pipelines.kubeflow.org/enable_caching: "true"
diff --git a/k8s/examples/vineyard-kubeflow/pipeline.py b/k8s/examples/vineyard-kubeflow/pipeline.py
index c58f7ce61..db93de6aa 100644
--- a/k8s/examples/vineyard-kubeflow/pipeline.py
+++ b/k8s/examples/vineyard-kubeflow/pipeline.py
@@ -1,35 +1,47 @@
 from kfp import dsl
 
 def PreProcess(data_multiplier: int, registry: str):
-    return dsl.ContainerOp(
+    op =  dsl.ContainerOp(
         name='Preprocess Data',
         image = f'{registry}/preprocess-data',
-        container_kwargs={'image_pull_policy':"Always"},
+        container_kwargs={
+            'image_pull_policy': "Always",
+        },
+        pvolumes={
+            "/data": dsl.PipelineVolume(pvc="benchmark-data"),
+        },
         command = ['python3', 'preprocess.py'],
         arguments = [f'--data_multiplier={data_multiplier}'],
-        # add the existing volume to the pipeline
-        pvolumes={"/data": dsl.PipelineVolume(pvc="benchmark-data")},
     )
+    return op
 
 def Train(comp1, registry: str):
-    return dsl.ContainerOp(
+    op = dsl.ContainerOp(
         name='Train Data',
         image=f'{registry}/train-data',
-        container_kwargs={'image_pull_policy':"Always"},
+        container_kwargs={
+            'image_pull_policy': "Always",
+        },
+        pvolumes={
+            "/data": comp1.pvolumes['/data'],
+        },
         command = ['python3', 'train.py'],
-        
-        pvolumes={"/data": comp1.pvolumes['/data']},
     )
+    return op
 
 def Test(comp2, registry: str):
-    return dsl.ContainerOp(
+    op = dsl.ContainerOp(
         name='Test Data',
         image=f'{registry}/test-data',
-        container_kwargs={'image_pull_policy':"Always"},
+        container_kwargs={
+            'image_pull_policy': "Always",
+        },
+        pvolumes={
+            "/data": comp2.pvolumes['/data'],
+        },
         command = ['python3', 'test.py'],
-
-        pvolumes={"/data": comp2.pvolumes['/data']},
     )
+    return op
 
 @dsl.pipeline(
    name='Machine Learning Pipeline',
diff --git a/k8s/examples/vineyard-kubeflow/prepare-data.yaml b/k8s/examples/vineyard-kubeflow/prepare-data.yaml
index 9475ff9fd..2c3abffac 100644
--- a/k8s/examples/vineyard-kubeflow/prepare-data.yaml
+++ b/k8s/examples/vineyard-kubeflow/prepare-data.yaml
@@ -53,5 +53,5 @@ spec:
   accessModes:
     - ReadWriteMany
   hostPath:
-    # mount a nfs volume to the kind nodes
+    # mount a nfs volume to the kubernetes nodes
     path: "/mnt/csi-benchmark"
\ No newline at end of file
diff --git a/k8s/examples/vineyard-kubeflow/readme.md b/k8s/examples/vineyard-kubeflow/readme.md
new file mode 100644
index 000000000..a0bc5a584
--- /dev/null
+++ b/k8s/examples/vineyard-kubeflow/readme.md
@@ -0,0 +1,60 @@
+## Use vineyard to accelerate kubeflow pipelines
+
+Vineyard can accelerate data sharing by utilizing shared memory compared to existing methods such as local files or S3 services. In this doc, we will show you how to use vineyard to accelerate an existing kubeflow pipeline.
+
+
+### Prerequisites
+
+- Install the argo CLI tool via the [official guide](https://github.com/argoproj/argo-workflows/releases/).
+
+
+### Overview of the pipeline
+
+The pipeline we use is a simple pipeline that trains a linear regression model on the dummy Boston Housing Dataset. It contains three steps: preprocess, train, and test.
+
+
+### Run the pipeline
+
+Assume we have installed [kubeflow](https://www.kubeflow.org/docs/components/pipelines/v1/installation/standalone-deployment/#deploying-kubeflow-pipelines) and [vineyard](https://v6d.io/notes/cloud-native/deploy-kubernetes.html#quick-start) in the kubernetes cluster. We can use the following steps to run the pipeline:
+
+First, we need to prepare the dataset by running the following command:
+
+```bash
+$ kubectl apply -f prepare_dataset.yaml
+```
+
+The dataset will be stored in the host path. Also, you may need to wait for a while for the dataset to be generated and you can use the following command to check the status:
+
+```bash
+$ kubectl logs -l app=prepare-data -n kubeflow | grep "preparing data time" >/dev/null && echo "dataset ready" || echo "dataset unready"
+```
+
+After that, you can run the pipeline via the following command:
+
+```bash
+$ argo submit --watch pipeline-with-vineyard.yaml -p data_multiplier=4000 -p registry="ghcr.io/v6d-io/v6d/kubeflow-example" -n kubeflow
+```
+
+
+### Modifications to use vineyard
+
+Compared to the original kubeflow pipeline, we could use the following command to check the differences:
+
+```bash
+$ git diff --no-index --unified=40 pipeline.py pipeline-with-vineyard.py
+```
+
+The main modifications are:
+- Add a new volume to the pipeline. This volume is used to connect to the vineyard cluster via the IPC socket file in
+the host path.
+- Add the scheduler annotations and labels to the pipeline. This is used to schedule the pipeline to the node that has vineyardd running.
+
+Also, you can check the modifications of the source code as 
+follows.
+
+- [Save data in the preparation step](https://github.com/v6d-io/v6d/blob/main/k8s/examples/vineyard-kubeflow/preprocess/preprocess.py#L62-L72).
+- [Load data in the training step](https://github.com/v6d-io/v6d/blob/main/k8s/examples/vineyard-kubeflow/train/train.py#L15-L24).
+- [load data in the testing step](https://github.com/v6d-io/v6d/blob/main/k8s/examples/vineyard-kubeflow/test/test.py#L14-L20).
+
+The main modification is to use vineyard to load and save data
+rather than using local files.

From f64b70a7ac08976816187bb7761789f0459f11ff Mon Sep 17 00:00:00 2001
From: Ye Cao <caoye.cao@alibaba-inc.com>
Date: Mon, 27 Nov 2023 20:22:03 +0800
Subject: [PATCH 3/4] Use the pypi installation of vineyardctl as recommend
 way.

Signed-off-by: Ye Cao <caoye.cao@alibaba-inc.com>
---
 docs/notes/cloud-native/deploy-kubernetes.rst | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/docs/notes/cloud-native/deploy-kubernetes.rst b/docs/notes/cloud-native/deploy-kubernetes.rst
index 9858050bd..8b02ca434 100644
--- a/docs/notes/cloud-native/deploy-kubernetes.rst
+++ b/docs/notes/cloud-native/deploy-kubernetes.rst
@@ -15,18 +15,13 @@ Install `vineyardctl`_ as follows.
 
 .. code:: bash
 
-    export LATEST_TAG=$(curl -s "https://api.github.com/repos/v6d-io/v6d/tags" | jq -r '.[0].name')
-    export OS=$(uname -s | tr '[:upper:]' '[:lower:]')
-    export ARCH=${$(uname -m)/x86_64/amd64}
-    curl -Lo vineyardctl https://github.com/v6d-io/v6d/releases/download/$LATEST_TAG/vineyardctl-$LATEST_TAG-$OS-$ARCH
-    chmod +x vineyardctl
-    sudo mv vineyardctl /usr/local/bin/
+    pip3 install vineyard
 
 Use the vineyardctl to install vineyard cluster.
 
 .. code:: bash
 
-    vineyardctl install vineyard-cluster --create-namespace
+    python3 -m vineyard.ctl install vineyard-cluster --create-namespace
 
 Also, you could follow the next guide to install vineyard cluster steps
 by steps.

From a9336b9895fb593621d5e6fbcbf8935559702422 Mon Sep 17 00:00:00 2001
From: Ye Cao <caoye.cao@alibaba-inc.com>
Date: Mon, 27 Nov 2023 20:29:50 +0800
Subject: [PATCH 4/4] Fix the wrong descrition of using vineyard operator doc.

Signed-off-by: Ye Cao <caoye.cao@alibaba-inc.com>
---
 docs/tutorials/kubernetes/using-vineyard-operator.rst | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/docs/tutorials/kubernetes/using-vineyard-operator.rst b/docs/tutorials/kubernetes/using-vineyard-operator.rst
index fb7d0e4a8..11f04a46a 100644
--- a/docs/tutorials/kubernetes/using-vineyard-operator.rst
+++ b/docs/tutorials/kubernetes/using-vineyard-operator.rst
@@ -252,7 +252,7 @@ Check the status of all relevant resources managed by the ``vineyardd-sample`` c
 
 .. code:: bash
 
-    $ kubectl get all -l app.kubernetes.io/instance=vineyardd -n vineyard-system
+    $ kubectl get all -l app.kubernetes.io/instance=vineyard-system-vineyardd-sample -n vineyard-system
 
 .. admonition:: Expected output
    :class: admonition-details
@@ -307,11 +307,11 @@ First, let's deploy the Python client on two Vineyard nodes as follows.
           containers:
           - name: vineyard-python
             imagePullPolicy: IfNotPresent
-            image: vineyardcloudnative/vineyard-python:v0.11.4
+            image: python:3.10
             command:
             - /bin/bash
             - -c
-            - sleep infinity
+            - pip3 install vineyard && sleep infinity
             volumeMounts:
             - mountPath: /var/run
               name: vineyard-sock
@@ -341,7 +341,8 @@ Wait for the vineyard python client pod ready.
     .. code:: bash
 
         NAME                                      READY   STATUS    RESTARTS   AGE
-        vineyard-python-client-6fd8c47c98-7btkv   1/1     Running   0          93s
+        vineyard-python-client-6fd84bc897-27glp   1/1     Running   0          93s
+        vineyard-python-client-6fd84bc897-tlb22   1/1     Running   0          93s
 
 Use the kubectl exec command to enter the first vineyard python client pod.