prepare Dockerfile for reproduction package and refactor

lfd · Jul 30, 2024 · 8fd0b3a · 8fd0b3a
1 parent abd5203
commit 8fd0b3a
Show file tree

Hide file tree

Showing 91,891 changed files with 5,068 additions and 7,495,341 deletions.
diff --git a/.github/workflows/build-docker.yml b/.github/workflows/build-docker.yml
@@ -0,0 +1,34 @@
+name: Docker Image
+
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: qce24_repro
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Build Dockerimage
+      run: docker build -t ${{ env.IMAGE_NAME }}:latest .
+
+    - name: Log in to the Container registry
+      uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
+      with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+    - name: Tag Docker image
+      run: docker image tag ${{ env.IMAGE_NAME }}:latest ${{ env.REGISTRY }}/lfd/RL_for_JO/${{ env.IMAGE_NAME }}:latest
+
+    - name: Push Docker image
+      run: docker image push ${{ env.REGISTRY }}/lfd/RL_for_JO/${{ env.IMAGE_NAME }}:latest
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@ queries/
 JOB/
 .python-version
 postgres
+paper/build/
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,115 @@
+FROM ubuntu:22.04
+
+LABEL author="Maja Franz <[email protected]>"
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV LANG="C.UTF-8"
+ENV LC_ALL="C.UTF-8"
+
+# Install required packages
+RUN apt-get update
+RUN apt-get install -y software-properties-common
+RUN add-apt-repository ppa:deadsnakes/ppa # For Python 3.9
+RUN apt-get install -y \
+    python3.9 \
+    python3-pip \
+    python3.9-distutils \
+    python3.9-dev \
+    wget \
+    git \
+    r-base \
+    libv8-dev \
+    libreadline-dev \
+    zlib1g-dev \
+    texlive-latex-base \
+    texlive-science \
+    texlive-fonts-recommended \
+    texlive-fonts-extra \
+    texlive-publishers \
+    texlive-bibtex-extra \
+    texlive-luatex \
+    biber
+
+# Install R packages for plotting
+RUN R -e "install.packages('tidyverse',dependencies=TRUE, repos='http://cran.rstudio.com/')"
+RUN R -e "install.packages('ggh4x',dependencies=TRUE, repos='http://cran.rstudio.com/')"
+RUN R -e "install.packages('patchwork',dependencies=TRUE, repos='http://cran.rstudio.com/')"
+RUN R -e "install.packages('tikzDevice',dependencies=TRUE, repos='http://cran.rstudio.com/')"
+RUN R -e "install.packages('scales',dependencies=TRUE, repos='http://cran.rstudio.com/')"
+
+# Let Python 3.9 be global python version
+RUN ln -s /usr/bin/python3.9 /usr/bin/python
+
+# Add user
+RUN useradd -m -G sudo -s /bin/bash repro && echo "repro:repro" | chpasswd
+USER repro
+
+# Add artifacts (from host) to home directory
+ADD --chown=repro:repro . /home/repro/qce24_repro
+
+WORKDIR /home/repro/postgres
+
+# setup PostgreSQL
+ENV RLJO_PSQL_BASE=/home/repro/postgres
+
+## PostgreSQL
+ENV RLJO_PSQL_DATA_DIRECTORY=$RLJO_PSQL_BASE/database
+ENV RLJO_PSQL_SRC_DIRECTORY="$RLJO_PSQL_BASE/postgresql-16.0"
+ENV RLJO_PSQL_INSTALL_DIRECTORY="$RLJO_PSQL_BASE/install"
+ENV RLJO_PG_HINT_PLAN_BASE="$RLJO_PSQL_BASE/pg_hint_plan"
+ENV RLJO_PG_HINT_PLAN_SRC_DIRECTORY="$RLJO_PG_HINT_PLAN_BASE/pg_hint_plan-REL16_1_6_0"
+
+WORKDIR $RLJO_PSQL_BASE
+RUN wget https://ftp.postgresql.org/pub/source/v16.0/postgresql-16.0.tar.gz
+RUN tar xvfz postgresql-16.0.tar.gz
+RUN mkdir $RLJO_PSQL_INSTALL_DIRECTORY
+RUN mkdir $RLJO_PSQL_DATA_DIRECTORY
+WORKDIR $RLJO_PSQL_SRC_DIRECTORY
+RUN ./configure --prefix=$RLJO_PSQL_INSTALL_DIRECTORY --enable-debug
+RUN make -j $(nproc)
+RUN make install
+WORKDIR $RLJO_PSQL_BASE
+
+ENV PATH=$RLJO_PSQL_BASE/install/bin:$PATH
+ENV LD_LIBRARY_PATH=$RLJO_PSQL_BASE/install/lib/
+
+RUN initdb -D $RLJO_PSQL_DATA_DIRECTORY
+RUN pg_ctl -D $RLJO_PSQL_DATA_DIRECTORY -l $RLJO_PSQL_BASE/logfile start
+
+## PG hint plugin
+RUN mkdir $RLJO_PG_HINT_PLAN_BASE
+WORKDIR $RLJO_PG_HINT_PLAN_BASE
+RUN wget https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL16_1_6_0.tar.gz
+RUN tar xvfz REL16_1_6_0.tar.gz
+WORKDIR $RLJO_PG_HINT_PLAN_SRC_DIRECTORY
+RUN make -j $(nproc)
+RUN make install
+
+ENV PATH=$PATH:/home/repro/postgres/install/bin
+
+
+# setup join order benchmark
+WORKDIR /home/repro
+
+ENV RLJO_JOB_BASE=/home/repro/JOB
+
+RUN mkdir $RLJO_JOB_BASE
+RUN git clone -n https://github.com/danolivo/jo-bench $RLJO_JOB_BASE/jo-bench
+
+WORKDIR $RLJO_JOB_BASE/jo-bench
+RUN git checkout a2019f9
+
+WORKDIR /home/repro/qce24_repro
+
+# install python packages
+ENV PATH=$PATH:/home/repro/.local/bin
+# set default python version to 3.9
+RUN echo 'alias python="python3.9"' >> /home/repro/.bashrc
+RUN echo 'alias pip="pip3.9"' >> /home/repro/.bashrc
+
+RUN python3.9 -m pip install -r experimental_analysis/requirements.txt
+
+# Experiments can be run, plots can be generated or paper can be built when
+# container is started, see options in README or run script
+ENTRYPOINT ["./scripts/run.sh"]
+CMD ["bash"]
diff --git a/README.md b/README.md
@@ -2,20 +2,85 @@
 
 This repository provides the implementation and artifacts accompanying the following article:
 ```
-@article{franz:23:qrl_jo,
-    author = {Maja Franz AND
-              Tobias Winker AND
-              Sven Groppe AND
-              Wolfgang Mauerer},
-    title  = {Hype or Heuristic? Quantum Reinforcement Learning for Join Order Optimisation},
-    note   = {under review},
+@inproceedings{franz:24:qce24,
+ title     = {Hype or Heuristic? Quantum Reinforcement Learning for Join Order Optimisation},
+ author    = {Maja Franz and Tobias Winker and Sven Groppe and Wolfgang Mauerer},
+ booktitle = {IEEE International Conference on Quantum Computing and Engineering (QCE)},
+ year      = {2024},
+ month     = {09},
+ userd     = {IEEE QCE '24},
 }
 ```
 A preprint is available on [arXiv](https://arxiv.org/abs/2405.07770).
 
+## Content of this Repository
+
+- [`experimental_analysis/`](experimental_analysis/):
+This directory contains all necessary scripts to run a training and evaluation of the classical and multi-step QRL approaches presented in the publication.
+For running experiments with the single-step QML approach, we refer to the [source code](https://github.com/TobiasWinker/QC4DB_QO) accompanying the [related publication](https://doi.org/10.1145/3579142.3594299).
+All result sets from the paper are stored as CSV files in [`experimental_analysis/logs/paper_results`](experimental_analysis/logs/paper_results).
+The tikz plots from the paper can be reproduced with the script [`scripts/generate_plots.sh`](scripts/generate_plots.sh).
+
+- [`info/`](info/):
+This directory contains supplementary material on the classical baseline replication and on specific hyperparameters.
+
+- [`paper/`](paper/):
+This directory contains the article in PDF and its source code in LaTeX.
+
+- [`plots/`](plots/):
+This directory contains the plots from the paper and the source code to generate them in R.
+
+- [`scalability/`](scalability/):
+In this directory the CSV files, which are used to generate the scalability Figs. 9 and 10, are stored, together with the python scripts, which were used to generate those.
+
+- [`scripts/`](scripts/):
+This directory contains bash scripts, which can either be used as endpoints for the Docker-image, or executed locally.
+Furthermore the setup scripts for PostgreSQL-V16 and the JOB can be found there.
+
+
 ## Setup
 
-### Python packages
+### Docker
+
+#### Get docker image
+Build image:
+
+```docker build -t qce24_repro .```
+
+or pull image:
+
+```docker pull ghcr.io/lfd/RL_for_JO/qce24_repro:latest```
+
+The image does contain an instance of PostgreSQL-V16.
+However the dataset for the Join order benchmark is only installed, once a container is run with the corresponding options for the endpoint.
+
+#### Create Container
+
+```docker run --name qce24_repro -it qce24_repro [<-flags>] [<option>]```
+
+The `<option>` specifies which operations are performed on container start.
+
+Available options are:
+* `experiments_classic`: performs the trainings with a classical NN\*
+* `experiments_quantum`: performs the trainings involving a VQC\*
+* `experiments_noise`: performs a evaluation in simulated noisy environments\*
+* `plot`: generates the the plots for the paper using R
+* `paper`: generates the full paper from LaTeX
+* `plot_paper`: generates both, plots and the paper
+* `all`: performs all of the above\*
+* `bash`(default): does not perform any operation, but launches interactive shell, default
+
+Feel free to define additional `<-flags>`, e.g.:
+* Volume, to keep track of generated files on the host system: `-v $PWD:/home/repro/qce24_repro`
+* Port forwarding to launch TensorBoard on the container to track the training process for RL on the host: `-p 6006:6006`. TensorBoard can be started in the Container with: `tensorboard --logdir experimental_analysis/logs --host 0.0.0.0`
+
+\*Please note the long runtimes for RL trainings (hours to days).
+Additionally, the Join Order Benchmark gets set up for these options, which additionally takes up to 1-2 hours.
+For quickly inspecting our reproduction package, we recommend to use the option `bash`.
+
+### Local Setup
+
+#### Python packages
 The code can be executed with Python version 3.9.
 For managing multiple python versions it is recommended to use [pyenv](https://github.com/pyenv/pyenv).
 After installing Python 3.9 using pyenv with
@@ -36,15 +101,15 @@ all required python packages can be installed with:
 pip install -r requirements.txt
 ```
 
-### PostgreSQL and the Join Order Benchmark
+#### PostgreSQL and the Join Order Benchmark
 
 For the setup of PostgreSQL-V16.0 and the Join Order Benchmark, we provide two setup scripts:
 ```
 source install_postgres.sh
 source setup_JOB.sh
 ```
 
-## Training
+## Custom Trainings
 
 ``python main.py <configuration_name>``
 
@@ -56,14 +121,3 @@ A training process can be tracked with TensorBoard:
 
 ``tensorboard --logdir logs``
 
-## Additional Material and Hyperparameter Search
-
-Information on the classical baseline replication and on specific hyperparameters can be found in [`info/supplementary.pdf`](info/supplementary.pdf)
-
-## Result Sets
-
-All result sets from the paper are stored as CSV files in logs.
-The tikz plots from the paper can be reproduced with the [`plots/plot.r`](plots/plot.r) script using
-```
-Rscript plots/plot.r
-```
diff --git a/configs/best_params/quantum.py → ...l_analysis/configs/best_params/quantum.py b/configs/best_params/quantum.py → ...l_analysis/configs/best_params/quantum.py
@@ -1,11 +1,9 @@
-from datetime import datetime
-from tensorflow.keras.layers import Dense, Activation, Softmax, concatenate
-from tensorflow.keras.optimizers import Adam, schedules
-from tensorflow.keras import Input, Model
-from tensorflow.keras.utils import plot_model
+from tensorflow.keras.layers import concatenate
+from tensorflow.keras.optimizers import schedules
+from tensorflow.keras import Input
 
 from src.configuration import PPOConfiguration
-from src.envs.join_order.actions import action_regular, action_possible_idx
+from src.envs.join_order.actions import action_regular
 
 ### Configuration
 conf = PPOConfiguration()
@@ -71,4 +69,3 @@
 conf.multistep = multistep
 conf.gather_selectivity_info = selpreds
 conf.take_best_frequency = take_best_frequency
-
diff --git a/configs/database.py → experimental_analysis/configs/database.py b/configs/database.py → experimental_analysis/configs/database.py
@@ -2,7 +2,7 @@
 PGDATABASE = "imdbload" # tpch & imdbload
 PGUSER = "postgres"
 PGPASSWORD = "postgres"
-PGPORT = "5434"
+PGPORT = "5432"
 
 QC4DB_PLUGIN_PORT = 17342
 QC4DB_PLUGIN_HOST = "127.0.0.1"
diff --git a/configs/example.py → experimental_analysis/configs/example.py b/configs/example.py → experimental_analysis/configs/example.py
@@ -77,10 +77,10 @@
 # Critic
 num_qubits = sum(conf.num_inputs) // conf.num_relations
 quantum_critic_layer = VQC_Layer(num_qubits,
-                    num_layers,
+                    4,
                     encoding_ops_rx,
                     layer_ry_rz_cz,
-                    data_reuploading=data_reuploading,
+                    data_reuploading=False,
                     num_actions=1,
                     incremental_data_uploading=True,
                     num_inputs=sum(conf.num_inputs),

diff --git a/...er/classical/COUTCM/basic_rejoin_shift.py → ...er/classical/COUTCM/basic_rejoin_shift.py b/...er/classical/COUTCM/basic_rejoin_shift.py → ...er/classical/COUTCM/basic_rejoin_shift.py
diff --git a/...er/classical/COUTCM/mod_reduced_rejoin.py → ...er/classical/COUTCM/mod_reduced_rejoin.py b/...er/classical/COUTCM/mod_reduced_rejoin.py → ...er/classical/COUTCM/mod_reduced_rejoin.py
diff --git a/...lassical/COUTCM/mod_reduced_rejoin_384.py → ...lassical/COUTCM/mod_reduced_rejoin_384.py b/...lassical/COUTCM/mod_reduced_rejoin_384.py → ...lassical/COUTCM/mod_reduced_rejoin_384.py
diff --git a/...er/classical/PGCM16/basic_rejoin_shift.py → ...er/classical/PGCM16/basic_rejoin_shift.py b/...er/classical/PGCM16/basic_rejoin_shift.py → ...er/classical/PGCM16/basic_rejoin_shift.py
diff --git a/...er/classical/PGCM16/mod_reduced_rejoin.py → ...er/classical/PGCM16/mod_reduced_rejoin.py b/...er/classical/PGCM16/mod_reduced_rejoin.py → ...er/classical/PGCM16/mod_reduced_rejoin.py
diff --git a/...lassical/PGCM16/mod_reduced_rejoin_384.py → ...lassical/PGCM16/mod_reduced_rejoin_384.py b/...lassical/PGCM16/mod_reduced_rejoin_384.py → ...lassical/PGCM16/mod_reduced_rejoin_384.py
diff --git a/...per/classical/PGCM8/basic_rejoin_shift.py → ...per/classical/PGCM8/basic_rejoin_shift.py b/...per/classical/PGCM8/basic_rejoin_shift.py → ...per/classical/PGCM8/basic_rejoin_shift.py
diff --git a/configs/hyper/quantum/COUTCM/base.py → ...ysis/configs/hyper/quantum/COUTCM/base.py b/configs/hyper/quantum/COUTCM/base.py → ...ysis/configs/hyper/quantum/COUTCM/base.py
diff --git a/configs/hyper/quantum/EXTIME/base.py → ...ysis/configs/hyper/quantum/EXTIME/base.py b/configs/hyper/quantum/EXTIME/base.py → ...ysis/configs/hyper/quantum/EXTIME/base.py
diff --git a/configs/hyper/quantum/PGCM16/base.py → ...ysis/configs/hyper/quantum/PGCM16/base.py b/configs/hyper/quantum/PGCM16/base.py → ...ysis/configs/hyper/quantum/PGCM16/base.py
diff --git a/experimental_analysis/configs/noisy_base.py b/experimental_analysis/configs/noisy_base.py
@@ -0,0 +1,50 @@
+from datetime import datetime
+from tensorflow.keras.layers import Dense, Activation, Softmax, concatenate
+from tensorflow.keras.optimizers import Adam, schedules
+from tensorflow.keras import Input, Model
+from tensorflow.keras.utils import plot_model
+
+from src.configuration import PPOConfiguration
+from src.envs.join_order.actions import action_regular, action_possible_idx
+
+### Configuration
+conf = PPOConfiguration()
+conf.pg_cost_model = False
+
+# Environment
+conf.num_relations = 4
+conf.target_num_relations = 4
+conf.num_inputs = [conf.num_relations, conf.num_relations, conf.num_relations**2, conf.num_relations**2]
+conf.num_actions = conf.num_relations * (conf.num_relations -1)
+conf.action_calc = action_regular
+conf.mask = True
+conf.take_best_threshold = .1
+conf.take_best_frequency = 1
+conf.update_dataset_reward_threshold = 0.5
+
+conf.gather_selectivity_info = False
+conf.multistep = True
+
+# Model
+input1 = Input(shape=(conf.num_relations,), name='join_indices')
+input2 = Input(shape=(conf.num_relations,), name='cardinalities')
+input3 = Input(shape=(conf.num_relations**2), name='tree_structure')
+input4 = Input(shape=(conf.num_relations**2), name='join_predicates')
+conf.all_inputs = [input1, input2, input3, input4]
+conf.concat_ins = concatenate(conf.all_inputs)
+
+# Hyperparameter
+conf.total_timesteps = 500000
+conf.num_episodes = 100
+conf.max_num_steps_per_episode = 17
+conf.num_updates = 200
+conf.mini_batchsize = 32
+conf.v_coeff = 1.0
+conf.ent_coeff = 0.01
+conf.clip_ratio = 0.2
+conf.train_iterations = 10
+conf.validate_every = 10
+conf.save_every = 500
+
+conf.lr_start = 3e-4
+conf.lr_duration = 0.9
diff --git a/execute.py → experimental_analysis/execute.py b/execute.py → experimental_analysis/execute.py