Skip to content

Commit

Permalink
prepare Dockerfile for reproduction package and refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
majafranz committed Jul 30, 2024
1 parent abd5203 commit 8fd0b3a
Show file tree
Hide file tree
Showing 91,891 changed files with 5,068 additions and 7,495,341 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
34 changes: 34 additions & 0 deletions .github/workflows/build-docker.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: Docker Image

env:
REGISTRY: ghcr.io
IMAGE_NAME: qce24_repro
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]

jobs:

build:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v2
- name: Build Dockerimage
run: docker build -t ${{ env.IMAGE_NAME }}:latest .

- name: Log in to the Container registry
uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Tag Docker image
run: docker image tag ${{ env.IMAGE_NAME }}:latest ${{ env.REGISTRY }}/lfd/RL_for_JO/${{ env.IMAGE_NAME }}:latest

- name: Push Docker image
run: docker image push ${{ env.REGISTRY }}/lfd/RL_for_JO/${{ env.IMAGE_NAME }}:latest
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ queries/
JOB/
.python-version
postgres
paper/build/
115 changes: 115 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
FROM ubuntu:22.04

LABEL author="Maja Franz <[email protected]>"

ENV DEBIAN_FRONTEND=noninteractive
ENV LANG="C.UTF-8"
ENV LC_ALL="C.UTF-8"

# Install required packages
RUN apt-get update
RUN apt-get install -y software-properties-common
RUN add-apt-repository ppa:deadsnakes/ppa # For Python 3.9
RUN apt-get install -y \
python3.9 \
python3-pip \
python3.9-distutils \
python3.9-dev \
wget \
git \
r-base \
libv8-dev \
libreadline-dev \
zlib1g-dev \
texlive-latex-base \
texlive-science \
texlive-fonts-recommended \
texlive-fonts-extra \
texlive-publishers \
texlive-bibtex-extra \
texlive-luatex \
biber

# Install R packages for plotting
RUN R -e "install.packages('tidyverse',dependencies=TRUE, repos='http://cran.rstudio.com/')"
RUN R -e "install.packages('ggh4x',dependencies=TRUE, repos='http://cran.rstudio.com/')"
RUN R -e "install.packages('patchwork',dependencies=TRUE, repos='http://cran.rstudio.com/')"
RUN R -e "install.packages('tikzDevice',dependencies=TRUE, repos='http://cran.rstudio.com/')"
RUN R -e "install.packages('scales',dependencies=TRUE, repos='http://cran.rstudio.com/')"

# Let Python 3.9 be global python version
RUN ln -s /usr/bin/python3.9 /usr/bin/python

# Add user
RUN useradd -m -G sudo -s /bin/bash repro && echo "repro:repro" | chpasswd
USER repro

# Add artifacts (from host) to home directory
ADD --chown=repro:repro . /home/repro/qce24_repro

WORKDIR /home/repro/postgres

# setup PostgreSQL
ENV RLJO_PSQL_BASE=/home/repro/postgres

## PostgreSQL
ENV RLJO_PSQL_DATA_DIRECTORY=$RLJO_PSQL_BASE/database
ENV RLJO_PSQL_SRC_DIRECTORY="$RLJO_PSQL_BASE/postgresql-16.0"
ENV RLJO_PSQL_INSTALL_DIRECTORY="$RLJO_PSQL_BASE/install"
ENV RLJO_PG_HINT_PLAN_BASE="$RLJO_PSQL_BASE/pg_hint_plan"
ENV RLJO_PG_HINT_PLAN_SRC_DIRECTORY="$RLJO_PG_HINT_PLAN_BASE/pg_hint_plan-REL16_1_6_0"

WORKDIR $RLJO_PSQL_BASE
RUN wget https://ftp.postgresql.org/pub/source/v16.0/postgresql-16.0.tar.gz
RUN tar xvfz postgresql-16.0.tar.gz
RUN mkdir $RLJO_PSQL_INSTALL_DIRECTORY
RUN mkdir $RLJO_PSQL_DATA_DIRECTORY
WORKDIR $RLJO_PSQL_SRC_DIRECTORY
RUN ./configure --prefix=$RLJO_PSQL_INSTALL_DIRECTORY --enable-debug
RUN make -j $(nproc)
RUN make install
WORKDIR $RLJO_PSQL_BASE

ENV PATH=$RLJO_PSQL_BASE/install/bin:$PATH
ENV LD_LIBRARY_PATH=$RLJO_PSQL_BASE/install/lib/

RUN initdb -D $RLJO_PSQL_DATA_DIRECTORY
RUN pg_ctl -D $RLJO_PSQL_DATA_DIRECTORY -l $RLJO_PSQL_BASE/logfile start

## PG hint plugin
RUN mkdir $RLJO_PG_HINT_PLAN_BASE
WORKDIR $RLJO_PG_HINT_PLAN_BASE
RUN wget https://github.com/ossc-db/pg_hint_plan/archive/refs/tags/REL16_1_6_0.tar.gz
RUN tar xvfz REL16_1_6_0.tar.gz
WORKDIR $RLJO_PG_HINT_PLAN_SRC_DIRECTORY
RUN make -j $(nproc)
RUN make install

ENV PATH=$PATH:/home/repro/postgres/install/bin


# setup join order benchmark
WORKDIR /home/repro

ENV RLJO_JOB_BASE=/home/repro/JOB

RUN mkdir $RLJO_JOB_BASE
RUN git clone -n https://github.com/danolivo/jo-bench $RLJO_JOB_BASE/jo-bench

WORKDIR $RLJO_JOB_BASE/jo-bench
RUN git checkout a2019f9

WORKDIR /home/repro/qce24_repro

# install python packages
ENV PATH=$PATH:/home/repro/.local/bin
# set default python version to 3.9
RUN echo 'alias python="python3.9"' >> /home/repro/.bashrc
RUN echo 'alias pip="pip3.9"' >> /home/repro/.bashrc

RUN python3.9 -m pip install -r experimental_analysis/requirements.txt

# Experiments can be run, plots can be generated or paper can be built when
# container is started, see options in README or run script
ENTRYPOINT ["./scripts/run.sh"]
CMD ["bash"]
96 changes: 75 additions & 21 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,85 @@

This repository provides the implementation and artifacts accompanying the following article:
```
@article{franz:23:qrl_jo,
author = {Maja Franz AND
Tobias Winker AND
Sven Groppe AND
Wolfgang Mauerer},
title = {Hype or Heuristic? Quantum Reinforcement Learning for Join Order Optimisation},
note = {under review},
@inproceedings{franz:24:qce24,
title = {Hype or Heuristic? Quantum Reinforcement Learning for Join Order Optimisation},
author = {Maja Franz and Tobias Winker and Sven Groppe and Wolfgang Mauerer},
booktitle = {IEEE International Conference on Quantum Computing and Engineering (QCE)},
year = {2024},
month = {09},
userd = {IEEE QCE '24},
}
```
A preprint is available on [arXiv](https://arxiv.org/abs/2405.07770).

## Content of this Repository

- [`experimental_analysis/`](experimental_analysis/):
This directory contains all necessary scripts to run a training and evaluation of the classical and multi-step QRL approaches presented in the publication.
For running experiments with the single-step QML approach, we refer to the [source code](https://github.com/TobiasWinker/QC4DB_QO) accompanying the [related publication](https://doi.org/10.1145/3579142.3594299).
All result sets from the paper are stored as CSV files in [`experimental_analysis/logs/paper_results`](experimental_analysis/logs/paper_results).
The tikz plots from the paper can be reproduced with the script [`scripts/generate_plots.sh`](scripts/generate_plots.sh).

- [`info/`](info/):
This directory contains supplementary material on the classical baseline replication and on specific hyperparameters.

- [`paper/`](paper/):
This directory contains the article in PDF and its source code in LaTeX.

- [`plots/`](plots/):
This directory contains the plots from the paper and the source code to generate them in R.

- [`scalability/`](scalability/):
In this directory the CSV files, which are used to generate the scalability Figs. 9 and 10, are stored, together with the python scripts, which were used to generate those.

- [`scripts/`](scripts/):
This directory contains bash scripts, which can either be used as endpoints for the Docker-image, or executed locally.
Furthermore the setup scripts for PostgreSQL-V16 and the JOB can be found there.


## Setup

### Python packages
### Docker

#### Get docker image
Build image:

```docker build -t qce24_repro .```

or pull image:

```docker pull ghcr.io/lfd/RL_for_JO/qce24_repro:latest```

The image does contain an instance of PostgreSQL-V16.
However the dataset for the Join order benchmark is only installed, once a container is run with the corresponding options for the endpoint.

#### Create Container

```docker run --name qce24_repro -it qce24_repro [<-flags>] [<option>]```

The `<option>` specifies which operations are performed on container start.

Available options are:
* `experiments_classic`: performs the trainings with a classical NN\*
* `experiments_quantum`: performs the trainings involving a VQC\*
* `experiments_noise`: performs a evaluation in simulated noisy environments\*
* `plot`: generates the the plots for the paper using R
* `paper`: generates the full paper from LaTeX
* `plot_paper`: generates both, plots and the paper
* `all`: performs all of the above\*
* `bash`(default): does not perform any operation, but launches interactive shell, default

Feel free to define additional `<-flags>`, e.g.:
* Volume, to keep track of generated files on the host system: `-v $PWD:/home/repro/qce24_repro`
* Port forwarding to launch TensorBoard on the container to track the training process for RL on the host: `-p 6006:6006`. TensorBoard can be started in the Container with: `tensorboard --logdir experimental_analysis/logs --host 0.0.0.0`

\*Please note the long runtimes for RL trainings (hours to days).
Additionally, the Join Order Benchmark gets set up for these options, which additionally takes up to 1-2 hours.
For quickly inspecting our reproduction package, we recommend to use the option `bash`.

### Local Setup

#### Python packages
The code can be executed with Python version 3.9.
For managing multiple python versions it is recommended to use [pyenv](https://github.com/pyenv/pyenv).
After installing Python 3.9 using pyenv with
Expand All @@ -36,15 +101,15 @@ all required python packages can be installed with:
pip install -r requirements.txt
```

### PostgreSQL and the Join Order Benchmark
#### PostgreSQL and the Join Order Benchmark

For the setup of PostgreSQL-V16.0 and the Join Order Benchmark, we provide two setup scripts:
```
source install_postgres.sh
source setup_JOB.sh
```

## Training
## Custom Trainings

``python main.py <configuration_name>``

Expand All @@ -56,14 +121,3 @@ A training process can be tracked with TensorBoard:

``tensorboard --logdir logs``

## Additional Material and Hyperparameter Search

Information on the classical baseline replication and on specific hyperparameters can be found in [`info/supplementary.pdf`](info/supplementary.pdf)

## Result Sets

All result sets from the paper are stored as CSV files in logs.
The tikz plots from the paper can be reproduced with the [`plots/plot.r`](plots/plot.r) script using
```
Rscript plots/plot.r
```
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
from datetime import datetime
from tensorflow.keras.layers import Dense, Activation, Softmax, concatenate
from tensorflow.keras.optimizers import Adam, schedules
from tensorflow.keras import Input, Model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import concatenate
from tensorflow.keras.optimizers import schedules
from tensorflow.keras import Input

from src.configuration import PPOConfiguration
from src.envs.join_order.actions import action_regular, action_possible_idx
from src.envs.join_order.actions import action_regular

### Configuration
conf = PPOConfiguration()
Expand Down Expand Up @@ -71,4 +69,3 @@
conf.multistep = multistep
conf.gather_selectivity_info = selpreds
conf.take_best_frequency = take_best_frequency

Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
PGDATABASE = "imdbload" # tpch & imdbload
PGUSER = "postgres"
PGPASSWORD = "postgres"
PGPORT = "5434"
PGPORT = "5432"

QC4DB_PLUGIN_PORT = 17342
QC4DB_PLUGIN_HOST = "127.0.0.1"
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,10 @@
# Critic
num_qubits = sum(conf.num_inputs) // conf.num_relations
quantum_critic_layer = VQC_Layer(num_qubits,
num_layers,
4,
encoding_ops_rx,
layer_ry_rz_cz,
data_reuploading=data_reuploading,
data_reuploading=False,
num_actions=1,
incremental_data_uploading=True,
num_inputs=sum(conf.num_inputs),
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
50 changes: 50 additions & 0 deletions experimental_analysis/configs/noisy_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from datetime import datetime
from tensorflow.keras.layers import Dense, Activation, Softmax, concatenate
from tensorflow.keras.optimizers import Adam, schedules
from tensorflow.keras import Input, Model
from tensorflow.keras.utils import plot_model

from src.configuration import PPOConfiguration
from src.envs.join_order.actions import action_regular, action_possible_idx

### Configuration
conf = PPOConfiguration()
conf.pg_cost_model = False

# Environment
conf.num_relations = 4
conf.target_num_relations = 4
conf.num_inputs = [conf.num_relations, conf.num_relations, conf.num_relations**2, conf.num_relations**2]
conf.num_actions = conf.num_relations * (conf.num_relations -1)
conf.action_calc = action_regular
conf.mask = True
conf.take_best_threshold = .1
conf.take_best_frequency = 1
conf.update_dataset_reward_threshold = 0.5

conf.gather_selectivity_info = False
conf.multistep = True

# Model
input1 = Input(shape=(conf.num_relations,), name='join_indices')
input2 = Input(shape=(conf.num_relations,), name='cardinalities')
input3 = Input(shape=(conf.num_relations**2), name='tree_structure')
input4 = Input(shape=(conf.num_relations**2), name='join_predicates')
conf.all_inputs = [input1, input2, input3, input4]
conf.concat_ins = concatenate(conf.all_inputs)

# Hyperparameter
conf.total_timesteps = 500000
conf.num_episodes = 100
conf.max_num_steps_per_episode = 17
conf.num_updates = 200
conf.mini_batchsize = 32
conf.v_coeff = 1.0
conf.ent_coeff = 0.01
conf.clip_ratio = 0.2
conf.train_iterations = 10
conf.validate_every = 10
conf.save_every = 500

conf.lr_start = 3e-4
conf.lr_duration = 0.9
File renamed without changes.
Loading

0 comments on commit 8fd0b3a

Please sign in to comment.