initial commit

xmartlabs · Sep 13, 2023 · a7a19d6 · a7a19d6
1 parent 2583a2a
commit a7a19d6
Show file tree

Hide file tree

Showing 23 changed files with 8,062 additions and 1 deletion.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,7 @@
+[flake8]
+max-line-length = 130
+ignores = W503
+per-file-ignores =
+    __init__.py: F401
+exclude =
+    .git,__pycache__,.ipynb_checkpoints,models/,dataset/
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,150 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# Custom
+/datasets/
+/models/
+clearml.conf
+*.parquet
+
+# Intellij
+.idea/
+
+# .DS_Store
+.DS_Store
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,8 @@
+FROM tensorflow/tensorflow:2.13.0-gpu-jupyter
+
+WORKDIR /app
+COPY requirements.txt /app
+
+RUN pip install -r requirements.txt
+
+ENTRYPOINT cd src && jupyter notebook --ip=0.0.0.0
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,9 @@
+The MIT License (MIT)
+
+Copyright (c) 2023 XMARTLABS
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
@@ -1 +1,30 @@
-# time-series-playground
+# Time Series Playrground
+
+Welcome to Xmartlabs' time series playground. This repository contains scripts and code to train time series models on weather datasets.
+
+## Instructions
+
+* Download the Jena Climate dataset by running:
+
+```bash
+./download_jena_dataset.sh
+```
+
+* Build the docker container:
+
+```bash
+./build.sh
+```
+
+* Start the docker container with the Jupyter Notebook:
+
+```bash
+./start.sh
+```
+
+* Follow the instructions to access the notebook on your browser
+
+
+## ClearML experiment tracking
+
+If you use the ClearML tracker, make sure to configure correctly your $HOME/clearml.conf file and create a $HOME/.clearml folder that will store the caches and other stuff.
diff --git a/build.sh b/build.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+docker build -t time_series_playground .
diff --git a/download_jena_dataset.sh b/download_jena_dataset.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+# constants
+DIR=datasets/jena_climate/
+DATASET_URL="https://storage.googleapis.com/tensorflow/tf-keras-datasets/jena_climate_2009_2016.csv.zip"
+
+mkdir -p $DIR
+wget -P $DIR $DATASET_URL
+cd $DIR
+unzip jena_climate_2009_2016.csv.zip
+
+# Clean up
+rm jena_climate_2009_2016.csv.zip
+rm -rf __MACOSX/
+
+echo "Goodbye! Here goes a joke:"
+curl -s https://api.chucknorris.io/jokes/random?category=dev | jq -r '.value'
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,6 @@
+clearml
+keras
+matplotlib
+pandas
+scikit-learn
+seaborn
diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/dataset_loader.py b/src/dataset_loader.py
@@ -0,0 +1,36 @@
+import json
+import os
+import pandas as pd
+from clearml import Dataset
+
+
+class DatasetLoader():
+    """Abstract class that serves to load datasets from different sources (local, ClearML, other tracker)
+    """
+
+    def get_dataset_folder(self, dataset_project, dataset_name):
+        return NotImplementedError()
+
+
+class LocalDatasetLoader(DatasetLoader):
+
+    def get_dataset_folder(self, dataset_project, dataset_name):
+        return f"data/{dataset_name}"
+
+
+class ClearMLDatasetLoader(DatasetLoader):
+
+    def get_dataset_folder(self, dataset_project, dataset_name):
+        return Dataset.get(dataset_project=dataset_project, dataset_name=dataset_name).get_local_copy()
+
+
+class JenaDatasetLoader(ClearMLDatasetLoader):
+    project = 'Time Series PG'
+    dataset = 'jena_climate'
+
+    def load(self):
+        self.data_folder = self.get_dataset_folder(self.project, self.dataset)
+
+    def get_data(self):
+        assert self.data_folder is not None, "You must call `load` before reading files"
+        return pd.read_csv(os.path.join(self.data_folder, "jena_climate_2009_2016.csv"))
diff --git a/src/models/__init__.py b/src/models/__init__.py
diff --git a/src/models/single_step_models.py b/src/models/single_step_models.py
@@ -0,0 +1,67 @@
+import tensorflow as tf
+from src.models.time_series_model import TimeSeriesModel
+
+
+class Baseline(tf.keras.Model):
+    def __init__(self, label_index=None):
+        super().__init__()
+        self.label_index = label_index
+
+    def call(self, inputs):
+        if self.label_index is None:
+            return inputs
+        result = inputs[:, :, self.label_index]
+        return result[:, :, tf.newaxis]
+
+
+class LinearModel(TimeSeriesModel):
+    def build_model(self, **kwargs):
+        self.model = tf.keras.Sequential([
+            tf.keras.layers.Dense(units=1)
+        ])
+
+
+class DenseModel(TimeSeriesModel):
+    def build_model(self, **kwargs):
+        self.model = tf.keras.Sequential([
+            tf.keras.layers.Dense(units=64, activation='relu'),
+            tf.keras.layers.Dense(units=64, activation='relu'),
+            tf.keras.layers.Dense(units=1)
+        ])
+
+
+class MultiStepDense(TimeSeriesModel):
+    def build_model(self, **kwargs):
+        self.model = tf.keras.Sequential([
+            # Shape: (time, features) => (time*features)
+            tf.keras.layers.Flatten(),
+            tf.keras.layers.Dense(units=32, activation='relu'),
+            tf.keras.layers.Dense(units=32, activation='relu'),
+            tf.keras.layers.Dense(units=1),
+            # Add back the time dimension.
+            # Shape: (outputs) => (1, outputs)
+            tf.keras.layers.Reshape([1, -1]),
+        ])
+
+
+class ConvModel(TimeSeriesModel):
+    def build_model(self, **kwargs):
+        kernel_size = kwargs.get('conv_width', 3)
+        self.model = tf.keras.Sequential([
+            tf.keras.layers.Conv1D(filters=32,
+                                   kernel_size=(kernel_size,),
+                                   activation='relu'),
+            tf.keras.layers.Dense(units=32, activation='relu'),
+            tf.keras.layers.Dense(units=1),
+        ])
+
+
+class RNNModel(TimeSeriesModel):
+    def build_model(self, **kwargs):
+        self.model = tf.keras.models.Sequential([
+            # Shape [batch, time, features] => [batch, time, lstm_units]
+            tf.keras.layers.LSTM(32, return_sequences=True),
+            # Shape => [batch, time, features]
+            tf.keras.layers.Dense(units=1)
+        ])
+
diff --git a/src/models/time_series_model.py b/src/models/time_series_model.py
@@ -0,0 +1,30 @@
+import tensorflow as tf
+
+
+class TimeSeriesModel:
+
+    model = None
+
+    def __init__(self, tracker):
+        self.tracker = tracker
+
+    # hidden1_size=512, hidden2_size=128, l2_param=0.002, dropout_factor=0.2, bias_regularizer='l1'
+    def build_model(self, **kwargs):
+        raise NotImplementedError()
+
+    def compile_and_fit(self, window, patience=2, epochs=20):
+        early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
+                                                          patience=patience,
+                                                          mode='min')
+
+        self.model.compile(loss=tf.keras.losses.MeanSquaredError(),
+                           optimizer=tf.keras.optimizers.Adam(),
+                           metrics=[tf.keras.metrics.MeanAbsoluteError()])
+
+        history = self.model.fit(window.train, epochs=epochs,
+                                 validation_data=window.val,
+                                 callbacks=[early_stopping])
+        return history
+
+    def predict(self, batch_generator):
+        pass
diff --git a/src/notebooks/__init__.py b/src/notebooks/__init__.py
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		#!/bin/bash
		docker build -t time_series_playground .