From 8fdf7d712a28d817bf6034e4b5686d4a839b723e Mon Sep 17 00:00:00 2001 From: Kaleb Lim Date: Fri, 21 Jun 2024 22:00:04 +0700 Subject: [PATCH] initial commit --- .DS_Store | Bin 0 -> 6148 bytes .devcontainer/Dockerfile | 24 +++++ .devcontainer/devcontainer.json | 8 ++ .dvc/.gitignore | 3 + .dvc/config | 5 + .dvcignore | 3 + .gitignore | 4 + DEVPOD.md | 74 +++++++++++++ README.md | 101 ++++++++++++++++++ data.csv.dvc | 5 + dvc.yaml | 14 +++ dvclive/metrics.json | 3 + dvclive/params.yaml | 1 + .../plots/custom/iris_feature_importance.json | 18 ++++ dvclive/plots/metrics/myMetric.tsv | 2 + requirements-dev.txt | 1 + requirements.txt | 2 + test.py | 23 ++++ 18 files changed, 291 insertions(+) create mode 100644 .DS_Store create mode 100644 .devcontainer/Dockerfile create mode 100644 .devcontainer/devcontainer.json create mode 100644 .dvc/.gitignore create mode 100644 .dvc/config create mode 100644 .dvcignore create mode 100644 .gitignore create mode 100644 DEVPOD.md create mode 100644 README.md create mode 100644 data.csv.dvc create mode 100644 dvc.yaml create mode 100644 dvclive/metrics.json create mode 100644 dvclive/params.yaml create mode 100644 dvclive/plots/custom/iris_feature_importance.json create mode 100644 dvclive/plots/metrics/myMetric.tsv create mode 100644 requirements-dev.txt create mode 100644 requirements.txt create mode 100644 test.py diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..fa6a28451f99dd5ad745396b942b0e441c9c3b9c GIT binary patch literal 6148 zcmeH~JqiLr422WjLa^D=avEE~8w{c+@B*S+3KpXE9Nm{61XpVjd4c4cWF{>8ik*#! z=;k)8MS2mL!Hu%CFfv8n$w~&f%W1ft&d1B;oM$FUI|J`!w4d7q6`%rCfC^9nD)2)J z*u4##&4Y|ofC^B7rvmnUC~#v}#|2z1n`G3^HlnPLRKT|-v z?S8w#OXc1A_Ig%7WYyLU4*GF~m!AM6b`>w+ZrCrj0Bf=ZQGxMCz-3^d0$)|&0j;7E AZU6uP literal 0 HcmV?d00001 diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 0000000..275a4dd --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1,24 @@ +FROM python:3.10 + +# Add non-root user +ARG USERNAME=nonroot +RUN groupadd --gid 1000 $USERNAME && \ + useradd --uid 1000 --gid 1000 -m $USERNAME + +## Make sure to reflect new user in PATH +ENV PATH="/home/${USERNAME}/.local/bin:${PATH}" +USER $USERNAME + +## Pip dependencies +# Upgrade pip +RUN pip install --upgrade pip + +# Install production dependencies +COPY --chown=nonroot:1000 requirements.txt /tmp/requirements.txt +RUN pip install -r /tmp/requirements.txt && \ + rm /tmp/requirements.txt + +# Install development dependencies +COPY --chown=nonroot:1000 requirements-dev.txt /tmp/requirements-dev.txt +RUN pip install -r /tmp/requirements-dev.txt && \ + rm /tmp/requirements-dev.txt \ No newline at end of file diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..4f4de35 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,8 @@ +{ + "build": { + "dockerfile": "Dockerfile", + "context": ".." + }, + + "remoteUser": "nonroot" +} \ No newline at end of file diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 0000000..528f30c --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 0000000..007369e --- /dev/null +++ b/.dvc/config @@ -0,0 +1,5 @@ +[core] + remote = aquaremote +['remote "aquaremote"'] + url = s3://aqua01 + endpointurl = https://sos-de-fra-1.exo.io diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 0000000..5197305 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a5e4aef --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.exoscale +.boto +.s3cfg +/data.csv diff --git a/DEVPOD.md b/DEVPOD.md new file mode 100644 index 0000000..5295fd0 --- /dev/null +++ b/DEVPOD.md @@ -0,0 +1,74 @@ +# Getting Started with DevPod + +DevPod is a powerful tool used for creating reproducible developer environments. It enables teams to maintain consistency in their development environments by allowing access to a shared project setup. In this guide, we'll walk you through the setup process for DevPod, which is commonly used at Aqua Research to access projects and run experiments in a standardized environment. + +## What is DevPod? + +DevPod serves two main purposes: + +- Access to the Project: DevPod allows developers to access the project's source code and data, ensuring everyone is working in the same development environment. + +- Running Experiments: It also provides a platform for running experiments and generating models. The results of these experiments can be compared and visualized on Gitea. + +In Aqua Research we use our private server (Hetzner machine) to run the devcontainer instance and ExoScale to manage the actual data access. + +## Prerequisites +Before you can set up and use DevPod, make sure you have the following prerequisites: + +- Docker installation: On the private server Docker has been installed. + +- DevPod Installation: You should have DevPod installed on your local machine. You can find installation instructions at https://devpod.sh/. For this guide, we will only use the CLI of DevPod. + +- SSH Access: You need SSH access with a public key to the private machine where you plan to run DevPod. This is typically set up in advance. + + +## Setup Your DevPod Environment + +### Create a Provider (SSH) +To get started, you need to create a provider that points to our private machine using SSH. Use the following commands: + +```bash +devpod provider add ssh +# enter root@95.217.101.177 +``` + +This command will create a new provider in DevPod (locally), enabling you to connect to our private machine through SSH using your public key. + +## Create a New Workspace +Now, it's time to create a new workspace. This command will open Visual Studio Code and connect it to the DevPod environment inside a Docker container running on our private remote machine. Execute the following command: + +```bash +devpod up --provider ssh git@github.com:gradientzero/aqua-research.git --ide vscode --debug +``` + +Visual Studio Code will open, automatically connecting to the DevPod environment. + + +## Apply Local Exoscale Credentials (Inside DevContainer) + +To access data used in Aqua Research and stored on ExoScale, you need to provide your credentials. Create a file named ```.dvc/config.local``` and add the following content (Note: Replace <1password> with your actual access key and secret access key): + +```bash +# create new file: .dvc/config.local +['remote "aquaremote"'] + access_key_id = <1password> + secret_access_key = <1password> +``` +(TODO: havn't found a better way, yet. But at least this has to be done only once) + +Now, you can simply use the ```dvc pull``` command to retrieve remote data into this DevPod instance. + +## How to Connect to a Workspace +If you've already set up a DevPod workspace and need to reconnect, use the following command: + +```bash +# connect to existing devpod on remote machine +devpod up aqua-research --ide vscode --debug +``` +(TODO: not sure how to connect from scratch, yet) + +## (Optional) Use dvclive to Track Experiments + +You can use dvclive, a tool for tracking and visualizing experiments. A new Python file ```test.py``` may have been created for this purpose, which outputs experimental metrics. You can use dvclive to monitor these experiments. + +That's it! You are now set up and ready to work with DevPod for your development and experimentation needs at Aqua Research. diff --git a/README.md b/README.md new file mode 100644 index 0000000..3b3de5c --- /dev/null +++ b/README.md @@ -0,0 +1,101 @@ +# Aqua Predict Research + +Aqua Predict Research Repo. + +AI/ML-based groundwater analysis and prediction solutions. + +## Repo Structure + +* [data](data) - research and development data sets. Data is managed by DVC +* [code](code) - code repo +* [papers](papers) - scientific papers and other information + +## Data Versioning + +Data is managed by [DVC](https://dvc.org/doc). Later [DetaBord](https://detabord.com) will offer more advanced data and AI management. +DVC is built on top of git. This means everything is git managed. Use the normal git workflow to use this repository. DVC adds additional features to manage (large) data files. + +### DVC Setup + +DVC manages data metadate and uses remote data repositories to store the actual data sets. The preferred data storage provider is Exoscale. But this S3 service is not ready yet, in the meantime Azure Blob Storage with a local German zone (west germany) is used. + +Environment (python) +```bash +conda create -n dvc python=3.11 +conda activate dvc +pip install -r requirements.txt +``` + +DVC Version (3.4.0) +```bash +# ensure you have installed DVC version 3.4.0 or higher +dvc --version +``` + +More information for DVC installation: +https://dvc.org/doc/install + + +#### ExoScale (primary S3 storage) + +Follow the installation instructions: https://community.exoscale.com/documentation/storage/quick-start/ +```bash +brew install s3cmd +``` + +Create a config file `~/.s3cfg` with the following content: +```bash +[default] +host_base = sos-de-fra-1.exo.io +host_bucket = %(bucket)s.sos-de-fra-1.exo.io +access_key = $EXO_SOS_KEY +secret_key = $EXO_SOS_SECRET +use_https = True +``` +Both `$EXO_SOS_KEY` and `$EXO_SOS_SECRET` you have to request from us once. host_bucket should stay as above. + +Ensure you have access to ExoScale: +```bash +s3cmd ls +# 2023-07-03 13:40 s3://aqua01 +``` + +Add data remote and use custom ExoScale endpoint: +```bash +dvc remote add -d aquaremote s3://aqua01 --force +dvc remote modify aquaremote endpointurl https://sos-de-fra-1.exo.io +# this will modify the file ".dvc/config" +``` + +DVC requires ExoScale credentials, we will provide them locally only: +```bash +dvc remote modify aquaremote --local access_key_id $EXO_SOS_KEY +dvc remote modify aquaremote --local secret_access_key $EXO_SOS_SECRET +# this will create a new file "config.local" that contains credentials for using ExoScale +``` +Again, both `$EXO_SOS_KEY` and `$EXO_SOS_SECRET` equals to values we already have stored in `~/.s3cfg` + +Use `dvc push` and `dvc pull` for data handling. + + +#### Azure (alternative S3 storage) + +Azure accounts are managed by Active Directory. Invites shall be sent via email. Contact jb@gradient0.com for help with the accounts. + +Users with access to the aqua01 storage account have the "Storage Blog Data Contributor" role assignment. To access the blog storage setup the connection via the Azure CLI. + +Install Azure CLI +[https://learn.microsoft.com/en-us/cli/azure/install-azure-cli?source=recommendations](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli?source=recommendations) + +Then, login to your Azure Account +`az login` + +And add and config the data remote: + +`dvc remote add -d aquaremote azure://aqua01` + +`dvc remote modify aquaremote account_name 'aqua01'` + +This will use the local Azure CLI config for storage access. + +Use `dvc push` and `dvc pull` for data handling. Refer to the DVC docs (see above) for detailed information. \ No newline at end of file diff --git a/data.csv.dvc b/data.csv.dvc new file mode 100644 index 0000000..d17ad3c --- /dev/null +++ b/data.csv.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 4d572c5f31f0be3a59e43907e996fa4d + size: 9 + hash: md5 + path: data.csv diff --git a/dvc.yaml b/dvc.yaml new file mode 100644 index 0000000..ab5e5bb --- /dev/null +++ b/dvc.yaml @@ -0,0 +1,14 @@ +params: +- dvclive/params.yaml +metrics: +- dvclive/metrics.json +plots: +- dvclive/plots/metrics: + x: step +- dvclive/plots/custom/iris_feature_importance.json: + template: bar_horizontal + x: importance + y: name + title: 'Iris Dataset: Feature Importance' + x_label: Feature Importance + y_label: Feature Name diff --git a/dvclive/metrics.json b/dvclive/metrics.json new file mode 100644 index 0000000..a26b9e2 --- /dev/null +++ b/dvclive/metrics.json @@ -0,0 +1,3 @@ +{ + "myMetric": 543 +} diff --git a/dvclive/params.yaml b/dvclive/params.yaml new file mode 100644 index 0000000..c4f7bf9 --- /dev/null +++ b/dvclive/params.yaml @@ -0,0 +1 @@ +myParam: 123 diff --git a/dvclive/plots/custom/iris_feature_importance.json b/dvclive/plots/custom/iris_feature_importance.json new file mode 100644 index 0000000..08c5e81 --- /dev/null +++ b/dvclive/plots/custom/iris_feature_importance.json @@ -0,0 +1,18 @@ +[ + { + "name": "petal_width", + "importance": 0.4 + }, + { + "name": "petal_length", + "importance": 0.33 + }, + { + "name": "sepal_width", + "importance": 0.24 + }, + { + "name": "sepal_length", + "importance": 0.03 + } +] diff --git a/dvclive/plots/metrics/myMetric.tsv b/dvclive/plots/metrics/myMetric.tsv new file mode 100644 index 0000000..0706e46 --- /dev/null +++ b/dvclive/plots/metrics/myMetric.tsv @@ -0,0 +1,2 @@ +step myMetric +0 543 diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..fbd890e --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1 @@ +# no one \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..68e02f6 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +dvc[all]==3.4.0 +dvclive diff --git a/test.py b/test.py new file mode 100644 index 0000000..19fd264 --- /dev/null +++ b/test.py @@ -0,0 +1,23 @@ +from dvclive import Live + +datapoints = [ + {"name": "petal_width", "importance": 0.4}, + {"name": "petal_length", "importance": 0.33}, + {"name": "sepal_width", "importance": 0.24}, + {"name": "sepal_length", "importance": 0.03} +] + +with Live() as live: + live.log_param("myParam", 123) + live.log_metric("myMetric", 543) + + live.log_plot( + "iris_feature_importance", + datapoints, + x="importance", + y="name", + template="bar_horizontal", + title="Iris Dataset: Feature Importance", + y_label="Feature Name", + x_label="Feature Importance" + ) \ No newline at end of file