From 0b44893a77a3ed483e7663fb5ba41cdc352f52f2 Mon Sep 17 00:00:00 2001 From: Artur Susdorf Date: Wed, 26 Jun 2024 10:52:38 +0200 Subject: [PATCH] inital commit --- .devcontainer.json | 1 + .devcontainer/Dockerfile | 27 +++ .devcontainer/devcontainer.json | 8 + .dvc/.gitignore | 3 + .dvc/config | 5 + .dvcignore | 3 + .gitignore | 8 + README.md | 197 ++++++++++++++++++ code/__init__.py | 0 code/simple.py | 27 +++ data/.gitignore | 1 + data/data.local.txt | 1 + data/super-secret.txt.dvc | 5 + dvc.lock | 13 ++ dvc.yaml | 6 + dvclive/dvc.yaml | 14 ++ dvclive/metrics.json | 3 + dvclive/params.yaml | 1 + .../plots/custom/iris_feature_importance.json | 18 ++ dvclive/plots/metrics/myMetric.tsv | 2 + dvclive/report.html | 64 ++++++ main.py | 11 + requirements-dev.txt | 1 + requirements.txt | 2 + 24 files changed, 421 insertions(+) create mode 100644 .devcontainer.json create mode 100644 .devcontainer/Dockerfile create mode 100644 .devcontainer/devcontainer.json create mode 100644 .dvc/.gitignore create mode 100644 .dvc/config create mode 100644 .dvcignore create mode 100644 .gitignore create mode 100644 README.md create mode 100644 code/__init__.py create mode 100644 code/simple.py create mode 100644 data/.gitignore create mode 100644 data/data.local.txt create mode 100644 data/super-secret.txt.dvc create mode 100644 dvc.lock create mode 100644 dvc.yaml create mode 100644 dvclive/dvc.yaml create mode 100644 dvclive/metrics.json create mode 100644 dvclive/params.yaml create mode 100644 dvclive/plots/custom/iris_feature_importance.json create mode 100644 dvclive/plots/metrics/myMetric.tsv create mode 100644 dvclive/report.html create mode 100644 main.py create mode 100644 requirements-dev.txt create mode 100644 requirements.txt diff --git a/.devcontainer.json b/.devcontainer.json new file mode 100644 index 0000000..909dd6f --- /dev/null +++ b/.devcontainer.json @@ -0,0 +1 @@ +{"image":"mcr.microsoft.com/devcontainers/python:3"} \ No newline at end of file diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 0000000..e79ff5f --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1,27 @@ +FROM python:3.11 + +# Add non-root user +ARG USERNAME=nonroot +RUN groupadd --gid 1000 $USERNAME && \ + useradd --uid 1000 --gid 1000 -m $USERNAME + +## Make sure to reflect new user in PATH +ENV PATH="/home/${USERNAME}/.local/bin:${PATH}" +USER $USERNAME + +## Pip dependencies +# Upgrade pip +RUN pip install --upgrade pip + +# Install production dependencies +COPY --chown=nonroot:1000 requirements.txt /tmp/requirements.txt +RUN pip install -r /tmp/requirements.txt && \ + rm /tmp/requirements.txt + +# Install development dependencies +COPY --chown=nonroot:1000 requirements-dev.txt /tmp/requirements-dev.txt +RUN pip install -r /tmp/requirements-dev.txt && \ + rm /tmp/requirements-dev.txt + +# fix: https://github.com/iterative/dvc/issues/10431 +RUN pip install pygit2==1.14.1 \ No newline at end of file diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..4f4de35 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,8 @@ +{ + "build": { + "dockerfile": "Dockerfile", + "context": ".." + }, + + "remoteUser": "nonroot" +} \ No newline at end of file diff --git a/.dvc/.gitignore b/.dvc/.gitignore new file mode 100644 index 0000000..528f30c --- /dev/null +++ b/.dvc/.gitignore @@ -0,0 +1,3 @@ +/config.local +/tmp +/cache diff --git a/.dvc/config b/.dvc/config new file mode 100644 index 0000000..73fba96 --- /dev/null +++ b/.dvc/config @@ -0,0 +1,5 @@ +[core] + remote = detabord-demo-remote +['remote "detabord-demo-remote"'] + url = s3://detabord-demo + endpointurl = https://sos-at-vie-1.exo.io diff --git a/.dvcignore b/.dvcignore new file mode 100644 index 0000000..5197305 --- /dev/null +++ b/.dvcignore @@ -0,0 +1,3 @@ +# Add patterns of files dvc should ignore, which could improve +# the performance. Learn more at +# https://dvc.org/doc/user-guide/dvcignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3eabaa4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +.exoscale +.boto +.s3cfg diff --git a/README.md b/README.md new file mode 100644 index 0000000..345ab19 --- /dev/null +++ b/README.md @@ -0,0 +1,197 @@ +# Detabord Template + +Detabord template Repo. + +## Repo Structure + +* [data](data) - Data sets. Data is managed by DVC +* [code](code) - Code repo + +## Data Versioning + +Data is managed by [DVC](https://dvc.org/doc). DVC is a version control system for data sets. It is used to track changes in data sets and to share data sets between team members. DVC is built on top of git. This means everything is git managed. Use the normal git workflow to use this repository. DVC adds additional features to manage (large) data files. With DVC you can easily track your experiments and their progress by only instrumenting your code, and collaborate on ML experiments like software engineers do for code. + +## Setup Environment + +Create and activate your python environment first: +```bash +conda create -n my-env python=3.11 +conda activate my-env +``` + +Use the package manager pip to install dependencies: +```bash +pip install -r requirements.txt +``` + +Ensure you have installed DVC version 3.4.0 or higher: +```bash +dvc --version +``` + +More information for DVC installation: +https://dvc.org/doc/install + + +## Setup S3 storage credentials (ExoScale Demo Bucket) + +Follow the installation instructions: https://community.exoscale.com/documentation/storage/quick-start/ +We prefer to use the CLI tool `s3cmd`. Install it with: + +```bash +brew install s3cmd +``` + +Create a config file `~/.s3cfg` with the following content: +```bash +[default] +host_base = sos-at-vie-1.exo.io +host_bucket = %(bucket)s.sos-at-vie-1.exo.io +access_key = PLEASE_REQUEST_YOUR_API_ACCESS_KEY +secret_key = PLEASE_REQUEST_YOUR_API_ACCESS_SECRET +use_https = True +``` + +Both `PLEASE_REQUEST_YOUR_API_ACCESS_KEY` and `PLEASE_REQUEST_YOUR_API_ACCESS_SECRET` you have to request from the Gradient0 Team. Host Base and Host Bucket should stay as above. + +Ensure you have access to ExoScale's bucket called `detabord-demo`: +```bash +s3cmd info s3://detabord-demo +# s3://detabord-demo/ (bucket): +# Location: at-vie-1 +# Payer: BucketOwner +# Expiration Rule: none +# Policy: none +# CORS: +# ACL: gradient-zero-softwareentwicklungsgmbh: FULL_CONTRO +``` + + +## Connect S3 bucket to DVC + +### Already prepared in this repo + +Initializing DVC in this project (already done in this repo): +```bash +# dvc init +``` + +Create new remote in DVC and use custom ExoScale endpoint: +```bash +# dvc remote add -d detabord-demo-remote s3://detabord-demo --force +# dvc remote modify detabord-demo-remote endpointurl https://sos-at-vie-1.exo.io +# this will modify the file ".dvc/config" +``` + +### Provide your credentials locally + +DVC requires ExoScale credentials, we will provide them locally to not commit to github: +```bash +dvc remote modify detabord-demo-remote --local access_key_id PLEASE_REQUEST_YOUR_API_ACCESS_KEY +dvc remote modify detabord-demo-remote --local secret_access_key PLEASE_REQUEST_YOUR_API_ACCESS_SECRET +# this will create a new file "config.local" that contains credentials for using ExoScale +``` +Again, both `PLEASE_REQUEST_YOUR_API_ACCESS_KEY` and `PLEASE_REQUEST_YOUR_API_ACCESS_SECRET` equals to values we already have stored in `~/.s3cfg` + + +## Pushing data with DVC + +If you want to work with data, please follow the instructions: https://dvc.org/doc/start/data-management/data-versioning + +```bash +# just for reference how data/super-secret.txt was added to DVC and uploaded to bucket: +```bash +dvc add data/super-secret.txt +git add data/.gitignore data/super-secret.txt.dvc +dvc push +# 1 file pushed +``` + +## Pulling data with DVC + +```bash +dvc pull +``` + + +## Prepare Remote Execution + +Make sure that a remote machine instance is already connected to your organization in Detabord. If not, please follow the instructions: + +### Generate a new SSH key pair for the remote machine + +```bash +# contenx: local machine +ssh-keygen -t ed25519 -C "remote@machine.com" -f org-key +# no passphrase +``` + +Copy content to Detabord - SSH Key for Organizations. + +Connect to the remote machine and add the public key to the authorized keys: + +```bash +# context: remote machine +nano /root/.ssh/authorized_keys +# add the content of org-key.pub +``` + +### Add Remote Machine + +Add Remote Machine in Detabord at "Machine: Machine for Organizations": + +```bash +Name: remote-machine +user : root +SSH Key: select org-key +Host: 95.217.101.177 +Port: 22 +``` + +### Provide Remote Machine Access to Repo + +In Detabord, go to User's Settings > Applications. A new user can also be created in whose name the remote machine can access and commit to the repository. For the sake of simplicity, the current user is used here. However, it is recommended to create your own user for this. + +```bash +Token Name: User Token for Remote machine +Select Permissions: +- organization: read +- repository: read and write +``` + +Copy the token and save it in Organization's Settings > Gitea Token: + +```bash +Name: User Token for Remote machine +Token: <...> +``` + +### Provide Remote Machine Access to Remote Datasets + +In Organization's Settings, go to "Devpod Credential" and add following credentials: + +```bash +# must be in sync with your local DVC credentials +Remote: detabord-demo-remote +key: access_key_id +value: <...> +``` + +and a second one: +```bash +# must be in sync with your local DVC credentials +Remote: detabord-demo-remote +key: secret_access_key +value: <...> +``` + +## Appendix: Create experiment stage (test) + +```bash +# create +dvc stage add -n simple_run \ + -p simple \ + -d code/simple.py \ + -d data/super-secret.txt \ + python main.py +``` \ No newline at end of file diff --git a/code/__init__.py b/code/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/code/simple.py b/code/simple.py new file mode 100644 index 0000000..37f30dc --- /dev/null +++ b/code/simple.py @@ -0,0 +1,27 @@ +from dvclive import Live + + +def run_simple_experiment(): + + datapoints = [ + {"name": "petal_width", "importance": 0.4}, + {"name": "petal_length", "importance": 0.33}, + {"name": "sepal_width", "importance": 0.24}, + {"name": "sepal_length", "importance": 0.03} + ] + + with Live() as live: + live.log_param("myParam", 123) + live.log_metric("myMetric", 543) + live.log_metric("new_metric", 333) + + live.log_plot( + "iris_feature_importance", + datapoints, + x="importance", + y="name", + template="bar_horizontal", + title="Iris Dataset: Feature Importance", + y_label="Feature Name", + x_label="Feature Importance" + ) \ No newline at end of file diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..d95c0ed --- /dev/null +++ b/data/.gitignore @@ -0,0 +1 @@ +/super-secret.txt diff --git a/data/data.local.txt b/data/data.local.txt new file mode 100644 index 0000000..b68d139 --- /dev/null +++ b/data/data.local.txt @@ -0,0 +1 @@ +This data is fully visible in the repository \ No newline at end of file diff --git a/data/super-secret.txt.dvc b/data/super-secret.txt.dvc new file mode 100644 index 0000000..edc8e6d --- /dev/null +++ b/data/super-secret.txt.dvc @@ -0,0 +1,5 @@ +outs: +- md5: 5ebe2294ecd0e0f08eab7690d2a6ee69 + size: 6 + hash: md5 + path: super-secret.txt diff --git a/dvc.lock b/dvc.lock new file mode 100644 index 0000000..80c215a --- /dev/null +++ b/dvc.lock @@ -0,0 +1,13 @@ +schema: '2.0' +stages: + simple_run: + cmd: python main.py + deps: + - path: code/simple.py + hash: md5 + md5: 8647c8d1057de1a1cba9ceb2d0bb7d5a + size: 724 + - path: data/super-secret.txt + hash: md5 + md5: 5ebe2294ecd0e0f08eab7690d2a6ee69 + size: 6 diff --git a/dvc.yaml b/dvc.yaml new file mode 100644 index 0000000..9c8ed61 --- /dev/null +++ b/dvc.yaml @@ -0,0 +1,6 @@ +stages: + simple_run: + cmd: python main.py + deps: + - code/simple.py + - data/super-secret.txt diff --git a/dvclive/dvc.yaml b/dvclive/dvc.yaml new file mode 100644 index 0000000..6d035cb --- /dev/null +++ b/dvclive/dvc.yaml @@ -0,0 +1,14 @@ +params: +- params.yaml +metrics: +- metrics.json +plots: +- plots/metrics: + x: step +- plots/custom/iris_feature_importance.json: + template: bar_horizontal + x: importance + y: name + title: 'Iris Dataset: Feature Importance' + x_label: Feature Importance + y_label: Feature Name diff --git a/dvclive/metrics.json b/dvclive/metrics.json new file mode 100644 index 0000000..a26b9e2 --- /dev/null +++ b/dvclive/metrics.json @@ -0,0 +1,3 @@ +{ + "myMetric": 543 +} diff --git a/dvclive/params.yaml b/dvclive/params.yaml new file mode 100644 index 0000000..c4f7bf9 --- /dev/null +++ b/dvclive/params.yaml @@ -0,0 +1 @@ +myParam: 123 diff --git a/dvclive/plots/custom/iris_feature_importance.json b/dvclive/plots/custom/iris_feature_importance.json new file mode 100644 index 0000000..08c5e81 --- /dev/null +++ b/dvclive/plots/custom/iris_feature_importance.json @@ -0,0 +1,18 @@ +[ + { + "name": "petal_width", + "importance": 0.4 + }, + { + "name": "petal_length", + "importance": 0.33 + }, + { + "name": "sepal_width", + "importance": 0.24 + }, + { + "name": "sepal_length", + "importance": 0.03 + } +] diff --git a/dvclive/plots/metrics/myMetric.tsv b/dvclive/plots/metrics/myMetric.tsv new file mode 100644 index 0000000..0706e46 --- /dev/null +++ b/dvclive/plots/metrics/myMetric.tsv @@ -0,0 +1,2 @@ +step myMetric +0 543 diff --git a/dvclive/report.html b/dvclive/report.html new file mode 100644 index 0000000..11522dd --- /dev/null +++ b/dvclive/report.html @@ -0,0 +1,64 @@ + + + + + DVC Plot + + + + + + + + + + +
+

params_yaml

+
+ + + + + + + +
myParam
123
+
+
+ +
+

metrics_json

+
+ + + + + + + +
myMetric
543
+
+
+ +
+ +
+ + +
+ +
+ + + \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..bfeb6a3 --- /dev/null +++ b/main.py @@ -0,0 +1,11 @@ +from code.simple import run_simple_experiment + + +def main(): + print("Running main...") + run_simple_experiment() + print("Running done!") + + +if __name__ == "__main__": + main() diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..fbd890e --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1 @@ +# no one \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f739a94 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +dvc[all]==3.4.0 +dvclive \ No newline at end of file