diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 9b40fb36c303fdd53fe94b0ab874f6b77e66c20d..e3431f20792505ae0f975e90aef7734516625daf 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -180,3 +180,26 @@ test_keras3cpu.image: - if: $CI_PIPELINE_SOURCE == "merge_request_event" changes: - keras3-cpu/Dockerfile + +pytorchgpu.image: + stage: build + variables: + REGISTRY_IMAGE: christofkaufmann/pytorch-gpu-notebook + script: + - echo "{\"auths\":{\"https://index.docker.io/v1/\":{\"auth\":\"$(printf "%s:%s" "${REGISTRY_USER}" "${REGISTRY_PASSWORD}" | base64 | tr -d '\n')\"}}}" > /kaniko/.docker/config.json + - /kaniko/executor + --context "dir://${CI_PROJECT_DIR}/pytorch-gpu" + --dockerfile "${CI_PROJECT_DIR}/pytorch-gpu/Dockerfile" + --target build + --build-arg LABEL_CREATED=$(date -u +'%Y-%m-%dT%H:%M:%SZ') + --build-arg LABEL_REVISION=${CI_COMMIT_SHA} + --destination "${REGISTRY_IMAGE}:${CI_COMMIT_SHORT_SHA}" + --destination "${REGISTRY_IMAGE}" + --cache=false + --compressed-caching=false + --use-new-run + --snapshot-mode=redo + rules: + - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH + changes: + - pytorch-gpu/Dockerfile diff --git a/README.md b/README.md index d73460e0853a6cb5e6a3087488670409d93f9d1c..34ec9b5527c6d08c0a44db2956c066baa2e3798d 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,18 @@ This is a collection of customized Jupyter images to be used in the JupyterHub r - Added Notebook git Puller ([nbgitpuller](https://hub.jupyter.org/nbgitpuller)) - Provide default home directory contents including VS Code Extensions and settings in `/etc/skel/`. Can be copied on first startup using [post start hook](https://z2jh.jupyter.org/en/stable/jupyterhub/customizing/user-environment.html#about-user-storage-and-adding-files-to-it). +## pytorch-gpu + +- Based on: `jupyter/scipy-notebook:latest` ([quay.io](https://quay.io/repository/jupyter/scipy-notebook), [src](https://github.com/jupyter/docker-stacks/tree/main/images/scipy-notebook), [doc](https://jupyter-docker-stacks.readthedocs.io/en/latest/using/selecting.html#jupyter-scipy-notebook)) +- Available as: [`christofkaufmann/pytorch-gpu-notebook:latest`](https://hub.docker.com/repository/docker/christofkaufmann/pytorch-gpu-notebook) +- Features: + - Jupyter enabled + - Contains PyTorch with GPU support (requires nvidia-driver) + - Contains Huggingface packages + - Added VS Code via code-server as web UI and via VS Code CLI for tunnels + - Added Notebook git Puller ([nbgitpuller](https://hub.jupyter.org/nbgitpuller)) + - Provide default home directory contents including VS Code Extensions and settings in `/etc/skel/`. Can be copied on first startup using [post start hook](https://z2jh.jupyter.org/en/stable/jupyterhub/customizing/user-environment.html#about-user-storage-and-adding-files-to-it). + ## Usage You can use the image from docker hub or build it locally. @@ -103,7 +115,7 @@ docker exec -it tf-cpu bash docker stop tf-cpu # also removes container, because of the --rm parameter on docker run ``` -And the same for the GPU images (`tensorflow-gpu-with-vscode` and `keras3-gpu`): +And the same for the GPU images (`tensorflow-gpu-with-vscode`, `keras3-gpu`, `pytorch-gpu`): ```bash cd tensorflow-gpu-with-vscode diff --git a/pytorch-gpu/Dockerfile b/pytorch-gpu/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..86914565d1646c04103c2e18301fc4d3cc2ee87b --- /dev/null +++ b/pytorch-gpu/Dockerfile @@ -0,0 +1,268 @@ +# syntax=docker/dockerfile:1 + +# use scipy-notebook and install torch from PyPI after conda-forge packages +FROM quay.io/jupyter/scipy-notebook:latest AS build + +# fix: https://github.com/hadolint/hadolint/wiki/DL4006 +# fix: https://github.com/koalaman/shellcheck/wiki/SC3014 +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +USER root + +# install code-server and extensions +ENV CODE_VERSION=4.93.1 +RUN wget --no-hsts -q https://github.com/coder/code-server/releases/download/v$CODE_VERSION/code-server_${CODE_VERSION}_amd64.deb && \ + dpkg -i code-server_${CODE_VERSION}_amd64.deb && \ + rm -f code-server_${CODE_VERSION}_amd64.deb && \ + code-server --force \ + --install-extension ms-python.python \ + --install-extension ms-toolsai.jupyter \ + --install-extension eamodio.gitlens \ + --install-extension gitlab.gitlab-workflow && \ + mkdir -p /usr/local/bin/start-notebook.d && \ + chown -R ${NB_USER} "/home/${NB_USER}/.config" "/home/${NB_USER}/.local" && \ + fix-permissions "/home/${NB_USER}" + +# install VS Code CLI +RUN wget --no-hsts -q -O vscode_cli.tar.gz 'https://code.visualstudio.com/sha/download?build=stable&os=cli-alpine-x64' && \ + tar -xf vscode_cli.tar.gz --directory /usr/local/bin && \ + rm vscode_cli.tar.gz && \ + chown ${NB_USER} /usr/local/bin/code && \ + fix-permissions /usr/local/bin/code + +# install some packages +# NOTE: without --no-update-deps somehow an undefined symbol error in /opt/conda/lib/python3.11/lib-dynload/_sqlite3.cpython-311-x86_64-linux-gnu.so occurs, which crashes jupyter lab. +USER ${NB_UID} +RUN mamba install --yes --no-update-deps \ + # umap + hdbscan + 'umap-learn' \ + 'hdbscan' \ + # gradio + 'gradio' \ + # optuna + plotly + 'optuna' \ + 'plotly' \ + # optional dependencies for pandas for read_html and to support Parquet files + 'lxml' \ + 'pyarrow' \ + 'pyogrio' \ + # geopandas + 'geopandas' \ + 'pysal' \ + # networkx + pyvis + netgraph + 'networkx' \ + 'pyvis' \ + 'netgraph' \ + # dependencies for jupyter-vscode-proxy (?) + 'rfc3339-validator' \ + 'rfc3986-validator' \ + 'uri-template' \ + 'fqdn' \ + 'webcolors' \ + 'isoduration' \ + 'jsonpointer' \ + # provide a way for lecturers to share code + 'nbgitpuller' \ + # monitor GPUs in terminal + 'nvtop' \ + # integration of VS Code in JupyterLab + 'jupyter-server-proxy' \ + 'jupyter-vscode-proxy' \ + # some improvements of jupyterlab + 'jupyterlab_execute_time' \ + 'jupyter-archive' \ + 'jupyter-resource-usage' \ + # install opencv via fastai, headless version not available in conda-forge + 'fastai::opencv-python-headless' && \ + pip install --no-cache-dir --extra-index-url 'https://pypi.nvidia.com' --extra-index-url 'https://download.pytorch.org/whl/cu118' \ + 'transformers' \ + 'diffusers' \ + 'datasets' \ + 'timm' \ + 'torch' \ + 'torchaudio' \ + 'torchvision' && \ + printf '%s\n' \ + "" \ + "# track CPU usage, prevent known bug with prometheus, set default limits if not set by MEM_LIMIT and CPU_LIMIT" \ + "import os" \ + "" \ + "c.ResourceUseDisplay.track_cpu_percent = True" \ + "c.ResourceUseDisplay.enable_prometheus_metrics = False" \ + "c.ResourceUseDisplay.mem_limit = int(os.getenv('MEM_LIMIT', os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')))" \ + "c.ResourceUseDisplay.cpu_limit = float(os.getenv('CPU_LIMIT', os.cpu_count()))" \ + >> /etc/jupyter/jupyter_notebook_config.py && \ + mamba clean --all -f -y && \ + fix-permissions "${CONDA_DIR}" && \ + fix-permissions "/home/${NB_USER}" + +# default settings and extensions in /etc/skel +USER root +# activate extensions by default +COPY plugin.jupyterlab-settings "/home/${NB_USER}/.jupyter/lab/user-settings/@jupyterlab/extensionmanager-extension/" +RUN chown -R ${NB_USER} "/home/${NB_USER}/.jupyter" && \ + # history search with Page Up/Down + sed -i "s/^# \(.*history-search.*\)/\1/" /etc/inputrc && \ + # VS Code Python settings + mkdir -p "/home/${NB_USER}/.local/share/code-server/User" && \ + echo -e '{\n "python.defaultInterpreterPath": "/opt/conda/bin/python",\n "jupyter.sendSelectionToInteractiveWindow": true\n}' > /home/${NB_USER}/.local/share/code-server/User/settings.json && \ + mkdir -p "/home/${NB_USER}/.local/share/code-server/Machine" && \ + cp "/home/${NB_USER}/.local/share/code-server/User/settings.json" "/home/${NB_USER}/.local/share/code-server/Machine/settings.json" && \ + mkdir -p /etc/skel/.local/share/ && \ + cp -r "/home/${NB_USER}/.local/share/code-server" /etc/skel/.local/share/ && \ + # matplotlib cache + mkdir -p /etc/skel/.cache && \ + cp -r "/home/${NB_USER}/.cache/matplotlib" /etc/skel/.cache/ && \ + # stuff + cp -r "/home/${NB_USER}/.conda" "/home/${NB_USER}/.config" "/home/${NB_USER}/.jupyter" /etc/skel/ && \ + # VS Code extension "gitlab-workflow", default gitlab server: gitlab.cvh-server.de + printf '%s\n' \ + '# Set default gitlab server to gitlab.cvh-server.de' \ + 'relpath=".local/share/code-server/extensions/gitlab.gitlab-workflow-*/extension.js"' \ + 'for abspath in /home/${NB_USER}/${relpath} /etc/skel/${relpath}; do' \ + ' if [[ -e "${abspath}" ]]; then' \ + ' sed -i "s_=\"https://gitlab.com\"_=\"https://gitlab.cvh-server.de\"_g" "${abspath}"' \ + ' fi' \ + 'done' \ + >> /usr/local/bin/start-notebook.d/fix-gitlab-server.sh && \ + # source mamba.sh in bash using /etc/profile.d/conda.sh + mamba init --system && \ + chown -R ${NB_USER} "/home/${NB_USER}/.local" && \ + fix-permissions "/home/${NB_USER}" && \ + fix-permissions "/etc/skel" +# remember to copy back in post start hook + + +# override maintainer label from Jupyter docker stacks +LABEL maintainer="Christof Kaufmann <christof.kaufmann@hs-bochum.de>" +# OCI annotations, see https://github.com/opencontainers/image-spec/blob/main/annotations.md +ARG LABEL_CREATED +ARG LABEL_REVISION=test-build +LABEL org.opencontainers.image.created=$LABEL_CREATED +LABEL org.opencontainers.image.authors="Christof Kaufmann <christof.kaufmann@hs-bochum.de>" +LABEL org.opencontainers.image.source="https://gitlab.cvh-server.de/ckaufmann/gpu-cluster-images" +LABEL org.opencontainers.image.revision=$LABEL_REVISION +LABEL org.opencontainers.image.vendor="UAS Bochum" +LABEL org.opencontainers.image.licenses=BSD-3-Clause +LABEL org.opencontainers.image.title="Jupyter Notebook PyTorch GPU image" +LABEL org.opencontainers.image.description="This image includes PyTorch with GPU support, Huggingface, VS Code CLI, code-server and nbgitpuller." +LABEL org.opencontainers.image.base.name=quay.io/jupyter/scipy-notebook:latest + +# switch back to jovyan to avoid accidental container runs as root +USER ${NB_UID} +WORKDIR "${HOME}" + + +################################################################### +######################## Testing the image ######################## +################################################################### +FROM build AS test + +# replace home directory with skel, which is closer to the kubernetes environment +USER root +RUN cd / && \ + rm -r "/home/${NB_USER}" && \ + cp -r /etc/skel "/home/${NB_USER}" && \ + fix-permissions "/home/${NB_USER}" +USER ${NB_UID} + +# enable logging for all Jupyter applications +RUN printf '%s\n' \ + "" \ + "# Filter out user connection hint message, which is a CRITICAL logger message" \ + "import logging" \ + "class UserHintFilter(logging.Filter):" \ + " def filter(self, record):" \ + " return 'To access the server' not in record.getMessage()" \ + "" \ + "c.Application.logging_config = {" \ + " 'filters': {" \ + " 'user_hint': {" \ + " '()': UserHintFilter," \ + " }," \ + " }," \ + " 'formatters': {" \ + " 'file': {" \ + " 'format': '%(asctime)s %(levelname)-8s %(name)-15s %(message)s'," \ + " }," \ + " }," \ + " 'handlers': {" \ + " 'file': {" \ + " 'class': 'logging.FileHandler'," \ + " 'filters': ['user_hint']," \ + " 'formatter': 'file'," \ + " 'level': 'INFO'," \ + " 'filename': '/home/jovyan/jupyter.log'," \ + " }," \ + " }," \ + " 'loggers': {" \ + " 'Application': {" \ + " 'level': 'DEBUG'," \ + " 'handlers': ['console', 'file']," \ + " }," \ + " }," \ + " 'loggers': {" \ + " 'JupyterApp': {" \ + " 'level': 'DEBUG'," \ + " 'handlers': ['console', 'file']," \ + " }," \ + " }," \ + " 'loggers': {" \ + " 'ExtensionApp': {" \ + " 'level': 'DEBUG'," \ + " 'handlers': ['console', 'file']," \ + " }," \ + " }," \ + " 'loggers': {" \ + " 'LabServerApp': {" \ + " 'level': 'DEBUG'," \ + " 'handlers': ['console', 'file']," \ + " }," \ + " }," \ + " 'loggers': {" \ + " 'LabApp': {" \ + " 'level': 'DEBUG'," \ + " 'handlers': ['console', 'file']," \ + " }," \ + " }," \ + " 'loggers': {" \ + " 'NotebookApp': {" \ + " 'level': 'DEBUG'," \ + " 'handlers': ['console', 'file']," \ + " }," \ + " }," \ + " 'loggers': {" \ + " 'ServerApp': {" \ + " 'level': 'DEBUG'," \ + " 'handlers': ['console', 'file']," \ + " }," \ + " }," \ + "}" \ + >> /etc/jupyter/jupyter_notebook_config.py + + +# collect lab logs for 4 sec +RUN start-notebook.sh & sleep 4 && kill -INT %1 && sleep 1 + +# check log file: + # not existing means some error prevents logging → bad +RUN [[ -f jupyter.log ]] \ + || ( echo "Log file jupyter.log does not exist!" && false ) \ + # existing, contains error → bad, print log file + && ! grep -qP '(Traceback|ERROR|CRITICAL)' jupyter.log \ + || ( cat jupyter.log && false ) + + +# add some environment variables and collect lab logs again for 4 sec +RUN mv jupyter.log jupyter1.log +ENV MEM_LIMIT=1000000000 \ + CPU_LIMIT=16.0 +RUN start-notebook.sh & sleep 4 && kill -INT %1 && sleep 1 + +# check log file: + # not existing means some error prevents logging → bad +RUN [[ -f jupyter.log ]] \ + || ( echo "Log file jupyter.log does not exist!" && false ) \ + # existing, contains error → bad, print log file + && ! grep -qP '(Traceback|ERROR|CRITICAL)' jupyter.log \ + || ( cat jupyter.log && false ) diff --git a/pytorch-gpu/plugin.jupyterlab-settings b/pytorch-gpu/plugin.jupyterlab-settings new file mode 100644 index 0000000000000000000000000000000000000000..43938cf61322cf74f501e9d6e42858d979557095 --- /dev/null +++ b/pytorch-gpu/plugin.jupyterlab-settings @@ -0,0 +1,15 @@ +{ + // Extension Manager + // @jupyterlab/extensionmanager-extension:plugin + // Extension manager settings. + // ********************************************* + + // Disclaimed Status + // Whether the user agrees the access to external web services and understands extensions may introduce security risks or contain malicious code that runs on his machine. + "disclaimed": true, + + // Enabled Status + // Enables extension manager. + // WARNING: installing untrusted extensions may be unsafe. + "enabled": true +} \ No newline at end of file