From 78700964ec0f35ab3c77c95e994768d06d37c048 Mon Sep 17 00:00:00 2001 From: ajkv-google Date: Fri, 16 Jan 2026 04:51:30 +0000 Subject: [PATCH 1/3] Updated dockerfile instrucitons --- Dockerfile | 24 ++++++++++++++++++++++++ training.md | 31 +++++++++++++++++++++++++++++-- 2 files changed, 53 insertions(+), 2 deletions(-) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..b88cad5 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,24 @@ +# Use an official Python 3.11 runtime as a parent image +FROM python:3.11-slim + +# Set the working directory +WORKDIR /app + +# Copy the current directory contents into the container +COPY . /app + +# This tells Python to look in /app for the 'recml' package +ENV PYTHONPATH="${PYTHONPATH}:/app" + +# Install system tools if needed (e.g., git) +RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* + +# Install dependencies +RUN pip install --upgrade pip +RUN pip install -r requirements.txt + +# Force install the specific protobuf version +RUN pip install "protobuf>=6.31.1" --no-deps + +# Default command to run the training script +CMD ["python", "recml/examples/dlrm_experiment_test.py"] diff --git a/training.md b/training.md index cb82670..03894e9 100644 --- a/training.md +++ b/training.md @@ -41,7 +41,7 @@ python dlrm_experiment_test.py If you prefer not to manage a virtual environment or want to deploy this as a container, you can build a Docker image. -## 1. Build the Image +## 1. Create a Dckerfile Create a file named `Dockerfile` in the root of the repository: ```dockerfile @@ -54,6 +54,9 @@ WORKDIR /app # Copy the current directory contents into the container COPY . /app +# This tells Python to look in /app for the 'recml' package +ENV PYTHONPATH="${PYTHONPATH}:/app" + # Install system tools if needed (e.g., git) RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* @@ -68,4 +71,28 @@ RUN pip install "protobuf>=6.31.1" --no-deps CMD ["python", "recml/examples/dlrm_experiment_test.py"] ``` -You can use this dockerfile to run the DLRM model experiment from this repo in your own environment. \ No newline at end of file +You can use this dockerfile to run the DLRM model experiment from this repo in your own environment. + +## 2. Build the Image + +Run this command from the root of the repository. It reads the `Dockerfile`, installs all dependencies, and creates a ready-to-run image. + +```bash +docker build -t recml-training . +``` + +## 3. Run the Image + +```bash +docker run --rm --privileged \ + --net=host \ + --ipc=host \ + --name recml-experiment \ + recml-training +``` + +### What is happening here? +* **`--rm`**: Automatically deletes the container after the script finishes to keep your disk clean. +* **`--privileged`**: Grants the container direct access to the host's hardware devices, which is required to see the physical TPU chips. +* **`--net=host`**: Removes the container's network isolation, allowing the script to connect to the TPU runtime listening on local ports (e.g., 8353). +* **`--ipc=host`**: Allows the container to use the host's Shared Memory (IPC), which is critical for high-speed data transfer between the CPU and TPU. \ No newline at end of file From 013e42e4814e6917342db8684c3f534645417a14 Mon Sep 17 00:00:00 2001 From: ajkv-google Date: Fri, 16 Jan 2026 18:36:02 +0000 Subject: [PATCH 2/3] Updated formatting --- training.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/training.md b/training.md index 03894e9..a40745f 100644 --- a/training.md +++ b/training.md @@ -6,7 +6,7 @@ This guide explains how to set up the environment and train the HSTU/DLRM models If you are developing on a TPU VM directly, use a virtual environment to avoid conflicts with the system-level Python packages. -#### 1. Prerequisites +### 1. Prerequisites Ensure you have **Python 3.11+** installed. ```bash python3 --version @@ -41,7 +41,7 @@ python dlrm_experiment_test.py If you prefer not to manage a virtual environment or want to deploy this as a container, you can build a Docker image. -## 1. Create a Dckerfile +### 1. Create a Dckerfile Create a file named `Dockerfile` in the root of the repository: ```dockerfile @@ -73,7 +73,7 @@ CMD ["python", "recml/examples/dlrm_experiment_test.py"] You can use this dockerfile to run the DLRM model experiment from this repo in your own environment. -## 2. Build the Image +### 2. Build the Image Run this command from the root of the repository. It reads the `Dockerfile`, installs all dependencies, and creates a ready-to-run image. @@ -81,7 +81,7 @@ Run this command from the root of the repository. It reads the `Dockerfile`, ins docker build -t recml-training . ``` -## 3. Run the Image +### 3. Run the Image ```bash docker run --rm --privileged \ From 532eb7b684c726bb1d6835e9cc220d10aa9f183e Mon Sep 17 00:00:00 2001 From: ajkv-google Date: Fri, 16 Jan 2026 19:02:18 +0000 Subject: [PATCH 3/3] Fixed typo --- training.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training.md b/training.md index a40745f..bf36dbc 100644 --- a/training.md +++ b/training.md @@ -41,7 +41,7 @@ python dlrm_experiment_test.py If you prefer not to manage a virtual environment or want to deploy this as a container, you can build a Docker image. -### 1. Create a Dckerfile +### 1. Create a Dockerfile Create a file named `Dockerfile` in the root of the repository: ```dockerfile