diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..b88cad5 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,24 @@ +# Use an official Python 3.11 runtime as a parent image +FROM python:3.11-slim + +# Set the working directory +WORKDIR /app + +# Copy the current directory contents into the container +COPY . /app + +# This tells Python to look in /app for the 'recml' package +ENV PYTHONPATH="${PYTHONPATH}:/app" + +# Install system tools if needed (e.g., git) +RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* + +# Install dependencies +RUN pip install --upgrade pip +RUN pip install -r requirements.txt + +# Force install the specific protobuf version +RUN pip install "protobuf>=6.31.1" --no-deps + +# Default command to run the training script +CMD ["python", "recml/examples/dlrm_experiment_test.py"] diff --git a/training.md b/training.md index cb82670..bf36dbc 100644 --- a/training.md +++ b/training.md @@ -6,7 +6,7 @@ This guide explains how to set up the environment and train the HSTU/DLRM models If you are developing on a TPU VM directly, use a virtual environment to avoid conflicts with the system-level Python packages. -#### 1. Prerequisites +### 1. Prerequisites Ensure you have **Python 3.11+** installed. ```bash python3 --version @@ -41,7 +41,7 @@ python dlrm_experiment_test.py If you prefer not to manage a virtual environment or want to deploy this as a container, you can build a Docker image. -## 1. Build the Image +### 1. Create a Dockerfile Create a file named `Dockerfile` in the root of the repository: ```dockerfile @@ -54,6 +54,9 @@ WORKDIR /app # Copy the current directory contents into the container COPY . /app +# This tells Python to look in /app for the 'recml' package +ENV PYTHONPATH="${PYTHONPATH}:/app" + # Install system tools if needed (e.g., git) RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* @@ -68,4 +71,28 @@ RUN pip install "protobuf>=6.31.1" --no-deps CMD ["python", "recml/examples/dlrm_experiment_test.py"] ``` -You can use this dockerfile to run the DLRM model experiment from this repo in your own environment. \ No newline at end of file +You can use this dockerfile to run the DLRM model experiment from this repo in your own environment. + +### 2. Build the Image + +Run this command from the root of the repository. It reads the `Dockerfile`, installs all dependencies, and creates a ready-to-run image. + +```bash +docker build -t recml-training . +``` + +### 3. Run the Image + +```bash +docker run --rm --privileged \ + --net=host \ + --ipc=host \ + --name recml-experiment \ + recml-training +``` + +### What is happening here? +* **`--rm`**: Automatically deletes the container after the script finishes to keep your disk clean. +* **`--privileged`**: Grants the container direct access to the host's hardware devices, which is required to see the physical TPU chips. +* **`--net=host`**: Removes the container's network isolation, allowing the script to connect to the TPU runtime listening on local ports (e.g., 8353). +* **`--ipc=host`**: Allows the container to use the host's Shared Memory (IPC), which is critical for high-speed data transfer between the CPU and TPU. \ No newline at end of file