NUS-HPC-AI-Lab · zxgx · Dec 26, 2024 · Dec 30, 2024 · Jan 1, 2025 · Mar 9, 2025
diff --git a/examples/training/cogvideox/1x8_baseline.sh b/examples/training/cogvideox/1x8_baseline.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+#PBS -P CFP02-CF-004
+#PBS -l select=1:ngpus=8
+#PBS -l place=vscatter
+#PBS -l walltime=24:00:00
+#PBS -j oe
+#PBS -o 1x8-cogvideox-baseline.log
+
+# =============== env params ================
+# This script is for NSCC which uses PBS Pro as the scheduler
+
+# where the singularity image is saved
+SCRATCH_PATH=$HPCTMP
+
+cd $PBS_O_WORKDIR
+echo "JOB ID: $PBS_JOBID, pwd: $PWD, pbs workdir: $PBS_O_WORKDIR"
+
+# for torch.distributed
+export NNODES=1
+# export NODE_RANK=0
+export GPUS_PER_NODE=8
+export WORLD_SIZE=$(($NNODES*$GPUS_PER_NODE))
+export MASTER_ADDR=$(head -n 1 $PBS_NODEFILE | awk -F'.' '{print $1}')
+export MASTER_PORT=29502
+echo "master node: $MASTER_ADDR"
+
+# used by OpenMPI
+export HOSTFILE="$PBS_JOBID.hostfile"
+cat $PBS_NODEFILE | awk -F'.' '{for(i=1;i<=NF;i+=6) print $1 " slots="ENVIRON["GPUS_PER_NODE"]}' > $HOSTFILE
+echo "detected hosts: $(cat $HOSTFILE)"
+
+# refer to: https://apptainer.org/user-docs/master/gpu.html
+# for apptainer, replace SINGULARITYENV_* with APPTAINERENV_*
+# export SINGULARITYENV_CUDA_VISIBLE_DEVICES=$(printf "%s," $(seq 0 $(($GPUS_PER_NODE-1))) | sed 's/,$//')
+# echo "singularity cuda visible devices: $SINGULARITYENV_CUDA_VISIBLE_DEVICES"
+
+# =============== program params ================
+export PYTHONPATH=$PYTHONPATH:$PWD
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=false
+
+# =============== zipf-1 ================
+mpirun --hostfile $HOSTFILE --np $WORLD_SIZE -N $GPUS_PER_NODE --oversubscribe \
+    singularity exec --nv /app1/common/singularity-img/hopper/cuda/cuda_12.1.1-cudnn8-devel-ubuntu22.04.sif \
+    /bin/bash -c "source /hpctmp/e1154485/venvs/videosys/bin/activate && \
+    python examples/training/cogvideox/train.py \
+    examples/training/cogvideox/configs/benchmarks/baseline.yaml \
+    --image-mixing-frac 1 --profile-flops
+"
+
+# =============== zipf-10 ================
+mpirun --hostfile $HOSTFILE --np $WORLD_SIZE -N $GPUS_PER_NODE --oversubscribe \
+    singularity exec --nv /app1/common/singularity-img/hopper/cuda/cuda_12.1.1-cudnn8-devel-ubuntu22.04.sif \
+    /bin/bash -c "source /hpctmp/e1154485/venvs/videosys/bin/activate && \
+    python examples/training/cogvideox/train.py \
+    examples/training/cogvideox/configs/benchmarks/baseline.yaml \
+    --image-mixing-frac 10 --profile-flops
+"
+
+# =============== zipf-50 ================
+mpirun --hostfile $HOSTFILE --np $WORLD_SIZE -N $GPUS_PER_NODE --oversubscribe \
+    singularity exec --nv /app1/common/singularity-img/hopper/cuda/cuda_12.1.1-cudnn8-devel-ubuntu22.04.sif \
+    /bin/bash -c "source /hpctmp/e1154485/venvs/videosys/bin/activate && \
+    python examples/training/cogvideox/train.py \
+    examples/training/cogvideox/configs/benchmarks/baseline.yaml \
+    --image-mixing-frac 50 --profile-flops
+"
+
+rm $HOSTFILE
diff --git a/examples/training/cogvideox/1x8_dcp_inter.sh b/examples/training/cogvideox/1x8_dcp_inter.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+#PBS -P CFP02-CF-004
+#PBS -l select=1:ngpus=8
+#PBS -l place=vscatter
+#PBS -l walltime=24:00:00
+#PBS -j oe
+#PBS -o 1x8-cogvideox-dcp-inter.log
+
+# =============== env params ================
+# This script is for NSCC which uses PBS Pro as the scheduler
+
+# where the singularity image is saved
+SCRATCH_PATH=$HPCTMP
+
+cd $PBS_O_WORKDIR
+echo "JOB ID: $PBS_JOBID, pwd: $PWD, pbs workdir: $PBS_O_WORKDIR"
+
+# for torch.distributed
+export NNODES=1
+# export NODE_RANK=0
+export GPUS_PER_NODE=8
+export WORLD_SIZE=$(($NNODES*$GPUS_PER_NODE))
+export MASTER_ADDR=$(head -n 1 $PBS_NODEFILE | awk -F'.' '{print $1}')
+export MASTER_PORT=29502
+echo "master node: $MASTER_ADDR"
+
+# used by OpenMPI
+export HOSTFILE="$PBS_JOBID.hostfile"
+cat $PBS_NODEFILE | awk -F'.' '{for(i=1;i<=NF;i+=6) print $1 " slots="ENVIRON["GPUS_PER_NODE"]}' > $HOSTFILE
+echo "detected hosts: $(cat $HOSTFILE)"
+
+# refer to: https://apptainer.org/user-docs/master/gpu.html
+# for apptainer, replace SINGULARITYENV_* with APPTAINERENV_*
+# export SINGULARITYENV_CUDA_VISIBLE_DEVICES=$(printf "%s," $(seq 0 $(($GPUS_PER_NODE-1))) | sed 's/,$//')
+# echo "singularity cuda visible devices: $SINGULARITYENV_CUDA_VISIBLE_DEVICES"
+
+# =============== program params ================
+export PYTHONPATH=$PYTHONPATH:$PWD
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=false
+
+# =============== zipf-1 ================
+mpirun --hostfile $HOSTFILE --np $WORLD_SIZE -N $GPUS_PER_NODE --oversubscribe \
+    singularity exec --nv /app1/common/singularity-img/hopper/cuda/cuda_12.1.1-cudnn8-devel-ubuntu22.04.sif \
+    /bin/bash -c "source /hpctmp/e1154485/venvs/videosys/bin/activate && \
+    python examples/training/cogvideox/train.py \
+    examples/training/cogvideox/configs/benchmarks/dcp_inter.yaml \
+    --image-mixing-frac 1 --profile-flops
+"
+
+# # =============== zipf-10 ================
+# mpirun --hostfile $HOSTFILE --np $WORLD_SIZE -N $GPUS_PER_NODE --oversubscribe \
+#     singularity exec --nv /app1/common/singularity-img/hopper/cuda/cuda_12.1.1-cudnn8-devel-ubuntu22.04.sif \
+#     /bin/bash -c "source /hpctmp/e1154485/venvs/videosys/bin/activate && \
+#     python examples/training/cogvideox/train.py \
+#     examples/training/cogvideox/configs/benchmarks/dcp_inter.yaml \
+#     --image-mixing-frac 10 --profile-flops
+# "
+
+# # =============== zipf-50 ================
+# mpirun --hostfile $HOSTFILE --np $WORLD_SIZE -N $GPUS_PER_NODE --oversubscribe \
+#     singularity exec --nv /app1/common/singularity-img/hopper/cuda/cuda_12.1.1-cudnn8-devel-ubuntu22.04.sif \
+#     /bin/bash -c "source /hpctmp/e1154485/venvs/videosys/bin/activate && \
+#     python examples/training/cogvideox/train.py \
+#     examples/training/cogvideox/configs/benchmarks/dcp_inter.yaml \
+#     --image-mixing-frac 50 --profile-flops
+# "
+
+rm $HOSTFILE
diff --git a/examples/training/cogvideox/1x8_dcp_inter_ckpt.sh b/examples/training/cogvideox/1x8_dcp_inter_ckpt.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+#PBS -P CFP02-CF-004
+#PBS -l select=1:ngpus=8
+#PBS -l place=vscatter
+#PBS -l walltime=24:00:00
+#PBS -j oe
+#PBS -o 1x8-cogvideox-dcp-inter-ckpt.log
+
+# =============== env params ================
+# This script is for NSCC which uses PBS Pro as the scheduler
+
+# where the singularity image is saved
+SCRATCH_PATH=$HPCTMP
+
+cd $PBS_O_WORKDIR
+echo "JOB ID: $PBS_JOBID, pwd: $PWD, pbs workdir: $PBS_O_WORKDIR"
+
+# for torch.distributed
+export NNODES=1
+# export NODE_RANK=0
+export GPUS_PER_NODE=8
+export WORLD_SIZE=$(($NNODES*$GPUS_PER_NODE))
+export MASTER_ADDR=$(head -n 1 $PBS_NODEFILE | awk -F'.' '{print $1}')
+export MASTER_PORT=29502
+echo "master node: $MASTER_ADDR"
+
+# used by OpenMPI
+export HOSTFILE="$PBS_JOBID.hostfile"
+cat $PBS_NODEFILE | awk -F'.' '{for(i=1;i<=NF;i+=6) print $1 " slots="ENVIRON["GPUS_PER_NODE"]}' > $HOSTFILE
+echo "detected hosts: $(cat $HOSTFILE)"
+
+# refer to: https://apptainer.org/user-docs/master/gpu.html
+# for apptainer, replace SINGULARITYENV_* with APPTAINERENV_*
+# export SINGULARITYENV_CUDA_VISIBLE_DEVICES=$(printf "%s," $(seq 0 $(($GPUS_PER_NODE-1))) | sed 's/,$//')
+# echo "singularity cuda visible devices: $SINGULARITYENV_CUDA_VISIBLE_DEVICES"
+
+# =============== program params ================
+export PYTHONPATH=$PYTHONPATH:$PWD
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=false
+
+# =============== zipf-1 ================
+mpirun --hostfile $HOSTFILE --np $WORLD_SIZE -N $GPUS_PER_NODE --oversubscribe \
+    singularity exec --nv /app1/common/singularity-img/hopper/cuda/cuda_12.1.1-cudnn8-devel-ubuntu22.04.sif \
+    /bin/bash -c "source /hpctmp/e1154485/venvs/videosys/bin/activate && \
+    python examples/training/cogvideox/train.py \
+    examples/training/cogvideox/configs/benchmarks/dcp_inter_ckpt.yaml \
+    --image-mixing-frac 1 --profile-flops
+"
+
+# =============== zipf-10 ================
+mpirun --hostfile $HOSTFILE --np $WORLD_SIZE -N $GPUS_PER_NODE --oversubscribe \
+    singularity exec --nv /app1/common/singularity-img/hopper/cuda/cuda_12.1.1-cudnn8-devel-ubuntu22.04.sif \
+    /bin/bash -c "source /hpctmp/e1154485/venvs/videosys/bin/activate && \
+    python examples/training/cogvideox/train.py \
+    examples/training/cogvideox/configs/benchmarks/dcp_inter_ckpt.yaml \
+    --image-mixing-frac 10 --profile-flops
+"
+
+# =============== zipf-50 ================
+mpirun --hostfile $HOSTFILE --np $WORLD_SIZE -N $GPUS_PER_NODE --oversubscribe \
+    singularity exec --nv /app1/common/singularity-img/hopper/cuda/cuda_12.1.1-cudnn8-devel-ubuntu22.04.sif \
+    /bin/bash -c "source /hpctmp/e1154485/venvs/videosys/bin/activate && \
+    python examples/training/cogvideox/train.py \
+    examples/training/cogvideox/configs/benchmarks/dcp_inter_ckpt.yaml \
+    --image-mixing-frac 50 --profile-flops
+"
+
+rm $HOSTFILE
diff --git a/examples/training/cogvideox/1x8_dcp_intra.sh b/examples/training/cogvideox/1x8_dcp_intra.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+#PBS -P CFP02-CF-004
+#PBS -l select=1:ngpus=8
+#PBS -l place=vscatter
+#PBS -l walltime=24:00:00
+#PBS -j oe
+#PBS -o 1x8-cogvideox-dcp-intra.log
+
+# =============== env params ================
+# This script is for NSCC which uses PBS Pro as the scheduler
+
+# where the singularity image is saved
+SCRATCH_PATH=$HPCTMP
+
+cd $PBS_O_WORKDIR
+echo "JOB ID: $PBS_JOBID, pwd: $PWD, pbs workdir: $PBS_O_WORKDIR"
+
+# for torch.distributed
+export NNODES=1
+# export NODE_RANK=0
+export GPUS_PER_NODE=8
+export WORLD_SIZE=$(($NNODES*$GPUS_PER_NODE))
+export MASTER_ADDR=$(head -n 1 $PBS_NODEFILE | awk -F'.' '{print $1}')
+export MASTER_PORT=29502
+echo "master node: $MASTER_ADDR"
+
+# used by OpenMPI
+export HOSTFILE="$PBS_JOBID.hostfile"
+cat $PBS_NODEFILE | awk -F'.' '{for(i=1;i<=NF;i+=6) print $1 " slots="ENVIRON["GPUS_PER_NODE"]}' > $HOSTFILE
+echo "detected hosts: $(cat $HOSTFILE)"
+
+# refer to: https://apptainer.org/user-docs/master/gpu.html
+# for apptainer, replace SINGULARITYENV_* with APPTAINERENV_*
+# export SINGULARITYENV_CUDA_VISIBLE_DEVICES=$(printf "%s," $(seq 0 $(($GPUS_PER_NODE-1))) | sed 's/,$//')
+# echo "singularity cuda visible devices: $SINGULARITYENV_CUDA_VISIBLE_DEVICES"
+
+# =============== program params ================
+export PYTHONPATH=$PYTHONPATH:$PWD
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=false
+
+# =============== zipf-1 ================
+mpirun --hostfile $HOSTFILE --np $WORLD_SIZE -N $GPUS_PER_NODE --oversubscribe \
+    singularity exec --nv /app1/common/singularity-img/hopper/cuda/cuda_12.1.1-cudnn8-devel-ubuntu22.04.sif \
+    /bin/bash -c "source /hpctmp/e1154485/venvs/videosys/bin/activate && \
+    python examples/training/cogvideox/train.py \
+    examples/training/cogvideox/configs/benchmarks/dcp_intra.yaml \
+    --image-mixing-frac 1 --profile-flops
+"
+
+# =============== zipf-10 ================
+mpirun --hostfile $HOSTFILE --np $WORLD_SIZE -N $GPUS_PER_NODE --oversubscribe \
+    singularity exec --nv /app1/common/singularity-img/hopper/cuda/cuda_12.1.1-cudnn8-devel-ubuntu22.04.sif \
+    /bin/bash -c "source /hpctmp/e1154485/venvs/videosys/bin/activate && \
+    python examples/training/cogvideox/train.py \
+    examples/training/cogvideox/configs/benchmarks/dcp_intra.yaml \
+    --image-mixing-frac 10 --profile-flops
+"
+
+# =============== zipf-50 ================
+mpirun --hostfile $HOSTFILE --np $WORLD_SIZE -N $GPUS_PER_NODE --oversubscribe \
+    singularity exec --nv /app1/common/singularity-img/hopper/cuda/cuda_12.1.1-cudnn8-devel-ubuntu22.04.sif \
+    /bin/bash -c "source /hpctmp/e1154485/venvs/videosys/bin/activate && \
+    python examples/training/cogvideox/train.py \
+    examples/training/cogvideox/configs/benchmarks/dcp_intra.yaml \
+    --image-mixing-frac 50 --profile-flops
+"
+
+rm $HOSTFILE
diff --git a/examples/training/cogvideox/baseline_profile.sh b/examples/training/cogvideox/baseline_profile.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+#PBS -P CFP02-CF-004
+#PBS -l select=1:ngpus=8
+#PBS -l place=vscatter
+#PBS -l walltime=12:00:00
+#PBS -j oe
+#PBS -o profile-baseline-cogvideox.log
+
+# =============== env params ================
+# This script is for NSCC which uses PBS Pro as the scheduler
+
+# where the singularity image is saved
+SCRATCH_PATH=$HPCTMP
+
+cd $PBS_O_WORKDIR
+echo "JOB ID: $PBS_JOBID, pwd: $PWD, pbs workdir: $PBS_O_WORKDIR"
+
+# for torch.distributed
+export NNODES=1
+# export NODE_RANK=0
+export GPUS_PER_NODE=8
+export WORLD_SIZE=$(($NNODES*$GPUS_PER_NODE))
+export MASTER_ADDR=$(head -n 1 $PBS_NODEFILE | awk -F'.' '{print $1}')
+export MASTER_PORT=9527
+echo "master node: $MASTER_ADDR"
+
+# used by OpenMPI
+export HOSTFILE="$PBS_JOBID.hostfile"
+cat $PBS_NODEFILE | awk -F'.' '{for(i=1;i<=NF;i+=6) print $1 " slots="ENVIRON["GPUS_PER_NODE"]}' > $HOSTFILE
+echo "detected hosts: $(cat $HOSTFILE)"
+
+# refer to: https://apptainer.org/user-docs/master/gpu.html
+# for apptainer, replace SINGULARITYENV_* with APPTAINERENV_*
+# export SINGULARITYENV_CUDA_VISIBLE_DEVICES=$(printf "%s," $(seq 0 $(($GPUS_PER_NODE-1))) | sed 's/,$//')
+# echo "singularity cuda visible devices: $SINGULARITYENV_CUDA_VISIBLE_DEVICES"
+
+# =============== program params ================
+export PYTHONPATH=$PYTHONPATH:$PWD
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+export TOKENIZERS_PARALLELISM=false
+
+# baseline
+mpirun --hostfile $HOSTFILE --np $WORLD_SIZE -N $GPUS_PER_NODE --oversubscribe \
+    singularity exec --nv /app1/common/singularity-img/hopper/cuda/cuda_12.1.1-cudnn8-devel-ubuntu22.04.sif \
+    /bin/bash -c "source /hpctmp/e1154485/venvs/videosys/bin/activate && \
+    python examples/training/cogvideox/train.py \
+    examples/training/cogvideox/configs/benchmarks/baseline.yaml
+"
+
+rm $HOSTFILE
diff --git a/examples/training/cogvideox/configs/benchmarks/baseline.yaml b/examples/training/cogvideox/configs/benchmarks/baseline.yaml
@@ -0,0 +1,68 @@
+zipf_offset: 5
+outputs: exp/cogvideox/baseline
+profile_path: exp/cogvideox/profile/baseline
+sp_size: 8
+dummy_dataset: true
+dummy_data_size: 2000
+verbose: true
+calculate_imbalance: true
+
+
+# ==== training config ====
+
+# preprocess embedding
+data_path: "./assets/example_data/demo_preprocess.csv"
+preprocessed_data: true
+drop_last: true
+
+# train
+ckpt_path: "THUDM/CogVideoX-5b"
+grad_checkpoint: True
+num_workers: 8
+dtype: "bf16"
+
+# log
+seed: 42
+epochs: 1
+log_every: 1e10
+
+# optimization
+grad_clip: 1.0
+lr: 1e-8
+ema_decay: 0.99
+adam_eps: 1e-15
+warmup_steps: 10
+
+# data
+# image_mixing_frac: 50
+num_bucket_build_workers: 16
+bucket_config:
+  "144p": {1: [1.0, 345], 25: [1.0, 48], 49: [1.0, 25], 73: [1.0, 12], 97: [1.0, 6]}
+  "240p": {1: [1.0, 128], 25: [1.0, 16], 49: [1.0, 8], 73: [1.0, 4], 97: [1.0, 2]}
+  "360p": {1: [1.0, 64], 25: [1.0, 7], 49: [1.0, 4], 73: [1.0, 2], 97: [1.0, 1]}
+  "480p": {1: [1.0, 32], 25: [1.0, 4], 49: [1.0, 2], 73: [1.0, 1], 97: [1.0, 1]}
+  "720p": {1: [1.0, 14], 25: [1.0, 1], 49: [1.0, 1], 73: [1.0, 1], 97: [1.0, 1]}
+
+# override default common ar
+# for benchmark, we use single ar for all resolutions
+# otherwise the data will be too sparse
+common_ar:
+  "144p": {"0.56": [144, 256]}
+  "240p": {"0.56": [240, 426]}
+  "360p": {"0.56": [360, 640]}
+  "480p": {"0.56": [480, 720]}
+  "720p": {"0.56": [720, 1280]}
+
+# mask
+mask_ratios: {
+    "random": 0.01,
+    "intepolate": 0.002,
+    "quarter_random": 0.002,
+    "quarter_head": 0.002,
+    "quarter_tail": 0.002,
+    "quarter_head_tail": 0.002,
+    "image_random": 0.0,
+    "image_head": 0.22,
+    "image_tail": 0.005,
+    "image_head_tail": 0.005,
+}