Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions examples/training/cogvideox/1x8_baseline.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/bin/bash
#PBS -P CFP02-CF-004
#PBS -l select=1:ngpus=8
#PBS -l place=vscatter
#PBS -l walltime=24:00:00
#PBS -j oe
#PBS -o 1x8-cogvideox-baseline.log

# =============== env params ================
# This script is for NSCC which uses PBS Pro as the scheduler

# where the singularity image is saved
SCRATCH_PATH=$HPCTMP

cd $PBS_O_WORKDIR
echo "JOB ID: $PBS_JOBID, pwd: $PWD, pbs workdir: $PBS_O_WORKDIR"

# for torch.distributed
export NNODES=1
# export NODE_RANK=0
export GPUS_PER_NODE=8
export WORLD_SIZE=$(($NNODES*$GPUS_PER_NODE))
export MASTER_ADDR=$(head -n 1 $PBS_NODEFILE | awk -F'.' '{print $1}')
export MASTER_PORT=29502
echo "master node: $MASTER_ADDR"

# used by OpenMPI
export HOSTFILE="$PBS_JOBID.hostfile"
cat $PBS_NODEFILE | awk -F'.' '{for(i=1;i<=NF;i+=6) print $1 " slots="ENVIRON["GPUS_PER_NODE"]}' > $HOSTFILE
echo "detected hosts: $(cat $HOSTFILE)"

# refer to: https://apptainer.org/user-docs/master/gpu.html
# for apptainer, replace SINGULARITYENV_* with APPTAINERENV_*
# export SINGULARITYENV_CUDA_VISIBLE_DEVICES=$(printf "%s," $(seq 0 $(($GPUS_PER_NODE-1))) | sed 's/,$//')
# echo "singularity cuda visible devices: $SINGULARITYENV_CUDA_VISIBLE_DEVICES"

# =============== program params ================
export PYTHONPATH=$PYTHONPATH:$PWD
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export TOKENIZERS_PARALLELISM=false

# =============== zipf-1 ================
mpirun --hostfile $HOSTFILE --np $WORLD_SIZE -N $GPUS_PER_NODE --oversubscribe \
singularity exec --nv /app1/common/singularity-img/hopper/cuda/cuda_12.1.1-cudnn8-devel-ubuntu22.04.sif \
/bin/bash -c "source /hpctmp/e1154485/venvs/videosys/bin/activate && \
python examples/training/cogvideox/train.py \
examples/training/cogvideox/configs/benchmarks/baseline.yaml \
--image-mixing-frac 1 --profile-flops
"

# =============== zipf-10 ================
mpirun --hostfile $HOSTFILE --np $WORLD_SIZE -N $GPUS_PER_NODE --oversubscribe \
singularity exec --nv /app1/common/singularity-img/hopper/cuda/cuda_12.1.1-cudnn8-devel-ubuntu22.04.sif \
/bin/bash -c "source /hpctmp/e1154485/venvs/videosys/bin/activate && \
python examples/training/cogvideox/train.py \
examples/training/cogvideox/configs/benchmarks/baseline.yaml \
--image-mixing-frac 10 --profile-flops
"

# =============== zipf-50 ================
mpirun --hostfile $HOSTFILE --np $WORLD_SIZE -N $GPUS_PER_NODE --oversubscribe \
singularity exec --nv /app1/common/singularity-img/hopper/cuda/cuda_12.1.1-cudnn8-devel-ubuntu22.04.sif \
/bin/bash -c "source /hpctmp/e1154485/venvs/videosys/bin/activate && \
python examples/training/cogvideox/train.py \
examples/training/cogvideox/configs/benchmarks/baseline.yaml \
--image-mixing-frac 50 --profile-flops
"

rm $HOSTFILE
69 changes: 69 additions & 0 deletions examples/training/cogvideox/1x8_dcp_inter.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/bin/bash
#PBS -P CFP02-CF-004
#PBS -l select=1:ngpus=8
#PBS -l place=vscatter
#PBS -l walltime=24:00:00
#PBS -j oe
#PBS -o 1x8-cogvideox-dcp-inter.log

# =============== env params ================
# This script is for NSCC which uses PBS Pro as the scheduler

# where the singularity image is saved
SCRATCH_PATH=$HPCTMP

cd $PBS_O_WORKDIR
echo "JOB ID: $PBS_JOBID, pwd: $PWD, pbs workdir: $PBS_O_WORKDIR"

# for torch.distributed
export NNODES=1
# export NODE_RANK=0
export GPUS_PER_NODE=8
export WORLD_SIZE=$(($NNODES*$GPUS_PER_NODE))
export MASTER_ADDR=$(head -n 1 $PBS_NODEFILE | awk -F'.' '{print $1}')
export MASTER_PORT=29502
echo "master node: $MASTER_ADDR"

# used by OpenMPI
export HOSTFILE="$PBS_JOBID.hostfile"
cat $PBS_NODEFILE | awk -F'.' '{for(i=1;i<=NF;i+=6) print $1 " slots="ENVIRON["GPUS_PER_NODE"]}' > $HOSTFILE
echo "detected hosts: $(cat $HOSTFILE)"

# refer to: https://apptainer.org/user-docs/master/gpu.html
# for apptainer, replace SINGULARITYENV_* with APPTAINERENV_*
# export SINGULARITYENV_CUDA_VISIBLE_DEVICES=$(printf "%s," $(seq 0 $(($GPUS_PER_NODE-1))) | sed 's/,$//')
# echo "singularity cuda visible devices: $SINGULARITYENV_CUDA_VISIBLE_DEVICES"

# =============== program params ================
export PYTHONPATH=$PYTHONPATH:$PWD
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export TOKENIZERS_PARALLELISM=false

# =============== zipf-1 ================
mpirun --hostfile $HOSTFILE --np $WORLD_SIZE -N $GPUS_PER_NODE --oversubscribe \
singularity exec --nv /app1/common/singularity-img/hopper/cuda/cuda_12.1.1-cudnn8-devel-ubuntu22.04.sif \
/bin/bash -c "source /hpctmp/e1154485/venvs/videosys/bin/activate && \
python examples/training/cogvideox/train.py \
examples/training/cogvideox/configs/benchmarks/dcp_inter.yaml \
--image-mixing-frac 1 --profile-flops
"

# # =============== zipf-10 ================
# mpirun --hostfile $HOSTFILE --np $WORLD_SIZE -N $GPUS_PER_NODE --oversubscribe \
# singularity exec --nv /app1/common/singularity-img/hopper/cuda/cuda_12.1.1-cudnn8-devel-ubuntu22.04.sif \
# /bin/bash -c "source /hpctmp/e1154485/venvs/videosys/bin/activate && \
# python examples/training/cogvideox/train.py \
# examples/training/cogvideox/configs/benchmarks/dcp_inter.yaml \
# --image-mixing-frac 10 --profile-flops
# "

# # =============== zipf-50 ================
# mpirun --hostfile $HOSTFILE --np $WORLD_SIZE -N $GPUS_PER_NODE --oversubscribe \
# singularity exec --nv /app1/common/singularity-img/hopper/cuda/cuda_12.1.1-cudnn8-devel-ubuntu22.04.sif \
# /bin/bash -c "source /hpctmp/e1154485/venvs/videosys/bin/activate && \
# python examples/training/cogvideox/train.py \
# examples/training/cogvideox/configs/benchmarks/dcp_inter.yaml \
# --image-mixing-frac 50 --profile-flops
# "

rm $HOSTFILE
69 changes: 69 additions & 0 deletions examples/training/cogvideox/1x8_dcp_inter_ckpt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/bin/bash
#PBS -P CFP02-CF-004
#PBS -l select=1:ngpus=8
#PBS -l place=vscatter
#PBS -l walltime=24:00:00
#PBS -j oe
#PBS -o 1x8-cogvideox-dcp-inter-ckpt.log

# =============== env params ================
# This script is for NSCC which uses PBS Pro as the scheduler

# where the singularity image is saved
SCRATCH_PATH=$HPCTMP

cd $PBS_O_WORKDIR
echo "JOB ID: $PBS_JOBID, pwd: $PWD, pbs workdir: $PBS_O_WORKDIR"

# for torch.distributed
export NNODES=1
# export NODE_RANK=0
export GPUS_PER_NODE=8
export WORLD_SIZE=$(($NNODES*$GPUS_PER_NODE))
export MASTER_ADDR=$(head -n 1 $PBS_NODEFILE | awk -F'.' '{print $1}')
export MASTER_PORT=29502
echo "master node: $MASTER_ADDR"

# used by OpenMPI
export HOSTFILE="$PBS_JOBID.hostfile"
cat $PBS_NODEFILE | awk -F'.' '{for(i=1;i<=NF;i+=6) print $1 " slots="ENVIRON["GPUS_PER_NODE"]}' > $HOSTFILE
echo "detected hosts: $(cat $HOSTFILE)"

# refer to: https://apptainer.org/user-docs/master/gpu.html
# for apptainer, replace SINGULARITYENV_* with APPTAINERENV_*
# export SINGULARITYENV_CUDA_VISIBLE_DEVICES=$(printf "%s," $(seq 0 $(($GPUS_PER_NODE-1))) | sed 's/,$//')
# echo "singularity cuda visible devices: $SINGULARITYENV_CUDA_VISIBLE_DEVICES"

# =============== program params ================
export PYTHONPATH=$PYTHONPATH:$PWD
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export TOKENIZERS_PARALLELISM=false

# =============== zipf-1 ================
mpirun --hostfile $HOSTFILE --np $WORLD_SIZE -N $GPUS_PER_NODE --oversubscribe \
singularity exec --nv /app1/common/singularity-img/hopper/cuda/cuda_12.1.1-cudnn8-devel-ubuntu22.04.sif \
/bin/bash -c "source /hpctmp/e1154485/venvs/videosys/bin/activate && \
python examples/training/cogvideox/train.py \
examples/training/cogvideox/configs/benchmarks/dcp_inter_ckpt.yaml \
--image-mixing-frac 1 --profile-flops
"

# =============== zipf-10 ================
mpirun --hostfile $HOSTFILE --np $WORLD_SIZE -N $GPUS_PER_NODE --oversubscribe \
singularity exec --nv /app1/common/singularity-img/hopper/cuda/cuda_12.1.1-cudnn8-devel-ubuntu22.04.sif \
/bin/bash -c "source /hpctmp/e1154485/venvs/videosys/bin/activate && \
python examples/training/cogvideox/train.py \
examples/training/cogvideox/configs/benchmarks/dcp_inter_ckpt.yaml \
--image-mixing-frac 10 --profile-flops
"

# =============== zipf-50 ================
mpirun --hostfile $HOSTFILE --np $WORLD_SIZE -N $GPUS_PER_NODE --oversubscribe \
singularity exec --nv /app1/common/singularity-img/hopper/cuda/cuda_12.1.1-cudnn8-devel-ubuntu22.04.sif \
/bin/bash -c "source /hpctmp/e1154485/venvs/videosys/bin/activate && \
python examples/training/cogvideox/train.py \
examples/training/cogvideox/configs/benchmarks/dcp_inter_ckpt.yaml \
--image-mixing-frac 50 --profile-flops
"

rm $HOSTFILE
69 changes: 69 additions & 0 deletions examples/training/cogvideox/1x8_dcp_intra.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/bin/bash
#PBS -P CFP02-CF-004
#PBS -l select=1:ngpus=8
#PBS -l place=vscatter
#PBS -l walltime=24:00:00
#PBS -j oe
#PBS -o 1x8-cogvideox-dcp-intra.log

# =============== env params ================
# This script is for NSCC which uses PBS Pro as the scheduler

# where the singularity image is saved
SCRATCH_PATH=$HPCTMP

cd $PBS_O_WORKDIR
echo "JOB ID: $PBS_JOBID, pwd: $PWD, pbs workdir: $PBS_O_WORKDIR"

# for torch.distributed
export NNODES=1
# export NODE_RANK=0
export GPUS_PER_NODE=8
export WORLD_SIZE=$(($NNODES*$GPUS_PER_NODE))
export MASTER_ADDR=$(head -n 1 $PBS_NODEFILE | awk -F'.' '{print $1}')
export MASTER_PORT=29502
echo "master node: $MASTER_ADDR"

# used by OpenMPI
export HOSTFILE="$PBS_JOBID.hostfile"
cat $PBS_NODEFILE | awk -F'.' '{for(i=1;i<=NF;i+=6) print $1 " slots="ENVIRON["GPUS_PER_NODE"]}' > $HOSTFILE
echo "detected hosts: $(cat $HOSTFILE)"

# refer to: https://apptainer.org/user-docs/master/gpu.html
# for apptainer, replace SINGULARITYENV_* with APPTAINERENV_*
# export SINGULARITYENV_CUDA_VISIBLE_DEVICES=$(printf "%s," $(seq 0 $(($GPUS_PER_NODE-1))) | sed 's/,$//')
# echo "singularity cuda visible devices: $SINGULARITYENV_CUDA_VISIBLE_DEVICES"

# =============== program params ================
export PYTHONPATH=$PYTHONPATH:$PWD
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export TOKENIZERS_PARALLELISM=false

# =============== zipf-1 ================
mpirun --hostfile $HOSTFILE --np $WORLD_SIZE -N $GPUS_PER_NODE --oversubscribe \
singularity exec --nv /app1/common/singularity-img/hopper/cuda/cuda_12.1.1-cudnn8-devel-ubuntu22.04.sif \
/bin/bash -c "source /hpctmp/e1154485/venvs/videosys/bin/activate && \
python examples/training/cogvideox/train.py \
examples/training/cogvideox/configs/benchmarks/dcp_intra.yaml \
--image-mixing-frac 1 --profile-flops
"

# =============== zipf-10 ================
mpirun --hostfile $HOSTFILE --np $WORLD_SIZE -N $GPUS_PER_NODE --oversubscribe \
singularity exec --nv /app1/common/singularity-img/hopper/cuda/cuda_12.1.1-cudnn8-devel-ubuntu22.04.sif \
/bin/bash -c "source /hpctmp/e1154485/venvs/videosys/bin/activate && \
python examples/training/cogvideox/train.py \
examples/training/cogvideox/configs/benchmarks/dcp_intra.yaml \
--image-mixing-frac 10 --profile-flops
"

# =============== zipf-50 ================
mpirun --hostfile $HOSTFILE --np $WORLD_SIZE -N $GPUS_PER_NODE --oversubscribe \
singularity exec --nv /app1/common/singularity-img/hopper/cuda/cuda_12.1.1-cudnn8-devel-ubuntu22.04.sif \
/bin/bash -c "source /hpctmp/e1154485/venvs/videosys/bin/activate && \
python examples/training/cogvideox/train.py \
examples/training/cogvideox/configs/benchmarks/dcp_intra.yaml \
--image-mixing-frac 50 --profile-flops
"

rm $HOSTFILE
50 changes: 50 additions & 0 deletions examples/training/cogvideox/baseline_profile.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/bin/bash
#PBS -P CFP02-CF-004
#PBS -l select=1:ngpus=8
#PBS -l place=vscatter
#PBS -l walltime=12:00:00
#PBS -j oe
#PBS -o profile-baseline-cogvideox.log

# =============== env params ================
# This script is for NSCC which uses PBS Pro as the scheduler

# where the singularity image is saved
SCRATCH_PATH=$HPCTMP

cd $PBS_O_WORKDIR
echo "JOB ID: $PBS_JOBID, pwd: $PWD, pbs workdir: $PBS_O_WORKDIR"

# for torch.distributed
export NNODES=1
# export NODE_RANK=0
export GPUS_PER_NODE=8
export WORLD_SIZE=$(($NNODES*$GPUS_PER_NODE))
export MASTER_ADDR=$(head -n 1 $PBS_NODEFILE | awk -F'.' '{print $1}')
export MASTER_PORT=9527
echo "master node: $MASTER_ADDR"

# used by OpenMPI
export HOSTFILE="$PBS_JOBID.hostfile"
cat $PBS_NODEFILE | awk -F'.' '{for(i=1;i<=NF;i+=6) print $1 " slots="ENVIRON["GPUS_PER_NODE"]}' > $HOSTFILE
echo "detected hosts: $(cat $HOSTFILE)"

# refer to: https://apptainer.org/user-docs/master/gpu.html
# for apptainer, replace SINGULARITYENV_* with APPTAINERENV_*
# export SINGULARITYENV_CUDA_VISIBLE_DEVICES=$(printf "%s," $(seq 0 $(($GPUS_PER_NODE-1))) | sed 's/,$//')
# echo "singularity cuda visible devices: $SINGULARITYENV_CUDA_VISIBLE_DEVICES"

# =============== program params ================
export PYTHONPATH=$PYTHONPATH:$PWD
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export TOKENIZERS_PARALLELISM=false

# baseline
mpirun --hostfile $HOSTFILE --np $WORLD_SIZE -N $GPUS_PER_NODE --oversubscribe \
singularity exec --nv /app1/common/singularity-img/hopper/cuda/cuda_12.1.1-cudnn8-devel-ubuntu22.04.sif \
/bin/bash -c "source /hpctmp/e1154485/venvs/videosys/bin/activate && \
python examples/training/cogvideox/train.py \
examples/training/cogvideox/configs/benchmarks/baseline.yaml
"

rm $HOSTFILE
68 changes: 68 additions & 0 deletions examples/training/cogvideox/configs/benchmarks/baseline.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
zipf_offset: 5
outputs: exp/cogvideox/baseline
profile_path: exp/cogvideox/profile/baseline
sp_size: 8
dummy_dataset: true
dummy_data_size: 2000
verbose: true
calculate_imbalance: true


# ==== training config ====

# preprocess embedding
data_path: "./assets/example_data/demo_preprocess.csv"
preprocessed_data: true
drop_last: true

# train
ckpt_path: "THUDM/CogVideoX-5b"
grad_checkpoint: True
num_workers: 8
dtype: "bf16"

# log
seed: 42
epochs: 1
log_every: 1e10

# optimization
grad_clip: 1.0
lr: 1e-8
ema_decay: 0.99
adam_eps: 1e-15
warmup_steps: 10

# data
# image_mixing_frac: 50
num_bucket_build_workers: 16
bucket_config:
"144p": {1: [1.0, 345], 25: [1.0, 48], 49: [1.0, 25], 73: [1.0, 12], 97: [1.0, 6]}
"240p": {1: [1.0, 128], 25: [1.0, 16], 49: [1.0, 8], 73: [1.0, 4], 97: [1.0, 2]}
"360p": {1: [1.0, 64], 25: [1.0, 7], 49: [1.0, 4], 73: [1.0, 2], 97: [1.0, 1]}
"480p": {1: [1.0, 32], 25: [1.0, 4], 49: [1.0, 2], 73: [1.0, 1], 97: [1.0, 1]}
"720p": {1: [1.0, 14], 25: [1.0, 1], 49: [1.0, 1], 73: [1.0, 1], 97: [1.0, 1]}

# override default common ar
# for benchmark, we use single ar for all resolutions
# otherwise the data will be too sparse
common_ar:
"144p": {"0.56": [144, 256]}
"240p": {"0.56": [240, 426]}
"360p": {"0.56": [360, 640]}
"480p": {"0.56": [480, 720]}
"720p": {"0.56": [720, 1280]}

# mask
mask_ratios: {
"random": 0.01,
"intepolate": 0.002,
"quarter_random": 0.002,
"quarter_head": 0.002,
"quarter_tail": 0.002,
"quarter_head_tail": 0.002,
"image_random": 0.0,
"image_head": 0.22,
"image_tail": 0.005,
"image_head_tail": 0.005,
}
Loading