AI-Hypercomputer
diff --git a/‎.github/workflows/UnitTests.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/UnitTests.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 1 deletion b/‎.gitignore‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎preview-xpk.sh‎
Lines changed: 93 additions & 0 deletions b/‎preview-xpk.sh‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎requirements.txt‎
Lines changed: 0 additions & 1 deletion b/‎requirements.txt‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/maxdiffusion/common_types.py‎
Lines changed: 1 addition & 33 deletions b/‎src/maxdiffusion/common_types.py‎
Lines changed: 1 addition & 33 deletions
diff --git a/‎src/maxdiffusion/configs/base14.yml‎
Lines changed: 0 additions & 9 deletions b/‎src/maxdiffusion/configs/base14.yml‎
Lines changed: 0 additions & 9 deletions
diff --git a/‎src/maxdiffusion/configs/base21.yml‎
Lines changed: 0 additions & 10 deletions b/‎src/maxdiffusion/configs/base21.yml‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎src/maxdiffusion/configs/base_2_base.yml‎
Lines changed: 0 additions & 10 deletions b/‎src/maxdiffusion/configs/base_2_base.yml‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎src/maxdiffusion/configs/base_flux_dev.yml‎
Lines changed: 0 additions & 9 deletions b/‎src/maxdiffusion/configs/base_flux_dev.yml‎
Lines changed: 0 additions & 9 deletions
diff --git a/‎src/maxdiffusion/configs/base_flux_dev_multi_res.yml‎
Lines changed: 0 additions & 9 deletions b/‎src/maxdiffusion/configs/base_flux_dev_multi_res.yml‎
Lines changed: 0 additions & 9 deletions
@@ -58,7 +58,7 @@ jobs:
         pip show jax jaxlib flax transformers datasets tensorflow tensorflow_datasets
     - name: PyTest
       run: | #--deselect=src/maxdiffusion/tests/input_pipeline_interface_test.py
-        HF_HUB_CACHE=/mnt/disks/github-runner-disk/ HF_HOME=/mnt/disks/github-runner-disk/ TOKENIZERS_PARALLELISM=false LIBTPU_INIT_ARGS="--xla_tpu_scoped_vmem_limit_kib=65472" python3 -m pytest --deselect=src/maxdiffusion/tests/ltx_transformer_step_test.py -x
+        HF_HUB_CACHE=/mnt/disks/github-runner-disk/ HF_HOME=/mnt/disks/github-runner-disk/ TOKENIZERS_PARALLELISM=false python3 -m pytest --deselect=src/maxdiffusion/tests/ltx_transformer_step_test.py -x
 #  add_pull_ready:
 #    if: github.ref != 'refs/heads/main'
 #    permissions:
 
@@ -4,6 +4,7 @@
 __pycache__/
 *.py[cod]
 *$py.class
+
 # C extensions
 *.so
 
@@ -97,7 +98,6 @@ celerybeat-schedule
 
 # Environments
 .env
-.history
 .venv
 env/
 venv/
 
@@ -0,0 +1,93 @@
+#!/bin/bash
+bash docker_build_dependency_image.sh
+docker tag maxdiffusion_base_image:latest gcr.io/cloud-tpu-multipod-dev/sanbao/maxdiffusion_base_image:latest
+docker push gcr.io/cloud-tpu-multipod-dev/sanbao/maxdiffusion_base_image:latest
+CLUSTER_NAME=bodaborg-tpu7x-128
+DEVICE_TYPE=tpu7x-128 # can change to any size <= tpu7x-256
+PROJECT=cloud-tpu-multipod-dev
+ZONE=us-central1
+
+# Please change the RUN_NAME and OUTPUT_DIR to your own GCS bucket path.
+export RUN_NAME=sanbao-wan-v7x-20k-${RANDOM}
+OUTPUT_DIR=gs://sanbao-bucket/wan/${RUN_NAME}
+# OUTPUT_DIR=gs://sanbao-bucket/wan/sanbao-wan-train-test
+DATASET_DIR=gs://sanbao-bucket/wan_tfr_dataset_pusa_v1/train/
+EVAL_DATA_DIR=gs://sanbao-bucket/wan_tfr_dataset_pusa_v1/eval_timesteps/
+SAVE_DATASET_DIR=gs://sanbao-bucket/wan_tfr_dataset_pusa_v1/save/
+RANDOM=123456789
+IMAGE_DIR=gcr.io/cloud-tpu-multipod-dev/sanbao/maxdiffusion_base_image:latest
+# IMAGE_DIR=gcr.io/tpu-prod-env-multipod/maxdiffusion_jax_stable_stack_nightly@sha256:fd27d49a3be7f743f08e3b6b03e5ae00196794944310e3fee2a7795b99d81195
+LIBTPU_VERSION=libtpu-0.0.25.dev20251013+tpu7x-cp312-cp312-manylinux_2_31_x86_64.whl
+
+xpk workload create \
+--cluster=$CLUSTER_NAME \
+--project=$PROJECT \
+--zone=$ZONE \
+--device-type=$DEVICE_TYPE \
+--num-slices=1 \
+--command=" \
+pip install . && \
+gsutil cp gs://libtpu-tpu7x-releases/wheels/libtpu/${LIBTPU_VERSION} . && \
+python -m pip install ${LIBTPU_VERSION} && \
+export LIBTPU_INIT_ARGS='--xla_enable_async_all_gather=true \
+--xla_tpu_enable_async_collective_fusion=true \
+--xla_tpu_enable_async_collective_fusion_fuse_all_gather=true \
+--xla_enable_async_all_reduce=true \
+--xla_tpu_enable_sparse_core_collective_offload_all_reduce=true \
+--xla_max_concurrent_async_all_gathers=4 \
+--xla_tpu_enable_async_all_to_all=true \
+--xla_latency_hiding_scheduler_rerun=5 \
+--xla_tpu_rwb_fusion=false \
+--xla_tpu_enable_sublane_major_scaling_bitcast_fusion=false \
+--xla_tpu_impure_enable_packed_bf16_math_ops=false \
+--xla_tpu_enable_sparse_core_reduce_scatter_v2=true \
+--xla_tpu_enable_sparse_core_collective_offload_all_gather=true \
+--xla_tpu_enable_sparse_core_collective_offload_2d_all_gather=true \
+--xla_tpu_enable_all_gather_offload_tracing=true \
+--xla_tpu_use_tc_device_shape_on_sc=true \
+--xla_tpu_prefer_async_allgather_to_allreduce=true \
+--xla_tpu_enable_sparse_core_collective_offload_reduce_scatter=true \
+--xla_tpu_scoped_vmem_limit_kib=65536 \
+--xla_tpu_enable_tpu_custom_call_scoped_vmem_adjustments=true \
+--xla_enable_transpose_trace=false' && \
+echo 'Starting WAN training ...' && \
+HF_HUB_CACHE=/dev/shm python src/maxdiffusion/train_wan.py \
+  src/maxdiffusion/configs/base_wan_14b.yml \
+  attention='flash' \
+  weights_dtype=bfloat16 \
+  activations_dtype=bfloat16 \
+  guidance_scale=5.0 \
+  flow_shift=5.0 \
+  fps=16 \
+  skip_jax_distributed_system=False \
+  run_name='test-wan-training-new' \
+  output_dir=${OUTPUT_DIR} \
+  train_data_dir=${DATASET_DIR} \
+  load_tfrecord_cached=True \
+  height=1280 \
+  width=720 \
+  num_frames=81 \
+  num_inference_steps=50 \
+  prompt='a japanese pop star young woman with black hair is singing with a smile. She is inside a studio with dim lighting and musical instruments.' \
+  jax_cache_dir=${OUTPUT_DIR}/jax_cache/ \
+  enable_profiler=True \
+  dataset_save_location=${SAVE_DATASET_DIR} \
+  remat_policy='HIDDEN_STATE_WITH_OFFLOAD' \
+  flash_min_seq_length=0 \
+  seed=$RANDOM \
+  skip_first_n_steps_for_profiler=3 \
+  profiler_steps=3 \
+  per_device_batch_size=0.5 \
+  ici_data_parallelism=64 \
+  ici_fsdp_parallelism=2 \
+  ici_tensor_parallelism=1 \
+  allow_split_physical_axes=True \
+  max_train_steps=150 \
+  scan_layers=true \
+  flash_block_sizes='{\"block_q\":2048,\"block_kv_compute\":512,\"block_kv\":2048,\"block_q_dkv\":2048,\"block_kv_dkv\":2048,\"block_kv_dkv_compute\":512,\"use_fused_bwd_kernel\":true}' \
+  " \
+--base-docker-image=${IMAGE_DIR} \
+--enable-debug-logs \
+--workload=${RUN_NAME} \
+--priority=medium \
+--max-restarts=0
@@ -13,7 +13,6 @@ ftfy
 tensorboard>=2.17.0
 tensorboardx>=2.6.2.2
 tensorboard-plugin-profile>=2.15.2
-tokamax
 Jinja2
 scikit-image
 parameterized
 
@@ -33,11 +33,7 @@
 BlockSizes = splash_attention_kernel.BlockSizes
 
 AxisNames = tuple[str, ...]
-# Physical axis names for device meshes.
-DATA = "data"
-FSDP = "fsdp"
-TENSOR = "tensor"
-# Logical axis names for model parameters and activations.
+
 BATCH = "activation_batch"
 LENGTH = "activation_length"
 KV_LENGTH = "activation_kv_length"
@@ -48,32 +44,4 @@
 KEEP_2 = "activation_keep_2"
 CONV_OUT = "activation_conv_out_channels"
 
-# For setting self/cross attention independently in splash kernel
-SELF_ATTN_HEAD = "activation_self_attn_heads"
-SELF_ATTN_Q_LENGTH = "activation_self_attn_q_length"
-SELF_ATTN_KV_LENGTH = "activation_self_attn_kv_length"
-CROSS_ATTN_HEAD = "activation_cross_attn_heads"
-CROSS_ATTN_Q_LENGTH = "activation_cross_attn_q_length"
-CROSS_ATTN_KV_LENGTH = "activation_cross_attn_kv_length"
-
-
 WAN_MODEL = "Wan2.1"
-
-### Common axis rules for ring attention ###
-RING_ATTENTION_AXIS_RULES = [
-        [SELF_ATTN_HEAD, None],
-        [SELF_ATTN_Q_LENGTH, FSDP],
-        [SELF_ATTN_KV_LENGTH, FSDP],
-        [CROSS_ATTN_HEAD, None],
-        [CROSS_ATTN_Q_LENGTH, FSDP],
-        [CROSS_ATTN_KV_LENGTH, FSDP],
-]
-
-SEQUENCE_PARALLEL_AXIS_RULES = [
-        [SELF_ATTN_HEAD, None],
-        [SELF_ATTN_Q_LENGTH, FSDP],
-        [SELF_ATTN_KV_LENGTH, None],
-        [CROSS_ATTN_HEAD, None],
-        [CROSS_ATTN_Q_LENGTH, FSDP],
-        [CROSS_ATTN_KV_LENGTH, None],
-]
@@ -50,15 +50,6 @@ jit_initializers: True
 from_pt: False
 split_head_dim: True
 attention: 'dot_product' # Supported attention: dot_product, flash
-# If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
-# Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
-# However, when padding tokens are significant, this will lead to worse quality and should be set to True.
-mask_padding_tokens: True 
-# Maxdiffusion has 2 types of attention sharding strategies:
-# 1. attention_sharding_uniform = True : same sequence sharding rules applied for q in both (self and cross attention)
-# 2. attention_sharding_uniform = False : Heads are sharded uniformly across devices for self attention while sequence is sharded
-#    in cross attention q.
-attention_sharding_uniform: True 
 flash_block_sizes: {}
 # GroupNorm groups
 norm_num_groups: 32
 
@@ -49,16 +49,6 @@ jit_initializers: True
 from_pt: False
 split_head_dim: True
 attention: 'dot_product' # Supported attention: dot_product, flash
-# If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
-# Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
-# However, when padding tokens are significant, this will lead to worse quality and should be set to True.
-mask_padding_tokens: True 
-# Maxdiffusion has 2 types of attention sharding strategies:
-# 1. attention_sharding_uniform = True : same sequence sharding rules applied for q in both (self and cross attention)
-# 2. attention_sharding_uniform = False : Heads are sharded uniformly across devices for self attention while sequence is sharded
-#    in cross attention q.
-attention_sharding_uniform: True 
-
 flash_block_sizes: {}
 # GroupNorm groups
 norm_num_groups: 32
 
@@ -50,16 +50,6 @@ jit_initializers: True
 from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash
-# If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
-# Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
-# However, when padding tokens are significant, this will lead to worse quality and should be set to True.
-mask_padding_tokens: True 
-# Maxdiffusion has 2 types of attention sharding strategies:
-# 1. attention_sharding_uniform = True : same sequence sharding rules applied for q in both (self and cross attention)
-# 2. attention_sharding_uniform = False : Heads are sharded uniformly across devices for self attention while sequence is sharded
-#    in cross attention q.
-attention_sharding_uniform: True 
-
 flash_block_sizes: {}
 # to override default block sizes for flash attention
 # flash_block_sizes:
 
@@ -63,15 +63,6 @@ jit_initializers: True
 from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te
-# If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
-# Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
-# However, when padding tokens are significant, this will lead to worse quality and should be set to True.
-mask_padding_tokens: True 
-# Maxdiffusion has 2 types of attention sharding strategies:
-# 1. attention_sharding_uniform = True : same sequence sharding rules applied for q in both (self and cross attention)
-# 2. attention_sharding_uniform = False : Heads are sharded uniformly across devices for self attention while sequence is sharded
-#    in cross attention q.
-attention_sharding_uniform: True 
 
 flash_block_sizes: {}
 # Use the following flash_block_sizes on v6e (Trillium) due to larger vmem.
 
@@ -63,15 +63,6 @@ jit_initializers: True
 from_pt: True
 split_head_dim: True
 attention: 'flash' # Supported attention: dot_product, flash, cudnn_flash_te
-# If mask_padding_tokens is True, we pass in segment ids to splash attention to avoid attending to padding tokens.
-# Else we do not pass in segment ids and on vpu bound hardware like trillium this is faster.
-# However, when padding tokens are significant, this will lead to worse quality and should be set to True.
-mask_padding_tokens: True 
-# Maxdiffusion has 2 types of attention sharding strategies:
-# 1. attention_sharding_uniform = True : same sequence sharding rules applied for q in both (self and cross attention)
-# 2. attention_sharding_uniform = False : Heads are sharded uniformly across devices for self attention while sequence is sharded
-#    in cross attention q.
-attention_sharding_uniform: True 
 
 #flash_block_sizes: {}
 # Use the following flash_block_sizes on v6e (Trillium) due to larger vmem.