Skip to content

Commit

Permalink
Adding CCL Async test cases to TG nightly and bug fix (#16700)
Browse files Browse the repository at this point in the history
### Overview
- [x] Adding CCL async test cases to TG nightly
- [x] bug fixes for all gather (core assignment, semaphore reset logic)

### Known Issues
Several issues are exposed in CCLs by these test cases. These failing
ones are commented out for now:
- [ ]  #16699

### Checklist
- [x] All Post commit:
https://github.com/tenstorrent/tt-metal/actions/runs/12771284956
- [x] TG nightly:
https://github.com/tenstorrent/tt-metal/actions/runs/12771306345
- [x] TG post commit:
https://github.com/tenstorrent/tt-metal/actions/runs/12771296063
- [x] T3K post commit and nightly:
https://github.com/tenstorrent/tt-metal/actions/runs/12756885713
  • Loading branch information
caixunshiren authored Jan 15, 2025
1 parent a0cf894 commit 095d101
Show file tree
Hide file tree
Showing 8 changed files with 502 additions and 20 deletions.
1 change: 1 addition & 0 deletions tests/nightly/tg/ccl/test_all_gather_async_nightly.py
1 change: 1 addition & 0 deletions tests/nightly/tg/ccl/test_reduce_scatter_async_nightly.py
3 changes: 1 addition & 2 deletions tests/scripts/tg/run_tg_nightly_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,7 @@ run_tg_llama3_70b_tests() {

echo "LOG_METAL: Running run_tg_llama3_70b_tests"

pytest tests/nightly/tg/ccl/test_all_gather_nightly.py ; fail+=$?
pytest tests/nightly/tg/ccl/test_reduce_scatter_nightly.py ; fail+=$?
pytest -n auto tests/nightly/tg/ccl --timeout=180 ; fail+=$?

# Falcon40B prefill 60 layer end to end with 10 loops; we need 8x8 grid size
pytest tests/nightly/tg/models/demos/tg/llama3_70b ; fail+=$?
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -254,10 +254,9 @@ def run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
topology=ttnn.Topology.Linear,
)

if enable_persistent_fabric:
logger.info(f"Waiting for op")
ttnn.synchronize_devices(mesh_device, sub_device_ids=sub_device_stall_group)
logger.info(f"Done iteration")
if enable_persistent_fabric:
ttnn.synchronize_devices(mesh_device, sub_device_ids=sub_device_stall_group)
ttnn.synchronize_devices(mesh_device, sub_device_ids=sub_device_stall_group)

if enable_persistent_fabric and teardown_persistent_fabric:
logger.info("Tearing down persistent fabric interface")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,330 @@
# SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC

# SPDX-License-Identifier: Apache-2.0

import torch
import pytest
from loguru import logger
import ttnn
from tests.tt_eager.python_api_testing.sweep_tests.comparison_funcs import comp_equal, comp_pcc
from models.utility_functions import skip_for_grayskull
from tests.ttnn.unit_tests.operations.ccl.test_ccl_common import (
create_and_load_sub_device_manager_with_fabric_interface,
teardown_fabric_interface,
create_global_semaphore_with_same_address,
)

from tests.ttnn.unit_tests.operations.ccl.test_all_gather_TG_post_commit import (
run_line_all_gather_on_TG_with_mesh_tensor_along_rows,
)

from tests.ttnn.unit_tests.operations.ccl.test_new_all_gather import (
run_all_gather_impl,
)


# Enumerate the post-commit cases explicitly
@skip_for_grayskull("Requires eth connected devices to run")
@pytest.mark.parametrize(
"num_devices, num_links",
[(4, 1)],
# [(4, 3)], Multi-links fails https://github.com/tenstorrent/tt-metal/issues/16699
)
@pytest.mark.parametrize(
"input_dtype",
[
ttnn.bfloat16,
ttnn.bfloat8_b,
],
)
@pytest.mark.parametrize("shard_grid_orientation", [ttnn.ShardOrientation.ROW_MAJOR])
@pytest.mark.parametrize(
"tensor_mem_layout,per_chip_output_shape, dim, input_shard_shape,shard_grid,layout",
(
# LLama
(
ttnn.TensorMemoryLayout.WIDTH_SHARDED,
(1, 1, 32, 1024 * 4),
3,
(32, 32),
ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
ttnn.TILE_LAYOUT,
),
(
ttnn.TensorMemoryLayout.WIDTH_SHARDED,
(4, 1, 32, 1280),
0,
(32, 32),
ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 4))}),
ttnn.TILE_LAYOUT,
),
),
)
@pytest.mark.parametrize("replication_factor", [8])
@pytest.mark.parametrize("enable_async", [True])
@pytest.mark.parametrize("mesh_device", [pytest.param((8, 4), id="8x4_grid")], indirect=True)
def test_line_all_gather_sharded_on_TG_rows_post_commit(
mesh_device,
num_devices,
per_chip_output_shape,
input_shard_shape,
shard_grid,
shard_grid_orientation,
tensor_mem_layout,
dim,
num_links,
input_dtype,
layout,
use_program_cache,
function_level_defaults,
enable_async,
replication_factor,
num_iters=1,
):
if len(mesh_device.get_devices()) != 32:
pytest.skip("Not TG!")
if input_dtype == ttnn.bfloat16 and per_chip_output_shape == (1, 1, 32, 1024 * 4):
pytest.skip("Skipped due to hang Issue #16699")
input_shard_spec = ttnn.ShardSpec(
shard_grid,
input_shard_shape,
shard_grid_orientation,
)
run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
mesh_device,
num_devices,
per_chip_output_shape,
tensor_mem_layout,
dim,
num_links,
input_dtype,
layout,
ttnn.BufferType.L1,
use_program_cache,
function_level_defaults,
enable_async=enable_async,
input_shard_spec=input_shard_spec,
num_iters=num_iters,
num_all_gather_instances=replication_factor,
cluster_axis=1,
use_all_gather_async=True,
enable_persistent_fabric=True,
create_persistent_fabric=True,
teardown_persistent_fabric=True,
)


# Enumerate the post-commit cases explicitly
@skip_for_grayskull("Requires eth connected devices to run")
@pytest.mark.parametrize(
"num_devices, num_links",
[
(8, 1),
],
# [(8, 4), (8, 3), (8, 2)], Multi-links fails https://github.com/tenstorrent/tt-metal/issues/16699
)
@pytest.mark.parametrize(
"input_dtype",
[
ttnn.bfloat16,
ttnn.bfloat8_b,
],
)
@pytest.mark.parametrize("shard_grid_orientation", [ttnn.ShardOrientation.ROW_MAJOR])
@pytest.mark.parametrize(
"tensor_mem_layout, input_shape, dim, input_shard_shape,shard_grid,layout",
(
(
ttnn.TensorMemoryLayout.WIDTH_SHARDED,
(8, 1, 32, 2048),
0,
(32, 64),
ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
ttnn.TILE_LAYOUT,
),
(
ttnn.TensorMemoryLayout.WIDTH_SHARDED,
(1, 8, 32, 2048),
1,
(32, 64),
ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
ttnn.TILE_LAYOUT,
),
(
ttnn.TensorMemoryLayout.WIDTH_SHARDED,
(1, 1, 256, 2048),
2,
(32, 64),
ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
ttnn.TILE_LAYOUT,
),
(
ttnn.TensorMemoryLayout.WIDTH_SHARDED,
(1, 1, 32, 16384),
3,
(32, 64),
ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
ttnn.TILE_LAYOUT,
),
(
ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
(8, 1, 2048, 32),
0,
(64, 32),
ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
ttnn.TILE_LAYOUT,
),
(
ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
(1, 8, 2048, 32),
1,
(64, 32),
ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
ttnn.TILE_LAYOUT,
),
(
ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
(1, 1, 16384, 32),
2,
(64, 32),
ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
ttnn.TILE_LAYOUT,
),
(
ttnn.TensorMemoryLayout.HEIGHT_SHARDED,
(1, 1, 2048, 256),
3,
(64, 32),
ttnn.CoreRangeSet({ttnn.CoreRange(ttnn.CoreCoord(0, 0), ttnn.CoreCoord(7, 3))}),
ttnn.TILE_LAYOUT,
),
),
)
@pytest.mark.parametrize("replication_factor", [4])
@pytest.mark.parametrize("enable_async", [True])
@pytest.mark.parametrize("mesh_device", [pytest.param((8, 4), id="8x4_grid")], indirect=True)
def test_line_all_gather_sharded_on_TG_cols_post_commit(
mesh_device,
num_devices,
input_shape,
input_shard_shape,
shard_grid,
shard_grid_orientation,
tensor_mem_layout,
dim,
num_links,
input_dtype,
layout,
use_program_cache,
function_level_defaults,
enable_async,
replication_factor,
num_iters=1,
):
if len(mesh_device.get_devices()) != 32:
pytest.skip("Not TG!")
if input_dtype == ttnn.bfloat16 and input_shape == (1, 1, 256, 2048):
pytest.skip("Skipped due to hang Issue #16699")
input_shard_spec = ttnn.ShardSpec(
shard_grid,
input_shard_shape,
shard_grid_orientation,
)

run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
mesh_device,
num_devices,
input_shape,
tensor_mem_layout,
dim,
num_links,
input_dtype,
layout,
ttnn.BufferType.L1,
use_program_cache,
function_level_defaults,
enable_async=enable_async,
num_iters=num_iters,
input_shard_spec=input_shard_spec,
num_all_gather_instances=replication_factor,
cluster_axis=0,
use_all_gather_async=True,
enable_persistent_fabric=True,
create_persistent_fabric=True,
teardown_persistent_fabric=True,
)


# Enumerate the post-commit cases explicitly
@skip_for_grayskull("Requires eth connected devices to run")
@pytest.mark.parametrize(
"num_devices, num_links, per_chip_output_shape, dim, layout",
[
(8, 1, [1, 8, 32, 1280], 1, ttnn.TILE_LAYOUT),
(8, 1, [8, 1, 32, 1280], 0, ttnn.TILE_LAYOUT),
(8, 1, [1, 8, 32, 2048], 1, ttnn.TILE_LAYOUT),
(8, 1, [1, 8, 32, 2304], 1, ttnn.TILE_LAYOUT),
(8, 1, [1, 8, 32, 4096], 1, ttnn.TILE_LAYOUT),
# multi-links fails: https://github.com/tenstorrent/tt-metal/issues/16699
# (8, 4, [1, 8, 32, 1280], 1, ttnn.TILE_LAYOUT),
# (8, 4, [8, 1, 32, 1280], 0, ttnn.TILE_LAYOUT),
# (8, 4, [1, 8, 32, 2048], 1, ttnn.TILE_LAYOUT),
# (8, 4, [1, 8, 32, 2304], 1, ttnn.TILE_LAYOUT),
# (8, 4, [1, 8, 32, 4096], 1, ttnn.TILE_LAYOUT),
],
)
@pytest.mark.parametrize(
"input_dtype",
[
ttnn.bfloat16,
ttnn.bfloat8_b,
],
)
@pytest.mark.parametrize(
"buffer_type",
[
ttnn.BufferType.DRAM,
ttnn.BufferType.L1,
],
)
@pytest.mark.parametrize("replication_factor", [4])
@pytest.mark.parametrize("enable_async", [True])
@pytest.mark.parametrize("mesh_device", [pytest.param((8, 4), id="8x4_grid")], indirect=True)
def test_line_all_gather_on_TG_cols_nightly(
mesh_device,
num_devices,
per_chip_output_shape,
dim,
num_links,
input_dtype,
layout,
buffer_type,
use_program_cache,
function_level_defaults,
enable_async,
replication_factor,
num_iters=1,
):
if len(mesh_device.get_devices()) != 32:
pytest.skip("Not TG!")
run_line_all_gather_on_TG_with_mesh_tensor_along_rows(
mesh_device,
num_devices,
per_chip_output_shape,
ttnn.TensorMemoryLayout.INTERLEAVED,
dim,
num_links,
input_dtype,
layout,
buffer_type,
use_program_cache,
function_level_defaults,
enable_async=enable_async,
num_iters=num_iters,
num_all_gather_instances=replication_factor,
cluster_axis=0,
use_all_gather_async=True,
enable_persistent_fabric=True,
create_persistent_fabric=True,
teardown_persistent_fabric=True,
)
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,8 @@ def run_line_reduce_scatter_on_TG_with_mesh_tensor_along_rows(
memory_config=output_mem_config,
topology=ttnn.Topology.Linear,
)
if enable_persistent_fabric:
ttnn.synchronize_devices(mesh_device, sub_device_ids=sub_device_stall_group)
ttnn.synchronize_devices(mesh_device, sub_device_ids=sub_device_stall_group)

if enable_persistent_fabric and teardown_persistent_fabric:
Expand Down
Loading

0 comments on commit 095d101

Please sign in to comment.