Merge branch 'main' into brosko/umd_single_driver

tenstorrent · Nov 2, 2024 · 25af3a8 · 25af3a8
2 parents d41c544 + 7321dd7
commit 25af3a8
Show file tree

Hide file tree

Showing 203 changed files with 5,439 additions and 899 deletions.
diff --git a/.github/workflows/cpp-ttnn-project.yaml b/.github/workflows/cpp-ttnn-project.yaml
@@ -57,11 +57,11 @@ jobs:
             -w ${{ github.workspace }}
           run: |
             set -eu # basic shell hygiene
-            ./build_metal.sh --disable-unity-builds --build-type Release
+            ./build_metal.sh --build-type Release
 
       # TTNN project
       - name: Checkout cpp-ttnn-project-template
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
         with:
           repository: tenstorrent/cpp-ttnn-project-template
           path: project

diff --git a/.github/workflows/ttnn-run-sweeps.yaml b/.github/workflows/ttnn-run-sweeps.yaml
@@ -11,6 +11,8 @@ on:
         options:
           - ALL SWEEPS (Nightly)
           - add
+          - tilize
+          - untilize
           - ccl.line_all_gather
           - ccl.all_gather_n300
           - ccl.all_gather_n300_focused
@@ -255,10 +257,13 @@ on:
           - eltwise.ternary.lerp.lerp
           - eltwise.ternary.where.where
           - eltwise.ternary.where.where_pytorch2
+          - eltwise.ternary_backward.addcmul_bw
+          - eltwise.ternary_backward.addcdiv_bw
+          - embedding.embedding
           - reduction.topk.topk
           - reduction.argmax.argmax
-          - embedding.embedding
-          - eltwise.ternary_backward.addcmul_bw
+          - reduction.prod
+          - reduction.sum
           - matmul.full.matmul_default_block_sharded
           - matmul.full.matmul_default_height_sharded
           - matmul.full.matmul_default_interleaved
@@ -270,6 +275,8 @@ on:
           - matmul.short.matmul_user_program_config_mcast_2d
           - matmul.short.matmul_user_program_config
           - matmul.short.matmul
+          - losses.l1_loss
+          - losses.mse_loss
           - data_movement.concat.concat_interleaved_n_tensors
           - data_movement.concat.concat_interleaved
           - data_movement.concat.concat_sharded
@@ -288,6 +295,8 @@ on:
           - data_movement.index_select.index_select_pytorch2
           - data_movement.split.split_with_sizes_pytorch2
           - data_movement.repeat.repeat
+          - data_movement.reshape.reshape
+          - data_movement.repeat_interleave.repeat_interleave
           - data_movement.nonzero.nonzero
           - conv2d.full.conv2d_misc
           - conv2d.full.conv2d_sharding

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -55,6 +55,9 @@ if(ENABLE_LIBCXX)
     #    $<$<LINK_LANG_AND_ID:CXX,Clang>:-lc++>
     #    $<$<LINK_LANG_AND_ID:CXX,Clang>:-lc++abi>
     #)
+else()
+    # required when linking with libstdc++ with clang and gcc
+    add_compile_options(-fsized-deallocation)
 endif()
 
 # Using below until we can move to CMake >= 3.18 for LINK_LANG_AND_ID
@@ -63,8 +66,6 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND ENABLE_LIBCXX)
     set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -lc++ -lc++abi")
 endif()
 
-add_compile_options($<$<COMPILE_LANG_AND_ID:CXX,GNU>:-fsized-deallocation>)
-
 include(CTest)
 
 get_property(isMultiConfig GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG)
@@ -263,7 +264,6 @@ target_compile_options(compiler_flags INTERFACE -DARCH_${ARCH_NAME_DEF})
 
 add_library(metal_header_directories INTERFACE)
 target_include_directories(metal_header_directories INTERFACE ${PROJECT_SOURCE_DIR}/tt_metal/hw/inc)
-target_include_directories(metal_header_directories SYSTEM INTERFACE ${reflect_SOURCE_DIR})
 foreach(lib ${BoostPackages})
     target_include_directories(metal_header_directories INTERFACE ${Boost${lib}_SOURCE_DIR}/include)
 endforeach()

diff --git a/CODEOWNERS b/CODEOWNERS
@@ -74,6 +74,7 @@ tt_metal/hw/firmware/src/*erisc* @aliuTT @ubcheema
 tt_metal/hw/inc/ethernet/ @aliuTT @ubcheema
 tt_metal/hw/inc/wormhole/eth_l1_address_map.h @aliuTT @ubcheema
 tt_metal/third_party/tt_llk_* @rtawfik01 @ttmtrajkovic @rdjogoTT
+tt_metal/tt_stl/ @patrickroberts @yan-zaretskiy @eyonland @ayerofieiev-tt @dmakoviichuk-tt @sminakov-tt
 
 sfpi/ @pgkeller
 

diff --git a/INSTALLING.md b/INSTALLING.md
@@ -37,7 +37,8 @@ chmod u+x llvm.sh
 sudo ./llvm.sh 17
 sudo apt install libc++-17-dev libc++abi-17-dev
 ```
-
+- Note: `CMake 3.16` is the targetted required version of `CMake` as it aligns with the default from `Ubuntu 20.04`. Some advanced build configurations like unity builds require `CMake 3.20`.
+  - To install `CMake 3.20` see: https://github.com/tenstorrent/tt-metal/blob/4d7730d3e2d22c51d62baa1bfed861b557d9a3c0/dockerfile/ubuntu-20.04-amd64.Dockerfile#L9-L14 
 ---
 
 ### Step 3. Hugepages

diff --git a/cmake/dependencies.cmake b/cmake/dependencies.cmake
@@ -58,6 +58,11 @@ endif()
 ############################################################################################################################
 
 CPMAddPackage(NAME reflect GITHUB_REPOSITORY boost-ext/reflect GIT_TAG v1.1.1)
+if(reflect_ADDED)
+    add_library(reflect INTERFACE)
+    add_library(Reflect::Reflect ALIAS reflect)
+    target_include_directories(reflect SYSTEM INTERFACE ${reflect_SOURCE_DIR})
+endif()
 
 ############################################################################################################################
 # magic_enum : https://github.com/Neargye/magic_enum

diff --git a/tests/scripts/run_cpp_unit_tests.sh b/tests/scripts/run_cpp_unit_tests.sh
@@ -19,7 +19,8 @@ if [[ ! -z "$TT_METAL_SLOW_DISPATCH_MODE" ]]; then
 else
     ./build/test/tt_metal/unit_tests_fast_dispatch
     TT_METAL_GTEST_NUM_HW_CQS=2 ./build/test/tt_metal/unit_tests_fast_dispatch_single_chip_multi_queue --gtest_filter=MultiCommandQueueSingleDeviceFixture.*
-    if [[ "$ARCH_NAME" == "wormhole_b0" || "$ARCH_NAME" == "blackhole" ]]; then
+    # Enable this on BH after #14613
+    if [[ "$ARCH_NAME" == "wormhole_b0" ]]; then
         TT_METAL_GTEST_ETH_DISPATCH=1 ./build/test/tt_metal/unit_tests_fast_dispatch
     fi
     env python tests/scripts/run_tt_eager.py --dispatch-mode fast

diff --git a/tests/scripts/tt_bisect.sh b/tests/scripts/tt_bisect.sh
@@ -56,7 +56,7 @@ while [[ "$found" = "false" ]]; do
    build_code=0
    echo "at commit `git rev-parse HEAD`"
    echo "building Metal"
-   . build_metal.sh; build_code+=$?
+   ./build_metal.sh --build-tests; build_code+=$?
 
    if [[ $build_code -ne 0 ]]; then
       echo "Build failed"

diff --git a/tests/sweep_framework/sweeps/data_movement/repeat_interleave/repeat_interleave.py b/tests/sweep_framework/sweeps/data_movement/repeat_interleave/repeat_interleave.py
@@ -0,0 +1,150 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from functools import partial
+from itertools import combinations
+
+import torch
+import random
+import ttnn
+from functools import lru_cache
+from tests.sweep_framework.sweep_utils.utils import gen_shapes
+from tests.tt_eager.python_api_testing.sweep_tests.generation_funcs import gen_func_with_cast_tt
+
+from tests.ttnn.utils_for_testing import check_with_pcc, start_measuring_time, stop_measuring_time
+from models.utility_functions import torch_random
+
+# Override the default timeout in seconds for hang detection.
+TIMEOUT = 360
+random.seed(0)
+
+
+# Does not have memory_config parameter
+parameters = {
+    "nightly": {
+        "input_shape": gen_shapes([1, 1, 1, 1], [6, 6, 256, 256], [1, 1, 1, 1], 8)
+        + gen_shapes([1, 1, 1], [6, 256, 256], [1, 1, 1], 8)
+        + gen_shapes([1, 1], [256, 256], [1, 1], 8),
+        "repeats": [1, 2, 4, 8],
+        "dim": [0, 1, 2, 3],
+        "input_a_dtype": [ttnn.bfloat16, ttnn.bfloat8_b],
+        "input_a_layout": [ttnn.TILE_LAYOUT, ttnn.ROW_MAJOR_LAYOUT],
+        "input_a_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+        "output_memory_config": [ttnn.DRAM_MEMORY_CONFIG, ttnn.L1_MEMORY_CONFIG],
+    },
+}
+
+
+def align_to_32(x):
+    if x % 32 == 0:
+        return x
+
+    return ((x // 32) + 1) * 32
+
+
+def max_volume(rehape_shape):
+    vol = align_to_32(rehape_shape[-1]) * align_to_32(rehape_shape[-2])
+
+    if len(rehape_shape) >= 3:
+        vol *= rehape_shape[-3]
+
+    if len(rehape_shape) == 4:
+        vol *= rehape_shape[-4]
+
+    return vol
+
+
+# Invalidate vector is called during the generation phase where each vector will be passed in.
+# If invalidated, the vector will still be stored but will be skipped.
+# Returns False, None if the vector is valid, and True, str with a reason for invalidation if it is invalid.
+def invalidate_vector(test_vector) -> Tuple[bool, Optional[str]]:
+    input_shape = test_vector["input_shape"]
+
+    if test_vector["dim"] >= len(input_shape):
+        return True, "dim must be < len(input_shape)"
+
+    if (
+        test_vector["input_a_memory_config"] == ttnn.L1_MEMORY_CONFIG
+        or test_vector["output_memory_config"] == ttnn.L1_MEMORY_CONFIG
+    ):
+        if max_volume(input_shape) * test_vector["repeats"] > 1024 * 1024:
+            return True, "Too large output tensor size for L1 memory config"
+
+    if test_vector["input_a_layout"] == ttnn.ROW_MAJOR_LAYOUT and test_vector["input_a_dtype"] == ttnn.bfloat8_b:
+        return True, "bfloat8_b/bfloat4_b requires TILE_LAYOUT!"
+
+    return False, None
+
+
+def run(
+    input_shape,
+    repeats,
+    dim,
+    input_a_dtype,
+    input_a_layout,
+    input_a_memory_config,
+    output_memory_config,
+    *,
+    device,
+) -> list:
+    data_seed = random.randint(0, 20000000)
+    torch.manual_seed(data_seed)
+
+    # Fix shape for row mayor
+    if input_a_layout == ttnn.ROW_MAJOR_LAYOUT and input_shape[-1] % 2 == 1:
+        input_shape[-1] += 1
+
+    torch_input_tensor_a = gen_func_with_cast_tt(
+        partial(torch_random, low=-100, high=100, dtype=torch.float32), input_a_dtype
+    )(input_shape)
+
+    # print(f"input_shape {input_shape} repeats {repeats} dim {dim} input_a_dtype {input_a_dtype} input_a_layout {input_a_layout}")
+
+    golden_function = ttnn.get_golden_function(ttnn.repeat_interleave)
+    torch_output_tensor = golden_function(torch_input_tensor_a, repeats=repeats, dim=dim)
+
+    input_tensor_a = ttnn.from_torch(
+        torch_input_tensor_a,
+        dtype=input_a_dtype,
+        layout=input_a_layout,
+        device=device,
+        memory_config=input_a_memory_config,
+    )
+
+    start_time = start_measuring_time()
+    result = ttnn.repeat_interleave(input_tensor_a, repeats=repeats, dim=dim, memory_config=output_memory_config)
+    output_tensor = ttnn.to_torch(result)
+    e2e_perf = stop_measuring_time(start_time)
+
+    pcc = check_with_pcc(torch_output_tensor, output_tensor, 0.999)
+    # print(pcc)
+    return [pcc, e2e_perf]
+
+
+# # Run sweeps locally
+# from tests.sweep_framework.framework.permutations import *
+
+# start_time = start_measuring_time()
+# for suite in parameters.keys():
+#     device_id = 0
+#     device = ttnn.open_device(device_id=device_id)
+#     suite_vectors = list(permutations(parameters[suite]))
+#     print(len(suite_vectors))
+#     for vector in suite_vectors:
+#         invalidate_res = invalidate_vector(vector)
+#         if invalidate_res[0]:
+#             print(f"Invalidated: {invalidate_res[1]}")
+#             continue
+#         try:
+#             passed, _ = run(**vector, device=device)
+#             if passed[0] != True:
+#                 print(passed)
+#         except Exception as e:
+#             print(e)
+
+#     ttnn.close_device(device)
+
+# e2e_perf = stop_measuring_time(start_time)
+# print(f"time {e2e_perf / 1000000000}s")