Disable data mismatch assertion for torchvision tests, add e2e tests …

…with good PCC to push CI Split assert_out_output_mismatch to assert_pcc and assert_atol
tenstorrent · Jan 31, 2025 · 7ccbc87 · 7ccbc87
1 parent 66cd734
commit 7ccbc87
Show file tree

Hide file tree

Showing 25 changed files with 121 additions and 91 deletions.
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
@@ -117,6 +117,33 @@ jobs:
                   tests/models/resnet/test_resnet.py::test_resnet[full-eval] \
                   tests/models/resnet50/test_resnet50.py::test_resnet[full-eval] \
                   tests/models/yolov3/test_yolov3.py::test_yolov3[full-eval] \
+                  tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-mobilenet_v2] \
+                  tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-mobilenet_v3_small] \
+                  tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-mobilenet_v3_large] \
+                  tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-resnet18] \
+                  tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-resnet34] \
+                  tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-resnet50] \
+                  tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-resnet101] \
+                  tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-resnet152] \
+                  tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-resnext50_32x4d] \
+                  tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-resnext101_32x8d] \
+                  tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-resnext101_64x4d] \
+                  tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-wide_resnet50_2] \
+                  tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-wide_resnet101_2] \
+                  tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-regnet_y_400mf] \
+                  tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-regnet_y_800mf] \
+                  tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-regnet_y_1_6gf] \
+                  tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-regnet_y_3_2gf] \
+                  tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-regnet_y_8gf] \
+                  tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-regnet_y_16gf] \
+                  tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-regnet_y_32gf] \
+                  tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-regnet_x_400mf] \
+                  tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-regnet_x_800mf] \
+                  tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-regnet_x_1_6gf] \
+                  tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-regnet_x_3_2gf] \
+                  tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-regnet_x_8gf] \
+                  tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-regnet_x_16gf] \
+                  tests/models/torchvision/test_torchvision_image_classification.py::test_torchvision_image_classification[full-regnet_x_32gf] \
            --junit-xml=${{ steps.strings.outputs.test_report_path_models }}
     - name: Upload Test Report Models
       uses: actions/upload-artifact@v4

diff --git a/tests/models/MobileNetV2/test_MobileNetV2.py b/tests/models/MobileNetV2/test_MobileNetV2.py
@@ -47,7 +47,7 @@ def test_MobileNetV2(record_property, mode, op_by_op):
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
     tester = ThisTester(
-        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+        model_name, mode, assert_pcc=False, assert_atol=False, compiler_config=cc
     )
     results = tester.test_model()
     if mode == "eval":

diff --git a/tests/models/Qwen/test_qwen2_token_classification.py b/tests/models/Qwen/test_qwen2_token_classification.py
@@ -47,7 +47,7 @@ def test_qwen2_token_classification(record_property, model_name, mode, op_by_op)
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
     tester = ThisTester(
-        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+        model_name, mode, assert_pcc=False, assert_atol=False, compiler_config=cc
     )
     with torch.no_grad():
         results = tester.test_model()

diff --git a/tests/models/albert/test_albert_masked_lm.py b/tests/models/albert/test_albert_masked_lm.py
@@ -60,7 +60,7 @@ def test_albert_masked_lm(record_property, model_name, mode, op_by_op):
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
     tester = ThisTester(
-        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+        model_name, mode, assert_pcc=False, assert_atol=False, compiler_config=cc
     )
     results = tester.test_model()
 

diff --git a/tests/models/albert/test_albert_sequence_classification.py b/tests/models/albert/test_albert_sequence_classification.py
@@ -43,7 +43,7 @@ def test_albert_sequence_classification(record_property, model_name, mode, op_by
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
     tester = ThisTester(
-        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+        model_name, mode, assert_pcc=False, assert_atol=False, compiler_config=cc
     )
     results = tester.test_model()
 

diff --git a/tests/models/albert/test_albert_token_classification.py b/tests/models/albert/test_albert_token_classification.py
@@ -45,7 +45,7 @@ def test_albert_token_classification(record_property, model_name, mode, op_by_op
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
     tester = ThisTester(
-        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+        model_name, mode, assert_pcc=False, assert_atol=False, compiler_config=cc
     )
     results = tester.test_model()
 

diff --git a/tests/models/bloom/test_bloom.py b/tests/models/bloom/test_bloom.py
@@ -54,7 +54,8 @@ def test_bloom(record_property, mode, op_by_op):
         model_name,
         mode,
         relative_atol=0.01,
-        assert_on_output_mismatch=False,
+        assert_pcc=False,
+        assert_atol=False,
         compiler_config=cc,
     )
     results = tester.test_model()

diff --git a/tests/models/clip/test_clip.py b/tests/models/clip/test_clip.py
@@ -74,7 +74,7 @@ def test_clip(record_property, mode, op_by_op):
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
     tester = ThisTester(
-        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+        model_name, mode, assert_pcc=False, assert_atol=False, compiler_config=cc
     )
     results = tester.test_model()
 

diff --git a/tests/models/detr/test_detr.py b/tests/models/detr/test_detr.py
@@ -57,7 +57,7 @@ def test_detr(record_property, mode, op_by_op):
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
     tester = ThisTester(
-        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+        model_name, mode, assert_pcc=False, assert_atol=False, compiler_config=cc
     )
     results = tester.test_model()
 

diff --git a/tests/models/distilbert/test_distilbert.py b/tests/models/distilbert/test_distilbert.py
@@ -39,7 +39,7 @@ def test_distilbert(record_property, model_name, mode, op_by_op):
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
     tester = ThisTester(
-        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+        model_name, mode, assert_pcc=False, assert_atol=False, compiler_config=cc
     )
     results = tester.test_model()
 

diff --git a/tests/models/dpr/test_dpr.py b/tests/models/dpr/test_dpr.py
@@ -47,7 +47,7 @@ def test_dpr(record_property, mode, op_by_op):
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
     tester = ThisTester(
-        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+        model_name, mode, assert_pcc=False, assert_atol=False, compiler_config=cc
     )
     results = tester.test_model()
 

diff --git a/tests/models/llama/test_llama.py b/tests/models/llama/test_llama.py
@@ -52,7 +52,7 @@ def test_llama(record_property, mode, op_by_op):
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
     tester = ThisTester(
-        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+        model_name, mode, assert_pcc=False, assert_atol=False, compiler_config=cc
     )
     results = tester.test_model()
     if mode == "eval":

diff --git a/tests/models/mlpmixer/test_mlpmixer.py b/tests/models/mlpmixer/test_mlpmixer.py
@@ -49,7 +49,7 @@ def test_mlpmixer(record_property, mode, op_by_op):
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
     tester = ThisTester(
-        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+        model_name, mode, assert_pcc=False, assert_atol=False, compiler_config=cc
     )
     results = tester.test_model()
     record_property("torch_ttnn", (tester, results))
diff --git a/tests/models/mnist/test_mnist.py b/tests/models/mnist/test_mnist.py
@@ -75,7 +75,7 @@ def test_mnist_train(record_property, mode, op_by_op):
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
     tester = ThisTester(
-        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+        model_name, mode, assert_pcc=False, assert_atol=False, compiler_config=cc
     )
     results = tester.test_model()
 

diff --git a/tests/models/openpose/test_openpose_v2.py b/tests/models/openpose/test_openpose_v2.py
@@ -65,7 +65,7 @@ def test_openpose_v2(record_property, mode, op_by_op):
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
     tester = ThisTester(
-        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+        model_name, mode, assert_pcc=False, assert_atol=False, compiler_config=cc
     )
     results = tester.test_model()
     if mode == "eval":

diff --git a/tests/models/perceiver_io/test_perceiver_io.py b/tests/models/perceiver_io/test_perceiver_io.py
@@ -52,7 +52,7 @@ def test_perceiver_io(record_property, mode, op_by_op):
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
     tester = ThisTester(
-        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+        model_name, mode, assert_pcc=False, assert_atol=False, compiler_config=cc
     )
     results = tester.test_model()
     if mode == "eval":

diff --git a/tests/models/resnet/test_resnet.py b/tests/models/resnet/test_resnet.py
@@ -39,7 +39,7 @@ def test_resnet(record_property, mode, op_by_op):
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
     tester = ThisTester(
-        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+        model_name, mode, assert_pcc=False, assert_atol=False, compiler_config=cc
     )
     results = tester.test_model()
 

diff --git a/tests/models/resnet50/test_resnet50.py b/tests/models/resnet50/test_resnet50.py
@@ -56,7 +56,8 @@ def test_resnet(record_property, mode, op_by_op):
         mode,
         required_atol=0.03,
         compiler_config=cc,
-        assert_on_output_mismatch=False,
+        assert_pcc=False,
+        assert_atol=False,
     )
     results = tester.test_model()
     if mode == "eval":

diff --git a/tests/models/stable_diffusion/test_stable_diffusion_v2.py b/tests/models/stable_diffusion/test_stable_diffusion_v2.py
@@ -77,7 +77,7 @@ def test_stable_diffusion_v2(record_property, mode, op_by_op):
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
     tester = ThisTester(
-        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+        model_name, mode, assert_pcc=False, assert_atol=False, compiler_config=cc
     )
     results = tester.test_model()
     if mode == "eval":

diff --git a/tests/models/torchvision/test_torchvision_image_classification.py b/tests/models/torchvision/test_torchvision_image_classification.py
@@ -50,30 +50,14 @@ def _load_inputs(self):
     [("resnext50_32x4d", "ResNeXt50_32X4D_Weights"), "eval"],
     [("resnext101_32x8d", "ResNeXt101_32X8D_Weights"), "eval"],
     [("resnext101_64x4d", "ResNeXt101_64X4D_Weights"), "eval"],
-    pytest.param(
-        [("vgg11", "VGG11_Weights"), "eval"], marks=pytest.mark.compilation_xfail
-    ),
-    pytest.param(
-        [("vgg11_bn", "VGG11_BN_Weights"), "eval"], marks=pytest.mark.compilation_xfail
-    ),
-    pytest.param(
-        [("vgg13", "VGG13_Weights"), "eval"], marks=pytest.mark.compilation_xfail
-    ),
-    pytest.param(
-        [("vgg13_bn", "VGG13_BN_Weights"), "eval"], marks=pytest.mark.compilation_xfail
-    ),
-    pytest.param(
-        [("vgg16", "VGG16_Weights"), "eval"], marks=pytest.mark.compilation_xfail
-    ),
-    pytest.param(
-        [("vgg16_bn", "VGG16_BN_Weights"), "eval"], marks=pytest.mark.compilation_xfail
-    ),
-    pytest.param(
-        [("vgg19", "VGG19_Weights"), "eval"], marks=pytest.mark.compilation_xfail
-    ),
-    pytest.param(
-        [("vgg19_bn", "VGG19_BN_Weights"), "eval"], marks=pytest.mark.compilation_xfail
-    ),
+    [("vgg11", "VGG11_Weights"), "eval"],
+    [("vgg11_bn", "VGG11_BN_Weights"), "eval"],
+    [("vgg13", "VGG13_Weights"), "eval"],
+    [("vgg13_bn", "VGG13_BN_Weights"), "eval"],
+    [("vgg16", "VGG16_Weights"), "eval"],
+    [("vgg16_bn", "VGG16_BN_Weights"), "eval"],
+    [("vgg19", "VGG19_Weights"), "eval"],
+    [("vgg19_bn", "VGG19_BN_Weights"), "eval"],
     [("vit_b_16", "ViT_B_16_Weights"), "eval"],
     [("vit_b_32", "ViT_B_32_Weights"), "eval"],
     [("vit_l_16", "ViT_L_16_Weights"), "eval"],
@@ -96,31 +80,20 @@ def _load_inputs(self):
     [("regnet_x_8gf", "RegNet_X_8GF_Weights"), "eval"],
     [("regnet_x_16gf", "RegNet_X_16GF_Weights"), "eval"],
     [("regnet_x_32gf", "RegNet_X_32GF_Weights"), "eval"],
-    pytest.param(
-        [("swin_t", "Swin_T_Weights"), "eval"], marks=pytest.mark.compilation_xfail
-    ),
-    pytest.param(
-        [("swin_s", "Swin_S_Weights"), "eval"], marks=pytest.mark.compilation_xfail
-    ),
-    pytest.param(
-        [("swin_b", "Swin_B_Weights"), "eval"], marks=pytest.mark.compilation_xfail
-    ),
-    pytest.param(
-        [("swin_v2_t", "Swin_V2_T_Weights"), "eval"],
-        marks=pytest.mark.compilation_xfail,
-    ),
-    pytest.param(
-        [("swin_v2_s", "Swin_V2_S_Weights"), "eval"],
-        marks=pytest.mark.compilation_xfail,
-    ),
-    pytest.param(
-        [("swin_v2_b", "Swin_V2_B_Weights"), "eval"],
-        marks=pytest.mark.compilation_xfail,
-    ),
+    [("swin_t", "Swin_T_Weights"), "eval"],
+    [("swin_s", "Swin_S_Weights"), "eval"],
+    [("swin_b", "Swin_B_Weights"), "eval"],
+    [("swin_v2_t", "Swin_V2_T_Weights"), "eval"],
+    [("swin_v2_s", "Swin_V2_S_Weights"), "eval"],
+    [("swin_v2_b", "Swin_V2_B_Weights"), "eval"],
 ]
 
 
-@pytest.mark.parametrize("model_info_and_mode", model_info_and_mode_list)
+@pytest.mark.parametrize(
+    "model_info_and_mode",
+    model_info_and_mode_list,
+    ids=[info[0] for info, _ in model_info_and_mode_list],
+)
 @pytest.mark.parametrize("op_by_op", [True, False], ids=["op_by_op", "full"])
 def test_torchvision_image_classification(
     record_property, model_info_and_mode, op_by_op
@@ -137,7 +110,9 @@ def test_torchvision_image_classification(
     if op_by_op:
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
-    tester = ThisTester(model_info, mode, compiler_config=cc)
+    tester = ThisTester(
+        model_info, mode, assert_pcc=True, assert_atol=False, compiler_config=cc
+    )
     results = tester.test_model()
     if mode == "eval":
         # Print the top 5 predictions

diff --git a/tests/models/torchvision/test_torchvision_object_detection.py b/tests/models/torchvision/test_torchvision_object_detection.py
@@ -36,18 +36,22 @@ def _load_inputs(self):
         return batch_t
 
 
+model_info_list = [
+    ("ssd300_vgg16", "SSD300_VGG16_Weights"),
+    ("ssdlite320_mobilenet_v3_large", "SSDLite320_MobileNet_V3_Large_Weights"),
+    ("retinanet_resnet50_fpn", "RetinaNet_ResNet50_FPN_Weights"),
+    ("retinanet_resnet50_fpn_v2", "RetinaNet_ResNet50_FPN_V2_Weights"),
+]
+
+
 @pytest.mark.parametrize(
     "mode",
     ["eval"],
 )
 @pytest.mark.parametrize(
     "model_info",
-    [
-        ("ssd300_vgg16", "SSD300_VGG16_Weights"),
-        ("ssdlite320_mobilenet_v3_large", "SSDLite320_MobileNet_V3_Large_Weights"),
-        ("retinanet_resnet50_fpn", "RetinaNet_ResNet50_FPN_Weights"),
-        ("retinanet_resnet50_fpn_v2", "RetinaNet_ResNet50_FPN_V2_Weights"),
-    ],
+    model_info_list,
+    ids=[model_info[0] for model_info in model_info_list],
 )
 @pytest.mark.parametrize("op_by_op", [True, False], ids=["op_by_op", "full"])
 def test_torchvision_object_detection(record_property, model_info, mode, op_by_op):
@@ -61,7 +65,9 @@ def test_torchvision_object_detection(record_property, model_info, mode, op_by_o
     if op_by_op:
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
-    tester = ThisTester(model_info, mode, compiler_config=cc)
+    tester = ThisTester(
+        model_info, mode, assert_pcc=True, assert_atol=False, compiler_config=cc
+    )
     results = tester.test_model()
     if mode == "eval":
         print(f"Model: {model_name} | Output: {results}")

diff --git a/tests/models/yolos/test_yolos.py b/tests/models/yolos/test_yolos.py
@@ -50,7 +50,7 @@ def test_yolos(record_property, mode, op_by_op):
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
     tester = ThisTester(
-        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+        model_name, mode, assert_pcc=False, assert_atol=False, compiler_config=cc
     )
     results = tester.test_model()
     if mode == "eval":

diff --git a/tests/models/yolov3/test_yolov3.py b/tests/models/yolov3/test_yolov3.py
@@ -70,7 +70,7 @@ def test_yolov3(record_property, mode, op_by_op):
         cc.compile_depth = CompileDepth.EXECUTE_OP_BY_OP
 
     tester = ThisTester(
-        model_name, mode, assert_on_output_mismatch=False, compiler_config=cc
+        model_name, mode, assert_pcc=False, assert_atol=False, compiler_config=cc
     )
     results = tester.test_model()