Merge remote-tracking branch 'origin/main' into fp8_check

pytorch · Nov 25, 2024 · dfe2eb7 · dfe2eb7
2 parents 436d3aa + 9bb1b23
commit dfe2eb7
Show file tree

Hide file tree

Showing 52 changed files with 3,222 additions and 368 deletions.
diff --git a/.github/workflows/build_wheels_linux.yml b/.github/workflows/build_wheels_linux.yml
@@ -43,6 +43,8 @@ jobs:
       # triggered daily from main with a schedule
       repository: pytorch/ao
       ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: main
       build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
       env-var-script: packaging/env_var_script_linux.sh
       pre-script: packaging/pre_build_script.sh

diff --git a/.github/workflows/regression_test.yml b/.github/workflows/regression_test.yml
@@ -25,12 +25,12 @@ jobs:
         include:
           - name: CUDA Nightly
             runs-on: linux.g5.12xlarge.nvidia.gpu
-            torch-spec: '--pre torch==2.6.0.dev20241101 --index-url https://download.pytorch.org/whl/nightly/cu121'
+            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu124'
             gpu-arch-type: "cuda"
-            gpu-arch-version: "12.1"
+            gpu-arch-version: "12.4"
           - name: CPU Nightly
             runs-on: linux.4xlarge
-            torch-spec: '--pre torch==2.6.0.dev20241101 --index-url https://download.pytorch.org/whl/nightly/cpu'
+            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cpu'
             gpu-arch-type: "cpu"
             gpu-arch-version: ""
 

diff --git a/.gitignore b/.gitignore
@@ -371,4 +371,7 @@ venv/
 sweep/
 
 # Model checkpoints
-checkpoints/
+checkpoints/
+
+# Experimental
+torchao/experimental/cmake-out
diff --git a/README.md b/README.md
@@ -177,7 +177,7 @@ We're also fortunate to be integrated into some of the leading open-source libra
 2. Hugging Face diffusers best practices with torch.compile and torchao in a standalone repo [diffusers-torchao](https://github.com/sayakpaul/diffusers-torchao)
 3. Mobius HQQ backend leveraged our int4 kernels to get [195 tok/s on a 4090](https://github.com/mobiusml/hqq#faster-inference)
 4. [TorchTune](https://github.com/pytorch/torchtune) for our QLoRA and QAT recipes
-5. [torchchat](https://github.com/pytorch/torchtune) for post training quantization
+5. [torchchat](https://github.com/pytorch/torchchat) for post training quantization
 6. [SGLang](https://github.com/sgl-project/sglang/pull/1341) for LLM inference quantization
 
 ## Videos

diff --git a/examples/sam2_amg_server/README.md b/examples/sam2_amg_server/README.md
@@ -8,7 +8,7 @@ curl -X POST http://127.0.0.1:5000/upload -F 'image=@/path/to/file.jpg' --output
 Start the server
 
 ```
-python server.py ~/checkpoints/sam2 --port <your_port> --host <your_hostname> --fast
+python server.py ~/checkpoints/sam2 large --port <your_port> --host <your_hostname> --fast
 ```
 
 Collect the rles
@@ -58,7 +58,7 @@ Make sure you've installed https://github.com/facebookresearch/sam2
 
 Start server
 ```
-python server.py ~/checkpoints/sam2 --port <your_port> --host <your_hostname> --baseline
+python server.py ~/checkpoints/sam2 large --port <your_port> --host <your_hostname> --baseline
 ```
 
 Generate and save rles (one line per json via `-w "\n"`)
@@ -73,7 +73,7 @@ sys     0m4.137s
 ### 3. Start server with torchao variant of SAM2
 Start server
 ```
-python server.py ~/checkpoints/sam2 --port <your_port> --host <your_hostname>
+python server.py ~/checkpoints/sam2 large --port <your_port> --host <your_hostname>
 ```
 
 Generate and save rles (one line per json via `-w "\n"`)
@@ -88,7 +88,7 @@ sys     0m4.350s
 ### 4. Start server with torchao variant of SAM2 and `--fast` optimizations
 Start server
 ```
-python server.py ~/checkpoints/sam2 --port <your_port> --host <your_hostname> --fast
+python server.py ~/checkpoints/sam2 large --port <your_port> --host <your_hostname> --fast
 ```
 
 Generate and save rles (one line per json via `-w "\n"`)
@@ -103,7 +103,7 @@ sys     0m4.138s
 ### 5. Start server with torchao variant of SAM2 and `--fast` and `--furious` optimizations
 Start server
 ```
-python server.py ~/checkpoints/sam2 --port <your_port> --host <your_hostname> --fast --furious
+python server.py ~/checkpoints/sam2 large --port <your_port> --host <your_hostname> --fast --furious
 ```
 
 Generate and save rles (one line per json via `-w "\n"`)

diff --git a/examples/sam2_amg_server/cli.py b/examples/sam2_amg_server/cli.py
@@ -0,0 +1,48 @@
+import fire
+import logging
+import matplotlib.pyplot as plt
+from server import file_bytes_to_image_tensor
+from server import show_anns
+from server import model_type_to_paths
+from server import MODEL_TYPES_TO_MODEL
+from torchao._models.sam2.build_sam import build_sam2
+from torchao._models.sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
+from torchao._models.sam2.utils.amg import rle_to_mask
+from io import BytesIO
+
+def main_docstring():
+    return f"""
+    Args:
+        checkpoint_path (str): Path to folder containing checkpoints from https://github.com/facebookresearch/sam2?tab=readme-ov-file#download-checkpoints
+        model_type (str): Choose from one of {", ".join(MODEL_TYPES_TO_MODEL.keys())}
+        input_path (str): Path to input image
+        output_path (str): Path to output image
+    """
+
+def main(checkpoint_path, model_type, input_path, output_path, points_per_batch=1024, output_format='png', verbose=False):
+    device = "cuda"
+    sam2_checkpoint, model_cfg = model_type_to_paths(checkpoint_path, model_type)
+    if verbose:
+        print(f"Loading model {sam2_checkpoint} with config {model_cfg}")
+    sam2 = build_sam2(model_cfg, sam2_checkpoint, device=device, apply_postprocessing=False)
+    mask_generator = SAM2AutomaticMaskGenerator(sam2, points_per_batch=points_per_batch, output_mode="uncompressed_rle")
+    image_tensor = file_bytes_to_image_tensor(bytearray(open(input_path, 'rb').read()))
+    if verbose:
+        print(f"Loaded image of size {tuple(image_tensor.shape)} and generating mask.")
+    masks = mask_generator.generate(image_tensor)
+
+    # Save an example
+    plt.figure(figsize=(image_tensor.shape[1]/100., image_tensor.shape[0]/100.), dpi=100)
+    plt.imshow(image_tensor)
+    show_anns(masks, rle_to_mask)
+    plt.axis('off')
+    plt.tight_layout()
+    buf = BytesIO()
+    plt.savefig(buf, format=output_format)
+    buf.seek(0)
+    with open(output_path, "wb") as file:
+        file.write(buf.getvalue())
+
+main.__doc__ = main_docstring()
+if __name__ == "__main__":
+    fire.Fire(main)
diff --git a/examples/sam2_amg_server/requirements.txt b/examples/sam2_amg_server/requirements.txt
@@ -7,3 +7,4 @@ hydra-core
 tqdm
 iopath
 python-multipart
+requests
diff --git a/examples/sam2_amg_server/server.py b/examples/sam2_amg_server/server.py
@@ -1,4 +1,5 @@
 import itertools
+import requests
 import uvicorn
 import fire
 import tempfile
@@ -37,6 +38,23 @@
 # torch._dynamo.config.capture_dynamic_output_shape_ops = True
 torch._dynamo.config.capture_dynamic_output_shape_ops = True
 
+def download_file(url, download_dir):
+    # Create the directory if it doesn't exist
+    download_dir = Path(download_dir)
+    download_dir.mkdir(parents=True, exist_ok=True)
+    # Extract the file name from the URL
+    file_name = url.split('/')[-1]
+    # Define the full path for the downloaded file
+    file_path = download_dir / file_name
+    # Download the file
+    response = requests.get(url, stream=True)
+    response.raise_for_status()  # Raise an error for bad responses
+    # Write the file to the specified directory
+    print(f"Downloading '{file_name}' to '{download_dir}'")
+    with open(file_path, 'wb') as file:
+        for chunk in response.iter_content(chunk_size=8192):
+            file.write(chunk)
+    print(f"Downloaded '{file_name}' to '{download_dir}'")
 
 def example_shapes():
     return [(848, 480, 3),
@@ -272,7 +290,51 @@ def unittest_fn(masks, ref_masks, order_by_area=False, verbose=False):
         print(f"mIoU is {miou} with equal count {equal_count} out of {len(masks)}")
 
 
+MODEL_TYPES_TO_CONFIG = {
+        "tiny": "sam2.1_hiera_t.yaml",
+        "small": "sam2.1_hiera_s.yaml",
+        "plus": "sam2.1_hiera_b+.yaml",
+        "large": "sam2.1_hiera_l.yaml",
+        }
+
+MODEL_TYPES_TO_MODEL = {
+        "tiny": "sam2.1_hiera_tiny.pt",
+        "small": "sam2.1_hiera_small.pt",
+        "plus": "sam2.1_hiera_base_plus.pt",
+        "large": "sam2.1_hiera_large.pt",
+        }
+
+
+MODEL_TYPES_TO_URL = {
+        "tiny": "https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_tiny.pt",
+        "small": "https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_small.pt",
+        "plus": "https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_base_plus.pt",
+        "large": "https://dl.fbaipublicfiles.com/segment_anything_2/092824/sam2.1_hiera_large.pt",
+        }
+
+
+def main_docstring():
+    return f"""
+    Args:
+        checkpoint_path (str): Path to folder containing checkpoints from https://github.com/facebookresearch/sam2?tab=readme-ov-file#download-checkpoints
+        model_type (str): Choose from one of {", ".join(MODEL_TYPES_TO_MODEL.keys())}
+    """
+
+
+def model_type_to_paths(checkpoint_path, model_type):
+    if model_type not in MODEL_TYPES_TO_CONFIG.keys():
+        raise ValueError(f"Expected model_type to be one of {', '.join(MODEL_TYPES_TO_MODEL.keys())} but got {model_type}")
+    sam2_checkpoint = Path(checkpoint_path) / Path(MODEL_TYPES_TO_MODEL[model_type])
+    if not sam2_checkpoint.exists():
+        print(f"Can't find checkpoint {sam2_checkpoint} in folder {checkpoint_path}. Downloading.")
+        download_file(MODEL_TYPES_TO_URL[model_type], checkpoint_path)
+    assert sam2_checkpoint.exists(), "Can't find downloaded file. Please open an issue."
+    model_cfg = f"configs/sam2.1/{MODEL_TYPES_TO_CONFIG[model_type]}"
+    return sam2_checkpoint, model_cfg
+
+
 def main(checkpoint_path,
+         model_type,
          baseline=False,
          fast=False,
          furious=False,
@@ -306,9 +368,7 @@ def main(checkpoint_path,
         from torchao._models.sam2.utils.amg import rle_to_mask
 
     device = "cuda"
-    from pathlib import Path
-    sam2_checkpoint = Path(checkpoint_path) / Path("sam2.1_hiera_large.pt")
-    model_cfg = "configs/sam2.1/sam2.1_hiera_l.yaml"
+    sam2_checkpoint, model_cfg = model_type_to_paths(checkpoint_path, model_type)
 
     logging.info(f"Loading model {sam2_checkpoint} with config {model_cfg}")
     sam2 = build_sam2(model_cfg, sam2_checkpoint, device=device, apply_postprocessing=False)
@@ -450,5 +510,6 @@ async def upload_image(image: UploadFile = File(...)):
     # uvicorn.run(app, host=host, port=port, log_level="info")
     uvicorn.run(app, host=host, port=port)
 
+main.__doc__ = main_docstring()
 if __name__ == "__main__":
     fire.Fire(main)
diff --git a/test/dtypes/test_affine_quantized.py b/test/dtypes/test_affine_quantized.py
@@ -156,6 +156,9 @@ class TestAffineQuantizedBasic(TestCase):
     @common_utils.parametrize("device", COMMON_DEVICES)
     @common_utils.parametrize("dtype", COMMON_DTYPES)
     def test_flatten_unflatten(self, apply_quant, device, dtype):
+        if device == "cpu":
+            self.skipTest(f"Temporarily skipping for {device}")
+
         linear = torch.nn.Linear(128, 256, dtype=dtype, device=device)
         ql = apply_quant(linear)
         lp_tensor = ql.weight

diff --git a/test/float8/test_dtensor.py b/test/float8/test_dtensor.py
@@ -15,8 +15,6 @@
 
 import pytest
 import torch
-import torch.nn as nn
-import torch.nn.functional as F
 
 from torchao.utils import TORCH_VERSION_AT_LEAST_2_5
 
@@ -49,6 +47,7 @@
 )
 from torchao.float8.float8_utils import e4m3_dtype, tensor_to_scale
 from torchao.float8.fsdp_utils import WeightWithDynamicFloat8CastTensor
+from torchao.testing.float8.dtensor_utils import ToyModel
 
 
 def setup_distributed():
@@ -59,28 +58,6 @@ def setup_distributed():
     return device_mesh
 
 
-class FeedForward(nn.Module):
-    """MLP based model"""
-
-    def __init__(self):
-        super(FeedForward, self).__init__()
-        self.w1 = nn.Linear(16, 32, bias=False)
-        self.w2 = nn.Linear(16, 32, bias=False)
-        self.out_proj = nn.Linear(32, 16, bias=False)
-
-    def forward(self, x):
-        return self.out_proj(F.silu(self.w1(x)) * self.w2(x))
-
-
-class ToyModel(nn.Module):
-    def __init__(self):
-        super(ToyModel, self).__init__()
-        self.ffn = FeedForward()
-
-    def forward(self, x):
-        return self.ffn(x)
-
-
 def _test_scaled_mm(mesh: DeviceMesh, size=16):
     device = mesh.device_type
     fp8_dtype = e4m3_dtype

diff --git a/test/float8/test_dtensor.sh b/test/float8/test_dtensor.sh
@@ -8,4 +8,8 @@ if python -c 'import torch;print(torch.cuda.is_available())' | grep -q "False";
     exit
 fi
 
+# integration tests for TP/SP
 NCCL_DEBUG=WARN torchrun --nproc_per_node 2 test/float8/test_dtensor.py
+
+# integration smoke tests for FSDP2 + TP
+NCCL_DEBUG=WARN torchrun --nproc_per_node 4 test/float8/test_fsdp2_tp.py
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,3 +7,4 @@ hydra-core @@
     tqdm
     iopath
     python-multipart
+    requests