Skip to content

use AutoProcessor with use_fast=False since there's a bug with use_fa… #125

use AutoProcessor with use_fast=False since there's a bug with use_fa…

use AutoProcessor with use_fast=False since there's a bug with use_fa… #125

Workflow file for this run

name: Python Tests on M1 Mac
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
jobs:
unit_test:
runs-on: macos-14
steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.12'
- name: Cache huggingface hub models
uses: actions/cache@v3
with:
path: ~/.cache/huggingface/hub
key: ${{ runner.os }}-huggingface-hub-${{ hashFiles('~/.cache/huggingface/hub/**/*') }}-${{ github.job }}
- name: Install dependencies
run: |
python3 -m pip install --upgrade pip
pip install .
- name: Run tests
run: |
# Check if cached files are present
ls ~/.cache/huggingface/hub/models--mlx-community--Meta-Llama-3-8B-Instruct-4bit/**/* || true
# Run unit tests
METAL_XCODE=1 python3 -m exo.inference.test_inference_engine
discovery_integration_test:
runs-on: macos-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.x'
- name: Install dependencies
run: |
python3 -m pip install --upgrade pip
pip install .
- name: Run discovery integration test
run: |
# Start first instance
DEBUG_DISCOVERY=9 DEBUG=9 python3 main.py --listen-port 5678 --broadcast-port 5679 --chatgpt-api-port 8000 > output1.log 2>&1 &
PID1=$!
# Start second instance
DEBUG_DISCOVERY=9 DEBUG=9 python3 main.py --listen-port 5679 --broadcast-port 5678 --chatgpt-api-port 8001 > output2.log 2>&1 &
PID2=$!
# Wait for discovery
sleep 10
# Stop both instances
kill $PID1 $PID2
# Check outputs
if grep -q "Connected to peer" output1.log && grep -q "Connected to peer" output2.log; then
echo "Test passed: Both instances discovered each other"
exit 0
else
echo "Test failed: Devices did not discover each other"
echo "Output of first instance:"
cat output1.log
echo "Output of second instance:"
cat output2.log
exit 1
fi
chatgpt_api_integration_test:
runs-on: macos-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.x'
- name: Cache huggingface hub models
uses: actions/cache@v3
with:
path: ~/.cache/huggingface/hub
key: ${{ runner.os }}-huggingface-hub-${{ hashFiles('~/.cache/huggingface/hub/**/*') }}-${{ github.job }}
restore-keys: |
${{ runner.os }}-huggingface-hub-
- name: Cache tinygrad downloaded models
uses: actions/cache@v3
with:
path: ~/Library/Caches/tinygrad/downloads
key: ${{ runner.os }}-tinygrad-downloads-${{ hashFiles('~/Library/Caches/tinygrad/downloads/**/*') }}-${{ github.job }}
restore-keys: |
${{ runner.os }}-tinygrad-downloads-
- name: Install dependencies
run: |
python3 -m pip install --upgrade pip
pip install .
- name: Run chatgpt api integration test
run: |
exit 0 # TODO
# Check if cached files are present
ls ~/.cache/huggingface/hub/models--mlx-community--Meta-Llama-3-8B-Instruct-4bit/**/* || true
# Start first instance
DEBUG_DISCOVERY=9 DEBUG=9 python3 main.py --inference-engine mlx --listen-port 5678 --broadcast-port 5679 --chatgpt-api-port 8000 --chatgpt-api-response-timeout-secs 900 > output1.log 2>&1 &
PID1=$!
# Start second instance
DEBUG_DISCOVERY=9 DEBUG=9 python3 main.py --inference-engine mlx --listen-port 5679 --broadcast-port 5678 --chatgpt-api-port 8001 --chatgpt-api-response-timeout-secs 900 > output2.log 2>&1 &
PID2=$!
# Wait for discovery
sleep 10
# Function to check if processes are still running
check_processes() {
if ! kill -0 $PID1 2>/dev/null; then
echo "First instance (PID $PID1) died unexpectedly. Log output:"
cat output1.log
exit 1
fi
if ! kill -0 $PID2 2>/dev/null; then
echo "Second instance (PID $PID2) died unexpectedly. Log output:"
cat output2.log
exit 1
fi
}
# Check processes before proceeding
check_processes
# first one to load the model
curl -s http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "llama-3-8b",
"messages": [{"role": "user", "content": "Keep responses concise. Placeholder to load model..."}],
"temperature": 0.7
}'
# Check processes after model load
check_processes
response_1=$(curl -s http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "llama-3-8b",
"messages": [{"role": "user", "content": "Keep responses concise. Who was the king of pop?"}],
"temperature": 0.7
}')
echo "Response 1: $response_1"
# Check processes after first response
check_processes
response_2=$(curl -s http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "llama-3-8b",
"messages": [{"role": "user", "content": "Keep responses concise. Who was the king of pop?"}],
"temperature": 0.7
}')
echo "Response 2: $response_2"
# Check processes after second response
check_processes
# Stop both instances
kill $PID1 $PID2
echo ""
if ! echo "$response_1" | grep -q "Michael Jackson" || ! echo "$response_2" | grep -q "Michael Jackson"; then
echo "Test failed: Response does not contain 'Michael Jackson'"
echo "Response 1: $response_1"
echo ""
echo "Response 2: $response_2"
echo "Output of first instance:"
cat output1.log
echo "Output of second instance:"
cat output2.log
exit 1
else
echo "Test passed: Response from both nodes contains 'Michael Jackson'"
fi