[F134] neuronx-cc terminated abnormally #24

nandeeka · 2024-10-04T18:58:41Z

I am trying to run what I think should be a kernel. However, I am getting the opaque error message, [F134] neuronx-cc terminated abnormally. What is the error and/or how do I go about debugging an error message like this?

The full kernel is:

from neuronxcc.nki import baremetal, benchmark
import neuronxcc.nki.language as nl
import neuronxcc.nki.isa as ni
import numpy as np

# @benchmark(save_neff_name='file.neff', save_trace_name='profile.ntff', additional_compile_opt=' --disable-internal-io-dge ')
@baremetal(save_neff_name='file.neff', additional_compile_opt=' --disable-internal-io-dge ')
def lora(I_DRAM, PW_DRAM, A_DRAM, SB_DRAM, O_DRAM, K2, K1, K0, M2, M1, M0, N2, N1, N0, R):
  for n2 in nl.affine_range(N2):
    for m2 in nl.affine_range(M2):

      # O_SBUF = nl.zeros((M1, nl.par_dim(M0), N1 * N0), dtype=O_DRAM.dtype, buffer=nl.sbuf)
      PO_SBUF = nl.zeros((M1, nl.par_dim(M0), N1 * N0), dtype=O_DRAM.dtype, buffer=nl.sbuf)
      DO_SBUF = nl.zeros((M1, nl.par_dim(M0), N1 * N0), dtype=O_DRAM.dtype, buffer=nl.sbuf)

      AI_SBUF = nl.zeros((N1, nl.par_dim(R), N0), dtype=O_DRAM.dtype, buffer=nl.sbuf)

      m_start = m2 * M1 * M0
      m_end = m_start + M1 * M0

      SB_SBUF = nl.load(SB_DRAM[:, m_start:m_end])

      for k2 in nl.affine_range(K2):
        PW_SBUF = nl.ndarray((M1, nl.par_dim(K0), K1, M0), dtype=PW_DRAM.dtype, buffer=nl.sbuf)
        I_SBUF = nl.ndarray((K1, nl.par_dim(K0), N1 * N0), dtype=I_DRAM.dtype, buffer=nl.sbuf)

        for m1 in nl.affine_range(M1):
          PW_SBUF[m1] = nl.load(PW_DRAM[m2, k2, m1])

        for k1 in nl.affine_range(K1):
          k_start = k2 * K1 * K0 + k1 * K0
          k_end = k_start + K0

          n_start = n2 * N1 * N0
          n_end = n_start + N1 * N0

          I_SBUF[k1] = nl.load(I_DRAM[k_start:k_end, n_start:n_end])

        for m1 in nl.affine_range(M1):
          for n1 in nl.affine_range(N1):
            PO_PSUM = nl.zeros((M0, N0), dtype=nl.float32, buffer=nl.psum)

            n_start = n1 * N0
            n_end = n_start + N0

            for k1 in nl.affine_range(K1):
              PO_PSUM += ni.nc_matmul(PW_SBUF[m1, :, k1], I_SBUF[k1, :, n_start:n_end])

            PO_SBUF[m1, :, n_start:n_end] = nl.loop_reduce(PO_PSUM, op=np.add, loop_indices=[k2], dtype=O_DRAM.dtype)

        # if m2 == 0:
        A_SBUF = nl.ndarray((K1, nl.par_dim(K0), R), dtype=A_DRAM.dtype, buffer=nl.sbuf)
        for k1 in nl.affine_range(K1):
          k_start = k2 * K1 * K0 + k1 * K0
          k_end = k_start + K0

          A_SBUF[k1] = nl.load(A_DRAM[k_start:k_end], mask=m2==0)

        for n1 in nl.affine_range(N1):
          AI_PSUM = nl.zeros((nl.par_dim(R), N0), dtype=nl.float32, buffer=nl.psum)

          n_start = n1 * N0
          n_end = n_start + N0

          for k1 in nl.affine_range(K1):
            AI_PSUM += ni.nc_matmul(A_SBUF[k1], I_SBUF[k1, :, n_start:n_end], mask=m2==0)

          AI_SBUF[n1] = nl.loop_reduce(AI_PSUM, op=np.add, loop_indices=[k2], dtype=O_DRAM.dtype, mask=m2==0)

        # endif
      for m1 in nl.affine_range(M1):
        for n1 in nl.affine_range(N1):
          DO_PSUM = nl.zeros((nl.par_dim(M0), N0), dtype=nl.float32, buffer=nl.psum)

          m_start = m1 * M0
          m_end = m_start + M0

          DO_PSUM[:] = ni.nc_matmul(SB_SBUF[:, m_start:m_end], AI_SBUF[n1])

          n_start = n1 * N0
          n_end = n_start + N0

          DO_SBUF[m1, :, n_start:n_end] = nl.loop_reduce(DO_PSUM, op=np.add, loop_indices=[k2], dtype=O_DRAM.dtype)

      for m1 in nl.affine_range(M1):
        m_start = m2 * M1 * M0 + m1 * M0
        m_end = m_start + M0

        n_start = n2 * N1 * N0
        n_end = n_start + N1 * N0

        O_SBUF = nl.add(PO_SBUF[m1], DO_SBUF[m1])

        nl.store(O_DRAM[m_start:m_end, n_start:n_end], value=O_SBUF)

def launch():
  K, M, N, R = (4096, 4096, 2048, 8)

  K0 = 128
  M0 = 128
  N0 = 512

  M1 = 4
  N1 = 4
  K1 = 8

  K2 = K // (K1 * K0)
  M2 = M // (M1 * M0)
  N2 = N // (N1 * N0)

  assert K2 * K1 * K0 == K
  assert M2 * M1 * M0 == M
  assert N2 * N1 * N0 == N

  PW = np.random.random_sample([M2, K2, M1, K0, K1, M0]).astype(np.float16)
  I = np.random.random_sample([K, N]).astype(np.float16)
  A = np.random.random_sample([K, R]).astype(np.float16)
  SB = np.random.random_sample([R, K]).astype(np.float16)
  O = np.ndarray(shape=[M, N], dtype=np.float16)

  lora(I, PW, A, SB, O, K2, K1, K0, M2, M1, M0, N2, N1, N0, R)

  return I, PW, A, SB, O

def main():
  I, PW, A, SB, O = launch()
  print(O[0, 0])

if __name__ == "__main__":
  main()

The full error message is:

[F134] neuronx-cc terminated abnormally - Please open a support ticket at https://github.com/aws-neuron/aws-neuron-sdk/issues/new
Traceback (most recent call last):
  File "/home/ubuntu/nki-kernels/out/../src/lora/nki_first_pass.py", line 130, in <module>
    main()
  File "/home/ubuntu/nki-kernels/out/../src/lora/nki_first_pass.py", line 126, in main
    I, PW, A, SB, O = launch()
  File "/home/ubuntu/nki-kernels/out/../src/lora/nki_first_pass.py", line 121, in launch
    lora(I, PW, A, SB, O, K2, K1, K0, M2, M1, M0, N2, N1, N0, R)
  File "neuronxcc/starfish/penguin/targets/nki/TraceKernel.py", line 756, in neuronxcc.starfish.penguin.targets.nki.TraceKernel.Kernel.__call__
  File "neuronxcc/starfish/penguin/targets/nki/TraceKernel.py", line 1254, in neuronxcc.starfish.penguin.targets.nki.TraceKernel.BaremetalKernel.post_process_call
  File "neuronxcc/starfish/penguin/targets/nki/TraceKernel.py", line 1257, in neuronxcc.starfish.penguin.targets.nki.TraceKernel.BaremetalKernel.post_process_call
  File "neuronxcc/starfish/penguin/targets/nki/TraceKernel.py", line 1314, in neuronxcc.starfish.penguin.targets.nki.TraceKernel.BaremetalKernel._compile
RuntimeError: Compilation failed for lora with error Command '['neuronx-cc', 'compile', '--framework', 'XLA', 'penguin.py', '--internal-tensorizer-opt-level=nki', '--pipeline', 'compile', 'SaveTemps', '--target', 'trn1', '--disable-internal-io-dge', '--disable-internal-io-dge', '--output=file.neff']' returned non-zero exit status 70.

My pip freeze is:

absl-py==2.1.0
aiohappyeyeballs==2.4.0
aiohttp==3.10.5
aiosignal==1.3.1
amqp==5.2.0
annotated-types==0.7.0
ansicolors==1.1.8
anyio==4.4.0
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
arrow==1.3.0
astroid==3.2.4
asttokens==2.4.1
async-lru==2.0.4
async-timeout==4.0.3
attrs==24.2.0
Automat==24.8.1
aws-neuronx-runtime-discovery==2.9
awscli==1.34.19
babel==2.16.0
beautifulsoup4==4.12.3
billiard==4.2.0
bleach==6.1.0
boto3==1.35.19
botocore==1.35.19
build==1.2.2
cachetools==5.5.0
celery==5.4.0
certifi==2024.8.30
cffi==1.17.1
charset-normalizer==3.3.2
click==8.1.7
click-didyoumean==0.3.1
click-plugins==1.1.1
click-repl==0.3.0
cloud-tpu-client==0.10
cloudpickle==3.0.0
cmake==3.30.3
colorama==0.4.6
comm==0.2.2
constantly==23.10.4
contourpy==1.3.0
cryptography==43.0.1
cssselect==1.2.0
cycler==0.12.1
dask==2024.9.0
debugpy==1.8.5
decorator==5.1.1
defusedxml==0.7.1
dill==0.3.8
distlib==0.3.8
docutils==0.16
dparse==0.6.3
ec2-metadata==2.13.0
entrypoints==0.4
environment-kernels==1.2.0
exceptiongroup==1.2.2
executing==2.1.0
fastapi==0.114.2
fastjsonschema==2.20.0
filelock==3.16.0
fonttools==4.53.1
fqdn==1.5.1
frozenlist==1.4.1
fsspec==2024.9.0
google-api-core==1.34.1
google-api-python-client==1.8.0
google-auth==2.34.0
google-auth-httplib2==0.2.0
googleapis-common-protos==1.65.0
h11==0.14.0
httpcore==1.0.5
httpie==3.2.3
httplib2==0.22.0
httpx==0.27.2
hyperlink==21.0.0
idna==3.10
imageio==2.35.1
importlib_metadata==8.5.0
incremental==24.7.2
iniconfig==2.0.0
ipykernel==6.29.5
ipython==8.27.0
ipywidgets==8.1.5
islpy==2023.2.5
isoduration==20.11.0
isort==5.13.2
itemadapter==0.9.0
itemloaders==1.3.1
jedi==0.19.1
Jinja2==3.1.4
jmespath==1.0.1
joblib==1.4.2
json5==0.9.25
jsonpointer==3.0.0
jsonschema==4.23.0
jsonschema-specifications==2023.12.1
jupyter==1.1.1
jupyter-console==6.6.3
jupyter-events==0.10.0
jupyter-lsp==2.2.5
jupyter_client==8.6.2
jupyter_core==5.7.2
jupyter_server==2.14.2
jupyter_server_terminals==0.5.3
jupyterlab==4.2.5
jupyterlab_pygments==0.3.0
jupyterlab_server==2.27.3
jupyterlab_widgets==3.0.13
kiwisolver==1.4.7
kombu==5.4.1
libneuronxla==2.0.4115.0
llvmlite==0.43.0
locket==1.0.0
lockfile==0.12.2
lxml==5.3.0
markdown-it-py==3.0.0
MarkupSafe==2.1.5
matplotlib==3.9.2
matplotlib-inline==0.1.7
mccabe==0.7.0
mdurl==0.1.2
mistune==3.0.2
ml-dtypes==0.2.0
mpmath==1.3.0
multidict==6.1.0
nbclient==0.10.0
nbconvert==7.16.4
nbformat==5.10.4
nest-asyncio==1.6.0
networkx==2.8.8
neuronx-cc==2.15.128.0+56dc5a86
neuronx-distributed==0.9.0
neuronx-distributed-training==1.0.0
notebook==7.2.2
notebook_shim==0.2.4
numba==0.60.0
numpy==1.25.2
nvidia-cublas-cu12==12.1.3.1
nvidia-cuda-cupti-cu12==12.1.105
nvidia-cuda-nvrtc-cu12==12.1.105
nvidia-cuda-runtime-cu12==12.1.105
nvidia-cudnn-cu12==8.9.2.26
nvidia-cufft-cu12==11.0.2.54
nvidia-curand-cu12==10.3.2.106
nvidia-cusolver-cu12==11.4.5.107
nvidia-cusparse-cu12==12.1.0.106
nvidia-nccl-cu12==2.18.1
nvidia-nvjitlink-cu12==12.6.68
nvidia-nvtx-cu12==12.1.105
oauth2client==4.1.3
opencv-python==4.10.0.84
overrides==7.7.0
packaging==21.3
pandas==2.2.2
pandocfilters==1.5.1
papermill==2.6.0
parsel==1.9.1
parso==0.8.4
partd==1.4.2
pexpect==4.9.0
pgzip==0.3.5
pillow==10.4.0
pip-tools==7.4.1
pipenv==2024.0.2
platformdirs==4.3.3
plotly==5.24.1
pluggy==1.5.0
prometheus_client==0.20.0
prompt_toolkit==3.0.47
Protego==0.3.1
protobuf==3.20.3
psutil==6.0.0
ptyprocess==0.7.0
pure_eval==0.2.3
pyasn1==0.6.1
pyasn1_modules==0.4.1
pycparser==2.22
pydantic==2.9.1
pydantic_core==2.23.3
PyDispatcher==2.0.7
Pygments==2.18.0
pylint==3.2.7
pyOpenSSL==24.2.1
pyparsing==3.1.4
pyproject_hooks==1.1.0
PySocks==1.7.1
pytest==8.3.3
python-daemon==3.0.1
python-dateutil==2.9.0.post0
python-json-logger==2.0.7
pytz==2024.2
PyYAML==6.0.2
pyzmq==26.2.0
queuelib==1.7.0
referencing==0.35.1
requests==2.31.0
requests-file==2.1.0
requests-toolbelt==1.0.0
requests-unixsocket==0.3.0
rfc3339-validator==0.1.4
rfc3986-validator==0.1.1
rich==13.8.1
rpds-py==0.20.0
rsa==4.7.2
ruamel.yaml==0.18.6
ruamel.yaml.clib==0.2.8
s3transfer==0.10.2
safetensors==0.4.5
safety==2.3.5
scikit-learn==1.5.2
scipy==1.11.2
Scrapy==2.11.2
seaborn==0.13.2
Send2Trash==1.8.3
service-identity==24.1.0
shap==0.46.0
six==1.16.0
slicer==0.0.8
sniffio==1.3.1
soupsieve==2.6
stack-data==0.6.3
starlette==0.38.5
sympy==1.13.2
tenacity==9.0.0
terminado==0.18.1
threadpoolctl==3.5.0
tinycss2==1.3.0
tldextract==5.1.2
tomli==2.0.1
tomlkit==0.13.2
toolz==0.12.1
torch==2.1.2
torch-neuronx==2.1.2.2.3.0
torch-xla==2.1.4
torchvision==0.16.2
tornado==6.4.1
tqdm==4.66.5
traitlets==5.14.3
triton==2.1.0
Twisted==24.7.0
types-python-dateutil==2.9.0.20240906
typing_extensions==4.12.2
tzdata==2024.1
uri-template==1.3.0
uritemplate==3.0.1
urllib3==2.2.3
vine==5.1.0
virtualenv==20.26.4
w3lib==2.2.1
wcwidth==0.2.13
webcolors==24.8.0
webencodings==0.5.1
websocket-client==1.8.0
wget==3.2
widgetsnbextension==4.0.13
yarl==1.11.1
zipp==3.20.2
zope.interface==7.0.3

The text was updated successfully, but these errors were encountered:

jeffhataws · 2024-10-04T21:48:42Z

Thank @nandeeka for filing the issue. We will take a look.

JonathanHenson · 2024-10-05T00:29:27Z

Hi Nandeeka, I’m taking a look to reproduce. If you have one could you also provide the contents of the compiler log?

nandeeka · 2024-10-05T19:19:45Z

Hi Jonathan,
Where do I see the compiler log? Following the instructions here, I tried printing it to the console with:

export NEURON_RT_LOG_LOCATION=console
export NEURON_RT_LOG_LEVEL=INFO

But this does not seem to have done anything.
Thanks!

jeffhataws · 2024-10-05T21:44:42Z

Hi @nandeeka will you try adding adding additional_compile_opt="--verbose debug" argument to the baremetal decorator?

nandeeka · 2024-10-05T22:59:19Z

This worked. It looks like the error was:

2024-10-05T22:49:01Z ERROR 3808 [job.WalrusDriver.0]: Backend exited with code -6 and stderr: No existing axis k2 found in instruction I-33's parent list
walrus_driver: /local/p4clients/pkgbuild-const/workspace/src/KaenaCompiler/neuronxcc/walrus/ir/lib/IR/BasicBlockHolder.cpp:150: bir::LoopAxis* bir::BasicBlockHolder::findAxis(const string&, bir::Instruction*): Assertion `false && "No existing axis found"' failed.

After inspecting all instructions involving k2, I figured out which one was creating the problem, and I fixed it. I guess my remaining question is, is there any way for me to figure out which instruction was instruction I-33? As kernels get bigger, manually inspecting all relevant instructions becomes more and more challenging.

JonathanHenson · 2024-10-07T18:39:44Z

This actually works with the simulator as is, so will need to look further why it's correct at the nki insertion point but incorrect in the backend:

updated code:

def test_lora(self):
    K, M, N, R = (4096, 4096, 2048, 8)

    K0 = 128
    M0 = 128
    N0 = 512

    M1 = 4
    N1 = 4
    K1 = 8

    K2 = K // (K1 * K0)
    M2 = M // (M1 * M0)
    N2 = N // (N1 * N0)

    assert K2 * K1 * K0 == K
    assert M2 * M1 * M0 == M
    assert N2 * N1 * N0 == N

    PW = np.random.random_sample([M2, K2, M1, K0, K1, M0]).astype(np.float16)
    I = np.random.random_sample([K, N]).astype(np.float16)
    A = np.random.random_sample([K, R]).astype(np.float16)
    SB = np.random.random_sample([R, K]).astype(np.float16)
    O = np.ndarray(shape=[M, N], dtype=np.float16)

    nki.simulate_kernel(lora, I, PW, A, SB, O, K2, K1, K0, M2, M1, M0, N2, N1, N0, R)
    print(O[0,0])
    return I, PW, A, SB, O

output:

4890.0

JonathanHenson · 2024-10-07T18:43:02Z

This worked. It looks like the error was:
2024-10-05T22:49:01Z ERROR 3808 [job.WalrusDriver.0]: Backend exited with code -6 and stderr: No existing axis k2 found in instruction I-33's parent list
walrus_driver: /local/p4clients/pkgbuild-const/workspace/src/KaenaCompiler/neuronxcc/walrus/ir/lib/IR/BasicBlockHolder.cpp:150: bir::LoopAxis* bir::BasicBlockHolder::findAxis(const string&, bir::Instruction*): Assertion `false && "No existing axis found"' failed.
After inspecting all instructions involving k2, I figured out which one was creating the problem, and I fixed it. I guess my remaining question is, is there any way for me to figure out which instruction was instruction I-33? As kernels get bigger, manually inspecting all relevant instructions becomes more and more challenging.

I-33 would be the 33rd instruction emitted by the Kernel.

As far as a better way to see which instruction maps to what line of code, we should be able to re-correlate it back to the debug info for the kernel. I am adding this to our backlog to make it more clear what went wrong.

JonathanHenson self-assigned this Oct 5, 2024

JonathanHenson added the enhancement New feature or request label Oct 7, 2024

JonathanHenson transferred this issue from aws-neuron/aws-neuron-sdk Oct 7, 2024

awsjoshir added enhancement New feature or request and removed enhancement New feature or request labels Oct 22, 2024

aws-taylor added bug Something isn't working and removed enhancement New feature or request labels Nov 8, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[F134] neuronx-cc terminated abnormally #24

[F134] neuronx-cc terminated abnormally #24

nandeeka commented Oct 4, 2024

jeffhataws commented Oct 4, 2024

JonathanHenson commented Oct 5, 2024 •

edited

Loading

nandeeka commented Oct 5, 2024

jeffhataws commented Oct 5, 2024

nandeeka commented Oct 5, 2024

JonathanHenson commented Oct 7, 2024

JonathanHenson commented Oct 7, 2024

[F134] neuronx-cc terminated abnormally #24

[F134] neuronx-cc terminated abnormally #24

Comments

nandeeka commented Oct 4, 2024

jeffhataws commented Oct 4, 2024

JonathanHenson commented Oct 5, 2024 • edited Loading

nandeeka commented Oct 5, 2024

jeffhataws commented Oct 5, 2024

nandeeka commented Oct 5, 2024

JonathanHenson commented Oct 7, 2024

JonathanHenson commented Oct 7, 2024

JonathanHenson commented Oct 5, 2024 •

edited

Loading